Esempio n. 1
0
    def add_word_children(element):
        """Recursive helper for tokenize_xml_in_place()"""
        tag = etree.QName(element.tag).localname
        nsmap = element.nsmap if hasattr(element,
                                         "nsmap") else element.getroot().nsmap
        if tag in ["w", "teiHeader",
                   "head"]:  # don't do anything to existing words!
            new_element = deepcopy(element)
            new_element.tail = ""  # just take off their .tail so that it's not doubled
            return new_element  # as the calling method tends to it

        if is_do_not_align(
                element):  # skip elements marked do-not-align="true"
            new_element = deepcopy(element)
            new_element.tail = ""  # don't add spurious whitespace
            return new_element

        new_element = etree.Element(element.tag, nsmap=nsmap)
        for key, value in element.attrib.items():
            new_element.attrib[key] = value

        lang = get_lang_attrib(element)
        tokenizer = get_tokenizer(lang)
        if element.text:
            new_element.text = ""
            for unit in tokenizer.tokenize_text(element.text):
                if unit["is_word"]:
                    new_child_element = etree.Element("w", nsmap=nsmap)
                    new_child_element.text = unit["text"]
                    new_element.append(new_child_element)
                    continue
                if new_element.getchildren():
                    if not new_element[-1].tail:
                        new_element[-1].tail = ""
                    new_element[-1].tail += unit["text"]
                    continue
                new_element.text += unit["text"]

        for child in element:
            # Comments Cause Crashes so Copy them Cleanly
            if child.tag is etree.Comment:
                new_element.append(child)
                continue
            new_child_element = add_word_children(child)
            new_element.append(new_child_element)
            if child.tail:
                # new_element.tail = ''  # in case it's a copy
                for unit in tokenizer.tokenize_text(child.tail):
                    if unit["is_word"]:
                        new_child_element = etree.Element("w")
                        new_child_element.text = unit["text"]
                        new_element.append(new_child_element)
                        continue
                    if not new_element[-1].tail:
                        new_element[-1].tail = ""
                    new_element[-1].tail += unit["text"]

        return new_element
Esempio n. 2
0
def add_lang_ids_to_element(element, lang_identifier):
    if get_lang_attrib(element):
        return
    text = [text for lang, text in iterate_over_text(element)]
    text = "".join(text)
    text = normalize("NFD", text)
    lang_ids = lang_identifier.identify_text(text)
    lang_id = lang_ids[0][0]  # for now just take the most likely
    set_lang_attrib(element, lang_id)
Esempio n. 3
0
def iterate_over_text(element):
    lang = get_lang_attrib(element)
    if element.text:
        yield (lang, element.text)
    for child in element:
        for subchild in iterate_over_text(child):
            yield subchild
        if child.tail:
            yield (lang, child.tail)
Esempio n. 4
0
def convert_words(  # noqa: C901
    xml, word_unit="w", output_orthography="eng-arpabet", verbose_warnings=False,
):
    """Helper for convert_xml(), with the same Args and Return values, except
    xml is modified in place returned itself, instead of making a copy.
    """

    # Defer expensive import of g2p to do them only if and when they are needed
    from g2p.mappings.langs.utils import is_arpabet

    try:
        # g2p > 0.5.20211029 uses its own exceptions for make_g2p errors
        from g2p import InvalidLanguageCode, NoPath, make_g2p
    except ImportError:
        # g2p <= 0.5.20211029 used NetworkXNoPath and FileNotFoundError
        from g2p import NetworkXNoPath as NoPath
        from g2p import make_g2p

        InvalidLanguageCode = FileNotFoundError

    # Tuck this function inside convert_words(), to share common arguments and imports
    def convert_word(word: str, lang: str):
        """Convert one individual word through the specified cascade of g2p mappings.

        Args:
            word (str): input word to map through g2p
            lang (str): the language code to use to attempt the g2p mapping

        Returns:
            g2p_text (str), valid(bool):
              - g2p_text is the word mapping from lang to output_orthography
              - valid is a flag indicating whether g2p conversion yielded valid
                output, which includes making sure IPA output was valid IPA and
                ARPABET output was valid ARPABET, at all intermediate steps as
                well as in the final output.
        """

        if lang == "eng":
            # Hack to use old English LexiconG2P
            # Note: adding eng_ prefix to vars that are used in both blocks to make mypy
            # happy. Since the two sides of the if and in the same scope, it complains about
            # type checking otherwise.
            assert output_orthography == "eng-arpabet"
            eng_converter = getLexiconG2P(
                os.path.join(os.path.dirname(LEXICON_PATH), "cmu_sphinx.metadata.json")
            )
            try:
                eng_text, _ = eng_converter.convert(word)
                eng_valid = is_arpabet(eng_text)
            except KeyError as e:
                if verbose_warnings:
                    LOGGER.warning(f'Could not g2p "{word}" as English: {e.args[0]}')
                eng_text = word
                eng_valid = False
            return eng_text, eng_valid
        else:
            try:
                converter = make_g2p(lang, output_orthography)
            except InvalidLanguageCode as e:
                raise ValueError(
                    f'Could not g2p "{word}" as "{lang}": invalid language code. '
                    f"Use one of {getLangs()[0]}"
                ) from e
            except NoPath as e:
                raise ValueError(
                    f'Count not g2p "{word}" as "{lang}": no path to "{output_orthography}". '
                    f"Use one of {getLangs()[0]}"
                ) from e
            tg = converter(word)
            text = tg.output_string.strip()
            valid = converter.check(tg, shallow=True)
            if not valid and verbose_warnings:
                converter.check(tg, shallow=False, display_warnings=verbose_warnings)
            return text, valid

    all_g2p_valid = True
    for word in xml.xpath(".//" + word_unit):
        # if the word was already g2p'd, skip and keep existing ARPABET representation
        if "ARPABET" in word.attrib:
            arpabet = word.attrib["ARPABET"]
            if not is_arpabet(arpabet):
                LOGGER.warning(
                    f'Pre-g2p\'d text "{word.text}" has invalid ARPABET conversion "{arpabet}"'
                )
                all_g2p_valid = False
            continue
        # only convert text within words
        if not word.text:
            continue
        g2p_lang = get_lang_attrib(word) or "und"  # default: Undetermined
        g2p_fallbacks = get_attrib_recursive(word, "fallback-langs")
        text_to_g2p = word.text
        try:
            g2p_text, valid = convert_word(text_to_g2p, g2p_lang.strip())
            if not valid:
                # This is where we apply the g2p cascade
                for lang in re.split(r"[,:]", g2p_fallbacks) if g2p_fallbacks else []:
                    LOGGER.warning(
                        f'Could not g2p "{text_to_g2p}" as {g2p_lang}. '
                        f"Trying fallback: {lang}."
                    )
                    g2p_lang = lang.strip()
                    g2p_text, valid = convert_word(text_to_g2p, g2p_lang)
                    if valid:
                        word.attrib["effective-g2p-lang"] = g2p_lang
                        break
                else:
                    all_g2p_valid = False
                    LOGGER.warning(
                        f'No valid g2p conversion found for "{text_to_g2p}". '
                        f"Check its orthography and language code, "
                        f"or pick suitable g2p fallback languages."
                    )

            # Save the g2p_text from the last conversion attemps, even when
            # it's not valid, so it's in the g2p output if the user wants to
            # inspect it manually.
            word.attrib["ARPABET"] = g2p_text

        except ValueError as e:
            LOGGER.warning(
                f'Could not g2p "{text_to_g2p}" due to an incorrect '
                f'"xml:lang", "lang" or "fallback-langs" attribute in the XML: {e}'
            )
            all_g2p_valid = False

    return xml, all_g2p_valid
Esempio n. 5
0
    def test_get_attrib_recursive(self):
        raw_xml = """<TEI>
            <text lang="text">
            <p lang="p1"><s>stuff</s><s lang="p1s2">nonsense</s></p>
            <p><s lang="p2s1">stuff</s><s>nonsense</s></p>
            </text>
            <text>
            <p xml:lang="p3"><s lang="p3s1">stuff</s><s>nonsense<s lang="p3p2c">!</s></s></p>
            </text>
            <text>
            <p><s xml:lang="p4s1" lang="not:xml:lang">stuff</s><s>nonsense<s xml:lang="p4p2c">!</s></s></p>
            </text>
            </TEI>
        """
        xml = etree.fromstring(raw_xml)
        for i, s, lang in zip(
                itertools.count(),
                xml.xpath(".//s"),
            (
                "p1",
                "p1s2",
                "p2s1",
                "text",
                "p3s1",
                None,
                "p3p2c",
                "not:xml:lang",
                None,
                None,
            ),
        ):
            self.assertEqual(
                get_attrib_recursive(s, "lang"),
                lang,
                f"expected lang={lang} for {etree.tostring(s)} (i={i})",
            )

        for i, s, get_lang in zip(
                itertools.count(),
                xml.xpath(".//s"),
            (
                "p1",
                "p1s2",
                "p2s1",
                "text",
                "p3s1",
                "p3",
                "p3p2c",
                "p4s1",
                None,
                "p4p2c",
            ),
        ):
            self.assertEqual(
                get_lang_attrib(s),
                get_lang,
                f"expected get_lang={get_lang} for {etree.tostring(s)} (i={i})",
            )

        for i, s, xml_lang in zip(
                itertools.count(),
                xml.xpath(".//s"),
            (None, None, None, None, "p3", "p3", "p3", "p4s1", None, "p4p2c"),
        ):
            self.assertEqual(
                get_attrib_recursive(s, "xml:lang"),
                xml_lang,
                f"expected xml:lang={xml_lang} for {etree.tostring(s)} (i={i})",
            )