Esempio n. 1
0
 def test_tiered_composition(self):
     transducer = make_g2p("dan", "eng-arpabet")
     tg = transducer("hej")
     self.assertEqual(tg.output_string, "HH EH Y")
     self.assertEqual(
         tg.edges,
         [
             [(0, 0), (1, 1), (2, 2)],
             [(0, 0), (1, 1), (2, 2)],
             [(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (1, 5), (2, 6)],
         ],
     )
     self.assertEqual(
         tg.pretty_edges(),
         [
             [["h", "h"], ["e", "ɛ"], ["j", "j"]],
             [["h", "h"], ["ɛ", "ɛ"], ["j", "j"]],
             [
                 ["h", "H"],
                 ["h", "H"],
                 ["h", " "],
                 ["ɛ", "E"],
                 ["ɛ", "H"],
                 ["ɛ", " "],
                 ["j", "Y"],
             ],
         ],
     )
     self.assertEqual(compose_tiers(tg.edges), [(0, 2), (1, 5), (2, 6)])
Esempio n. 2
0
 def test_composition_with_none(self):
     transducer = make_g2p("ctp", "eng-arpabet")
     tg = transducer("Qne\u1D2C")
     self.assertEqual(tg.output_string, "HH N EY")
     self.assertEqual(
         tg.edges,
         [
             [(0, 0), (1, 1), (2, 2), (3, None)],
             [(0, 0), (1, 1), (2, 2), (2, 3)],
             [(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (2, 5), (3, 6)],
         ],
     )
     self.assertEqual(
         tg.pretty_edges(),
         [
             [["q", "ʔ"], ["n", "n"], ["e", "e"], ["ᴬ", None]],
             [["ʔ", "ʔ"], ["n", "n"], ["e", "e"], ["e", "ː"]],
             [
                 ["ʔ", "H"],
                 ["ʔ", "H"],
                 ["ʔ", " "],
                 ["n", "N"],
                 ["n", " "],
                 ["e", "E"],
                 ["ː", "Y"],
             ],
         ],
     )
     self.assertEqual(compose_tiers(tg.edges), [(0, 2), (1, 4), (2, 6),
                                                (3, 6)])
Esempio n. 3
0
 def test_fra(self):
     transducer = make_g2p("fra", "eng-arpabet")
     tg = transducer("mais")
     self.assertEqual(tg.output_string, "M EH")
     self.assertEqual(compose_tiers(increment_tiers(tg.edges)), [(1, 2),
                                                                 (2, 4),
                                                                 (3, 4),
                                                                 (4, 4)])
Esempio n. 4
0
def convert_words(xml, word_unit="w", output_orthography="eng-arpabet"):
    for word in xml.xpath(".//" + word_unit):
        # only convert text within words
        same_language_units = get_same_language_units(word)
        if not same_language_units:
            return
        all_text = ""
        all_indices = []
        for unit in same_language_units:
            # Hack to use old English LexiconG2P
            if unit["lang"] != "eng":
                converter = make_g2p(unit["lang"], output_orthography)
                tg = converter(unit["text"])
                text = tg.output_string
                indices = tg.edges
            else:
                tg = False
                converter = LexiconG2P(
                    os.path.join(
                        os.path.dirname(LEXICON_PATH), "cmu_sphinx.metadata.json"
                    )
                )
                text, indices = converter.convert(unit["text"])
            all_text += text
            all_indices += indices
        if tg and isinstance(tg, CompositeTransductionGraph):
            norm_form = converter._transducers[0].norm_form
            indices = increment_tiers(indices)
            all_indices = compose_tiers(indices)
        elif tg and isinstance(tg, TransductionGraph):
            norm_form = converter.norm_form
            indices = increment_indices(indices)
            all_indices = compose_indices([], indices)
        else:
            norm_form = None
            all_indices = indices
        if norm_form:
            word.text = ud.normalize(norm_form, word.text)
        replace_text_in_node(word, all_text, all_indices)
    return xml