def test_tokenizing_transducer(self): ref_word_ipa = g2p.make_g2p("mic", "mic-ipa")("sq").output_string transducer = g2p.make_g2p("mic", "mic-ipa", tok_lang="mic") word_ipa = transducer("sq").output_string self.assertEqual(word_ipa, ref_word_ipa) string_ipa = transducer(self.contextualize("sq")).output_string self.assertEqual(string_ipa, self.contextualize(ref_word_ipa))
def test_check_ipa(self): transducer = make_g2p("fra", "fra-ipa") self.assertTrue(transducer.check(transducer("ceci"))) self.assertFalse(transducer.check(transducer("ñ"))) self.assertFalse( transducer.check(transducer("ñ"), display_warnings=True)) self.assertTrue(transducer.check(transducer("ceci est un test été à"))) transducer = make_g2p("fra-ipa", "eng-ipa") self.assertFalse(transducer.check(transducer("ñ")))
def align_to_dummy_fallback(mapping: Mapping, io: str = 'in', write_to_file: bool = False): dummy_inventory = ["ɑ", "i", "u", "t", "s", "n"] display_name = mapping.kwargs.get('language_name', 'No Language display name in Config') config = generate_config(mapping.kwargs[f'{io}_lang'], 'dummy', display_name, display_name) default_char = 't' if is_ipa(mapping.kwargs[f'{io}_lang']): mapping = align_inventories(mapping.inventory(io), dummy_inventory) else: und_g2p = make_g2p('und', 'und-ipa') mapping = [{"in": unicode_escape(x), "out": und_g2p(unidecode(x).lower())} for x in mapping.inventory(io)] dummy_list = align_inventories([x['out'] for x in mapping], dummy_inventory) dummy_dict = {} for x in dummy_list: if x['in']: dummy_dict[x['in']] = x['out'] for x in mapping: try: x['out'] = dummy_dict[x['out']] except KeyError: LOGGER.warn(f"We couldn't guess at what {x['in']} means, so it's being replaced with '{default_char}' instead.") x['out'] = default_char if write_to_file: write_generated_mapping_to_file(config, mapping) return config, mapping
def test_tiered_composition(self): transducer = make_g2p("dan", "eng-arpabet") tg = transducer("hej") self.assertEqual(tg.output_string, "HH EH Y") self.assertEqual( tg.edges, [ [(0, 0), (1, 1), (2, 2)], [(0, 0), (1, 1), (2, 2)], [(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (1, 5), (2, 6)], ], ) self.assertEqual( tg.pretty_edges(), [ [["h", "h"], ["e", "ɛ"], ["j", "j"]], [["h", "h"], ["ɛ", "ɛ"], ["j", "j"]], [ ["h", "H"], ["h", "H"], ["h", " "], ["ɛ", "E"], ["ɛ", "H"], ["ɛ", " "], ["j", "Y"], ], ], ) self.assertEqual(compose_tiers(tg.edges), [(0, 2), (1, 5), (2, 6)])
def get(self): args = self.parser.parse_args() in_lang = args['in-lang'] out_lang = args['out-lang'] text = args['text'] index = args['index'] debugger = args['debugger'] try: transducer = make_g2p(in_lang, out_lang) tg = transducer(text) text = tg.output_string input_text = tg.input_string if debugger: debugger = tg.debugger if index: index = tg.edges return { 'input-text': input_text, 'output-text': text, 'index': index, 'debugger': debugger } except NetworkXNoPath: abort(400) except FileNotFoundError: abort(404)
def align_to_dummy_fallback(mapping: Mapping, io: str = 'in', write_to_file: bool = False, out_dir: str = ''): display_name = mapping.kwargs.get('language_name', 'No Language display name in Config') config = {'in_lang': mapping.kwargs[f'{io}_lang'], 'out_lang': 'dummy'} default_char = 't' if is_ipa(mapping.kwargs[f'{io}_lang']): mapping = align_inventories(mapping.inventory(io), DUMMY_INVENTORY) else: und_g2p = make_g2p('und', 'und-ipa') mapping = [{"in": unicode_escape(x), "out": und_g2p(unidecode(x).lower()).output_string} for x in mapping.inventory(io)] dummy_list = align_inventories([x['out'] for x in mapping], DUMMY_INVENTORY) dummy_dict = {} for x in dummy_list: if x['in']: dummy_dict[x['in']] = x['out'] for x in mapping: try: x['out'] = dummy_dict[x['out']] except KeyError: LOGGER.warn(f"We couldn't guess at what {x['in']} means, so it's being replaced with '{default_char}' instead.") x['out'] = default_char config['mapping'] = mapping mapping = Mapping(**config) if write_to_file: if out_dir: if os.path.isdir(out_dir): mapping.config_to_file(out_dir) mapping.mapping_to_file(out_dir) else: LOGGER.warning(f'{out_dir} is not a directory. Writing to default instead.') else: mapping.config_to_file() mapping.mapping_to_file() return mapping
def test_composition_with_none(self): transducer = make_g2p("ctp", "eng-arpabet") tg = transducer("Qne\u1D2C") self.assertEqual(tg.output_string, "HH N EY") self.assertEqual( tg.edges, [ [(0, 0), (1, 1), (2, 2), (3, None)], [(0, 0), (1, 1), (2, 2), (2, 3)], [(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (2, 5), (3, 6)], ], ) self.assertEqual( tg.pretty_edges(), [ [["q", "ʔ"], ["n", "n"], ["e", "e"], ["ᴬ", None]], [["ʔ", "ʔ"], ["n", "n"], ["e", "e"], ["e", "ː"]], [ ["ʔ", "H"], ["ʔ", "H"], ["ʔ", " "], ["n", "N"], ["n", " "], ["e", "E"], ["ː", "Y"], ], ], ) self.assertEqual(compose_tiers(tg.edges), [(0, 2), (1, 4), (2, 6), (3, 6)])
def test_tiered_composition(self): """Indices mapped through a more complex, three-step composition""" transducer = make_g2p("dan", "eng-arpabet") tg = transducer("hej") self.assertEqual(tg.output_string, "HH EH Y ") self.assertEqual( tg.edges, [ [(0, 0), (1, 1), (2, 2)], [(0, 0), (1, 1), (2, 2)], [(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (1, 5), (2, 6), (2, 7)], ], ) self.assertEqual( tg.pretty_edges(), [ [["h", "h"], ["e", "ɛ"], ["j", "j"]], [["h", "h"], ["ɛ", "ɛ"], ["j", "j"]], [ ["h", "H"], ["h", "H"], ["h", " "], ["ɛ", "E"], ["ɛ", "H"], ["ɛ", " "], ["j", "Y"], ["j", " "], ], ], )
def test_tokenizing_transducer_edges(self): transducer = g2p.make_g2p("fra", "fra-ipa", tok_lang="fra") edges = transducer("est est").edges # est -> ɛ, so edges are (0, 0), (1, 0), (2, 0) for each "est", plus the # space to the space, and the second set of edges being offset ref_edges = [(0, 0), (1, 0), (2, 0), (3, 1), (4, 2), (5, 2), (6, 2)] self.assertEqual(edges, ref_edges)
def test_tok_and_map_mic(self): transducer = g2p.make_g2p("mic", "mic-ipa") tokenizer = g2p.make_tokenizer("mic") word_ipa = transducer("sq").output_string string_ipa = g2p.tokenize_and_map( tokenizer, transducer, self.contextualize("sq") ) self.assertEqual(string_ipa, self.contextualize(word_ipa))
def convert(in_lang, out_lang, input_text): ''' Convert any text ''' if os.path.exists(input_text) and input_text.endswith('txt'): with open(input_text, encoding='utf8') as f: input_text = f.read() transducer = make_g2p(in_lang, out_lang) click.echo(transducer(input_text))
def test_fra(self): transducer = make_g2p("fra", "eng-arpabet") tg = transducer("mais") self.assertEqual(tg.output_string, "M EH") self.assertEqual(compose_tiers(increment_tiers(tg.edges)), [(1, 2), (2, 4), (3, 4), (4, 4)])
def test_check_with_equiv(self): transducer = make_g2p("tau", "eng-arpabet", tok_lang="tau") tau_ipa = make_g2p("tau", "tau-ipa", tok_lang="tau")( "sh'oo Jign maasee' do'eent'aa shyyyh").output_string self.assertTrue(utils.is_panphon(tau_ipa)) eng_ipa = make_g2p("tau", "eng-ipa", tok_lang="tau")( "sh'oo Jign maasee' do'eent'aa shyyyh").output_string self.assertTrue(utils.is_panphon(eng_ipa)) eng_arpabet = make_g2p("tau", "eng-arpabet", tok_lang="tau")( "sh'oo Jign maasee' do'eent'aa shyyyh").output_string self.assertTrue(utils.is_arpabet(eng_arpabet)) LOGGER.warning( f"tau-ipa {tau_ipa}\neng-ipa {eng_ipa}\n eng-arpabet {eng_arpabet}" ) self.assertTrue( transducer.check( transducer("sh'oo Jign maasee' do'eent'aa shyyyh")))
def get_transducer(input_language: str, output_language: str): if not input_language: input_language = '' raise exceptions.CorrespondenceMissing(input_language) elif not output_language: output_language = '' raise exceptions.CorrespondenceMissing(output_language) else: return make_g2p(input_language, output_language)
def test_tokenizing_transducer_edge_spaces(self): transducer = g2p.make_g2p("fra", "eng-arpabet", tok_lang="fra") edges = transducer(" a, ").edges ref_edges = [ [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)], # " a, " -> " a, " [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)], # " a, " -> " ɑ, " [(0, 0), (1, 1), (2, 2), (2, 3), (2, 4), (3, 5), (4, 6)], # " ɑ, " -> " AA , " ] self.assertEqual(edges, ref_edges)
def test_io(self): # go through each language declared in the test case set up # Instead of asserting immediately, we go through all the cases first, so that # running test_langs.py prints all the errors at once, to help debugging a given g2p mapping. # Then we call assertEqual on the first failed case, to make unittest register the failure. error_count = 0 for test in self.langs_to_test: transducer = make_g2p(test[0], test[1]) output_string = transducer(test[2]).output_string if output_string != test[3]: LOGGER.warning("test_langs.py: mapping error: {} from {} to {} should be {}, got {}".format(test[2], test[0], test[1], test[3], output_string)) if error_count == 0: first_failed_test = test error_count += 1 if error_count > 0: transducer = make_g2p(first_failed_test[0], first_failed_test[1]) self.assertEqual(transducer(first_failed_test[2]).output_string, first_failed_test[3])
def test_tokenizing_transducer_edge_chain(self): transducer = g2p.make_g2p("fra", "eng-arpabet", tok_lang="fra") edges = transducer("est est").edges ref_edges = [ # "est est" -> "ɛ ɛ" [(0, 0), (1, 0), (2, 0), (3, 1), (4, 2), (5, 2), (6, 2)], [(0, 0), (1, 1), (2, 2)], # "ɛ ɛ" -> "ɛ ɛ" [(0, 0), (0, 1), (0, 2), (1, 3), (2, 4), (2, 5), (2, 6)], # "ɛ ɛ" -> "EH EH " ] self.assertEqual(edges, ref_edges)
def test_io(self): # go through each language declared in the test case set up for lang in self.langs_to_test: in_lang = lang['in_lang'] out_lang = lang['out_lang'] transducer = make_g2p(in_lang, out_lang) # go through each table in the current lang for sample in lang['samples']: # assert that the transduced first item in the tuple is equal to the second item in the tuple self.assertEqual(transducer(sample[0]), sample[1])
def test_check_tokenizing_transducer(self): transducer = make_g2p("fra", "fra-ipa", tok_lang="fra") self.assertTrue(transducer.check(transducer("ceci est un test été à"))) self.assertFalse(transducer.check(transducer("ñ oǹ"))) self.assertTrue( transducer.check( transducer("ceci, cela; c'est tokenizé: alors c'est bon!"))) self.assertFalse( transducer.check( transducer("mais... c'est ñoñ, si du texte ne passe pas!")))
def test_tok_and_map_fra(self): """ Chaining tests: tokenize and map a string """ transducer = g2p.make_g2p("fra", "fra-ipa") tokenizer = g2p.make_tokenizer("fra") # "teste" in isolation is at string and word end and beginning word_ipa = transducer("teste").output_string # "teste" followed by space or punctuation should be mapped to the same string string_ipa = g2p.tokenize_and_map( tokenizer, transducer, self.contextualize("teste") ) self.assertEqual(string_ipa, self.contextualize(word_ipa))
def convert_word(word: str, lang: str): """Convert one individual word through the specified cascade of g2p mappings. Args: word (str): input word to map through g2p lang (str): the language code to use to attempt the g2p mapping Returns: g2p_text (str), valid(bool): - g2p_text is the word mapping from lang to output_orthography - valid is a flag indicating whether g2p conversion yielded valid output, which includes making sure IPA output was valid IPA and ARPABET output was valid ARPABET, at all intermediate steps as well as in the final output. """ if lang == "eng": # Hack to use old English LexiconG2P # Note: adding eng_ prefix to vars that are used in both blocks to make mypy # happy. Since the two sides of the if and in the same scope, it complains about # type checking otherwise. assert output_orthography == "eng-arpabet" eng_converter = getLexiconG2P( os.path.join(os.path.dirname(LEXICON_PATH), "cmu_sphinx.metadata.json") ) try: eng_text, _ = eng_converter.convert(word) eng_valid = is_arpabet(eng_text) except KeyError as e: if verbose_warnings: LOGGER.warning(f'Could not g2p "{word}" as English: {e.args[0]}') eng_text = word eng_valid = False return eng_text, eng_valid else: try: converter = make_g2p(lang, output_orthography) except InvalidLanguageCode as e: raise ValueError( f'Could not g2p "{word}" as "{lang}": invalid language code. ' f"Use one of {getLangs()[0]}" ) from e except NoPath as e: raise ValueError( f'Count not g2p "{word}" as "{lang}": no path to "{output_orthography}". ' f"Use one of {getLangs()[0]}" ) from e tg = converter(word) text = tg.output_string.strip() valid = converter.check(tg, shallow=True) if not valid and verbose_warnings: converter.check(tg, shallow=False, display_warnings=verbose_warnings) return text, valid
def test_check_tokenizing_composite_transducer(self): transducer = make_g2p("fra", "eng-arpabet", tok_lang="fra") self.assertTrue(transducer.check(transducer("ceci est un test été à"))) self.assertFalse(transducer.check(transducer("ñ oǹ"))) self.assertTrue( transducer.check( transducer("ceci, cela; c'est tokenizé: alors c'est bon!"))) self.assertFalse( transducer.check( transducer("mais... c'est ñoñ, si du texte ne passe pas!"))) self.assertFalse( transducer.check( transducer("mais... c'est ñoñ, si du texte ne passe pas!"), display_warnings=True, ))
def change_table(message): """ Change the lookup table """ if message['in_lang'] == 'custom' or message['out_lang'] == 'custom': mappings = Mapping(return_empty_mappings()) else: transducer = make_g2p(message['in_lang'], message['out_lang']) if isinstance(transducer, Transducer): mappings = [transducer.mapping] elif isinstance(transducer, CompositeTransducer): mappings = [x.mapping for x in transducer._transducers] else: pass emit('table response', [{ 'mappings': x.plain_mapping(), 'abbs': expand_abbreviations(x.abbreviations), 'kwargs': x.kwargs } for x in mappings])
def convert_words(xml, word_unit="w", output_orthography="eng-arpabet"): for word in xml.xpath(".//" + word_unit): # only convert text within words same_language_units = get_same_language_units(word) if not same_language_units: return all_text = "" all_indices = [] for unit in same_language_units: # Hack to use old English LexiconG2P if unit["lang"] != "eng": converter = make_g2p(unit["lang"], output_orthography) tg = converter(unit["text"]) text = tg.output_string indices = tg.edges else: tg = False converter = LexiconG2P( os.path.join( os.path.dirname(LEXICON_PATH), "cmu_sphinx.metadata.json" ) ) text, indices = converter.convert(unit["text"]) all_text += text all_indices += indices if tg and isinstance(tg, CompositeTransductionGraph): norm_form = converter._transducers[0].norm_form indices = increment_tiers(indices) all_indices = compose_tiers(indices) elif tg and isinstance(tg, TransductionGraph): norm_form = converter.norm_form indices = increment_indices(indices) all_indices = compose_indices([], indices) else: norm_form = None all_indices = indices if norm_form: word.text = ud.normalize(norm_form, word.text) replace_text_in_node(word, all_text, all_indices) return xml
def convert(in_lang, out_lang, input_text, path, debugger): '''Convert INPUT_TEXT through g2p mapping(s) from IN_LANG to OUT_LANG. Visit http://g2p-studio.herokuapp.com/api/v1/langs for a list of languages. There must be a path from IN_LANG to OUT_LANG, possibly via some intermediates. For example, mapping from fra to eng-arpabet will successively apply fra->fra-ipa, fra-ipa->eng-ipa and eng-ipa->eng-arpabet. ''' # Check valid input # Check input != output if in_lang == out_lang: raise click.UsageError( "Values must be different for 'IN_LANG' and 'OUT_LANG'") # Check input lang exists if not in_lang in LANGS_NETWORK.nodes: raise click.UsageError( f"'{in_lang}' is not a valid value for 'IN_LANG'") # Check output lang exists if not out_lang in LANGS_NETWORK.nodes: raise click.UsageError( f"'{out_lang}' is not a valid value for 'OUT_LANG'") # Check if path exists if not has_path(LANGS_NETWORK, in_lang, out_lang): raise click.UsageError( f"Path between '{in_lang}' and '{out_lang}' does not exist") if os.path.exists(input_text) and input_text.endswith('txt'): with open(input_text, encoding='utf8') as f: input_text = f.read() if in_lang and out_lang: transducer = make_g2p(in_lang, out_lang) elif path: transducer = Transducer(Mapping(path)) tg = transducer(input_text) if debugger: output = [tg.output_string, tg.edges, tg.debugger] PRINTER.pprint(output) else: output = tg.output_string click.echo(output)
def get(self): args = self.parser.parse_args() in_lang = args["in-lang"] out_lang = args["out-lang"] text = args["text"] index = args["index"] debugger = args["debugger"] try: transducer = make_g2p(in_lang, out_lang) tg = transducer(text) text = tg.output_string input_text = tg.input_string debugger = tg.debugger if debugger else debugger index = tg.edges if index else index return { "input-text": input_text, "output-text": text, "index": index, "debugger": debugger, } except NoPath: abort(400) except InvalidLanguageCode: abort(404)
def align_to_dummy_fallback(mapping: Mapping, io: str = 'in', distance: str = "weighted_feature_edit_distance"): """Create a mapping from mapping's output inventory to a minimalist dummy inventory""" config = {'in_lang': mapping.kwargs[f'{io}_lang'], 'out_lang': 'dummy'} default_char = 't' if is_ipa(mapping.kwargs[f'{io}_lang']): mapping = align_inventories(mapping.inventory(io), DUMMY_INVENTORY, distance=distance) else: und_g2p = make_g2p('und', 'und-ipa') mapping = [{ "in": unicode_escape(x), "out": und_g2p(unidecode(x).lower()).output_string } for x in mapping.inventory(io)] dummy_list = align_inventories([x['out'] for x in mapping], DUMMY_INVENTORY, distance=distance) dummy_dict = {} for x in dummy_list: if x['in']: dummy_dict[x['in']] = x['out'] for x in mapping: try: x['out'] = dummy_dict[x['out']] except KeyError: LOGGER.warning( f"We couldn't guess at what {x['in']} means, so it's being " f"replaced with '{default_char}' instead.") x['out'] = default_char config['mapping'] = mapping mapping = Mapping(**config) return mapping
def test_valid_transducer(self): transducer = make_g2p('atj', 'atj-ipa') self.assertTrue(isinstance(transducer, Transducer)) self.assertEqual('niɡiɡw', transducer('nikikw').output_string)
def test_no_path(self): with self.assertRaises(NetworkXNoPath): make_g2p('hei', 'git')
def test_not_found(self): with self.assertRaises(FileNotFoundError): make_g2p('foo', 'eng-ipa') with self.assertRaises(FileNotFoundError): make_g2p('git', 'bar')