def test_as_is(self): mapping = Mapping([{'in': 'a', "out": 'b'}, {'in': 'aa', 'out': 'c'}]) mapping_as_is = Mapping([{'in': 'a', "out": 'b'}, {'in': 'aa', 'out': 'c'}], as_is=True) transducer = Transducer(mapping) transducer_as_is = Transducer(mapping_as_is) self.assertEqual(transducer('aa'), 'c') self.assertEqual(transducer_as_is('aa'), 'bb')
def test_mapping(self): mapping = Mapping([{'in': 'a', 'out': 'æ'}, {'in': 'e', 'out': 'ɐ'}, {'in': 'i', 'out': 'ɑ̃'}, {'in': 'b', 'out': 'β'}, {'in': 'g', 'out': 'ɡ'}, {'in': 'g', 'out': 'g'}, {'in': 'i', 'out': 'ةُ'}], in_lang='test', out_lang='test-out') ipa_mapping = Mapping([{'in': 'a', 'out': 'æ'}, {'in': 'e', 'out': 'ɐ'}, {'in': 'i', 'out': 'ɑ̃'}, {'in': 'b', 'out': 'β'}, {'in': 'g', 'out': 'ɡ'}], in_lang='test', out_lang='test-ipa') test_in = align_to_dummy_fallback(mapping) self.assertEqual(test_in.mapping, [{'in': 'a', 'out': 'ɑ', 'context_before': '', 'context_after': '', 'match_pattern': re.compile('a')}, {'in': 'e', 'out': 'i', 'context_before': '', 'context_after': '', 'match_pattern': re.compile('e')}, {'in': 'i', 'out': 'i', 'context_before': '', 'context_after': '', 'match_pattern': re.compile('i')}, {'in': 'b', 'out': 't', 'context_before': '', 'context_after': '', 'match_pattern': re.compile('b')}, {'in': 'g', 'out': 't', 'context_before': '', 'context_after': '', 'match_pattern': re.compile('g')}, {'in': 'g', 'out': 't', 'context_before': '', 'context_after': '', 'match_pattern': re.compile('g')}, {'in': 'i', 'out': 'i', 'context_before': '', 'context_after': '', 'match_pattern': re.compile('i')}]) test_out = align_to_dummy_fallback(mapping, 'out') self.assertEqual(test_out.mapping, [{'in': 'æ', 'out': 'ɑi', 'context_before': '', 'context_after': '', 'match_pattern': re.compile('æ')}, {'in': 'ɐ', 'out': 'ɑ', 'context_before': '', 'context_after': '', 'match_pattern': re.compile('ɐ')}, {'in': 'ɑ̃', 'out': 'ɑ', 'context_before': '', 'context_after': '', 'match_pattern': re.compile('ɑ̃')}, { 'in': 'β', 'out': 't', 'context_before': '', 'context_after': '', 'match_pattern': re.compile('β')}, {'in': 'ɡ', 'out': 't', 'context_before': '', 'context_after': '', 'match_pattern': re.compile('ɡ')}, {'in': 'g', 'out': 't', 'context_before': '', 'context_after': '', 'match_pattern': re.compile('g')}, {'in': 'ةُ', 'out': 'ɑu', 'context_before': '', 'context_after': '', 'match_pattern': re.compile('ةُ')}]) test_ipa = align_to_dummy_fallback(ipa_mapping, 'out') self.assertEqual( test_ipa.mapping, [{'in': 'æ', 'out': 'ɑ', 'context_before': '', 'context_after': '', 'match_pattern': re.compile('æ')}, {'in': 'ɐ', 'out': 'ɑ', 'context_before': '', 'context_after': '', 'match_pattern': re.compile('ɐ')}, {'in': 'ɑ̃', 'out': 'ɑ', 'context_before': '', 'context_after': '', 'match_pattern': re.compile('ɑ̃')}, {'in': 'β', 'out': 's', 'context_before': '', 'context_after': '', 'match_pattern': re.compile('β')}, {'in': 'ɡ', 'out': 't', 'context_before': '', 'context_after': '', 'match_pattern': re.compile('ɡ')}])
def generate_mapping(in_lang, dummy, ipa, list_dummy, out_dir): ''' For specified IN_LANG, generate a mapping from IN_LANG-ipa to eng-ipa, or from IN_LANG-ipa to a dummy minimalist phone inventory. If you just modified or wrote the IN_LANG to IN_LANG-ipa mapping, don't forget to call "g2p update" first so "g2p generate-mapping" sees the latest version. Call "g2p update" again after calling "g2p generate-mapping" to make the new IN_LANG-ipa to eng-ipa mapping available. ''' if not ipa and not dummy and not list_dummy: click.echo('You have to choose to generate either an IPA-based mapping or a dummy fallback mapping. Check the docs for more information.') if out_dir and (os.path.exists(os.path.join(out_dir, 'config.yaml')) or os.path.exists(os.path.join(out_dir, 'config.yaml'))): click.echo( f'There is already a mapping config file in \'{out_dir}\' \nPlease choose another path.') return if list_dummy: print("Dummy phone inventory: {}".format(DUMMY_INVENTORY)) if ipa: check_ipa_known_segs([f'{in_lang}-ipa']) eng_ipa = Mapping(in_lang='eng-ipa', out_lang='eng-arpabet') new_mapping = Mapping(in_lang=in_lang, out_lang=f'{in_lang}-ipa') click.echo(f"Writing English IPA mapping for {in_lang} to file") create_mapping(new_mapping, eng_ipa, write_to_file=True, out_dir=out_dir) if dummy: new_mapping = Mapping(in_lang=in_lang, out_lang=f'{in_lang}-ipa') click.echo(f"Writing dummy fallback mapping for {in_lang} to file") dummy_config, dummy_mapping = align_to_dummy_fallback( new_mapping, write_to_file=True, out_dir=out_dir)
def test_case_sensitive(self): mapping = Mapping([{"in": "A", "out": "b"}], case_sensitive=False) mapping_case_sensitive = Mapping([{"in": "A", "out": "b"}]) transducer = Transducer(mapping) transducer_case_sensitive = Transducer(mapping_case_sensitive) self.assertEqual(transducer("a").output_string, "b") self.assertEqual(transducer_case_sensitive("a").output_string, "a") self.assertEqual(transducer("A").output_string, "b")
def test_case_sensitive(self): mapping = Mapping([{'in': 'A', "out": 'b'}], case_sensitive=False) mapping_case_sensitive = Mapping([{'in': 'A', "out": 'b'}]) transducer = Transducer(mapping) transducer_case_sensitive = Transducer(mapping_case_sensitive) self.assertEqual(transducer('a').output_string, 'b') self.assertEqual(transducer_case_sensitive('a').output_string, 'a') self.assertEqual(transducer('A').output_string, 'b')
def test_reverse(self): mapping = Mapping([{'in': 'a', "out": 'b'}]) mapping_reversed = Mapping([{'in': 'a', "out": 'b'}], reverse=True) transducer = Transducer(mapping) transducer_reversed = Transducer(mapping_reversed) self.assertEqual(transducer('a').output_string, 'b') self.assertEqual(transducer('b').output_string, 'b') self.assertEqual(transducer_reversed('a').output_string, 'a') self.assertEqual(transducer_reversed('b').output_string, 'a')
def test_escape_special(self): mapping = Mapping([{'in': '\d', "out": 'digit'}]) mapping_escaped = Mapping([{'in': '\d', "out": 'b'}], escape_special=True) transducer = Transducer(mapping) transducer_escaped = Transducer(mapping_escaped) self.assertEqual(transducer('1'), 'digit') self.assertEqual(transducer('\d'), '\d') self.assertEqual(transducer_escaped('1'), '1') self.assertEqual(transducer_escaped('\d'), 'b')
def test_escape_special(self): mapping = Mapping([{"in": r"\d", "out": "digit"}]) mapping_escaped = Mapping([{"in": r"\d", "out": "b"}], escape_special=True) transducer = Transducer(mapping) transducer_escaped = Transducer(mapping_escaped) self.assertEqual(transducer("1").output_string, "digit") self.assertEqual(transducer(r"\d").output_string, r"\d") self.assertEqual(transducer_escaped("1").output_string, "1") self.assertEqual(transducer_escaped(r"\d").output_string, "b")
def test_reverse(self): mapping = Mapping([{"in": "a", "out": "b"}]) mapping_reversed = Mapping([{"in": "a", "out": "b"}], reverse=True) transducer = Transducer(mapping) transducer_reversed = Transducer(mapping_reversed) self.assertEqual(transducer("a").output_string, "b") self.assertEqual(transducer("b").output_string, "b") self.assertEqual(transducer_reversed("a").output_string, "a") self.assertEqual(transducer_reversed("b").output_string, "a")
def test_as_is(self): """ Test deprecated config: as_is. """ # explicitly set as_is=False log_output = io.StringIO() with redirect_stderr(log_output): mapping_sorted = Mapping([{ 'in': 'a', "out": 'b' }, { 'in': 'aa', 'out': 'c' }], as_is=False) self.assertTrue(mapping_sorted.wants_rules_sorted()) self.assertIn("deprecated", log_output.getvalue(), "it should warn that the feature is deprecated") self.assertIn("apply-longest-first", log_output.getvalue(), "it should show the equivalent rule_ordering setting") # explicitly set as_is=True log_output = io.StringIO() with redirect_stderr(log_output): mapping = Mapping([{ 'in': 'a', "out": 'b' }, { 'in': 'aa', 'out': 'c' }], as_is=True) self.assertFalse(mapping.wants_rules_sorted()) self.assertIn("deprecated", log_output.getvalue(), "it should warn that the feature is deprecated") self.assertIn("as-written", log_output.getvalue(), "it should show the equivalent rule_ordering setting") # test the default (rule_ordering="as-written") mapping_as_is = Mapping([{ 'in': 'a', "out": 'b' }, { 'in': 'aa', 'out': 'c' }]) self.assertFalse(mapping.wants_rules_sorted()) # test the alternative (rule_ordering="apply-longest-first") transducer = Transducer(mapping_sorted) transducer_as_is = Transducer(mapping_as_is) self.assertEqual(transducer('aa').output_string, 'c') self.assertEqual(transducer_as_is('aa').output_string, 'bb')
def test_extend_and_deduplicate(self): mapping1 = Mapping(rules_from_strings("a:b", "c:d", "g:h")) mapping2 = Mapping(rules_from_strings("a:x", "c:d", "e:f")) extend_ref = Mapping( rules_from_strings("a:b", "c:d", "g:h", "a:x", "c:d", "e:f") ) mapping1.extend(mapping2) self.assertEquals(mapping1.mapping, extend_ref.mapping) dedup_ref = Mapping(rules_from_strings("a:b", "c:d", "g:h", "a:x", "e:f")) mapping1.deduplicate() self.assertEquals(mapping1.mapping, dedup_ref.mapping)
def return_js_template(self, t_name_or_path: str) -> str: '''Given a transducer, create JavaScript string of that transducer. Args: :param str t_name_or_path: name of transducer or path to transducer. ''' name = self.return_transducer_name(t_name_or_path) transducer_js_template = '''\n\nmtd.transducers["{name}"] = (function() {{ var correspondences = {cors}; var keys = {keys}; var regex = new RegExp("(" + keys.join("|") + ")", "g"); return function(str) {{ return str.replace(regex, function(a,b) {{ return correspondences[a]; }}); }}; }})();''' composite_js_template = u'''\n\nmtd.transducers["{name}"] = (function(){{ var orths = {composite_transducers}; return function(str) {{ for (var i = 0; i < orths.length; i++) {{ transducer = mtd.transducers[orths[i]]; str = transducer(str); }} return str; }}; }})();''' if "composite" in t_name_or_path: with open(t_name_or_path, encoding='utf8') as f: composite_transducers = json.load(f) return composite_js_template.format( name=name, composite_transducers=composite_transducers) else: path = self.return_transducer_path(t_name_or_path) if not path and t_name_or_path in self.available_transducers: mapping = Mapping(**self.available_transducers[t_name_or_path]) elif path.endswith('yaml'): cors = Mapping(self.return_transducer_path(t_name_or_path)) else: cors = Mapping( load_from_file( self.return_transducer_path(t_name_or_path))) keys = sorted([cor['in'] for cor in cors.mapping], key=len, reverse=True) # js_cors = [{k:v for k,v in cor} for cor in cors] js_cors = [{cor['in']: cor['out']} for cor in cors.mapping] js_cors = dict(ChainMap(*js_cors)) return transducer_js_template.format(name=name, cors=js_cors, keys=keys)
def generate_mapping(in_lang, dummy, ipa): ''' Generate English mapping ''' if not ipa and not dummy: click.echo('You have to choose to generate either an IPA-based mapping or a dummy fallback mapping. Check the docs for more information.') if ipa: eng_ipa = Mapping(in_lang='eng-ipa', out_lang='eng-arpabet') new_mapping = Mapping(in_lang=in_lang, out_lang=f'{in_lang}-ipa') click.echo(f"Writing English IPA mapping for {in_lang} to file") create_mapping(new_mapping, eng_ipa, write_to_file=True) if dummy: new_mapping = Mapping(in_lang=in_lang, out_lang=f'{in_lang}-ipa') click.echo(f"Writing dummy fallback mapping for {in_lang} to file") dummy_config, dummy_mapping = align_to_dummy_fallback(new_mapping, write_to_file=True)
def setUp(self): self.test_mapping_no_norm = Mapping([{ 'in': '\u00e1', 'out': '\u00e1' }, { 'in': '\u0061\u0301', 'out': '\u0061\u0301' }], norm_form='none') self.test_mapping_norm = Mapping([{'in': '\u00e1', 'out': '\u00e1'}]) with open(os.path.join(os.path.dirname(public_data), 'git_to_ipa.json'), encoding='utf8') as f: self.json_map = json.load(f)
def setUp(self): self.test_mapping_no_norm = Mapping( [ {"in": "\u00e1", "out": "\u00e1"}, {"in": "\u0061\u0301", "out": "\u0061\u0301"}, ], norm_form="none", ) self.test_mapping_norm = Mapping([{"in": "\u00e1", "out": "\u00e1"}]) with open( os.path.join(os.path.dirname(public_data), "git_to_ipa.json"), encoding="utf8", ) as f: self.json_map = json.load(f)
def change_table(message): """ Change the lookup table """ if message['in_lang'] == 'custom' or message['out_lang'] == 'custom': mappings = Mapping(return_empty_mappings()) else: mappings = Mapping(in_lang=message['in_lang'], out_lang=message['out_lang']) emit( 'table response', { 'mappings': mappings.plain_mapping(), 'abbs': expand_abbreviations(mappings.abbreviations), 'kwargs': mappings.kwargs })
def test_norm_form(self): mapping_nfc = Mapping([{"in": "a\u0301", "out": "a"}]) # Defaults to NFC mapping_nfd = Mapping([{"in": "a\u0301", "out": "a"}], norm_form="NFD") mapping_none = Mapping([{"in": "a\u0301", "out": "a"}], norm_form=False) transducer_nfc = Transducer(mapping_nfc) transducer_nfd = Transducer(mapping_nfd) transducer_none = Transducer(mapping_none) self.assertEqual(transducer_nfc("a\u0301").output_string, "a") self.assertEqual(transducer_nfc("\u00E1").output_string, "a") self.assertEqual(transducer_nfd("a\u0301").output_string, "a") self.assertEqual(transducer_nfd("\u00E1").output_string, "a") self.assertEqual(transducer_none("a\u0301").output_string, "a") self.assertEqual(transducer_none("\u00E1").output_string, "\u00E1")
def test_norm_form(self): mapping_nfc = Mapping([{'in': 'a\u0301', "out": 'a'}]) # Defaults to NFC mapping_nfd = Mapping([{'in': 'a\u0301', "out": 'a'}], norm_form='NFD') mapping_none = Mapping([{'in': 'a\u0301', "out": 'a'}], norm_form=False) transducer_nfc = Transducer(mapping_nfc) transducer_nfd = Transducer(mapping_nfd) transducer_none = Transducer(mapping_none) self.assertEqual(transducer_nfc('a\u0301'), 'a') self.assertEqual(transducer_nfc('\u00E1'), 'a') self.assertEqual(transducer_nfd('a\u0301'), 'a') self.assertEqual(transducer_nfd('\u00E1'), 'a') self.assertEqual(transducer_none('a\u0301'), 'a') self.assertEqual(transducer_none('\u00E1'), '\u00E1')
def create_transducer_mapping(self, t_name_or_path: str) -> Callable[[str], str]: """ Gets mapping of transducer :param t_name_or_path: <string> path to transducer or default transducer """ path = self.return_transducer_path(t_name_or_path) if t_name_or_path in self.available_transducers: mapping = Mapping(**self.available_transducers[t_name_or_path]) elif path.endswith('yaml'): mapping = Mapping(self.return_transducer_path(t_name_or_path)) else: mapping = Mapping( load_from_file(self.return_transducer_path(t_name_or_path))) return G2PTransducer(mapping)
def create_transducer_function( self, t_name_or_path: str) -> Callable[[str], str]: """ Creates function based on transducer :param t_name_or_path: <string> path to transducer or default transducer """ path = self.return_transducer_path(t_name_or_path) if t_name_or_path in self.available_transducers: mapping = Mapping(**self.available_transducers[t_name_or_path]) elif path.endswith('yaml'): mapping = Mapping(self.return_transducer_path(t_name_or_path)) else: mapping = Mapping( load_from_file(self.return_transducer_path(t_name_or_path))) transducer = G2PTransducer(mapping) return lambda x: transducer(x).output_string
def test_json_map(self): json_map = Mapping( self.json_map["map"], **{k: v for k, v in self.json_map.items() if k != "map"} ) self.assertEqual(len(json_map), 34) self.assertTrue(json_map.kwargs["in_metadata"]["case_insensitive"])
def create_transducer(mapping): if mapping: if isinstance(mapping, list): mapping_obj = Mapping(mapping) elif isinstance(mapping, str) and re.search( r'.y(a)*ml\b', mapping): mapping_obj = Mapping(mapping) elif os.path.isfile(mapping): mapping_data = load_from_file(mapping) mapping_obj = Mapping(mapping_data) else: raise exceptions.MissingFileError(mapping) return Transducer(mapping_obj) else: mapping = str(mapping) raise exceptions.MissingFileError(mapping)
def test_unidecode_mapping(self): m = Mapping(type="unidecode") self.assertEqual(m.mapping, []) self.assertEqual(m.kwargs["type"], "unidecode") t = Transducer(m) tg = t("été Nunavut ᓄᓇᕗᑦ") self.assertEqual(tg.output_string, "ete Nunavut nonafot")
def test_basic_composition(self): mapping = Mapping([{"in": "a", "out": "b"}]) transducer = Transducer(mapping) tg = transducer("abba") self.assertEqual(tg.output_string, "bbbb") self.assertEqual(tg.edges, [(0, 0), (1, 1), (2, 2), (3, 3)]) self.assertEqual(tg.edges, compose_indices(tg.edges, tg.edges))
def convert(message): """ Convert input text and return output """ transducers = [] for mapping in message['data']['mappings']: mappings_obj = Mapping(hot_to_mappings(mapping['mapping']), abbreviations=flatten_abbreviations( mapping['abbreviations']), **mapping['kwargs']) transducer = Transducer(mappings_obj) transducers.append(transducer) transducer = CompositeTransducer(transducers) if message['data']['index']: tg = transducer(message['data']['input_string']) data, links = return_echart_data(tg) emit( 'conversion response', { 'output_string': tg.output_string, 'index_data': data, 'index_links': links }) else: output_string = transducer( message['data']['input_string']).output_string emit('conversion response', {'output_string': output_string})
def create_mapping(mapping_1: Mapping, mapping_2: Mapping, mapping_1_io: str = 'out', mapping_2_io: str = 'in', write_to_file: bool = False) -> Mapping: map_1_name = mapping_1.kwargs[f'{mapping_1_io}_lang'] map_2_name = mapping_2.kwargs[f'{mapping_2_io}_lang'] if not is_ipa(map_1_name) and not is_xsampa(map_1_name): LOGGER.warning( "Unsupported orthography of inventory 1: %s" " (must be ipa or x-sampa)", map_1_name) if not is_ipa(map_2_name) and not is_xsampa(map_2_name): LOGGER.warning( "Unsupported orthography of inventory 2: %s" " (must be ipa or x-sampa)", map_2_name) l1_is_xsampa, l2_is_xsampa = is_xsampa(map_1_name), is_xsampa(map_2_name) mapping = align_inventories(mapping_1.inventory(mapping_1_io), mapping_2.inventory(mapping_2_io), l1_is_xsampa, l2_is_xsampa) l1_display_name = mapping_1.kwargs.get( 'language_name', 'No Language display name in Config') l2_display_name = mapping_2.kwargs.get( 'language_name', 'No Language display name in Config') config = generate_config(map_1_name, map_2_name, l1_display_name, l2_display_name) if write_to_file: write_generated_mapping_to_file(config, mapping) return Mapping(mapping, **{k: v for k, v in config.items() if k != 'mapping'})
def test_json_map(self): json_map = Mapping( self.json_map['map'], **{k: v for k, v in self.json_map.items() if k != 'map'}) self.assertEqual(len(json_map), 34) self.assertTrue(json_map.kwargs['in_metadata']['case_insensitive'])
def test_basic_composition(self): """Indices mapped through a two-step basic composition""" mapping = Mapping([{"in": "a", "out": "b"}]) transducer = Transducer(mapping) tg = transducer("abba") self.assertEqual(tg.output_string, "bbbb") self.assertEqual(tg.edges, [(0, 0), (1, 1), (2, 2), (3, 3)])
def align_to_dummy_fallback(mapping: Mapping, io: str = 'in', write_to_file: bool = False, out_dir: str = ''): display_name = mapping.kwargs.get('language_name', 'No Language display name in Config') config = {'in_lang': mapping.kwargs[f'{io}_lang'], 'out_lang': 'dummy'} default_char = 't' if is_ipa(mapping.kwargs[f'{io}_lang']): mapping = align_inventories(mapping.inventory(io), DUMMY_INVENTORY) else: und_g2p = make_g2p('und', 'und-ipa') mapping = [{"in": unicode_escape(x), "out": und_g2p(unidecode(x).lower()).output_string} for x in mapping.inventory(io)] dummy_list = align_inventories([x['out'] for x in mapping], DUMMY_INVENTORY) dummy_dict = {} for x in dummy_list: if x['in']: dummy_dict[x['in']] = x['out'] for x in mapping: try: x['out'] = dummy_dict[x['out']] except KeyError: LOGGER.warn(f"We couldn't guess at what {x['in']} means, so it's being replaced with '{default_char}' instead.") x['out'] = default_char config['mapping'] = mapping mapping = Mapping(**config) if write_to_file: if out_dir: if os.path.isdir(out_dir): mapping.config_to_file(out_dir) mapping.mapping_to_file(out_dir) else: LOGGER.warning(f'{out_dir} is not a directory. Writing to default instead.') else: mapping.config_to_file() mapping.mapping_to_file() return mapping
def test_distance_errors(self): src_mappings = [{"in": "ᐃ", "out": "i"}] src_mapping = Mapping(src_mappings, in_lang="crj", out_lang="crj-ipa") # Exercise looking up distances in the known list with self.assertRaises(ValueError): mapping = create_mapping(src_mapping, self.target_mapping, distance="not_a_distance") with self.assertRaises(ValueError): mapping = create_multi_mapping( [(src_mapping, "out")], [(self.target_mapping, "in")], distance="not_a_distance", ) # White box testing: monkey-patch an invalid distance to validate the # second way we make sure distances are supported DISTANCE_METRICS.append("not_a_real_distance") with self.assertRaises(ValueError): mapping = create_mapping(src_mapping, self.target_mapping, distance="not_a_real_distance") with self.assertRaises(ValueError): mapping = create_multi_mapping( [(src_mapping, "out")], [(self.target_mapping, "in")], distance="not_a_real_distance", ) DISTANCE_METRICS.pop()