def align_to_dummy_fallback(mapping: Mapping, io: str = 'in', write_to_file: bool = False, out_dir: str = ''): display_name = mapping.kwargs.get('language_name', 'No Language display name in Config') config = {'in_lang': mapping.kwargs[f'{io}_lang'], 'out_lang': 'dummy'} default_char = 't' if is_ipa(mapping.kwargs[f'{io}_lang']): mapping = align_inventories(mapping.inventory(io), DUMMY_INVENTORY) else: und_g2p = make_g2p('und', 'und-ipa') mapping = [{"in": unicode_escape(x), "out": und_g2p(unidecode(x).lower()).output_string} for x in mapping.inventory(io)] dummy_list = align_inventories([x['out'] for x in mapping], DUMMY_INVENTORY) dummy_dict = {} for x in dummy_list: if x['in']: dummy_dict[x['in']] = x['out'] for x in mapping: try: x['out'] = dummy_dict[x['out']] except KeyError: LOGGER.warn(f"We couldn't guess at what {x['in']} means, so it's being replaced with '{default_char}' instead.") x['out'] = default_char config['mapping'] = mapping mapping = Mapping(**config) if write_to_file: if out_dir: if os.path.isdir(out_dir): mapping.config_to_file(out_dir) mapping.mapping_to_file(out_dir) else: LOGGER.warning(f'{out_dir} is not a directory. Writing to default instead.') else: mapping.config_to_file() mapping.mapping_to_file() return mapping
def check_ipa_known_segs(mappings_to_check=False) -> bool: """Check the given mappings, or all IPA mappings, for invalid IPA in the "out" fields Returns True iff not errors were found. """ if not mappings_to_check: mappings_to_check = [x["out_lang"] for x in MAPPINGS_AVAILABLE] found_error = False for mapping in [ x for x in MAPPINGS_AVAILABLE if x["out_lang"] in mappings_to_check ]: if is_ipa(mapping["out_lang"]): reverse = mapping.get("reverse", False) for rule in mapping["mapping_data"]: output = rule["in"] if reverse else rule["out"] if not is_panphon(output): LOGGER.warning( f"Output '{rule['out']}' in rule {rule} in mapping between {mapping['in_lang']} " f"and {mapping['out_lang']} is not recognized as valid IPA by panphon." ) found_error = True if found_error: LOGGER.warning( "Please refer to https://github.com/dmort27/panphon for information about panphon." ) return not found_error
def create_mapping(mapping_1: Mapping, mapping_2: Mapping, mapping_1_io: str = 'out', mapping_2_io: str = 'in', write_to_file: bool = False) -> Mapping: map_1_name = mapping_1.kwargs[f'{mapping_1_io}_lang'] map_2_name = mapping_2.kwargs[f'{mapping_2_io}_lang'] if not is_ipa(map_1_name) and not is_xsampa(map_1_name): LOGGER.warning( "Unsupported orthography of inventory 1: %s" " (must be ipa or x-sampa)", map_1_name) if not is_ipa(map_2_name) and not is_xsampa(map_2_name): LOGGER.warning( "Unsupported orthography of inventory 2: %s" " (must be ipa or x-sampa)", map_2_name) l1_is_xsampa, l2_is_xsampa = is_xsampa(map_1_name), is_xsampa(map_2_name) mapping = align_inventories(mapping_1.inventory(mapping_1_io), mapping_2.inventory(mapping_2_io), l1_is_xsampa, l2_is_xsampa) l1_display_name = mapping_1.kwargs.get( 'language_name', 'No Language display name in Config') l2_display_name = mapping_2.kwargs.get( 'language_name', 'No Language display name in Config') config = generate_config(map_1_name, map_2_name, l1_display_name, l2_display_name) if write_to_file: write_generated_mapping_to_file(config, mapping) return Mapping(mapping, **{k: v for k, v in config.items() if k != 'mapping'})
def check( self, tg: TransductionGraph, shallow=False, display_warnings=False, original_input=None, ): out_lang = self.mapping.kwargs["out_lang"] if "eng-arpabet" in out_lang: if not is_arpabet(tg.output_string): if display_warnings: display_input = (original_input if original_input else tg.input_string) LOGGER.warning( f'Transducer output "{tg.output_string}" for input "{display_input}" is not fully valid eng-arpabet as recognized by soundswallower.' ) return False else: return True elif is_ipa(out_lang): if not is_panphon(tg.output_string, display_warnings=display_warnings): if display_warnings: display_input = (original_input if original_input else tg.input_string) LOGGER.warning( f'Transducer output "{tg.output_string}" for input "{display_input}" is not fully valid {out_lang}.' ) return False else: return True else: # No check implemented at this tier, just return True return True
def get_tokenizer(*args, **kwargs): """ Deprecated; use make_tokenizer() instead. """ global _deprecated_warning_printed if not _deprecated_warning_printed: LOGGER.warning( "g2p.get_tokenizer() / g2p.mappings.tokenizer.get_tokenizer() is deprecated. Import and use g2p.make_tokenizer() instead." ) _deprecated_warning_printed = True return make_tokenizer(*args, **kwargs)
def rule_to_regex(self, rule: dict) -> Pattern: """Turns an input string (and the context) from an input/output pair into a regular expression pattern" The 'in' key is the match. The 'context_after' key creates a lookahead. The 'context_before' key creates a lookbehind. Args: rule: A dictionary containing 'in', 'out', 'context_before', and 'context_after' keys Raises: Exception: This is raised when un-supported regex characters or symbols exist in the rule Returns: Pattern: returns a regex pattern (re.Pattern) bool: returns False if input is null """ # Prevent null input. See, https://github.com/roedoejet/g2p/issues/24 if not rule['in']: LOGGER.warning( f'Rule with input \'{rule["in"]}\' and output \'{rule["out"]}\' has no input. This is disallowed. Please check your mapping file for rules with null inputs.' ) return False if "context_before" in rule and rule['context_before']: before = rule["context_before"] else: before = '' if 'context_after' in rule and rule['context_after']: after = rule["context_after"] else: after = '' input_match = re.sub(re.compile(r'{\d+}'), "", rule['in']) try: inp = create_fixed_width_lookbehind(before) + input_match if after: inp += f"(?={after})" if not self.kwargs['case_sensitive']: rule_regex = re.compile(inp, re.I) else: rule_regex = re.compile(inp) except: in_lang = self.kwargs.get('in_lang', 'und') out_lang = self.kwargs.get('out_lang', 'und') LOGGER.error( f'Your regex in mapping between {in_lang} and {out_lang} is malformed. \ Do you have un-escaped regex characters in your input {inp}, contexts {before}, {after}?' ) raise Exception( f'Your regex in mapping between {in_lang} and {out_lang} is malformed. \ Do you have un-escaped regex characters in your input {inp}, contexts {before}, {after}?' ) return rule_regex
def test_convert(self): error_count = 0 for test in self.langs_to_test: output_string = self.runner.invoke(convert, [test[2], test[0], test[1]]).stdout.strip() if output_string != test[3]: LOGGER.warning("test_cli.py: mapping error: {} from {} to {} should be {}, got {}".format(test[2], test[0], test[1], test[3], output_string)) if error_count == 0: first_failed_test = test error_count += 1 if error_count > 0: output_string = self.runner.invoke(convert, [first_failed_test[2], first_failed_test[0], first_failed_test[1]]).stdout.strip() self.assertEqual(output_string, first_failed_test[3])
def find_good_match(p1, inventory_l2): """Find a good sequence in inventory_l2 matching p1.""" # The proper way to do this would be with some kind of beam search # through a determinized/minimized FST, but in the absence of that # we can do a kind of heurstic greedy search. (we don't want any # dependencies outside of PyPI otherwise we'd just use OpenFST) p1_pseq = dst.fm.ipa_segs(p1) i = 0 good_match = [] while i < len(p1_pseq): best_input = "" best_output = -1 best_score = 0xDEADBEEF for j, p2_pseq in enumerate(p2_pseqs): # FIXME: Should also consider the (weighted) possibility # of deleting input or inserting any segment (but that # can't be done with a greedy search) if len(p2_pseq) == 0: LOGGER.warning( "No panphon mapping for %s - skipping", inventory_l2[j] ) continue e = min(i + len(p2_pseq), len(p1_pseq)) input_seg = p1_pseq[i:e] distance_method = get_distance_method(dst, distance) score = distance_method("".join(input_seg), "".join(p2_pseq)) # Be very greedy and take the longest match if ( score < best_score or score == best_score and len(input_seg) > len(best_input) ): best_input = input_seg best_output = j best_score = score LOGGER.debug( "Best match at position %d: %s => %s", i, best_input, inventory_l2[best_output], ) good_match.append(inventory_l2[best_output]) i += len(best_input) # greedy! return "".join(good_match)
def setUp(self): DATA_DIR = os.path.dirname(data_dir) self.langs_to_test = [] for fn in glob(f'{DATA_DIR}/*.*sv'): if fn.endswith('csv'): delimiter = ',' elif fn.endswith('psv'): delimiter = '|' elif fn.endswith('tsv'): delimiter = '\t' with open(fn, encoding="utf-8") as csvfile: reader = csv.reader(csvfile, delimiter=delimiter) for row in reader: if len(row) != 4: LOGGER.warning(f'Row in {fn} containing values {row} does not have the right values. Please check your data.') else: self.langs_to_test.append(row)
def create_mapping( mapping_1: Mapping, mapping_2: Mapping, mapping_1_io: str = "out", mapping_2_io: str = "in", distance: str = "weighted_feature_edit_distance", ) -> Mapping: """Create a mapping from mapping_1's output inventory to mapping_2's input inventory""" map_1_name = mapping_1.kwargs[f"{mapping_1_io}_lang"] map_2_name = mapping_2.kwargs[f"{mapping_2_io}_lang"] if not is_ipa(map_1_name) and not is_xsampa(map_1_name): LOGGER.warning( "Unsupported orthography of inventory 1: %s (must be ipa or x-sampa)", map_1_name, ) if not is_ipa(map_2_name) and not is_xsampa(map_2_name): LOGGER.warning( "Unsupported orthography of inventory 2: %s (must be ipa or x-sampa)", map_2_name, ) l1_is_xsampa, l2_is_xsampa = is_xsampa(map_1_name), is_xsampa(map_2_name) mapping = align_inventories( mapping_1.inventory(mapping_1_io), mapping_2.inventory(mapping_2_io), l1_is_xsampa, l2_is_xsampa, distance=distance, ) # Initialize mapping with input language parameters (as_is, # case_sensitive, prevent_feeding, etc) config = mapping_1.kwargs.copy() # Fix up names, etc. if "authors" in config: del config["authors"] if "display_name" in config: del config["display_name"] if "language_name" in config: del config["language_name"] config["prevent_feeding"] = True config["in_lang"] = map_1_name config["out_lang"] = map_2_name config["mapping"] = mapping mapping = Mapping(**config) return mapping
def test_check_with_equiv(self): transducer = make_g2p("tau", "eng-arpabet", tok_lang="tau") tau_ipa = make_g2p("tau", "tau-ipa", tok_lang="tau")( "sh'oo Jign maasee' do'eent'aa shyyyh").output_string self.assertTrue(utils.is_panphon(tau_ipa)) eng_ipa = make_g2p("tau", "eng-ipa", tok_lang="tau")( "sh'oo Jign maasee' do'eent'aa shyyyh").output_string self.assertTrue(utils.is_panphon(eng_ipa)) eng_arpabet = make_g2p("tau", "eng-arpabet", tok_lang="tau")( "sh'oo Jign maasee' do'eent'aa shyyyh").output_string self.assertTrue(utils.is_arpabet(eng_arpabet)) LOGGER.warning( f"tau-ipa {tau_ipa}\neng-ipa {eng_ipa}\n eng-arpabet {eng_arpabet}" ) self.assertTrue( transducer.check( transducer("sh'oo Jign maasee' do'eent'aa shyyyh")))
def test_io(self): # go through each language declared in the test case set up # Instead of asserting immediately, we go through all the cases first, so that # running test_langs.py prints all the errors at once, to help debugging a given g2p mapping. # Then we call assertEqual on the first failed case, to make unittest register the failure. error_count = 0 for test in self.langs_to_test: transducer = make_g2p(test[0], test[1]) output_string = transducer(test[2]).output_string if output_string != test[3]: LOGGER.warning("test_langs.py: mapping error: {} from {} to {} should be {}, got {}".format(test[2], test[0], test[1], test[3], output_string)) if error_count == 0: first_failed_test = test error_count += 1 if error_count > 0: transducer = make_g2p(first_failed_test[0], first_failed_test[1]) self.assertEqual(transducer(first_failed_test[2]).output_string, first_failed_test[3])
def load_mapping_from_path(path_to_mapping_config, index=0): ''' Loads a mapping from a path, if there is more than one mapping, then it loads based on the int provided to the 'index' argument. Default is 0. ''' path = Path(path_to_mapping_config) # If path leads to actual mapping config if path.exists() and (path.suffix.endswith('yml') or path.suffix.endswith('yaml')): # safe load it with open(path, encoding='utf8') as f: mapping = yaml.safe_load(f) # If more than one mapping in the mapping config if 'mappings' in mapping: try: LOGGER.debug( 'Loading mapping from %s between "%s" and "%s" at index %s', path_to_mapping_config, mapping['mappings'][index].get('in_lang', 'und'), mapping['mappings'][index].get('out_lang', 'und'), index) mapping = mapping['mappings'][index] except KeyError: LOGGER.warning( 'An index of %s was provided for the mapping %s but that index does not exist in the mapping. Please check your mapping.', index, path_to_mapping_config) # Log the warning if an Index other than 0 was provided for a mapping config with a single mapping. elif index != 0: LOGGER.warning( 'An index of %s was provided for the mapping %s but that index does not exist in the mapping. Please check your mapping.', index, path_to_mapping_config) # try to load the data from the mapping data file if 'mapping' in mapping: mapping['mapping_data'] = load_from_file( os.path.join(path.parent, mapping['mapping'])) else: # Is "mapping" key missing? raise exceptions.MalformedMapping # load any abbreviations if 'abbreviations' in mapping: mapping['abbreviations_data'] = load_abbreviations_from_file( os.path.join(path.parent, mapping['abbreviations'])) return mapping else: raise FileNotFoundError
def find_good_match(p1, inventory_l2, l2_is_xsampa=False): """Find a good sequence in inventory_l2 matching p1.""" dst = panphon.distance.Distance() # The proper way to do this would be with some kind of beam search # through a determinized/minimized FST, but in the absence of that # we can do a kind of heurstic greedy search. (we don't want any # dependencies outside of PyPI otherwise we'd just use OpenFST) p1_pseq = dst.fm.ipa_segs(p1) p2_pseqs = [ dst.fm.ipa_segs(p) for p in process_characters(inventory_l2, l2_is_xsampa) ] i = 0 good_match = [] while i < len(p1_pseq): best_input = "" best_output = -1 best_score = 0xdeadbeef for j, p2_pseq in enumerate(p2_pseqs): # FIXME: Should also consider the (weighted) possibility # of deleting input or inserting any segment (but that # can't be done with a greedy search) if len(p2_pseq) == 0: LOGGER.warning('No panphon mapping for %s - skipping', inventory_l2[j]) continue e = min(i + len(p2_pseq), len(p1_pseq)) input_seg = p1_pseq[i:e] score = dst.weighted_feature_edit_distance(''.join(input_seg), ''.join(p2_pseq)) # Be very greedy and take the longest match if (score < best_score or score == best_score and len(input_seg) > len(best_input)): best_input = input_seg best_output = j best_score = score LOGGER.debug('Best match at position %d: %s => %s', i, best_input, inventory_l2[best_output]) good_match.append(inventory_l2[best_output]) i += len(best_input) # greedy! return ''.join(good_match)
def check_ipa_known_segs(mappings_to_check=False): dst = distance.Distance() if not mappings_to_check: mappings_to_check = [x['out_lang'] for x in MAPPINGS_AVAILABLE] found_error = False for mapping in [ x for x in MAPPINGS_AVAILABLE if x['out_lang'] in mappings_to_check ]: if mapping['out_lang'].endswith('-ipa'): for rule in mapping['mapping_data']: joined_ipa_segs = ''.join(dst.fm.ipa_segs(rule['out'])) if not joined_ipa_segs == rule['out']: LOGGER.warning( f"Output '{rule['out']}' in rule {rule} in mapping between {mapping['in_lang']} and {mapping['out_lang']} is not recognized as valid IPA by panphon. You may ignore this warning if you know it gets remapped to IPA later." ) found_error = True if found_error: LOGGER.warning( "Please refer to https://github.com/dmort27/panphon for information about panphon." )
def setUp(self): self.runner = APP.test_cli_runner() self.data_dir = os.path.dirname(data_dir) self.langs_to_test = [] for fn in glob(os.path.join(self.data_dir, "*.*sv")): if fn.endswith("csv"): delimiter = "," elif fn.endswith("psv"): delimiter = "|" elif fn.endswith("tsv"): delimiter = "\t" with open(fn, encoding="utf-8") as csvfile: reader = csv.reader(csvfile, delimiter=delimiter) for row in reader: if len(row) < 4: LOGGER.warning( f"Row in {fn} containing values {row} does not have the right values." f"Please check your data.") else: self.langs_to_test.append(row)
def doctor(mapping, list_all, list_ipa): """ Check for common errors in mappings. There should eventually be more checks here, but doctor currently checks for: 1. Characters that are in IPA mappings but are not recognized by panphon library. You can list available mappings with --list-all or --list-ipa, or by visiting http://g2p-studio.herokuapp.com/api/v1/langs . """ if list_all or list_ipa: out_langs = sorted(set([x["out_lang"] for x in MAPPINGS_AVAILABLE])) if list_ipa: out_langs = [x for x in out_langs if is_ipa(x)] LOGGER.info("Specifying an output language will check all mappings into that language:\n") for m in out_langs: print(f"{m}: ", end="") print( ("\n" + " " * len(m) + " ").join( [x["in_lang"] for x in MAPPINGS_AVAILABLE if x["out_lang"] == m] ) ) print("") return for m in mapping: if m not in [x["out_lang"] for x in MAPPINGS_AVAILABLE]: raise click.UsageError( f"No known mappings into '{m}'. " "Use --list-all or --list-ipa to list valid options." ) if not is_ipa(m): LOGGER.warning( f"No checks implemented yet for non-IPA mappings: '{m}' will not be checked." ) if not mapping: LOGGER.info("Checking all IPA mappings.") else: LOGGER.info("Checking the following mappings: \n" + "\n".join(mapping)) check_ipa_known_segs(list(mapping))
def create_mapping(l1_mapping: Mapping, l2_mapping: Mapping) -> Mapping: ''' Create a mapping from the output of l1 and input of l2. Both must be either ipa or x-sampa. ''' l1 = l1_mapping.kwargs['out_lang'] l2 = l2_mapping.kwargs['in_lang'] inv_l1 = l1_mapping.inventory("out") inv_l2 = l2_mapping.inventory() if not is_ipa(l1) and not is_xsampa(l1): LOGGER.warning( "Unsupported orthography of inventory 1: %s" " (must be ipa or x-sampa)", l1) if not is_ipa(l2) and not is_xsampa(l2): LOGGER.warning( "Unsupported orthography of inventory 2: %s" " (must be ipa or x-sampa)", l2) mapping = align_inventories(inv_l1["inventory"], inv_l2["inventory"], is_xsampa(l1), is_xsampa(l2)) output_mapping = Mapping(mapping, in_lang=l1, out_lang=l2) return output_mapping
def scan(lang, path): """ Returns the set of non-mapped characters in a document. Accounts for case sensitivity in the configuration. """ # Check input lang exists if not lang in LANGS_NETWORK.nodes: raise click.UsageError(f"'{lang}' is not a valid value for 'LANG'") # Retrieve the mappings for lang case_sensitive = True mappings = [] for mapping in MAPPINGS_AVAILABLE: mapping_name = mapping["in_lang"] # Exclude mappings for converting between IPAs if mapping_name.startswith(lang) and "ipa" not in mapping_name: case_sensitive = case_sensitive and mapping.get( "case_sensitive", True) mappings.append(mapping) # Get input chars in mapping mapped_chars = set() for lang_mapping in mappings: for x in lang_mapping["mapping_data"]: mapped_chars.add(normalize(x["in"], "NFD")) # Find unmapped chars filter_chars = " \n" mapped_string = "".join(mapped_chars) pattern = "[^" + mapped_string + filter_chars + ".]" prog = re.compile(pattern) with open(path, "r", encoding="utf8") as file: data = normalize(file.read(), "NFD") if not case_sensitive: data = data.lower() unmapped = set(prog.findall(data)) if unmapped: LOGGER.warning("The following characters are not mapped:") print(unmapped)
def test_convert(self): LOGGER.info( f"Running {len(self.langs_to_test)} g2p convert test cases found in public/data" ) error_count = 0 for tok_option in [["--tok", "--check"], ["--no-tok"]]: for test in self.langs_to_test: output_string = self.runner.invoke( convert, [*tok_option, test[2], test[0], test[1]]).stdout.strip() if output_string != test[3].strip(): LOGGER.warning( f"test_cli.py: {test[0]}->{test[1]} mapping error: '{test[2]}' " f"should map to '{test[3]}', got '{output_string}' (with {tok_option})." ) if error_count == 0: first_failed_test = test + [tok_option] error_count += 1 if error_count > 0: reference_string = first_failed_test[3] output_string = self.runner.invoke( convert, [ first_failed_test[4], # tok_option first_failed_test[2], # word to convert first_failed_test[0], # in_lang first_failed_test[1], # out_lang ], ).stdout.strip() self.assertEqual( output_string, reference_string.strip(), f"{first_failed_test[0]}->{first_failed_test[1]} mapping error " "for '{first_failed_test[2]}'.\n" "Look for warnings in the log for any more mapping errors", )
def align_to_dummy_fallback(mapping: Mapping, io: str = 'in', distance: str = "weighted_feature_edit_distance"): """Create a mapping from mapping's output inventory to a minimalist dummy inventory""" config = {'in_lang': mapping.kwargs[f'{io}_lang'], 'out_lang': 'dummy'} default_char = 't' if is_ipa(mapping.kwargs[f'{io}_lang']): mapping = align_inventories(mapping.inventory(io), DUMMY_INVENTORY, distance=distance) else: und_g2p = make_g2p('und', 'und-ipa') mapping = [{ "in": unicode_escape(x), "out": und_g2p(unidecode(x).lower()).output_string } for x in mapping.inventory(io)] dummy_list = align_inventories([x['out'] for x in mapping], DUMMY_INVENTORY, distance=distance) dummy_dict = {} for x in dummy_list: if x['in']: dummy_dict[x['in']] = x['out'] for x in mapping: try: x['out'] = dummy_dict[x['out']] except KeyError: LOGGER.warning( f"We couldn't guess at what {x['in']} means, so it's being " f"replaced with '{default_char}' instead.") x['out'] = default_char config['mapping'] = mapping mapping = Mapping(**config) return mapping
def is_panphon(string, display_warnings=False): # Deferred importing required here, because g2p.transducer also imports this file. # Such circular dependency is probably bad design, maybe a reviewer of this code will # have a better solution to recommend? import g2p.transducer dst = getPanphonDistanceSingleton() panphon_preprocessor = g2p.transducer.Transducer( Mapping(id="panphon_preprocessor")) preprocessed_string = panphon_preprocessor(string).output_string # Use a loop that prints the warnings on all strings that are not panphon, even though # logically this should not be necessary to calculate the answer. result = True for word in preprocessed_string.split(): word_ipa_segs = dst.fm.ipa_segs(word) word_ipa = "".join(word_ipa_segs) if word != word_ipa: if not display_warnings: return False LOGGER.warning( f'String "{word}" is not identical to its IPA segmentation: {word_ipa_segs}' ) if "g" in word and not is_panphon.g_warning_printed: LOGGER.warning( f"Common IPA gotcha: the ASCII 'g' character is not IPA, use 'ɡ' (\\u0261) instead." ) is_panphon.g_warning_printed = True if ":" in word and not is_panphon.colon_warning_printed: LOGGER.warning( f"Common IPA gotcha: the ASCII ':' character is not IPA, use 'ː' (\\u02D0) instead." ) is_panphon.colon_warning_printed = True for c in word: if c not in word_ipa: LOGGER.warning( f"Character '{c}' (\\u{format(ord(c), '04x')}) in word '{word}' " "was not recognized as IPA by panphon.") result = False return result
def create_mapping(mapping_1: Mapping, mapping_2: Mapping, mapping_1_io: str = 'out', mapping_2_io: str = 'in', write_to_file: bool = False, out_dir: str = '') -> Mapping: map_1_name = mapping_1.kwargs[f'{mapping_1_io}_lang'] map_2_name = mapping_2.kwargs[f'{mapping_2_io}_lang'] if not is_ipa(map_1_name) and not is_xsampa(map_1_name): LOGGER.warning("Unsupported orthography of inventory 1: %s" " (must be ipa or x-sampa)", map_1_name) if not is_ipa(map_2_name) and not is_xsampa(map_2_name): LOGGER.warning("Unsupported orthography of inventory 2: %s" " (must be ipa or x-sampa)", map_2_name) l1_is_xsampa, l2_is_xsampa = is_xsampa(map_1_name), is_xsampa(map_2_name) mapping = align_inventories(mapping_1.inventory(mapping_1_io), mapping_2.inventory(mapping_2_io), l1_is_xsampa, l2_is_xsampa) # Initialize mapping with input language parameters (as_is, # case_sensitive, prevent_feeding, etc) config = mapping_1.kwargs.copy() # Fix up names, etc. if 'authors' in config: del config['authors'] if 'display_name' in config: del config['display_name'] if 'language_name' in config: del config['language_name'] config['in_lang'] = map_1_name config['out_lang'] = map_2_name config['mapping'] = mapping mapping = Mapping(**config) if write_to_file: if out_dir: if os.path.isdir(out_dir): mapping.config_to_file(out_dir) mapping.mapping_to_file(out_dir) else: LOGGER.warning(f'{out_dir} is not a directory. Writing to default instead.') else: mapping.config_to_file() mapping.mapping_to_file() return mapping
def make_tokenizer(self, in_lang, out_lang=None, tok_path=None): tokenizer_key = self.make_tokenizer_key(in_lang, out_lang, tok_path) if not self.tokenizers.get(tokenizer_key): # This tokenizer was not created yet, initialize it now. if tok_path: # LOGGER.warning(f"in_lang={in_lang} tok_path={tok_path}") if tok_path[0] != in_lang: raise ValueError( "calling make_tokenizer() with tok_path requires that tok_path[0] == in_lang" ) assert len(tok_path) >= 2 if len(tok_path) == 2 or is_ipa(tok_path[1]): out_lang = tok_path[1] elif len(tok_path) == 3 or is_ipa(tok_path[2]): out_lang = tok_path[1:3] elif len(tok_path) > 3 and is_ipa(tok_path[3]): out_lang = tok_path[1:4] else: out_lang = tok_path[1:3] if not out_lang: try: successors = [x for x in LANGS_NETWORK.successors(in_lang)] except NetworkXError: successors = [] ipa_successors = [x for x in successors if is_ipa(x)] # LOGGER.warning(pprint.pformat([in_lang, "->", successors, ipa_successors])) if ipa_successors: # in_lang has an ipa successor, tokenize using it # there currently are no langs with more than 1 IPA successor, but to # be future-proof we'll arbitrarily take the first if there are more. out_lang = ipa_successors[0] else: # There is no direct IPA successor, look for a two-hop path to -ipa for x in successors: ipa_successors_two_hops = [ y for y in LANGS_NETWORK.successors(x) if is_ipa(y) ] # LOGGER.warning(pprint.pformat([in_lang, x, "->", [ipa_successors_two_hops]])) if ipa_successors_two_hops: out_lang = [x, ipa_successors_two_hops[0]] break # There is no two-hop IPA successor, use the first direct successor if out_lang is None and successors: out_lang = successors[0] # LOGGER.warning(f"Tokenizer for {in_lang} is {out_lang}.") if out_lang is None: # Default tokenizer: self.tokenizers[tokenizer_key] = self.tokenizers[None] elif isinstance(out_lang, list): # Build a multi-hop tokenizer assert len(out_lang) > 1 try: mappings = [Mapping(in_lang=in_lang, out_lang=out_lang[0])] for i in range(1, len(out_lang)): mappings.append( Mapping(in_lang=out_lang[i - 1], out_lang=out_lang[i])) self.tokenizers[tokenizer_key] = MultiHopTokenizer( mappings) except MappingMissing: self.tokenizers[tokenizer_key] = self.tokenizers[None] LOGGER.warning( f"missing mapping yet we looked for mappings in graph for {in_lang}-{out_lang}." ) else: # Build a one-hop tokenizer try: mapping = Mapping(in_lang=in_lang, out_lang=out_lang) self.tokenizers[tokenizer_key] = Tokenizer(mapping) except MappingMissing: self.tokenizers[tokenizer_key] = self.tokenizers[None] LOGGER.warning( f"Cannot find mapping from '{in_lang}' to '{out_lang}'. Using default tokenizer instead" ) # Hack for Tlingit using dot as a letter when non word-final if in_lang == "tli": self.tokenizers[tokenizer_key].dot_is_letter = True return self.tokenizers.get(tokenizer_key)
def create_multi_mapping( src_mappings: List[Tuple[Mapping, str]], tgt_mappings: List[Tuple[Mapping, str]], distance: str = "weighted_feature_edit_distance", ) -> Mapping: """Create a mapping for a set of source mappings to a set of target mappings Each src/tgt mappings is a (mapping: Mapping, in_or_out: str) pair specifying the mapping to use and whether its input ("in") or output ("out") inventory should be used to create the new mapping. The name of the mapping is infered from src_mappings[0] and tgt_mappings[0]'s metadata. """ def compact_ipa_names(ipa_names: Iterable) -> str: # ["fra-ipa", "eng-ipa", "kwk-ipa"] -> "fra-eng-kwk-ipa" return ( "-".join(name[:-4] if name.endswith("-ipa") else name for name in ipa_names) + "-ipa" ) def long_ipa_names(ipa_names: Iterable) -> str: # ["fra-ipa", "eng-ipa", "kwk-ipa"] -> "fra-ipa and eng-ipa and kwk-ipa" return " and ".join(ipa_names) def get_sorted_unique_names(mappings: List[Tuple[Mapping, str]]) -> List[str]: return sorted( {mapping.kwargs[f"{in_or_out}_lang"] for mapping, in_or_out in mappings} ) def deduplicate(iterable: Iterable) -> List: # Use a dict, and not a set, to preserve the original order. return list({v: v for v in iterable}.values()) map_1_names = get_sorted_unique_names(src_mappings) map_2_names = get_sorted_unique_names(tgt_mappings) src_inventory = [] for (mapping, io) in src_mappings: name = mapping.kwargs[f"{io}_lang"] if not is_ipa(name): LOGGER.warning( "Unsupported orthography of src inventory: %s; must be IPA", name ) src_inventory.extend(mapping.inventory(io)) src_inventory = deduplicate(src_inventory) tgt_inventory = [] for (mapping, io) in tgt_mappings: name = mapping.kwargs[f"{io}_lang"] if not is_ipa(name): LOGGER.warning( "Unsupported orthography of tgt inventory: %s; must be IPA", name ) tgt_inventory.extend(mapping.inventory(io)) tgt_inventory = deduplicate(tgt_inventory) mapping = align_inventories(src_inventory, tgt_inventory, distance=distance) config = { "in_lang": compact_ipa_names(map_1_names), "out_lang": compact_ipa_names(map_2_names), "language_name": "IPA", "rule_ordering": "apply-longest-first", "mapping": mapping, "prevent_feeding": True, "norm_form": "NFC", "display_name": ( long_ipa_names(map_1_names) + " to " + long_ipa_names(map_2_names) ), } return Mapping(**config)
def load_mapping_from_path(path_to_mapping_config, index=0): """ Loads a mapping from a path, if there is more than one mapping, then it loads based on the int provided to the 'index' argument. Default is 0. """ path = Path(path_to_mapping_config) # If path leads to actual mapping config if path.exists() and (path.suffix.endswith("yml") or path.suffix.endswith("yaml")): # safe load it with open(path, encoding="utf8") as f: mapping = yaml.safe_load(f) # If more than one mapping in the mapping config if "mappings" in mapping: try: LOGGER.debug( 'Loading mapping from %s between "%s" and "%s" at index %s', path_to_mapping_config, mapping["mappings"][index].get("in_lang", "und"), mapping["mappings"][index].get("out_lang", "und"), index, ) mapping = mapping["mappings"][index] except KeyError: LOGGER.warning( "An index of %s was provided for the mapping %s but that index does not exist in the mapping. " "Please check your mapping.", index, path_to_mapping_config, ) # Log the warning if an Index other than 0 was provided for a mapping config with a single mapping. elif index != 0: LOGGER.warning( "An index of %s was provided for the mapping %s but that index does not exist in the mapping. " "Please check your mapping.", index, path_to_mapping_config, ) # try to load the data from the mapping data file if "mapping" in mapping: try: mapping["mapping_data"] = load_from_file( os.path.join(path.parent, mapping["mapping"]) ) except (OSError, exceptions.IncorrectFileType) as e: raise exceptions.MalformedMapping( f"Cannot load mapping data file specified in {path}: {e}" ) from e elif mapping.get("type", "") == "unidecode": # This mapping is not implemented as a regular mapping, but as custom software pass else: # Is "mapping" key missing? raise exceptions.MalformedMapping( 'Key "mapping:" missing from a mapping in {}.'.format(path) ) # load any abbreviations if "abbreviations" in mapping: try: mapping["abbreviations_data"] = load_abbreviations_from_file( os.path.join(path.parent, mapping["abbreviations"]) ) except (OSError, exceptions.IncorrectFileType) as e: raise exceptions.MalformedMapping( f"Cannot load abbreviations data file specified in {path}: {e}" ) from e return mapping else: raise FileNotFoundError
def process_kwargs(self, mapping): ''' Apply kwargs in the order they are provided. kwargs are ordered as of python 3.6 ''' if 'as_is' in self.kwargs: as_is = self.kwargs['as_is'] if as_is: appropriate_setting = "as-written" else: appropriate_setting = "apply-longest-first" self.kwargs["rule_ordering"] = appropriate_setting LOGGER.warning( f"mapping from {self.kwargs.get('in_lang')} to {self.kwargs.get('out_lang')} " 'is using the deprecated parameter "as_is"; ' f"replace `as_is: {as_is}` with `rule_ordering: {appropriate_setting}`" ) # Add defaults if 'rule_ordering' in self.kwargs: # right now, "rule-ordering" is a more explict alias of the "as-is" option. ordering = self.kwargs["rule_ordering"] if ordering not in ("as-written", "apply-longest-first"): LOGGER.error( f"mapping from {self.kwargs.get('in_lang')} to {self.kwargs.get('out_lang')} " f"has invalid value '{ordering}' for rule_ordering parameter; " "rule_ordering must be one of " '"as-written" or "apply-longest-first"') else: self.kwargs["rule_ordering"] = "as-written" if 'case_sensitive' not in self.kwargs: self.kwargs['case_sensitive'] = True if 'escape_special' not in self.kwargs: self.kwargs['escape_special'] = False if 'norm_form' not in self.kwargs: self.kwargs['norm_form'] = 'NFD' if 'reverse' not in self.kwargs: self.kwargs['reverse'] = False if 'prevent_feeding' not in self.kwargs: self.kwargs['prevent_feeding'] = False if 'in_lang' not in self.kwargs: self.kwargs['in_lang'] = 'und' if 'out_lang' not in self.kwargs: self.kwargs['out_lang'] = 'und' # Process kwargs in order received for kwarg, val in self.kwargs.items(): if kwarg == 'rule_ordering' and self.wants_rules_sorted(): # sort by reverse len mapping = sorted(mapping, key=lambda x: len(x["in"]), reverse=True) elif kwarg == 'escape_special' and val: mapping = [escape_special_characters(x) for x in mapping] elif kwarg == 'norm_form' and val: for io in mapping: for k, v in io.items(): if isinstance(v, str): io[k] = normalize(v, self.kwargs['norm_form']) elif kwarg == 'reverse' and val: mapping = self.reverse_mappings(mapping) # After all processing is done, turn into regex for io in mapping: if self.kwargs['prevent_feeding'] or ('prevent_feeding' in io and io['prevent_feeding']): io['intermediate_form'] = self._string_to_pua( io['out'], mapping.index(io)) io['match_pattern'] = self.rule_to_regex(io) if not io['match_pattern']: mapping.remove(io) self.processed = True return mapping
def config_to_file(self, output_path: str = os.path.join(GEN_DIR, 'config.yaml'), mapping_type: str = 'json'): ''' Write config to file ''' add_config = False if os.path.exists(output_path) and os.path.isfile(output_path): LOGGER.warning(f'Adding mapping config to file at {output_path}') fn = output_path add_config = True elif os.path.isdir(output_path): fn = os.path.join(output_path, 'config.yaml') else: LOGGER.warning(f'writing mapping config to file at {output_path}') fn = output_path template = { "mappings": [{ "language_name": self.kwargs.get('language_name', self.kwargs.get('in_lang', 'und')), "display_name": self.kwargs.get( 'display_name', self.kwargs.get('in_lang', 'und') + " " + self.mapping_type(self.kwargs.get('out_lang', 'und')) + " to " + self.kwargs.get('out_lang', 'und') + " " + self.mapping_type(self.kwargs.get('out_lang', 'und'))), "in_lang": self.kwargs.get('in_lang', 'und'), "out_lang": self.kwargs.get('out_lang', 'und'), "authors": self.kwargs.get('authors', [f'Generated {dt.datetime.now()}']), "as_is": not self.wants_rules_sorted(), # TODO: rule_ordering "prevent_feeding": self.kwargs.get('prevent_feeding', False), "case_sensitive": self.kwargs.get('case_sensitive', True), "escape_special": self.kwargs.get('escape_special', False), "norm_form": self.kwargs.get('norm_form', "NFD"), "reverse": self.kwargs.get('reverse', False), "mapping": self.kwargs.get('in_lang', 'und') + "_to_" + self.kwargs.get('out_lang', 'und') + '.' + mapping_type }] } # If config file exists already, just add the mapping. if add_config: with open(fn, encoding='utf8') as f: existing_data = yaml.safe_load(f.read()) updated = False for i, mapping in enumerate(existing_data['mappings']): # if the mapping exists, just update the generation data if mapping['in_lang'] == template['mappings'][0][ 'in_lang'] and mapping['out_lang'] == template[ 'mappings'][0]['out_lang']: existing_data['mappings'][i]['authors'] = template[ 'mappings'][0]['authors'] updated = True break if not updated: existing_data['mappings'].append(template['mappings'][0]) template = existing_data with open(fn, 'w', encoding='utf8') as f: yaml.dump(template, f, Dumper=IndentDumper, default_flow_style=False)
def config_to_file( self, output_path: str = os.path.join(GEN_DIR, "config.yaml"), mapping_type: str = "json", ): """ Write config to file """ add_config = False if os.path.isdir(output_path): output_path = os.path.join(output_path, "config.yaml") if os.path.exists(output_path) and os.path.isfile(output_path): LOGGER.warning(f"Adding mapping config to file at {output_path}") fn = output_path add_config = True else: LOGGER.warning(f"writing mapping config to file at {output_path}") fn = output_path template = { "mappings": [{ "language_name": self.kwargs.get("language_name", self.kwargs.get("in_lang", "und")), "display_name": self.kwargs.get( "display_name", self.kwargs.get("in_lang", "und") + " " + self.mapping_type(self.kwargs.get("out_lang", "und")) + " to " + self.kwargs.get("out_lang", "und") + " " + self.mapping_type(self.kwargs.get("out_lang", "und")), ), "in_lang": self.kwargs.get("in_lang", "und"), "out_lang": self.kwargs.get("out_lang", "und"), "authors": self.kwargs.get("authors", [f"Generated {dt.datetime.now()}"]), "rule_ordering": self.kwargs.get("rule_ordering", "as-written"), "prevent_feeding": self.kwargs.get("prevent_feeding", False), "case_sensitive": self.kwargs.get("case_sensitive", True), "escape_special": self.kwargs.get("escape_special", False), "norm_form": self.kwargs.get("norm_form", "NFD"), "reverse": self.kwargs.get("reverse", False), "mapping": self.kwargs.get("in_lang", "und") + "_to_" + self.kwargs.get("out_lang", "und") + "." + mapping_type, }] } # If config file exists already, just add the mapping. if add_config: with open(fn, encoding="utf8") as f: existing_data = yaml.safe_load(f.read()) updated = False for i, mapping in enumerate(existing_data["mappings"]): # if the mapping exists, just update the generation data if (mapping["in_lang"] == template["mappings"][0]["in_lang"] and mapping["out_lang"] == template["mappings"][0]["out_lang"]): existing_data["mappings"][i]["authors"] = template[ "mappings"][0]["authors"] updated = True break if not updated: existing_data["mappings"].append(template["mappings"][0]) template = existing_data with open(fn, "w", encoding="utf8") as f: yaml.dump(template, f, Dumper=IndentDumper, default_flow_style=False)
def convert( in_lang, out_lang, input_text, path, tok, check, debugger, pretty_edges, tok_lang, config, ): """Convert INPUT_TEXT through g2p mapping(s) from IN_LANG to OUT_LANG. Visit http://g2p-studio.herokuapp.com/api/v1/langs for a list of languages. There must be a path from IN_LANG to OUT_LANG, possibly via some intermediates. For example, mapping from fra to eng-arpabet will successively apply fra->fra-ipa, fra-ipa->eng-ipa and eng-ipa->eng-arpabet. """ # Check valid input # Check input != output if in_lang == out_lang: raise click.UsageError( "Values must be different for 'IN_LANG' and 'OUT_LANG'") if config: # This isn't that DRY - copied from g2p/mappings/langs/__init__.py mappings_legal_pairs = [] with open(config, encoding="utf8") as f: data = yaml.safe_load(f) if "mappings" in data: for index, mapping in enumerate(data["mappings"]): mappings_legal_pairs.append(( data["mappings"][index]["in_lang"], data["mappings"][index]["out_lang"], )) data["mappings"][index] = load_mapping_from_path(config, index) else: mapping = load_mapping_from_path(config) data["mappings"] = [mapping] mappings_legal_pairs.append( (mapping["in_lang"], mapping["out_lang"])) for pair in mappings_legal_pairs: if pair[0] in LANGS_NETWORK.nodes: LOGGER.warning( f"A mapping with the name '{pair[0]}' is already defined in g2p. " "Your local mapping with the same name might not function properly." ) LANGS_NETWORK.add_edges_from(mappings_legal_pairs) MAPPINGS_AVAILABLE.extend(data["mappings"]) # Check input lang exists if in_lang not in LANGS_NETWORK.nodes: raise click.UsageError( f"'{in_lang}' is not a valid value for 'IN_LANG'") # Check output lang exists if out_lang not in LANGS_NETWORK.nodes: raise click.UsageError( f"'{out_lang}' is not a valid value for 'OUT_LANG'") # Check if path exists if not has_path(LANGS_NETWORK, in_lang, out_lang): raise click.UsageError( f"Path between '{in_lang}' and '{out_lang}' does not exist") if os.path.exists(input_text) and input_text.endswith("txt"): with open(input_text, encoding="utf8") as f: input_text = f.read() # Determine which tokenizer to use, if any if tok is not None and not tok and tok_lang is not None: raise click.UsageError( "Specified conflicting --no-tok and --tok-lang options.") if tok and tok_lang is None: tok_lang = "path" # Transduce!!! if in_lang and out_lang: transducer = make_g2p(in_lang, out_lang, tok_lang=tok_lang) elif path: transducer = Transducer(Mapping(path)) tg = transducer(input_text) if check: transducer.check(tg, display_warnings=True) outputs = [tg.output_string] if pretty_edges: outputs += [tg.pretty_edges()] if debugger: outputs += [tg.edges, tg.debugger] if len(outputs) > 1: click.echo(pprint.pformat(outputs, indent=4)) else: click.echo(tg.output_string)