def test_unicode_escape(self): ''' Should turn \u0331 declared in CSVs into actual Unicode string for that codepoint ''' self.assertEqual('\u0000', utils.unicode_escape('\\u0000')) self.assertEqual('\u0331', utils.unicode_escape('\\u0331')) self.assertEqual('\u26F0', utils.unicode_escape('\\u26F0'))
def align_to_dummy_fallback(mapping: Mapping, io: str = 'in', write_to_file: bool = False): dummy_inventory = ["ɑ", "i", "u", "t", "s", "n"] display_name = mapping.kwargs.get('language_name', 'No Language display name in Config') config = generate_config(mapping.kwargs[f'{io}_lang'], 'dummy', display_name, display_name) default_char = 't' if is_ipa(mapping.kwargs[f'{io}_lang']): mapping = align_inventories(mapping.inventory(io), dummy_inventory) else: und_g2p = make_g2p('und', 'und-ipa') mapping = [{"in": unicode_escape(x), "out": und_g2p(unidecode(x).lower())} for x in mapping.inventory(io)] dummy_list = align_inventories([x['out'] for x in mapping], dummy_inventory) dummy_dict = {} for x in dummy_list: if x['in']: dummy_dict[x['in']] = x['out'] for x in mapping: try: x['out'] = dummy_dict[x['out']] except KeyError: LOGGER.warn(f"We couldn't guess at what {x['in']} means, so it's being replaced with '{default_char}' instead.") x['out'] = default_char if write_to_file: write_generated_mapping_to_file(config, mapping) return config, mapping
def align_to_dummy_fallback(mapping: Mapping, io: str = 'in', write_to_file: bool = False, out_dir: str = ''): display_name = mapping.kwargs.get('language_name', 'No Language display name in Config') config = {'in_lang': mapping.kwargs[f'{io}_lang'], 'out_lang': 'dummy'} default_char = 't' if is_ipa(mapping.kwargs[f'{io}_lang']): mapping = align_inventories(mapping.inventory(io), DUMMY_INVENTORY) else: und_g2p = make_g2p('und', 'und-ipa') mapping = [{"in": unicode_escape(x), "out": und_g2p(unidecode(x).lower()).output_string} for x in mapping.inventory(io)] dummy_list = align_inventories([x['out'] for x in mapping], DUMMY_INVENTORY) dummy_dict = {} for x in dummy_list: if x['in']: dummy_dict[x['in']] = x['out'] for x in mapping: try: x['out'] = dummy_dict[x['out']] except KeyError: LOGGER.warn(f"We couldn't guess at what {x['in']} means, so it's being replaced with '{default_char}' instead.") x['out'] = default_char config['mapping'] = mapping mapping = Mapping(**config) if write_to_file: if out_dir: if os.path.isdir(out_dir): mapping.config_to_file(out_dir) mapping.mapping_to_file(out_dir) else: LOGGER.warning(f'{out_dir} is not a directory. Writing to default instead.') else: mapping.config_to_file() mapping.mapping_to_file() return mapping
def __init__(self, mapping=None, abbreviations: Union[str, DefaultDict[str, List[str]]] = False, **kwargs): # should these just be explicit instead of kwargs... # yes, they should self.allowable_kwargs = [ 'language_name', 'display_name', 'mapping', 'in_lang', 'out_lang', 'out_delimiter', 'as_is', 'case_sensitive', 'rule_ordering', 'escape_special', 'norm_form', 'prevent_feeding', 'reverse' ] self.kwargs = OrderedDict(kwargs) self.processed = False if isinstance(abbreviations, defaultdict) or not abbreviations: self.abbreviations = abbreviations elif abbreviations: self.abbreviations = load_abbreviations_from_file(abbreviations) # Handle user-supplied list if isinstance(mapping, list): self.mapping = validate(mapping) elif isinstance(mapping, str) and (mapping.endswith('yaml') or mapping.endswith('yml')): loaded_config = load_mapping_from_path(mapping) self.process_loaded_config(loaded_config) elif isinstance(mapping, str): self.mapping = validate(load_from_file(mapping)) else: if "in_lang" in self.kwargs and "out_lang" in self.kwargs: loaded_config = find_mapping(self.kwargs['in_lang'], self.kwargs['out_lang']) self.process_loaded_config(loaded_config) elif 'id' in self.kwargs: loaded_config = self.find_mapping_by_id(self.kwargs['id']) self.process_loaded_config(loaded_config) else: raise exceptions.MalformedLookup() if self.abbreviations: for abb, stands_for in self.abbreviations.items(): abb_match = re.compile(abb) abb_repl = '|'.join(stands_for) if self.mapping and 'match_pattern' not in self.mapping[0]: for io in self.mapping: for key in io.keys(): if key in [ 'in', 'out', 'context_before', 'context_after' ] and re.search(abb_match, io[key]): io[key] = re.sub(abb_match, unicode_escape(abb_repl), io[key]) if not self.processed: self.mapping = self.process_kwargs(self.mapping)
def align_to_dummy_fallback(mapping: Mapping, io: str = 'in', distance: str = "weighted_feature_edit_distance"): """Create a mapping from mapping's output inventory to a minimalist dummy inventory""" config = {'in_lang': mapping.kwargs[f'{io}_lang'], 'out_lang': 'dummy'} default_char = 't' if is_ipa(mapping.kwargs[f'{io}_lang']): mapping = align_inventories(mapping.inventory(io), DUMMY_INVENTORY, distance=distance) else: und_g2p = make_g2p('und', 'und-ipa') mapping = [{ "in": unicode_escape(x), "out": und_g2p(unidecode(x).lower()).output_string } for x in mapping.inventory(io)] dummy_list = align_inventories([x['out'] for x in mapping], DUMMY_INVENTORY, distance=distance) dummy_dict = {} for x in dummy_list: if x['in']: dummy_dict[x['in']] = x['out'] for x in mapping: try: x['out'] = dummy_dict[x['out']] except KeyError: LOGGER.warning( f"We couldn't guess at what {x['in']} means, so it's being " f"replaced with '{default_char}' instead.") x['out'] = default_char config['mapping'] = mapping mapping = Mapping(**config) return mapping
def __init__( self, mapping=None, abbreviations: Union[str, DefaultDict[str, List[str]]] = False, **kwargs, ): # should these just be explicit instead of kwargs... # yes, they should self.allowable_kwargs = [ "language_name", "display_name", "mapping", "in_lang", "out_lang", "out_delimiter", "as_is", "case_sensitive", "rule_ordering", "escape_special", "norm_form", "prevent_feeding", "reverse", "type", ] self.kwargs = OrderedDict(kwargs) self.processed = False if isinstance(abbreviations, defaultdict) or not abbreviations: self.abbreviations = abbreviations else: self.abbreviations = load_abbreviations_from_file(abbreviations) # Handle user-supplied list if isinstance(mapping, list): self.mapping = validate(mapping, path="user-supplied mapping") elif isinstance(mapping, str) and (mapping.endswith("yaml") or mapping.endswith("yml")): loaded_config = load_mapping_from_path(mapping) self.process_loaded_config(loaded_config) elif isinstance(mapping, str): self.mapping = validate(load_from_file(mapping), path=mapping) else: if "in_lang" in self.kwargs and "out_lang" in self.kwargs: loaded_config = find_mapping(self.kwargs["in_lang"], self.kwargs["out_lang"]) self.process_loaded_config(loaded_config) elif "id" in self.kwargs: loaded_config = self.find_mapping_by_id(self.kwargs["id"]) self.process_loaded_config(loaded_config) elif self.kwargs.get("type", "") == "unidecode": self.mapping = [] else: raise exceptions.MalformedLookup() if self.abbreviations: for abb, stands_for in sorted(self.abbreviations.items(), key=lambda x: len(x[0]), reverse=True): abb_match = re.compile(abb) abb_repl = "|".join(stands_for) if self.mapping and "match_pattern" not in self.mapping[0]: for io in self.mapping: for key in io.keys(): if key in [ "in", "out", "context_before", "context_after", ] and re.search(abb_match, io[key]): io[key] = re.sub(abb_match, unicode_escape(abb_repl), io[key]) if not self.processed: self.mapping = self.process_kwargs(self.mapping)