Example #1
0
 def test_unicode_escape(self):
     ''' Should turn \u0331 declared in CSVs
         into actual Unicode string for that codepoint
     '''
     self.assertEqual('\u0000', utils.unicode_escape('\\u0000'))
     self.assertEqual('\u0331', utils.unicode_escape('\\u0331'))
     self.assertEqual('\u26F0', utils.unicode_escape('\\u26F0'))
Example #2
0
def align_to_dummy_fallback(mapping: Mapping, io: str = 'in', write_to_file: bool = False):
    dummy_inventory = ["ɑ", "i", "u", "t", "s", "n"]
    display_name = mapping.kwargs.get('language_name', 'No Language display name in Config')
    config = generate_config(mapping.kwargs[f'{io}_lang'], 'dummy', display_name, display_name)
    default_char = 't'
    if is_ipa(mapping.kwargs[f'{io}_lang']):
        mapping = align_inventories(mapping.inventory(io), dummy_inventory)
    else:
        und_g2p = make_g2p('und', 'und-ipa')
        mapping = [{"in": unicode_escape(x), "out": und_g2p(unidecode(x).lower())} for x in mapping.inventory(io)]
        dummy_list = align_inventories([x['out'] for x in mapping], dummy_inventory)
        dummy_dict = {}
        for x in dummy_list:
            if x['in']:
                dummy_dict[x['in']] = x['out']
                
        for x in mapping:
            try:
                x['out'] = dummy_dict[x['out']]
            except KeyError:
                LOGGER.warn(f"We couldn't guess at what {x['in']} means, so it's being replaced with '{default_char}' instead.")
                x['out'] = default_char       
 
    if write_to_file:
        write_generated_mapping_to_file(config, mapping)
    return config, mapping
def align_to_dummy_fallback(mapping: Mapping, io: str = 'in', write_to_file: bool = False, out_dir: str = ''):
    display_name = mapping.kwargs.get('language_name', 'No Language display name in Config')
    config = {'in_lang': mapping.kwargs[f'{io}_lang'], 'out_lang': 'dummy'}
    default_char = 't'
    if is_ipa(mapping.kwargs[f'{io}_lang']):
        mapping = align_inventories(mapping.inventory(io), DUMMY_INVENTORY)
    else:
        und_g2p = make_g2p('und', 'und-ipa')
        mapping = [{"in": unicode_escape(x), "out": und_g2p(unidecode(x).lower()).output_string} for x in mapping.inventory(io)]
        dummy_list = align_inventories([x['out'] for x in mapping], DUMMY_INVENTORY)
        dummy_dict = {}
        for x in dummy_list:
            if x['in']:
                dummy_dict[x['in']] = x['out']
                
        for x in mapping:
            try:
                x['out'] = dummy_dict[x['out']]
            except KeyError:
                LOGGER.warn(f"We couldn't guess at what {x['in']} means, so it's being replaced with '{default_char}' instead.")
                x['out'] = default_char       

    config['mapping'] = mapping
    mapping = Mapping(**config)
    if write_to_file:
        if out_dir:
            if os.path.isdir(out_dir):
                mapping.config_to_file(out_dir)
                mapping.mapping_to_file(out_dir)
            else:
                LOGGER.warning(f'{out_dir} is not a directory. Writing to default instead.')
        else:
            mapping.config_to_file()
            mapping.mapping_to_file()
    return mapping
Example #4
0
 def __init__(self,
              mapping=None,
              abbreviations: Union[str, DefaultDict[str,
                                                    List[str]]] = False,
              **kwargs):
     # should these just be explicit instead of kwargs...
     # yes, they should
     self.allowable_kwargs = [
         'language_name', 'display_name', 'mapping', 'in_lang', 'out_lang',
         'out_delimiter', 'as_is', 'case_sensitive', 'rule_ordering',
         'escape_special', 'norm_form', 'prevent_feeding', 'reverse'
     ]
     self.kwargs = OrderedDict(kwargs)
     self.processed = False
     if isinstance(abbreviations, defaultdict) or not abbreviations:
         self.abbreviations = abbreviations
     elif abbreviations:
         self.abbreviations = load_abbreviations_from_file(abbreviations)
     # Handle user-supplied list
     if isinstance(mapping, list):
         self.mapping = validate(mapping)
     elif isinstance(mapping, str) and (mapping.endswith('yaml')
                                        or mapping.endswith('yml')):
         loaded_config = load_mapping_from_path(mapping)
         self.process_loaded_config(loaded_config)
     elif isinstance(mapping, str):
         self.mapping = validate(load_from_file(mapping))
     else:
         if "in_lang" in self.kwargs and "out_lang" in self.kwargs:
             loaded_config = find_mapping(self.kwargs['in_lang'],
                                          self.kwargs['out_lang'])
             self.process_loaded_config(loaded_config)
         elif 'id' in self.kwargs:
             loaded_config = self.find_mapping_by_id(self.kwargs['id'])
             self.process_loaded_config(loaded_config)
         else:
             raise exceptions.MalformedLookup()
     if self.abbreviations:
         for abb, stands_for in self.abbreviations.items():
             abb_match = re.compile(abb)
             abb_repl = '|'.join(stands_for)
             if self.mapping and 'match_pattern' not in self.mapping[0]:
                 for io in self.mapping:
                     for key in io.keys():
                         if key in [
                                 'in', 'out', 'context_before',
                                 'context_after'
                         ] and re.search(abb_match, io[key]):
                             io[key] = re.sub(abb_match,
                                              unicode_escape(abb_repl),
                                              io[key])
     if not self.processed:
         self.mapping = self.process_kwargs(self.mapping)
Example #5
0
def align_to_dummy_fallback(mapping: Mapping,
                            io: str = 'in',
                            distance: str = "weighted_feature_edit_distance"):
    """Create a mapping from mapping's output inventory to a minimalist dummy inventory"""
    config = {'in_lang': mapping.kwargs[f'{io}_lang'], 'out_lang': 'dummy'}
    default_char = 't'
    if is_ipa(mapping.kwargs[f'{io}_lang']):
        mapping = align_inventories(mapping.inventory(io),
                                    DUMMY_INVENTORY,
                                    distance=distance)
    else:
        und_g2p = make_g2p('und', 'und-ipa')
        mapping = [{
            "in": unicode_escape(x),
            "out": und_g2p(unidecode(x).lower()).output_string
        } for x in mapping.inventory(io)]
        dummy_list = align_inventories([x['out'] for x in mapping],
                                       DUMMY_INVENTORY,
                                       distance=distance)
        dummy_dict = {}
        for x in dummy_list:
            if x['in']:
                dummy_dict[x['in']] = x['out']

        for x in mapping:
            try:
                x['out'] = dummy_dict[x['out']]
            except KeyError:
                LOGGER.warning(
                    f"We couldn't guess at what {x['in']} means, so it's being "
                    f"replaced with '{default_char}' instead.")
                x['out'] = default_char

    config['mapping'] = mapping
    mapping = Mapping(**config)
    return mapping
Example #6
0
 def __init__(
     self,
     mapping=None,
     abbreviations: Union[str, DefaultDict[str, List[str]]] = False,
     **kwargs,
 ):
     # should these just be explicit instead of kwargs...
     # yes, they should
     self.allowable_kwargs = [
         "language_name",
         "display_name",
         "mapping",
         "in_lang",
         "out_lang",
         "out_delimiter",
         "as_is",
         "case_sensitive",
         "rule_ordering",
         "escape_special",
         "norm_form",
         "prevent_feeding",
         "reverse",
         "type",
     ]
     self.kwargs = OrderedDict(kwargs)
     self.processed = False
     if isinstance(abbreviations, defaultdict) or not abbreviations:
         self.abbreviations = abbreviations
     else:
         self.abbreviations = load_abbreviations_from_file(abbreviations)
     # Handle user-supplied list
     if isinstance(mapping, list):
         self.mapping = validate(mapping, path="user-supplied mapping")
     elif isinstance(mapping, str) and (mapping.endswith("yaml")
                                        or mapping.endswith("yml")):
         loaded_config = load_mapping_from_path(mapping)
         self.process_loaded_config(loaded_config)
     elif isinstance(mapping, str):
         self.mapping = validate(load_from_file(mapping), path=mapping)
     else:
         if "in_lang" in self.kwargs and "out_lang" in self.kwargs:
             loaded_config = find_mapping(self.kwargs["in_lang"],
                                          self.kwargs["out_lang"])
             self.process_loaded_config(loaded_config)
         elif "id" in self.kwargs:
             loaded_config = self.find_mapping_by_id(self.kwargs["id"])
             self.process_loaded_config(loaded_config)
         elif self.kwargs.get("type", "") == "unidecode":
             self.mapping = []
         else:
             raise exceptions.MalformedLookup()
     if self.abbreviations:
         for abb, stands_for in sorted(self.abbreviations.items(),
                                       key=lambda x: len(x[0]),
                                       reverse=True):
             abb_match = re.compile(abb)
             abb_repl = "|".join(stands_for)
             if self.mapping and "match_pattern" not in self.mapping[0]:
                 for io in self.mapping:
                     for key in io.keys():
                         if key in [
                                 "in",
                                 "out",
                                 "context_before",
                                 "context_after",
                         ] and re.search(abb_match, io[key]):
                             io[key] = re.sub(abb_match,
                                              unicode_escape(abb_repl),
                                              io[key])
     if not self.processed:
         self.mapping = self.process_kwargs(self.mapping)