def apply_unidecode(self, to_convert: str): if self.norm_form: to_convert = normalize(to_convert, self.norm_form) tg = TransductionGraph(to_convert) # Conversion is done character by character using unidecode converted = [text_unidecode.unidecode(c) for c in to_convert] tg.output_string = "".join(converted) # Edges are calculated to follow the conversion step by step if tg.output_string == "": # Some inputs get completely deleted by unidecode, in which case there are no # valid edges to output. tg.edges = [] else: edges = [] x_len, y_len = 0, 0 for tgt in converted: if tgt: for c in tgt: edges.append((x_len, y_len)) y_len += 1 else: edges.append((x_len, max(y_len - 1, 0))) x_len += 1 tg.edges = edges return tg
def apply_rules(self, to_convert: str): if self.mapping.kwargs.get("type", "") == "unidecode": return self.apply_unidecode(to_convert) # perform any normalization if not self.case_sensitive: to_convert = to_convert.lower() if self.norm_form: to_convert = normalize(to_convert, self.norm_form) tg = TransductionGraph(to_convert) tg.debugger.append([]) # initialize values intermediate_forms = False # iterate rules for io in self.mapping: # Do not allow empty rules if not io["in"] and not io["out"]: continue io = copy.deepcopy(io) intermediate_diff = 0 for match in io["match_pattern"].finditer(tg.output_string): debug_string = tg.output_string start = match.start() + intermediate_diff end = match.end() + intermediate_diff if "intermediate_form" in io: out_string = io["intermediate_form"] intermediate_forms = True else: out_string = io["out"] if self.out_delimiter: out_string += self.out_delimiter if any(self._char_match_pattern.finditer(io["in"])) and any( self._char_match_pattern.finditer(out_string)): self.update_explicit_indices(tg, match, io, intermediate_diff, out_string) else: self.update_default_indices(tg, match, intermediate_diff, out_string) if (io["in"] != io["out"] or ("context_after" in io and io["context_after"]) or ("context_before" in io and io["context_before"])): tg.debugger[-1].append({ "input": debug_string, "output": tg.output_string, "rule": {k: v for k, v in io.items() if k != "match_pattern"}, "start": start, "end": end, }) out_string = re.sub(re.compile(r"{\d+}"), "", out_string) intermediate_diff += len(out_string) - len(match.group()) if intermediate_forms: tg.output_string = self.resolve_intermediate_chars( tg.output_string) tg.edges = list( dict.fromkeys( [tuple(x) for x in sorted(tg.edges, key=lambda x: x[0])])) return tg
def apply_rules(self, to_convert: str): # perform any normalization if not self.case_sensitive: to_convert = to_convert.lower() if self.norm_form: to_convert = normalize(to_convert, self.norm_form) tg = TransductionGraph(to_convert) # initialize values intermediate_forms = False # iterate rules for io in self.mapping: # Do not allow empty rules if not io['in'] and not io['out']: continue io = copy.deepcopy(io) intermediate_diff = 0 for match in io['match_pattern'].finditer(tg.output_string): debug_string = tg.output_string start = match.start() + intermediate_diff end = match.end() + intermediate_diff if 'intermediate_form' in io: out_string = io['intermediate_form'] intermediate_forms = True else: out_string = io['out'] if self.out_delimiter: # if not end segment, add delimiter if not end >= len(tg.output_string): out_string += self.out_delimiter if any(self._char_match_pattern.finditer(io['in'])) and any( self._char_match_pattern.finditer(out_string)): self.update_explicit_indices(tg, match, io, intermediate_diff, out_string) else: self.update_default_indices(tg, match, intermediate_diff, out_string) if io['in'] != io['out']: tg.debugger.append({ 'input': debug_string, 'output': tg.output_string, 'rule': {k: v for k, v in io.items() if k != 'match_pattern'}, 'start': start, 'end': end }) out_string = re.sub(re.compile(r'{\d+}'), '', out_string) intermediate_diff += len(out_string) - len(match.group()) if intermediate_forms: tg.output_string = self.resolve_intermediate_chars( tg.output_string) tg.edges = list( dict.fromkeys( [tuple(x) for x in sorted(tg.edges, key=lambda x: x[0])])) return tg
def scan(lang, path): """ Returns the set of non-mapped characters in a document. Accounts for case sensitivity in the configuration. """ # Check input lang exists if not lang in LANGS_NETWORK.nodes: raise click.UsageError(f"'{lang}' is not a valid value for 'LANG'") # Retrieve the mappings for lang case_sensitive = True mappings = [] for mapping in MAPPINGS_AVAILABLE: mapping_name = mapping["in_lang"] # Exclude mappings for converting between IPAs if mapping_name.startswith(lang) and "ipa" not in mapping_name: case_sensitive = case_sensitive and mapping.get( "case_sensitive", True) mappings.append(mapping) # Get input chars in mapping mapped_chars = set() for lang_mapping in mappings: for x in lang_mapping["mapping_data"]: mapped_chars.add(normalize(x["in"], "NFD")) # Find unmapped chars filter_chars = " \n" mapped_string = "".join(mapped_chars) pattern = "[^" + mapped_string + filter_chars + ".]" prog = re.compile(pattern) with open(path, "r", encoding="utf8") as file: data = normalize(file.read(), "NFD") if not case_sensitive: data = data.lower() unmapped = set(prog.findall(data)) if unmapped: LOGGER.warning("The following characters are not mapped:") print(unmapped)
def __call__(self, to_convert: str): # perform normalization before tokenizing, since it can change tokenization if self._transducer.norm_form: to_convert = normalize(to_convert, self._transducer.norm_form) # Initialize the transducer on an empty string so we can handle inputs # that start with a non-token correctly. tg = self._transducer("") tg.clear_debugger() # clear the meaningless initial debugger for token in self._tokenizer.tokenize_text(to_convert): if token["is_word"]: word_tg = self._transducer(token["text"]) tg += word_tg else: non_word_tg = TransductionGraph(token["text"]) tg += non_word_tg return tg
def process_kwargs(self, mapping): ''' Apply kwargs in the order they are provided. kwargs are ordered as of python 3.6 ''' # Add defaults if 'as_is' not in self.kwargs: self.kwargs['as_is'] = False if 'case_sensitive' not in self.kwargs: self.kwargs['case_sensitive'] = True if 'escape_special' not in self.kwargs: self.kwargs['escape_special'] = False if 'norm_form' not in self.kwargs: self.kwargs['norm_form'] = 'NFD' if 'reverse' not in self.kwargs: self.kwargs['reverse'] = False # Process kwargs in order received for kwarg, val in self.kwargs.items(): if kwarg == 'as_is' and not val: # sort by reverse len mapping = sorted(mapping, key=lambda x: len(x["in"]), reverse=True) elif kwarg == 'escape_special' and val: mapping = [escape_special_characters(x) for x in mapping] elif kwarg == 'case_sensitive' and not val: mapping = self.lower_mappings(mapping) elif kwarg == 'norm_form' and val: for io in mapping: for k, v in io.items(): if isinstance(v, str): io[k] = normalize(v, self.kwargs['norm_form']) elif kwarg == 'reverse' and val: mapping = self.reverse_mappings(mapping) # After all processing is done, turn into regex for io in mapping: io['match_pattern'] = self.rule_to_regex(io) self.processed = True return mapping
def process_kwargs(self, mapping): ''' Apply kwargs in the order they are provided. kwargs are ordered as of python 3.6 ''' if 'as_is' in self.kwargs: as_is = self.kwargs['as_is'] if as_is: appropriate_setting = "as-written" else: appropriate_setting = "apply-longest-first" self.kwargs["rule_ordering"] = appropriate_setting LOGGER.warning( f"mapping from {self.kwargs.get('in_lang')} to {self.kwargs.get('out_lang')} " 'is using the deprecated parameter "as_is"; ' f"replace `as_is: {as_is}` with `rule_ordering: {appropriate_setting}`" ) # Add defaults if 'rule_ordering' in self.kwargs: # right now, "rule-ordering" is a more explict alias of the "as-is" option. ordering = self.kwargs["rule_ordering"] if ordering not in ("as-written", "apply-longest-first"): LOGGER.error( f"mapping from {self.kwargs.get('in_lang')} to {self.kwargs.get('out_lang')} " f"has invalid value '{ordering}' for rule_ordering parameter; " "rule_ordering must be one of " '"as-written" or "apply-longest-first"') else: self.kwargs["rule_ordering"] = "as-written" if 'case_sensitive' not in self.kwargs: self.kwargs['case_sensitive'] = True if 'escape_special' not in self.kwargs: self.kwargs['escape_special'] = False if 'norm_form' not in self.kwargs: self.kwargs['norm_form'] = 'NFD' if 'reverse' not in self.kwargs: self.kwargs['reverse'] = False if 'prevent_feeding' not in self.kwargs: self.kwargs['prevent_feeding'] = False if 'in_lang' not in self.kwargs: self.kwargs['in_lang'] = 'und' if 'out_lang' not in self.kwargs: self.kwargs['out_lang'] = 'und' # Process kwargs in order received for kwarg, val in self.kwargs.items(): if kwarg == 'rule_ordering' and self.wants_rules_sorted(): # sort by reverse len mapping = sorted(mapping, key=lambda x: len(x["in"]), reverse=True) elif kwarg == 'escape_special' and val: mapping = [escape_special_characters(x) for x in mapping] elif kwarg == 'norm_form' and val: for io in mapping: for k, v in io.items(): if isinstance(v, str): io[k] = normalize(v, self.kwargs['norm_form']) elif kwarg == 'reverse' and val: mapping = self.reverse_mappings(mapping) # After all processing is done, turn into regex for io in mapping: if self.kwargs['prevent_feeding'] or ('prevent_feeding' in io and io['prevent_feeding']): io['intermediate_form'] = self._string_to_pua( io['out'], mapping.index(io)) io['match_pattern'] = self.rule_to_regex(io) if not io['match_pattern']: mapping.remove(io) self.processed = True return mapping
def apply_rules(self, to_convert: str, index: bool = False, debugger: bool = False) -> Union[str, Tuple[str, Indices]]: """ Apply all the rules in self.mapping sequentially. @param to_convert: str This is the string to convert @param index: bool This is whether to preserve indices, default is False @param debugger: bool This is whether to show intermediary steps, default is False """ indices = {} rules_applied = [] if not self.case_sensitive: to_convert = to_convert.lower() if self.norm_form: to_convert = normalize(to_convert, self.norm_form) # initialized converted converted = to_convert if index: input_index = 0 output_index = 0 new_index = {} for char in range(len(to_convert)): # account for many-to-many rules making the input index # outpace the char-by-char conversion if char < input_index: continue if not char in new_index or new_index[char][ 'input_string'] != to_convert[char]: input_index = char new_index[char] = { 'input_string': to_convert[char], 'output': {} } # intermediate form refreshes on each new char intermediate_conversion = to_convert rule_applied = False # go through rules for io in self.mapping: io_copy = copy.deepcopy(io) # find all matches. for match in io_copy['match_pattern'].finditer( intermediate_conversion): match_index = match.start() # if start index of match is equal to input index, # then apply the rule and append the index-formatted tuple # to the main indices list if match_index == input_index: if self.out_delimiter: # Don't add the delimiter to the last segment if not char + (len(io_copy['in']) - 1) >= len(to_convert) - 1: io_copy['out'] += self.out_delimiter # convert the final output output_sub = re.sub(re.compile(r'{\d+}'), '', io_copy['out']) intermediate_output = intermediate_conversion[:char] + re.sub( io_copy["match_pattern"], output_sub, intermediate_conversion[char:]) if debugger and intermediate_conversion != intermediate_output: applied_rule = { "input": intermediate_conversion, "rule": io_copy, "output": intermediate_output } rules_applied.append(applied_rule) # update intermediate converted form intermediate_conversion = intermediate_output # get the new index tuple non_null_index = self.return_index( input_index, output_index, io_copy['in'], io_copy['out'], to_convert, new_index) # if it's not empty, then a rule has applied and it can overwrite # the previous intermediate index tuple if non_null_index: rule_applied = True new_index = {**new_index, **non_null_index} # if you've gone past the input_index, you can safely break from the loop elif match_index > input_index: break # increase the index counters # new_index = self.convert_index_to_tuples(new_index) # if the rule applied if rule_applied and new_index[char]['output']: # add the new index to the list of indices indices = {**indices, **new_index} # get the length of the new index inputs and outputs # and increase the input counter by the length of the input input_index = max(new_index.keys()) input_index += 1 # do the same with outputs outputs = {} for v in new_index.values(): outputs = {**outputs, **v['output']} output_index = max(outputs.keys()) output_index += 1 else: # if a rule wasn't applied, just add on the input character # as the next input and output character new_index = { **new_index, **{ input_index: { 'input_string': to_convert[input_index], 'output': { output_index: to_convert[input_index] } } } } # merge it indices = {**indices, **new_index} # add one to input and output input_index += 1 output_index += 1 else: # if not worrying about indices, just do the conversion rule-by-rule for io in self.mapping: io_copy = copy.deepcopy(io) if self.out_delimiter: io_copy['out'] += self.out_delimiter output_sub = re.sub(re.compile(r'{\d+}'), '', io_copy['out']) if re.search(io_copy["match_pattern"], converted): inp = converted outp = re.sub(io_copy["match_pattern"], output_sub, converted) if debugger and inp != outp: applied_rule = { "input": inp, "rule": io_copy, "output": outp } rules_applied.append(applied_rule) converted = outp # Don't add the delimiter to the last segment converted = converted.rstrip() if index and debugger: io_states = Indices(indices) return (io_states.output(), io_states, rules_applied) if debugger: return (converted, rules_applied) if index: io_states = Indices(indices) return (io_states.output(), io_states) return converted
def process_kwargs(self, mapping): """ Apply kwargs in the order they are provided. kwargs are ordered as of python 3.6 """ if "as_is" in self.kwargs: as_is = self.kwargs["as_is"] if as_is: appropriate_setting = "as-written" else: appropriate_setting = "apply-longest-first" self.kwargs["rule_ordering"] = appropriate_setting del self.kwargs["as_is"] LOGGER.warning( f"mapping from {self.kwargs.get('in_lang')} to {self.kwargs.get('out_lang')} " 'is using the deprecated parameter "as_is"; ' f"replace `as_is: {as_is}` with `rule_ordering: {appropriate_setting}`" ) # Add defaults if "rule_ordering" in self.kwargs: # right now, "rule-ordering" is a more explict alias of the "as-is" option. ordering = self.kwargs["rule_ordering"] if ordering not in ("as-written", "apply-longest-first"): LOGGER.error( f"mapping from {self.kwargs.get('in_lang')} to {self.kwargs.get('out_lang')} " f"has invalid value '{ordering}' for rule_ordering parameter; " "rule_ordering must be one of " '"as-written" or "apply-longest-first"') else: self.kwargs["rule_ordering"] = "as-written" if "case_sensitive" not in self.kwargs: self.kwargs["case_sensitive"] = True if "escape_special" not in self.kwargs: self.kwargs["escape_special"] = False if "norm_form" not in self.kwargs: self.kwargs["norm_form"] = "NFD" if "reverse" not in self.kwargs: self.kwargs["reverse"] = False if "prevent_feeding" not in self.kwargs: self.kwargs["prevent_feeding"] = False if "in_lang" not in self.kwargs: self.kwargs["in_lang"] = "und" if "out_lang" not in self.kwargs: self.kwargs["out_lang"] = "und" # Process kwargs in order received for kwarg, val in self.kwargs.items(): if kwarg == "rule_ordering" and self.wants_rules_sorted(): # sort by reverse len mapping = sorted(mapping, key=lambda x: len(x["in"]), reverse=True) elif kwarg == "escape_special" and val: mapping = [escape_special_characters(x) for x in mapping] elif kwarg == "norm_form" and val: for io in mapping: for k, v in io.items(): if isinstance(v, str): io[k] = normalize(v, self.kwargs["norm_form"]) elif kwarg == "reverse" and val: mapping = self.reverse_mappings(mapping) # After all processing is done, turn into regex for i, io in enumerate(mapping): if self.kwargs["prevent_feeding"] or ("prevent_feeding" in io and io["prevent_feeding"]): io["intermediate_form"] = self._string_to_pua(io["out"], i) io["match_pattern"] = self.rule_to_regex(io) # Finally, remove rules with an empty match pattern, typically empty rules mapping = [io for io in mapping if io["match_pattern"]] self.processed = True return mapping