Beispiel #1
0
    def apply_unidecode(self, to_convert: str):
        if self.norm_form:
            to_convert = normalize(to_convert, self.norm_form)
        tg = TransductionGraph(to_convert)

        # Conversion is done character by character using unidecode
        converted = [text_unidecode.unidecode(c) for c in to_convert]
        tg.output_string = "".join(converted)

        # Edges are calculated to follow the conversion step by step
        if tg.output_string == "":
            # Some inputs get completely deleted by unidecode, in which case there are no
            # valid edges to output.
            tg.edges = []
        else:
            edges = []
            x_len, y_len = 0, 0
            for tgt in converted:
                if tgt:
                    for c in tgt:
                        edges.append((x_len, y_len))
                        y_len += 1
                else:
                    edges.append((x_len, max(y_len - 1, 0)))
                x_len += 1
            tg.edges = edges

        return tg
Beispiel #2
0
    def apply_rules(self, to_convert: str):
        if self.mapping.kwargs.get("type", "") == "unidecode":
            return self.apply_unidecode(to_convert)

        # perform any normalization
        if not self.case_sensitive:
            to_convert = to_convert.lower()
        if self.norm_form:
            to_convert = normalize(to_convert, self.norm_form)
        tg = TransductionGraph(to_convert)
        tg.debugger.append([])
        # initialize values
        intermediate_forms = False
        # iterate rules
        for io in self.mapping:
            # Do not allow empty rules
            if not io["in"] and not io["out"]:
                continue
            io = copy.deepcopy(io)
            intermediate_diff = 0
            for match in io["match_pattern"].finditer(tg.output_string):
                debug_string = tg.output_string
                start = match.start() + intermediate_diff
                end = match.end() + intermediate_diff
                if "intermediate_form" in io:
                    out_string = io["intermediate_form"]
                    intermediate_forms = True
                else:
                    out_string = io["out"]
                if self.out_delimiter:
                    out_string += self.out_delimiter
                if any(self._char_match_pattern.finditer(io["in"])) and any(
                        self._char_match_pattern.finditer(out_string)):
                    self.update_explicit_indices(tg, match, io,
                                                 intermediate_diff, out_string)
                else:
                    self.update_default_indices(tg, match, intermediate_diff,
                                                out_string)
                if (io["in"] != io["out"]
                        or ("context_after" in io and io["context_after"])
                        or ("context_before" in io and io["context_before"])):
                    tg.debugger[-1].append({
                        "input": debug_string,
                        "output": tg.output_string,
                        "rule":
                        {k: v
                         for k, v in io.items() if k != "match_pattern"},
                        "start": start,
                        "end": end,
                    })
                out_string = re.sub(re.compile(r"{\d+}"), "", out_string)
                intermediate_diff += len(out_string) - len(match.group())
        if intermediate_forms:
            tg.output_string = self.resolve_intermediate_chars(
                tg.output_string)
        tg.edges = list(
            dict.fromkeys(
                [tuple(x) for x in sorted(tg.edges, key=lambda x: x[0])]))
        return tg
Beispiel #3
0
 def apply_rules(self, to_convert: str):
     # perform any normalization
     if not self.case_sensitive:
         to_convert = to_convert.lower()
     if self.norm_form:
         to_convert = normalize(to_convert, self.norm_form)
     tg = TransductionGraph(to_convert)
     # initialize values
     intermediate_forms = False
     # iterate rules
     for io in self.mapping:
         # Do not allow empty rules
         if not io['in'] and not io['out']:
             continue
         io = copy.deepcopy(io)
         intermediate_diff = 0
         for match in io['match_pattern'].finditer(tg.output_string):
             debug_string = tg.output_string
             start = match.start() + intermediate_diff
             end = match.end() + intermediate_diff
             if 'intermediate_form' in io:
                 out_string = io['intermediate_form']
                 intermediate_forms = True
             else:
                 out_string = io['out']
             if self.out_delimiter:
                 # if not end segment, add delimiter
                 if not end >= len(tg.output_string):
                     out_string += self.out_delimiter
             if any(self._char_match_pattern.finditer(io['in'])) and any(
                     self._char_match_pattern.finditer(out_string)):
                 self.update_explicit_indices(tg, match, io,
                                              intermediate_diff, out_string)
             else:
                 self.update_default_indices(tg, match, intermediate_diff,
                                             out_string)
             if io['in'] != io['out']:
                 tg.debugger.append({
                     'input': debug_string,
                     'output': tg.output_string,
                     'rule':
                     {k: v
                      for k, v in io.items() if k != 'match_pattern'},
                     'start': start,
                     'end': end
                 })
             out_string = re.sub(re.compile(r'{\d+}'), '', out_string)
             intermediate_diff += len(out_string) - len(match.group())
     if intermediate_forms:
         tg.output_string = self.resolve_intermediate_chars(
             tg.output_string)
     tg.edges = list(
         dict.fromkeys(
             [tuple(x) for x in sorted(tg.edges, key=lambda x: x[0])]))
     return tg
Beispiel #4
0
def scan(lang, path):
    """ Returns the set of non-mapped characters in a document.
        Accounts for case sensitivity in the configuration.
    """
    # Check input lang exists
    if not lang in LANGS_NETWORK.nodes:
        raise click.UsageError(f"'{lang}' is not a valid value for 'LANG'")

    # Retrieve the mappings for lang
    case_sensitive = True
    mappings = []
    for mapping in MAPPINGS_AVAILABLE:
        mapping_name = mapping["in_lang"]
        # Exclude mappings for converting between IPAs
        if mapping_name.startswith(lang) and "ipa" not in mapping_name:
            case_sensitive = case_sensitive and mapping.get(
                "case_sensitive", True)
            mappings.append(mapping)

    # Get input chars in mapping
    mapped_chars = set()
    for lang_mapping in mappings:
        for x in lang_mapping["mapping_data"]:
            mapped_chars.add(normalize(x["in"], "NFD"))
    # Find unmapped chars
    filter_chars = " \n"
    mapped_string = "".join(mapped_chars)
    pattern = "[^" + mapped_string + filter_chars + ".]"
    prog = re.compile(pattern)

    with open(path, "r", encoding="utf8") as file:
        data = normalize(file.read(), "NFD")
        if not case_sensitive:
            data = data.lower()
        unmapped = set(prog.findall(data))
        if unmapped:
            LOGGER.warning("The following characters are not mapped:")
            print(unmapped)
Beispiel #5
0
    def __call__(self, to_convert: str):
        # perform normalization before tokenizing, since it can change tokenization
        if self._transducer.norm_form:
            to_convert = normalize(to_convert, self._transducer.norm_form)

        # Initialize the transducer on an empty string so we can handle inputs
        # that start with a non-token correctly.
        tg = self._transducer("")
        tg.clear_debugger()  # clear the meaningless initial debugger

        for token in self._tokenizer.tokenize_text(to_convert):
            if token["is_word"]:
                word_tg = self._transducer(token["text"])
                tg += word_tg
            else:
                non_word_tg = TransductionGraph(token["text"])
                tg += non_word_tg
        return tg
Beispiel #6
0
 def process_kwargs(self, mapping):
     ''' Apply kwargs in the order they are provided. kwargs are ordered as of python 3.6
     '''
     # Add defaults
     if 'as_is' not in self.kwargs:
         self.kwargs['as_is'] = False
     if 'case_sensitive' not in self.kwargs:
         self.kwargs['case_sensitive'] = True
     if 'escape_special' not in self.kwargs:
         self.kwargs['escape_special'] = False
     if 'norm_form' not in self.kwargs:
         self.kwargs['norm_form'] = 'NFD'
     if 'reverse' not in self.kwargs:
         self.kwargs['reverse'] = False
     # Process kwargs in order received
     for kwarg, val in self.kwargs.items():
         if kwarg == 'as_is' and not val:
             # sort by reverse len
             mapping = sorted(mapping,
                              key=lambda x: len(x["in"]),
                              reverse=True)
         elif kwarg == 'escape_special' and val:
             mapping = [escape_special_characters(x) for x in mapping]
         elif kwarg == 'case_sensitive' and not val:
             mapping = self.lower_mappings(mapping)
         elif kwarg == 'norm_form' and val:
             for io in mapping:
                 for k, v in io.items():
                     if isinstance(v, str):
                         io[k] = normalize(v, self.kwargs['norm_form'])
         elif kwarg == 'reverse' and val:
             mapping = self.reverse_mappings(mapping)
     # After all processing is done, turn into regex
     for io in mapping:
         io['match_pattern'] = self.rule_to_regex(io)
     self.processed = True
     return mapping
Beispiel #7
0
    def process_kwargs(self, mapping):
        ''' Apply kwargs in the order they are provided. kwargs are ordered as of python 3.6
        '''

        if 'as_is' in self.kwargs:
            as_is = self.kwargs['as_is']
            if as_is:
                appropriate_setting = "as-written"
            else:
                appropriate_setting = "apply-longest-first"

            self.kwargs["rule_ordering"] = appropriate_setting

            LOGGER.warning(
                f"mapping from {self.kwargs.get('in_lang')} to {self.kwargs.get('out_lang')} "
                'is using the deprecated parameter "as_is"; '
                f"replace `as_is: {as_is}` with `rule_ordering: {appropriate_setting}`"
            )

        # Add defaults
        if 'rule_ordering' in self.kwargs:
            # right now, "rule-ordering" is a more explict alias of the "as-is" option.
            ordering = self.kwargs["rule_ordering"]
            if ordering not in ("as-written", "apply-longest-first"):
                LOGGER.error(
                    f"mapping from {self.kwargs.get('in_lang')} to {self.kwargs.get('out_lang')} "
                    f"has invalid value '{ordering}' for rule_ordering parameter; "
                    "rule_ordering must be one of "
                    '"as-written" or "apply-longest-first"')
        else:
            self.kwargs["rule_ordering"] = "as-written"
        if 'case_sensitive' not in self.kwargs:
            self.kwargs['case_sensitive'] = True
        if 'escape_special' not in self.kwargs:
            self.kwargs['escape_special'] = False
        if 'norm_form' not in self.kwargs:
            self.kwargs['norm_form'] = 'NFD'
        if 'reverse' not in self.kwargs:
            self.kwargs['reverse'] = False
        if 'prevent_feeding' not in self.kwargs:
            self.kwargs['prevent_feeding'] = False
        if 'in_lang' not in self.kwargs:
            self.kwargs['in_lang'] = 'und'
        if 'out_lang' not in self.kwargs:
            self.kwargs['out_lang'] = 'und'

        # Process kwargs in order received
        for kwarg, val in self.kwargs.items():
            if kwarg == 'rule_ordering' and self.wants_rules_sorted():
                # sort by reverse len
                mapping = sorted(mapping,
                                 key=lambda x: len(x["in"]),
                                 reverse=True)
            elif kwarg == 'escape_special' and val:
                mapping = [escape_special_characters(x) for x in mapping]
            elif kwarg == 'norm_form' and val:
                for io in mapping:
                    for k, v in io.items():
                        if isinstance(v, str):
                            io[k] = normalize(v, self.kwargs['norm_form'])
            elif kwarg == 'reverse' and val:
                mapping = self.reverse_mappings(mapping)
        # After all processing is done, turn into regex
        for io in mapping:
            if self.kwargs['prevent_feeding'] or ('prevent_feeding' in io
                                                  and io['prevent_feeding']):
                io['intermediate_form'] = self._string_to_pua(
                    io['out'], mapping.index(io))
            io['match_pattern'] = self.rule_to_regex(io)
            if not io['match_pattern']:
                mapping.remove(io)
        self.processed = True
        return mapping
Beispiel #8
0
    def apply_rules(self,
                    to_convert: str,
                    index: bool = False,
                    debugger: bool = False) -> Union[str, Tuple[str, Indices]]:
        """ Apply all the rules in self.mapping sequentially.

        @param to_convert: str
            This is the string to convert

        @param index: bool
            This is whether to preserve indices, default is False

        @param debugger: bool
            This is whether to show intermediary steps, default is False

        """
        indices = {}
        rules_applied = []

        if not self.case_sensitive:
            to_convert = to_convert.lower()

        if self.norm_form:
            to_convert = normalize(to_convert, self.norm_form)

        # initialized converted
        converted = to_convert

        if index:
            input_index = 0
            output_index = 0
            new_index = {}
            for char in range(len(to_convert)):
                # account for many-to-many rules making the input index
                # outpace the char-by-char conversion
                if char < input_index:
                    continue
                if not char in new_index or new_index[char][
                        'input_string'] != to_convert[char]:
                    input_index = char
                    new_index[char] = {
                        'input_string': to_convert[char],
                        'output': {}
                    }
                # intermediate form refreshes on each new char
                intermediate_conversion = to_convert
                rule_applied = False
                # go through rules
                for io in self.mapping:
                    io_copy = copy.deepcopy(io)
                    # find all matches.
                    for match in io_copy['match_pattern'].finditer(
                            intermediate_conversion):
                        match_index = match.start()
                        # if start index of match is equal to input index,
                        # then apply the rule and append the index-formatted tuple
                        # to the main indices list
                        if match_index == input_index:
                            if self.out_delimiter:
                                # Don't add the delimiter to the last segment
                                if not char + (len(io_copy['in']) -
                                               1) >= len(to_convert) - 1:
                                    io_copy['out'] += self.out_delimiter
                            # convert the final output
                            output_sub = re.sub(re.compile(r'{\d+}'), '',
                                                io_copy['out'])
                            intermediate_output = intermediate_conversion[:char] + re.sub(
                                io_copy["match_pattern"], output_sub,
                                intermediate_conversion[char:])
                            if debugger and intermediate_conversion != intermediate_output:
                                applied_rule = {
                                    "input": intermediate_conversion,
                                    "rule": io_copy,
                                    "output": intermediate_output
                                }
                                rules_applied.append(applied_rule)
                            # update intermediate converted form
                            intermediate_conversion = intermediate_output
                            # get the new index tuple
                            non_null_index = self.return_index(
                                input_index, output_index, io_copy['in'],
                                io_copy['out'], to_convert, new_index)
                            # if it's not empty, then a rule has applied and it can overwrite
                            # the previous intermediate index tuple
                            if non_null_index:
                                rule_applied = True
                                new_index = {**new_index, **non_null_index}
                        # if you've gone past the input_index, you can safely break from the loop
                        elif match_index > input_index:
                            break
                # increase the index counters
                # new_index = self.convert_index_to_tuples(new_index)
                # if the rule applied
                if rule_applied and new_index[char]['output']:
                    # add the new index to the list of indices
                    indices = {**indices, **new_index}
                    # get the length of the new index inputs and outputs
                    # and increase the input counter by the length of the input
                    input_index = max(new_index.keys())
                    input_index += 1
                    # do the same with outputs
                    outputs = {}
                    for v in new_index.values():
                        outputs = {**outputs, **v['output']}
                    output_index = max(outputs.keys())
                    output_index += 1
                else:
                    # if a rule wasn't applied, just add on the input character
                    # as the next input and output character
                    new_index = {
                        **new_index,
                        **{
                            input_index: {
                                'input_string': to_convert[input_index],
                                'output': {
                                    output_index: to_convert[input_index]
                                }
                            }
                        }
                    }
                    # merge it
                    indices = {**indices, **new_index}
                    # add one to input and output
                    input_index += 1
                    output_index += 1
        else:
            # if not worrying about indices, just do the conversion rule-by-rule
            for io in self.mapping:
                io_copy = copy.deepcopy(io)
                if self.out_delimiter:
                    io_copy['out'] += self.out_delimiter
                output_sub = re.sub(re.compile(r'{\d+}'), '', io_copy['out'])
                if re.search(io_copy["match_pattern"], converted):
                    inp = converted
                    outp = re.sub(io_copy["match_pattern"], output_sub,
                                  converted)
                    if debugger and inp != outp:
                        applied_rule = {
                            "input": inp,
                            "rule": io_copy,
                            "output": outp
                        }
                        rules_applied.append(applied_rule)
                    converted = outp
            # Don't add the delimiter to the last segment
            converted = converted.rstrip()
        if index and debugger:
            io_states = Indices(indices)
            return (io_states.output(), io_states, rules_applied)
        if debugger:
            return (converted, rules_applied)
        if index:
            io_states = Indices(indices)
            return (io_states.output(), io_states)
        return converted
Beispiel #9
0
    def process_kwargs(self, mapping):
        """ Apply kwargs in the order they are provided. kwargs are ordered as of python 3.6
        """

        if "as_is" in self.kwargs:
            as_is = self.kwargs["as_is"]
            if as_is:
                appropriate_setting = "as-written"
            else:
                appropriate_setting = "apply-longest-first"

            self.kwargs["rule_ordering"] = appropriate_setting
            del self.kwargs["as_is"]

            LOGGER.warning(
                f"mapping from {self.kwargs.get('in_lang')} to {self.kwargs.get('out_lang')} "
                'is using the deprecated parameter "as_is"; '
                f"replace `as_is: {as_is}` with `rule_ordering: {appropriate_setting}`"
            )

        # Add defaults
        if "rule_ordering" in self.kwargs:
            # right now, "rule-ordering" is a more explict alias of the "as-is" option.
            ordering = self.kwargs["rule_ordering"]
            if ordering not in ("as-written", "apply-longest-first"):
                LOGGER.error(
                    f"mapping from {self.kwargs.get('in_lang')} to {self.kwargs.get('out_lang')} "
                    f"has invalid value '{ordering}' for rule_ordering parameter; "
                    "rule_ordering must be one of "
                    '"as-written" or "apply-longest-first"')
        else:
            self.kwargs["rule_ordering"] = "as-written"
        if "case_sensitive" not in self.kwargs:
            self.kwargs["case_sensitive"] = True
        if "escape_special" not in self.kwargs:
            self.kwargs["escape_special"] = False
        if "norm_form" not in self.kwargs:
            self.kwargs["norm_form"] = "NFD"
        if "reverse" not in self.kwargs:
            self.kwargs["reverse"] = False
        if "prevent_feeding" not in self.kwargs:
            self.kwargs["prevent_feeding"] = False
        if "in_lang" not in self.kwargs:
            self.kwargs["in_lang"] = "und"
        if "out_lang" not in self.kwargs:
            self.kwargs["out_lang"] = "und"

        # Process kwargs in order received
        for kwarg, val in self.kwargs.items():
            if kwarg == "rule_ordering" and self.wants_rules_sorted():
                # sort by reverse len
                mapping = sorted(mapping,
                                 key=lambda x: len(x["in"]),
                                 reverse=True)
            elif kwarg == "escape_special" and val:
                mapping = [escape_special_characters(x) for x in mapping]
            elif kwarg == "norm_form" and val:
                for io in mapping:
                    for k, v in io.items():
                        if isinstance(v, str):
                            io[k] = normalize(v, self.kwargs["norm_form"])
            elif kwarg == "reverse" and val:
                mapping = self.reverse_mappings(mapping)

        # After all processing is done, turn into regex
        for i, io in enumerate(mapping):
            if self.kwargs["prevent_feeding"] or ("prevent_feeding" in io
                                                  and io["prevent_feeding"]):
                io["intermediate_form"] = self._string_to_pua(io["out"], i)
            io["match_pattern"] = self.rule_to_regex(io)

        # Finally, remove rules with an empty match pattern, typically empty rules
        mapping = [io for io in mapping if io["match_pattern"]]

        self.processed = True
        return mapping