Exemple #1
0
 def rule_to_regex(self, rule: str) -> Pattern:
     """Turns an input string (and the context) from an input/output pair
     into a regular expression pattern"""
     if "context_before" in rule and rule['context_before']:
         before = rule["context_before"]
     else:
         before = ''
     if 'context_after' in rule and rule['context_after']:
         after = rule["context_after"]
     else:
         after = ''
     input_match = re.sub(re.compile(r'{\d+}'), "", rule['in'])
     try:
         inp = create_fixed_width_lookbehind(before) + input_match
         if after:
             inp += f"(?={after})"
         if not self.kwargs['case_sensitive']:
             rule_regex = re.compile(inp, re.I)
         else:
             rule_regex = re.compile(inp)
     except:
         LOGGER.error(
             f'Your regex in mapping between {self.kwargs["in_lang"]} and {self.kwargs["out_lang"]} is malformed. \
                 Do you have un-escaped regex characters in your input {inp}, contexts {before}, {after}?'
         )
         raise Exception(
             f'Your regex in mapping between {self.kwargs["in_lang"]} and {self.kwargs["out_lang"]} is malformed. \
                 Do you have un-escaped regex characters in your input {inp}, contexts {before}, {after}?'
         )
     return rule_regex
Exemple #2
0
 def test_response_code(self):
     '''
     Ensure all routes return 200
     '''
     for rt in self.routes_no_args:
         try:
             r = self.client().get(rt)
             self.assertEqual(r.status_code, 200)
             LOGGER.debug("Route " + rt + " returned " + str(r.status_code))
         except:
             LOGGER.error("Couldn't connect. Is flask running?")
Exemple #3
0
    def rule_to_regex(self, rule: dict) -> Pattern:
        """Turns an input string (and the context) from an input/output pair
        into a regular expression pattern"

        The 'in' key is the match.
        The 'context_after' key creates a lookahead.
        The 'context_before' key creates a lookbehind.

        Args:
            rule: A dictionary containing 'in', 'out', 'context_before', and 'context_after' keys

        Raises:
            Exception: This is raised when un-supported regex characters or symbols exist in the rule

        Returns:
            Pattern: returns a regex pattern (re.Pattern)
            bool: returns False if input is null
        """
        # Prevent null input. See, https://github.com/roedoejet/g2p/issues/24
        if not rule['in']:
            LOGGER.warning(
                f'Rule with input \'{rule["in"]}\' and output \'{rule["out"]}\' has no input. This is disallowed. Please check your mapping file for rules with null inputs.'
            )
            return False
        if "context_before" in rule and rule['context_before']:
            before = rule["context_before"]
        else:
            before = ''
        if 'context_after' in rule and rule['context_after']:
            after = rule["context_after"]
        else:
            after = ''
        input_match = re.sub(re.compile(r'{\d+}'), "", rule['in'])
        try:
            inp = create_fixed_width_lookbehind(before) + input_match
            if after:
                inp += f"(?={after})"
            if not self.kwargs['case_sensitive']:
                rule_regex = re.compile(inp, re.I)
            else:
                rule_regex = re.compile(inp)
        except:
            in_lang = self.kwargs.get('in_lang', 'und')
            out_lang = self.kwargs.get('out_lang', 'und')
            LOGGER.error(
                f'Your regex in mapping between {in_lang} and {out_lang} is malformed. \
                    Do you have un-escaped regex characters in your input {inp}, contexts {before}, {after}?'
            )
            raise Exception(
                f'Your regex in mapping between {in_lang} and {out_lang} is malformed. \
                    Do you have un-escaped regex characters in your input {inp}, contexts {before}, {after}?'
            )
        return rule_regex
Exemple #4
0
 def test_response_code_with_args(self):
     '''
     Ensure all args return 200
     '''
     for ep in self.routes_only_args:
         for node in LANGS_NETWORK.nodes:
             rt = re.sub(self.arg_match, node, ep)
             try:
                 r = self.client().get(rt)
                 self.assertEqual(r.status_code, 200)
             except:
                 LOGGER.error("Couldn't connect. Is flask running?")
         LOGGER.debug("Successfully tested " + str(len(LANGS_NETWORK.nodes)
                                                   ) + " node resources at route " + ep + " .")
Exemple #5
0
def get_distance_method(dst, distance: str):
    if distance not in DISTANCE_METRICS:
        raise ValueError(f"Distance metric {distance} not supported")
    try:
        distance_method = getattr(dst, distance)
    except AttributeError as e:
        # Older versions of panphon mispelled Dolgopolsky's name as Dogolpolsky...
        # Try again with the older name, so we stay compatible with both <=0.19
        # and >=0.19.1
        if distance == "dolgo_prime_distance":
            return getattr(dst, "dogol_prime_distance")

        LOGGER.error(f"The distance metric {distance} is not supported by PanPhon")
        raise ValueError(f"Distance metric {distance} not supported") from e
    return distance_method
Exemple #6
0
def run_tests(suite):
    ''' Decide which Test Suite to run
    '''
    if suite == 'all':
        suite = LOADER.discover(os.path.dirname(__file__))
    if suite == 'trans':
        suite = TestSuite(TRANSDUCER_TESTS)
    if suite == 'langs':
        suite = TestSuite(LANGS_TESTS)
    if suite == 'mappings':
        suite = TestSuite(MAPPINGS_TESTS)
    elif suite == 'dev':
        suite = TestSuite(DEV_TESTS)
    runner = TextTestRunner(verbosity=3)
    if isinstance(suite, str):
        LOGGER.error("Please specify a test suite to run: i.e. 'dev' or 'all'")
    else:
        runner.run(suite)
Exemple #7
0
def make_g2p(in_lang: str, out_lang: str):
    # Check in_lang is a node in network
    if in_lang not in LANGS_NETWORK.nodes:
        LOGGER.error(f"No lang called {in_lang}. Please try again.")
        raise (FileNotFoundError("No lang called {in_lang}."))

    # Check out_lang is a node in network
    if out_lang not in LANGS_NETWORK.nodes:
        LOGGER.error(f"No lang called {out_lang}. Please try again.")
        raise (FileNotFoundError("No lang called {out_lang}."))

    # Try to find the shortest path between the nodes
    try:
        path = shortest_path(LANGS_NETWORK, in_lang, out_lang)
    except NetworkXNoPath:
        LOGGER.error(
            f"Sorry, we couldn't find a way to convert {in_lang} to {out_lang}. Please update your langs by running `g2p update` and try again."
        )
        raise (NetworkXNoPath)

    # Find all mappings needed
    mappings_needed = []
    for i, lang in enumerate(path):
        try:
            mapping = Mapping(in_lang=path[i], out_lang=path[i + 1])
            LOGGER.debug(
                f"Adding mapping between {path[i]} and {path[i+1]} to composite transducer."
            )
            mappings_needed.append(mapping)
        except IndexError:
            continue

    # Either return Transducer or Composite Transducer
    if len(mappings_needed) == 1:
        return Transducer(mappings_needed[0])
    else:
        return CompositeTransducer([Transducer(x) for x in mappings_needed])
Exemple #8
0
def make_g2p(in_lang: str, out_lang: str, tok_lang=None):
    """Make a g2p Transducer for mapping text from in_lang to out_lang via the
    shortest path between them.

    Args:
        in_lang (str): input language code
        out_lang (str): output language code

    Returns:
        Transducer from in_lang to out_lang

    Raises:
        InvalidLanguageCode: if in_lang or out_lang don't exist
        NoPath: if there is path between in_lang and out_lang
    """
    if (in_lang, out_lang, tok_lang) in _g2p_cache:
        return _g2p_cache[(in_lang, out_lang, tok_lang)]

    # Check in_lang is a node in network
    if in_lang not in LANGS_NETWORK.nodes:
        LOGGER.error(f"No lang called '{in_lang}'. Please try again.")
        raise InvalidLanguageCode(in_lang)

    # Check out_lang is a node in network
    if out_lang not in LANGS_NETWORK.nodes:
        LOGGER.error(f"No lang called '{out_lang}'. Please try again.")
        raise InvalidLanguageCode(out_lang)

    if in_lang == out_lang:
        LOGGER.error(
            f"Sorry, you can't transduce between the same language. Please select a different output language code."
        )
        raise NoPath(in_lang, out_lang)

    # Try to find the shortest path between the nodes
    try:
        path = shortest_path(LANGS_NETWORK, in_lang, out_lang)
    except NetworkXNoPath as e:
        LOGGER.error(
            f"Sorry, we couldn't find a way to convert {in_lang} to {out_lang}. Please update your langs by running `g2p update` and try again."
        )
        raise NoPath(in_lang, out_lang) from e

    # Find all mappings needed
    mappings_needed = []
    for lang1, lang2 in zip(path[:-1], path[1:]):
        mapping = Mapping(in_lang=lang1, out_lang=lang2)
        LOGGER.debug(
            f"Adding mapping between {lang1} and {lang2} to composite transducer."
        )
        mappings_needed.append(mapping)

    # Either construct a Transducer or Composite Transducer
    if len(mappings_needed) == 1:
        transducer = Transducer(mappings_needed[0])
    else:
        transducer = CompositeTransducer(
            [Transducer(x) for x in mappings_needed])

    # If tokenization was requested, return a TokenizingTransducer
    if tok_lang:
        if tok_lang == "path":
            tokenizer = make_tokenizer(in_lang=in_lang, tok_path=path)
        else:
            tokenizer = make_tokenizer(in_lang=tok_lang)
        transducer = TokenizingTransducer(transducer, tokenizer)

    _g2p_cache[(in_lang, out_lang, tok_lang)] = transducer
    return transducer
Exemple #9
0
]

DEV_TESTS = TRANSDUCER_TESTS + MAPPINGS_TESTS + LANGS_TESTS + INTEGRATION_TESTS


def run_tests(suite):
    ''' Decide which Test Suite to run
    '''
    if suite == 'all':
        suite = LOADER.discover(os.path.dirname(__file__))
    if suite == 'trans':
        suite = TestSuite(TRANSDUCER_TESTS)
    if suite == 'langs':
        suite = TestSuite(LANGS_TESTS)
    if suite == 'mappings':
        suite = TestSuite(MAPPINGS_TESTS)
    elif suite == 'dev':
        suite = TestSuite(DEV_TESTS)
    runner = TextTestRunner(verbosity=3)
    if isinstance(suite, str):
        LOGGER.error("Please specify a test suite to run: i.e. 'dev' or 'all'")
    else:
        runner.run(suite)


if __name__ == "__main__":
    try:
        run_tests(sys.argv[1])
    except IndexError:
        LOGGER.error("Please specify a test suite to run: i.e. 'dev' or 'all'")
Exemple #10
0
    def process_kwargs(self, mapping):
        ''' Apply kwargs in the order they are provided. kwargs are ordered as of python 3.6
        '''

        if 'as_is' in self.kwargs:
            as_is = self.kwargs['as_is']
            if as_is:
                appropriate_setting = "as-written"
            else:
                appropriate_setting = "apply-longest-first"

            self.kwargs["rule_ordering"] = appropriate_setting

            LOGGER.warning(
                f"mapping from {self.kwargs.get('in_lang')} to {self.kwargs.get('out_lang')} "
                'is using the deprecated parameter "as_is"; '
                f"replace `as_is: {as_is}` with `rule_ordering: {appropriate_setting}`"
            )

        # Add defaults
        if 'rule_ordering' in self.kwargs:
            # right now, "rule-ordering" is a more explict alias of the "as-is" option.
            ordering = self.kwargs["rule_ordering"]
            if ordering not in ("as-written", "apply-longest-first"):
                LOGGER.error(
                    f"mapping from {self.kwargs.get('in_lang')} to {self.kwargs.get('out_lang')} "
                    f"has invalid value '{ordering}' for rule_ordering parameter; "
                    "rule_ordering must be one of "
                    '"as-written" or "apply-longest-first"')
        else:
            self.kwargs["rule_ordering"] = "as-written"
        if 'case_sensitive' not in self.kwargs:
            self.kwargs['case_sensitive'] = True
        if 'escape_special' not in self.kwargs:
            self.kwargs['escape_special'] = False
        if 'norm_form' not in self.kwargs:
            self.kwargs['norm_form'] = 'NFD'
        if 'reverse' not in self.kwargs:
            self.kwargs['reverse'] = False
        if 'prevent_feeding' not in self.kwargs:
            self.kwargs['prevent_feeding'] = False
        if 'in_lang' not in self.kwargs:
            self.kwargs['in_lang'] = 'und'
        if 'out_lang' not in self.kwargs:
            self.kwargs['out_lang'] = 'und'

        # Process kwargs in order received
        for kwarg, val in self.kwargs.items():
            if kwarg == 'rule_ordering' and self.wants_rules_sorted():
                # sort by reverse len
                mapping = sorted(mapping,
                                 key=lambda x: len(x["in"]),
                                 reverse=True)
            elif kwarg == 'escape_special' and val:
                mapping = [escape_special_characters(x) for x in mapping]
            elif kwarg == 'norm_form' and val:
                for io in mapping:
                    for k, v in io.items():
                        if isinstance(v, str):
                            io[k] = normalize(v, self.kwargs['norm_form'])
            elif kwarg == 'reverse' and val:
                mapping = self.reverse_mappings(mapping)
        # After all processing is done, turn into regex
        for io in mapping:
            if self.kwargs['prevent_feeding'] or ('prevent_feeding' in io
                                                  and io['prevent_feeding']):
                io['intermediate_form'] = self._string_to_pua(
                    io['out'], mapping.index(io))
            io['match_pattern'] = self.rule_to_regex(io)
            if not io['match_pattern']:
                mapping.remove(io)
        self.processed = True
        return mapping
Exemple #11
0
    def process_kwargs(self, mapping):
        """ Apply kwargs in the order they are provided. kwargs are ordered as of python 3.6
        """

        if "as_is" in self.kwargs:
            as_is = self.kwargs["as_is"]
            if as_is:
                appropriate_setting = "as-written"
            else:
                appropriate_setting = "apply-longest-first"

            self.kwargs["rule_ordering"] = appropriate_setting
            del self.kwargs["as_is"]

            LOGGER.warning(
                f"mapping from {self.kwargs.get('in_lang')} to {self.kwargs.get('out_lang')} "
                'is using the deprecated parameter "as_is"; '
                f"replace `as_is: {as_is}` with `rule_ordering: {appropriate_setting}`"
            )

        # Add defaults
        if "rule_ordering" in self.kwargs:
            # right now, "rule-ordering" is a more explict alias of the "as-is" option.
            ordering = self.kwargs["rule_ordering"]
            if ordering not in ("as-written", "apply-longest-first"):
                LOGGER.error(
                    f"mapping from {self.kwargs.get('in_lang')} to {self.kwargs.get('out_lang')} "
                    f"has invalid value '{ordering}' for rule_ordering parameter; "
                    "rule_ordering must be one of "
                    '"as-written" or "apply-longest-first"')
        else:
            self.kwargs["rule_ordering"] = "as-written"
        if "case_sensitive" not in self.kwargs:
            self.kwargs["case_sensitive"] = True
        if "escape_special" not in self.kwargs:
            self.kwargs["escape_special"] = False
        if "norm_form" not in self.kwargs:
            self.kwargs["norm_form"] = "NFD"
        if "reverse" not in self.kwargs:
            self.kwargs["reverse"] = False
        if "prevent_feeding" not in self.kwargs:
            self.kwargs["prevent_feeding"] = False
        if "in_lang" not in self.kwargs:
            self.kwargs["in_lang"] = "und"
        if "out_lang" not in self.kwargs:
            self.kwargs["out_lang"] = "und"

        # Process kwargs in order received
        for kwarg, val in self.kwargs.items():
            if kwarg == "rule_ordering" and self.wants_rules_sorted():
                # sort by reverse len
                mapping = sorted(mapping,
                                 key=lambda x: len(x["in"]),
                                 reverse=True)
            elif kwarg == "escape_special" and val:
                mapping = [escape_special_characters(x) for x in mapping]
            elif kwarg == "norm_form" and val:
                for io in mapping:
                    for k, v in io.items():
                        if isinstance(v, str):
                            io[k] = normalize(v, self.kwargs["norm_form"])
            elif kwarg == "reverse" and val:
                mapping = self.reverse_mappings(mapping)

        # After all processing is done, turn into regex
        for i, io in enumerate(mapping):
            if self.kwargs["prevent_feeding"] or ("prevent_feeding" in io
                                                  and io["prevent_feeding"]):
                io["intermediate_form"] = self._string_to_pua(io["out"], i)
            io["match_pattern"] = self.rule_to_regex(io)

        # Finally, remove rules with an empty match pattern, typically empty rules
        mapping = [io for io in mapping if io["match_pattern"]]

        self.processed = True
        return mapping