def rule_to_regex(self, rule: str) -> Pattern: """Turns an input string (and the context) from an input/output pair into a regular expression pattern""" if "context_before" in rule and rule['context_before']: before = rule["context_before"] else: before = '' if 'context_after' in rule and rule['context_after']: after = rule["context_after"] else: after = '' input_match = re.sub(re.compile(r'{\d+}'), "", rule['in']) try: inp = create_fixed_width_lookbehind(before) + input_match if after: inp += f"(?={after})" if not self.kwargs['case_sensitive']: rule_regex = re.compile(inp, re.I) else: rule_regex = re.compile(inp) except: LOGGER.error( f'Your regex in mapping between {self.kwargs["in_lang"]} and {self.kwargs["out_lang"]} is malformed. \ Do you have un-escaped regex characters in your input {inp}, contexts {before}, {after}?' ) raise Exception( f'Your regex in mapping between {self.kwargs["in_lang"]} and {self.kwargs["out_lang"]} is malformed. \ Do you have un-escaped regex characters in your input {inp}, contexts {before}, {after}?' ) return rule_regex
def test_response_code(self): ''' Ensure all routes return 200 ''' for rt in self.routes_no_args: try: r = self.client().get(rt) self.assertEqual(r.status_code, 200) LOGGER.debug("Route " + rt + " returned " + str(r.status_code)) except: LOGGER.error("Couldn't connect. Is flask running?")
def rule_to_regex(self, rule: dict) -> Pattern: """Turns an input string (and the context) from an input/output pair into a regular expression pattern" The 'in' key is the match. The 'context_after' key creates a lookahead. The 'context_before' key creates a lookbehind. Args: rule: A dictionary containing 'in', 'out', 'context_before', and 'context_after' keys Raises: Exception: This is raised when un-supported regex characters or symbols exist in the rule Returns: Pattern: returns a regex pattern (re.Pattern) bool: returns False if input is null """ # Prevent null input. See, https://github.com/roedoejet/g2p/issues/24 if not rule['in']: LOGGER.warning( f'Rule with input \'{rule["in"]}\' and output \'{rule["out"]}\' has no input. This is disallowed. Please check your mapping file for rules with null inputs.' ) return False if "context_before" in rule and rule['context_before']: before = rule["context_before"] else: before = '' if 'context_after' in rule and rule['context_after']: after = rule["context_after"] else: after = '' input_match = re.sub(re.compile(r'{\d+}'), "", rule['in']) try: inp = create_fixed_width_lookbehind(before) + input_match if after: inp += f"(?={after})" if not self.kwargs['case_sensitive']: rule_regex = re.compile(inp, re.I) else: rule_regex = re.compile(inp) except: in_lang = self.kwargs.get('in_lang', 'und') out_lang = self.kwargs.get('out_lang', 'und') LOGGER.error( f'Your regex in mapping between {in_lang} and {out_lang} is malformed. \ Do you have un-escaped regex characters in your input {inp}, contexts {before}, {after}?' ) raise Exception( f'Your regex in mapping between {in_lang} and {out_lang} is malformed. \ Do you have un-escaped regex characters in your input {inp}, contexts {before}, {after}?' ) return rule_regex
def test_response_code_with_args(self): ''' Ensure all args return 200 ''' for ep in self.routes_only_args: for node in LANGS_NETWORK.nodes: rt = re.sub(self.arg_match, node, ep) try: r = self.client().get(rt) self.assertEqual(r.status_code, 200) except: LOGGER.error("Couldn't connect. Is flask running?") LOGGER.debug("Successfully tested " + str(len(LANGS_NETWORK.nodes) ) + " node resources at route " + ep + " .")
def get_distance_method(dst, distance: str): if distance not in DISTANCE_METRICS: raise ValueError(f"Distance metric {distance} not supported") try: distance_method = getattr(dst, distance) except AttributeError as e: # Older versions of panphon mispelled Dolgopolsky's name as Dogolpolsky... # Try again with the older name, so we stay compatible with both <=0.19 # and >=0.19.1 if distance == "dolgo_prime_distance": return getattr(dst, "dogol_prime_distance") LOGGER.error(f"The distance metric {distance} is not supported by PanPhon") raise ValueError(f"Distance metric {distance} not supported") from e return distance_method
def run_tests(suite): ''' Decide which Test Suite to run ''' if suite == 'all': suite = LOADER.discover(os.path.dirname(__file__)) if suite == 'trans': suite = TestSuite(TRANSDUCER_TESTS) if suite == 'langs': suite = TestSuite(LANGS_TESTS) if suite == 'mappings': suite = TestSuite(MAPPINGS_TESTS) elif suite == 'dev': suite = TestSuite(DEV_TESTS) runner = TextTestRunner(verbosity=3) if isinstance(suite, str): LOGGER.error("Please specify a test suite to run: i.e. 'dev' or 'all'") else: runner.run(suite)
def make_g2p(in_lang: str, out_lang: str): # Check in_lang is a node in network if in_lang not in LANGS_NETWORK.nodes: LOGGER.error(f"No lang called {in_lang}. Please try again.") raise (FileNotFoundError("No lang called {in_lang}.")) # Check out_lang is a node in network if out_lang not in LANGS_NETWORK.nodes: LOGGER.error(f"No lang called {out_lang}. Please try again.") raise (FileNotFoundError("No lang called {out_lang}.")) # Try to find the shortest path between the nodes try: path = shortest_path(LANGS_NETWORK, in_lang, out_lang) except NetworkXNoPath: LOGGER.error( f"Sorry, we couldn't find a way to convert {in_lang} to {out_lang}. Please update your langs by running `g2p update` and try again." ) raise (NetworkXNoPath) # Find all mappings needed mappings_needed = [] for i, lang in enumerate(path): try: mapping = Mapping(in_lang=path[i], out_lang=path[i + 1]) LOGGER.debug( f"Adding mapping between {path[i]} and {path[i+1]} to composite transducer." ) mappings_needed.append(mapping) except IndexError: continue # Either return Transducer or Composite Transducer if len(mappings_needed) == 1: return Transducer(mappings_needed[0]) else: return CompositeTransducer([Transducer(x) for x in mappings_needed])
def make_g2p(in_lang: str, out_lang: str, tok_lang=None): """Make a g2p Transducer for mapping text from in_lang to out_lang via the shortest path between them. Args: in_lang (str): input language code out_lang (str): output language code Returns: Transducer from in_lang to out_lang Raises: InvalidLanguageCode: if in_lang or out_lang don't exist NoPath: if there is path between in_lang and out_lang """ if (in_lang, out_lang, tok_lang) in _g2p_cache: return _g2p_cache[(in_lang, out_lang, tok_lang)] # Check in_lang is a node in network if in_lang not in LANGS_NETWORK.nodes: LOGGER.error(f"No lang called '{in_lang}'. Please try again.") raise InvalidLanguageCode(in_lang) # Check out_lang is a node in network if out_lang not in LANGS_NETWORK.nodes: LOGGER.error(f"No lang called '{out_lang}'. Please try again.") raise InvalidLanguageCode(out_lang) if in_lang == out_lang: LOGGER.error( f"Sorry, you can't transduce between the same language. Please select a different output language code." ) raise NoPath(in_lang, out_lang) # Try to find the shortest path between the nodes try: path = shortest_path(LANGS_NETWORK, in_lang, out_lang) except NetworkXNoPath as e: LOGGER.error( f"Sorry, we couldn't find a way to convert {in_lang} to {out_lang}. Please update your langs by running `g2p update` and try again." ) raise NoPath(in_lang, out_lang) from e # Find all mappings needed mappings_needed = [] for lang1, lang2 in zip(path[:-1], path[1:]): mapping = Mapping(in_lang=lang1, out_lang=lang2) LOGGER.debug( f"Adding mapping between {lang1} and {lang2} to composite transducer." ) mappings_needed.append(mapping) # Either construct a Transducer or Composite Transducer if len(mappings_needed) == 1: transducer = Transducer(mappings_needed[0]) else: transducer = CompositeTransducer( [Transducer(x) for x in mappings_needed]) # If tokenization was requested, return a TokenizingTransducer if tok_lang: if tok_lang == "path": tokenizer = make_tokenizer(in_lang=in_lang, tok_path=path) else: tokenizer = make_tokenizer(in_lang=tok_lang) transducer = TokenizingTransducer(transducer, tokenizer) _g2p_cache[(in_lang, out_lang, tok_lang)] = transducer return transducer
] DEV_TESTS = TRANSDUCER_TESTS + MAPPINGS_TESTS + LANGS_TESTS + INTEGRATION_TESTS def run_tests(suite): ''' Decide which Test Suite to run ''' if suite == 'all': suite = LOADER.discover(os.path.dirname(__file__)) if suite == 'trans': suite = TestSuite(TRANSDUCER_TESTS) if suite == 'langs': suite = TestSuite(LANGS_TESTS) if suite == 'mappings': suite = TestSuite(MAPPINGS_TESTS) elif suite == 'dev': suite = TestSuite(DEV_TESTS) runner = TextTestRunner(verbosity=3) if isinstance(suite, str): LOGGER.error("Please specify a test suite to run: i.e. 'dev' or 'all'") else: runner.run(suite) if __name__ == "__main__": try: run_tests(sys.argv[1]) except IndexError: LOGGER.error("Please specify a test suite to run: i.e. 'dev' or 'all'")
def process_kwargs(self, mapping): ''' Apply kwargs in the order they are provided. kwargs are ordered as of python 3.6 ''' if 'as_is' in self.kwargs: as_is = self.kwargs['as_is'] if as_is: appropriate_setting = "as-written" else: appropriate_setting = "apply-longest-first" self.kwargs["rule_ordering"] = appropriate_setting LOGGER.warning( f"mapping from {self.kwargs.get('in_lang')} to {self.kwargs.get('out_lang')} " 'is using the deprecated parameter "as_is"; ' f"replace `as_is: {as_is}` with `rule_ordering: {appropriate_setting}`" ) # Add defaults if 'rule_ordering' in self.kwargs: # right now, "rule-ordering" is a more explict alias of the "as-is" option. ordering = self.kwargs["rule_ordering"] if ordering not in ("as-written", "apply-longest-first"): LOGGER.error( f"mapping from {self.kwargs.get('in_lang')} to {self.kwargs.get('out_lang')} " f"has invalid value '{ordering}' for rule_ordering parameter; " "rule_ordering must be one of " '"as-written" or "apply-longest-first"') else: self.kwargs["rule_ordering"] = "as-written" if 'case_sensitive' not in self.kwargs: self.kwargs['case_sensitive'] = True if 'escape_special' not in self.kwargs: self.kwargs['escape_special'] = False if 'norm_form' not in self.kwargs: self.kwargs['norm_form'] = 'NFD' if 'reverse' not in self.kwargs: self.kwargs['reverse'] = False if 'prevent_feeding' not in self.kwargs: self.kwargs['prevent_feeding'] = False if 'in_lang' not in self.kwargs: self.kwargs['in_lang'] = 'und' if 'out_lang' not in self.kwargs: self.kwargs['out_lang'] = 'und' # Process kwargs in order received for kwarg, val in self.kwargs.items(): if kwarg == 'rule_ordering' and self.wants_rules_sorted(): # sort by reverse len mapping = sorted(mapping, key=lambda x: len(x["in"]), reverse=True) elif kwarg == 'escape_special' and val: mapping = [escape_special_characters(x) for x in mapping] elif kwarg == 'norm_form' and val: for io in mapping: for k, v in io.items(): if isinstance(v, str): io[k] = normalize(v, self.kwargs['norm_form']) elif kwarg == 'reverse' and val: mapping = self.reverse_mappings(mapping) # After all processing is done, turn into regex for io in mapping: if self.kwargs['prevent_feeding'] or ('prevent_feeding' in io and io['prevent_feeding']): io['intermediate_form'] = self._string_to_pua( io['out'], mapping.index(io)) io['match_pattern'] = self.rule_to_regex(io) if not io['match_pattern']: mapping.remove(io) self.processed = True return mapping
def process_kwargs(self, mapping): """ Apply kwargs in the order they are provided. kwargs are ordered as of python 3.6 """ if "as_is" in self.kwargs: as_is = self.kwargs["as_is"] if as_is: appropriate_setting = "as-written" else: appropriate_setting = "apply-longest-first" self.kwargs["rule_ordering"] = appropriate_setting del self.kwargs["as_is"] LOGGER.warning( f"mapping from {self.kwargs.get('in_lang')} to {self.kwargs.get('out_lang')} " 'is using the deprecated parameter "as_is"; ' f"replace `as_is: {as_is}` with `rule_ordering: {appropriate_setting}`" ) # Add defaults if "rule_ordering" in self.kwargs: # right now, "rule-ordering" is a more explict alias of the "as-is" option. ordering = self.kwargs["rule_ordering"] if ordering not in ("as-written", "apply-longest-first"): LOGGER.error( f"mapping from {self.kwargs.get('in_lang')} to {self.kwargs.get('out_lang')} " f"has invalid value '{ordering}' for rule_ordering parameter; " "rule_ordering must be one of " '"as-written" or "apply-longest-first"') else: self.kwargs["rule_ordering"] = "as-written" if "case_sensitive" not in self.kwargs: self.kwargs["case_sensitive"] = True if "escape_special" not in self.kwargs: self.kwargs["escape_special"] = False if "norm_form" not in self.kwargs: self.kwargs["norm_form"] = "NFD" if "reverse" not in self.kwargs: self.kwargs["reverse"] = False if "prevent_feeding" not in self.kwargs: self.kwargs["prevent_feeding"] = False if "in_lang" not in self.kwargs: self.kwargs["in_lang"] = "und" if "out_lang" not in self.kwargs: self.kwargs["out_lang"] = "und" # Process kwargs in order received for kwarg, val in self.kwargs.items(): if kwarg == "rule_ordering" and self.wants_rules_sorted(): # sort by reverse len mapping = sorted(mapping, key=lambda x: len(x["in"]), reverse=True) elif kwarg == "escape_special" and val: mapping = [escape_special_characters(x) for x in mapping] elif kwarg == "norm_form" and val: for io in mapping: for k, v in io.items(): if isinstance(v, str): io[k] = normalize(v, self.kwargs["norm_form"]) elif kwarg == "reverse" and val: mapping = self.reverse_mappings(mapping) # After all processing is done, turn into regex for i, io in enumerate(mapping): if self.kwargs["prevent_feeding"] or ("prevent_feeding" in io and io["prevent_feeding"]): io["intermediate_form"] = self._string_to_pua(io["out"], i) io["match_pattern"] = self.rule_to_regex(io) # Finally, remove rules with an empty match pattern, typically empty rules mapping = [io for io in mapping if io["match_pattern"]] self.processed = True return mapping