コード例 #1
0
 def test_response_code(self):
     '''
     Ensure all routes return 200
     '''
     for rt in self.routes_no_args:
         try:
             r = self.client().get(rt)
             self.assertEqual(r.status_code, 200)
             LOGGER.debug("Route " + rt + " returned " + str(r.status_code))
         except:
             LOGGER.error("Couldn't connect. Is flask running?")
コード例 #2
0
def escape_special_characters(to_escape: Dict[str, str]) -> Dict[str, str]:
    for k, v in to_escape.items():
        if isinstance(v, str):
            escaped = re.escape(v)
        else:
            escaped = v
        if escaped != v:
            LOGGER.debug(
                f"Escaped special characters in '{v}' with '{escaped}''. Set 'escape_special' to False in your Mapping configuration to disable this."
            )
        to_escape[k] = escaped
    return to_escape
コード例 #3
0
 def test_response_code_with_args(self):
     '''
     Ensure all args return 200
     '''
     for ep in self.routes_only_args:
         for node in LANGS_NETWORK.nodes:
             rt = re.sub(self.arg_match, node, ep)
             try:
                 r = self.client().get(rt)
                 self.assertEqual(r.status_code, 200)
             except:
                 LOGGER.error("Couldn't connect. Is flask running?")
         LOGGER.debug("Successfully tested " + str(len(LANGS_NETWORK.nodes)
                                                   ) + " node resources at route " + ep + " .")
コード例 #4
0
def normalize(inp: str, norm_form: str):
    ''' Normalize to NFC(omposed) or NFD(ecomposed).
        Also, find any Unicode Escapes & decode 'em!
    '''
    if norm_form not in ['none', 'NFC', 'NFD', 'NFKC', 'NFKD']:
        raise exceptions.InvalidNormalization(normalize)
    elif norm_form is None or norm_form == 'none':
        return unicode_escape(inp)
    else:
        normalized = ud.normalize(norm_form, unicode_escape(inp))
        if normalized != inp:
            LOGGER.debug(
                'The string %s was normalized to %s using the %s standard and by decoding any Unicode escapes. Note that this is not necessarily the final stage of normalization.',
                inp, normalized, norm_form)
        return normalized
コード例 #5
0
ファイル: create_ipa_mapping.py プロジェクト: deltork/g2p
    def find_good_match(p1, inventory_l2):
        """Find a good sequence in inventory_l2 matching p1."""

        # The proper way to do this would be with some kind of beam search
        # through a determinized/minimized FST, but in the absence of that
        # we can do a kind of heurstic greedy search.  (we don't want any
        # dependencies outside of PyPI otherwise we'd just use OpenFST)

        p1_pseq = dst.fm.ipa_segs(p1)

        i = 0
        good_match = []
        while i < len(p1_pseq):
            best_input = ""
            best_output = -1
            best_score = 0xDEADBEEF
            for j, p2_pseq in enumerate(p2_pseqs):
                # FIXME: Should also consider the (weighted) possibility
                # of deleting input or inserting any segment (but that
                # can't be done with a greedy search)
                if len(p2_pseq) == 0:
                    LOGGER.warning(
                        "No panphon mapping for %s - skipping", inventory_l2[j]
                    )
                    continue
                e = min(i + len(p2_pseq), len(p1_pseq))
                input_seg = p1_pseq[i:e]
                distance_method = get_distance_method(dst, distance)
                score = distance_method("".join(input_seg), "".join(p2_pseq))
                # Be very greedy and take the longest match
                if (
                    score < best_score
                    or score == best_score
                    and len(input_seg) > len(best_input)
                ):
                    best_input = input_seg
                    best_output = j
                    best_score = score
            LOGGER.debug(
                "Best match at position %d: %s => %s",
                i,
                best_input,
                inventory_l2[best_output],
            )
            good_match.append(inventory_l2[best_output])
            i += len(best_input)  # greedy!
        return "".join(good_match)
コード例 #6
0
ファイル: utils.py プロジェクト: deltork/g2p
def normalize(inp: str, norm_form: str):
    """ Normalize to NFC(omposed) or NFD(ecomposed).
        Also, find any Unicode Escapes & decode 'em!
    """
    if norm_form not in ["none", "NFC", "NFD", "NFKC", "NFKD"]:
        raise exceptions.InvalidNormalization(normalize)
    elif norm_form is None or norm_form == "none":
        return unicode_escape(inp)
    else:
        normalized = ud.normalize(norm_form, unicode_escape(inp))
        if normalized != inp:
            LOGGER.debug(
                "The string %s was normalized to %s using the %s standard and by decoding any Unicode escapes. "
                "Note that this is not necessarily the final stage of normalization.",
                inp,
                normalized,
                norm_form,
            )
        return normalized
コード例 #7
0
def load_mapping_from_path(path_to_mapping_config, index=0):
    ''' Loads a mapping from a path, if there is more than one mapping, then it loads based on the int
        provided to the 'index' argument. Default is 0.
    '''
    path = Path(path_to_mapping_config)
    # If path leads to actual mapping config
    if path.exists() and (path.suffix.endswith('yml')
                          or path.suffix.endswith('yaml')):
        # safe load it
        with open(path, encoding='utf8') as f:
            mapping = yaml.safe_load(f)
        # If more than one mapping in the mapping config
        if 'mappings' in mapping:
            try:
                LOGGER.debug(
                    'Loading mapping from %s between "%s" and "%s" at index %s',
                    path_to_mapping_config,
                    mapping['mappings'][index].get('in_lang', 'und'),
                    mapping['mappings'][index].get('out_lang', 'und'), index)
                mapping = mapping['mappings'][index]
            except KeyError:
                LOGGER.warning(
                    'An index of %s was provided for the mapping %s but that index does not exist in the mapping. Please check your mapping.',
                    index, path_to_mapping_config)
        # Log the warning if an Index other than 0 was provided for a mapping config with a single mapping.
        elif index != 0:
            LOGGER.warning(
                'An index of %s was provided for the mapping %s but that index does not exist in the mapping. Please check your mapping.',
                index, path_to_mapping_config)
        # try to load the data from the mapping data file
        if 'mapping' in mapping:
            mapping['mapping_data'] = load_from_file(
                os.path.join(path.parent, mapping['mapping']))
        else:
            # Is "mapping" key missing?
            raise exceptions.MalformedMapping
        # load any abbreviations
        if 'abbreviations' in mapping:
            mapping['abbreviations_data'] = load_abbreviations_from_file(
                os.path.join(path.parent, mapping['abbreviations']))
        return mapping
    else:
        raise FileNotFoundError
コード例 #8
0
ファイル: create_ipa_mapping.py プロジェクト: joanise/g2p
def find_good_match(p1, inventory_l2, l2_is_xsampa=False):
    """Find a good sequence in inventory_l2 matching p1."""

    dst = panphon.distance.Distance()
    # The proper way to do this would be with some kind of beam search
    # through a determinized/minimized FST, but in the absence of that
    # we can do a kind of heurstic greedy search.  (we don't want any
    # dependencies outside of PyPI otherwise we'd just use OpenFST)
    p1_pseq = dst.fm.ipa_segs(p1)
    p2_pseqs = [
        dst.fm.ipa_segs(p)
        for p in process_characters(inventory_l2, l2_is_xsampa)
    ]
    i = 0
    good_match = []
    while i < len(p1_pseq):
        best_input = ""
        best_output = -1
        best_score = 0xdeadbeef
        for j, p2_pseq in enumerate(p2_pseqs):
            # FIXME: Should also consider the (weighted) possibility
            # of deleting input or inserting any segment (but that
            # can't be done with a greedy search)
            if len(p2_pseq) == 0:
                LOGGER.warning('No panphon mapping for %s - skipping',
                               inventory_l2[j])
                continue
            e = min(i + len(p2_pseq), len(p1_pseq))
            input_seg = p1_pseq[i:e]
            score = dst.weighted_feature_edit_distance(''.join(input_seg),
                                                       ''.join(p2_pseq))
            # Be very greedy and take the longest match
            if (score < best_score or score == best_score
                    and len(input_seg) > len(best_input)):
                best_input = input_seg
                best_output = j
                best_score = score
        LOGGER.debug('Best match at position %d: %s => %s', i, best_input,
                     inventory_l2[best_output])
        good_match.append(inventory_l2[best_output])
        i += len(best_input)  # greedy!
    return ''.join(good_match)
コード例 #9
0
ファイル: __init__.py プロジェクト: bradley-ellert/g2p
def make_g2p(in_lang: str, out_lang: str):
    # Check in_lang is a node in network
    if in_lang not in LANGS_NETWORK.nodes:
        LOGGER.error(f"No lang called {in_lang}. Please try again.")
        raise (FileNotFoundError("No lang called {in_lang}."))

    # Check out_lang is a node in network
    if out_lang not in LANGS_NETWORK.nodes:
        LOGGER.error(f"No lang called {out_lang}. Please try again.")
        raise (FileNotFoundError("No lang called {out_lang}."))

    # Try to find the shortest path between the nodes
    try:
        path = shortest_path(LANGS_NETWORK, in_lang, out_lang)
    except NetworkXNoPath:
        LOGGER.error(
            f"Sorry, we couldn't find a way to convert {in_lang} to {out_lang}. Please update your langs by running `g2p update` and try again."
        )
        raise (NetworkXNoPath)

    # Find all mappings needed
    mappings_needed = []
    for i, lang in enumerate(path):
        try:
            mapping = Mapping(in_lang=path[i], out_lang=path[i + 1])
            LOGGER.debug(
                f"Adding mapping between {path[i]} and {path[i+1]} to composite transducer."
            )
            mappings_needed.append(mapping)
        except IndexError:
            continue

    # Either return Transducer or Composite Transducer
    if len(mappings_needed) == 1:
        return Transducer(mappings_needed[0])
    else:
        return CompositeTransducer([Transducer(x) for x in mappings_needed])
コード例 #10
0
ファイル: __init__.py プロジェクト: deltork/g2p
def make_g2p(in_lang: str, out_lang: str, tok_lang=None):
    """Make a g2p Transducer for mapping text from in_lang to out_lang via the
    shortest path between them.

    Args:
        in_lang (str): input language code
        out_lang (str): output language code

    Returns:
        Transducer from in_lang to out_lang

    Raises:
        InvalidLanguageCode: if in_lang or out_lang don't exist
        NoPath: if there is path between in_lang and out_lang
    """
    if (in_lang, out_lang, tok_lang) in _g2p_cache:
        return _g2p_cache[(in_lang, out_lang, tok_lang)]

    # Check in_lang is a node in network
    if in_lang not in LANGS_NETWORK.nodes:
        LOGGER.error(f"No lang called '{in_lang}'. Please try again.")
        raise InvalidLanguageCode(in_lang)

    # Check out_lang is a node in network
    if out_lang not in LANGS_NETWORK.nodes:
        LOGGER.error(f"No lang called '{out_lang}'. Please try again.")
        raise InvalidLanguageCode(out_lang)

    if in_lang == out_lang:
        LOGGER.error(
            f"Sorry, you can't transduce between the same language. Please select a different output language code."
        )
        raise NoPath(in_lang, out_lang)

    # Try to find the shortest path between the nodes
    try:
        path = shortest_path(LANGS_NETWORK, in_lang, out_lang)
    except NetworkXNoPath as e:
        LOGGER.error(
            f"Sorry, we couldn't find a way to convert {in_lang} to {out_lang}. Please update your langs by running `g2p update` and try again."
        )
        raise NoPath(in_lang, out_lang) from e

    # Find all mappings needed
    mappings_needed = []
    for lang1, lang2 in zip(path[:-1], path[1:]):
        mapping = Mapping(in_lang=lang1, out_lang=lang2)
        LOGGER.debug(
            f"Adding mapping between {lang1} and {lang2} to composite transducer."
        )
        mappings_needed.append(mapping)

    # Either construct a Transducer or Composite Transducer
    if len(mappings_needed) == 1:
        transducer = Transducer(mappings_needed[0])
    else:
        transducer = CompositeTransducer(
            [Transducer(x) for x in mappings_needed])

    # If tokenization was requested, return a TokenizingTransducer
    if tok_lang:
        if tok_lang == "path":
            tokenizer = make_tokenizer(in_lang=in_lang, tok_path=path)
        else:
            tokenizer = make_tokenizer(in_lang=tok_lang)
        transducer = TokenizingTransducer(transducer, tokenizer)

    _g2p_cache[(in_lang, out_lang, tok_lang)] = transducer
    return transducer
コード例 #11
0
ファイル: utils.py プロジェクト: deltork/g2p
def load_mapping_from_path(path_to_mapping_config, index=0):
    """ Loads a mapping from a path, if there is more than one mapping, then it loads based on the int
        provided to the 'index' argument. Default is 0.
    """
    path = Path(path_to_mapping_config)
    # If path leads to actual mapping config
    if path.exists() and (path.suffix.endswith("yml") or path.suffix.endswith("yaml")):
        # safe load it
        with open(path, encoding="utf8") as f:
            mapping = yaml.safe_load(f)
        # If more than one mapping in the mapping config
        if "mappings" in mapping:
            try:
                LOGGER.debug(
                    'Loading mapping from %s between "%s" and "%s" at index %s',
                    path_to_mapping_config,
                    mapping["mappings"][index].get("in_lang", "und"),
                    mapping["mappings"][index].get("out_lang", "und"),
                    index,
                )
                mapping = mapping["mappings"][index]
            except KeyError:
                LOGGER.warning(
                    "An index of %s was provided for the mapping %s but that index does not exist in the mapping. "
                    "Please check your mapping.",
                    index,
                    path_to_mapping_config,
                )
        # Log the warning if an Index other than 0 was provided for a mapping config with a single mapping.
        elif index != 0:
            LOGGER.warning(
                "An index of %s was provided for the mapping %s but that index does not exist in the mapping. "
                "Please check your mapping.",
                index,
                path_to_mapping_config,
            )
        # try to load the data from the mapping data file
        if "mapping" in mapping:
            try:
                mapping["mapping_data"] = load_from_file(
                    os.path.join(path.parent, mapping["mapping"])
                )
            except (OSError, exceptions.IncorrectFileType) as e:
                raise exceptions.MalformedMapping(
                    f"Cannot load mapping data file specified in {path}: {e}"
                ) from e
        elif mapping.get("type", "") == "unidecode":
            # This mapping is not implemented as a regular mapping, but as custom software
            pass
        else:
            # Is "mapping" key missing?
            raise exceptions.MalformedMapping(
                'Key "mapping:" missing from a mapping in {}.'.format(path)
            )
        # load any abbreviations
        if "abbreviations" in mapping:
            try:
                mapping["abbreviations_data"] = load_abbreviations_from_file(
                    os.path.join(path.parent, mapping["abbreviations"])
                )
            except (OSError, exceptions.IncorrectFileType) as e:
                raise exceptions.MalformedMapping(
                    f"Cannot load abbreviations data file specified in {path}: {e}"
                ) from e
        return mapping
    else:
        raise FileNotFoundError