Example #1
0
def network_to_echart(write_to_file: bool = False, layout: bool = False):
    nodes = []
    no_nodes = len(LANGS_NETWORK.nodes)
    for node in LANGS_NETWORK.nodes:
        lang_name = node.split('-')[0]
        no_ancestors = len(ancestors(LANGS_NETWORK, node))
        no_descendants = len(descendants(LANGS_NETWORK, node))
        size = min(
            20,
            max(2, ((no_ancestors / no_nodes) * 100 +
                    (no_descendants / no_nodes) * 100)))
        node = {
            'name': node,
            'symbolSize': size,
            'id': node,
            'category': lang_name
        }
        nodes.append(node)
    nodes.sort(key=lambda x: x['name'])
    edges = []
    for edge in LANGS_NETWORK.edges:
        edges.append({'source': edge[0], 'target': edge[1]})
    if write_to_file:
        with open(
                os.path.join(os.path.dirname(static_file),
                             'languages-network.json'), 'w') as f:
            f.write(json.dumps({'nodes': nodes, 'edges': edges}))
        LOGGER.info(f'Wrote network nodes and edges to static file.')
    return nodes, edges
Example #2
0
def write_generated_mapping_to_file(config: dict, mapping: List[dict]):
    # read config
    with open(GEN_CONFIG, 'r') as f:
        data = yaml.safe_load(f)
    map_output_path = os.path.join(GEN_DIR, config['mapping'])
    # write mapping
    if os.path.exists(map_output_path):
        LOGGER.info(f"Overwriting file at {map_output_path}")
    with open(map_output_path, 'w', encoding='utf8') as f:
        json.dump(mapping, f, indent=4)
    data = deepcopy(data)
    cfg_exists = bool([x for x in data['mappings'] if x['in_lang']
                       == config['in_lang'] and x['out_lang'] == config['out_lang']])
    # add new mapping if no mappings are generated yet
    if not data['mappings']:
        data['mappings'] = [config]
    # add new mapping if it doesn't exist yet
    elif not cfg_exists:
        data['mappings'].append(config)
        # rewrite config
        with open(GEN_CONFIG, 'w', encoding='utf8') as f:
            yaml.dump(data, f, Dumper=IndentDumper, default_flow_style=False)
    elif cfg_exists:
        for i, cfg in enumerate(data['mappings']):
            if cfg['in_lang'] == config['in_lang'] and cfg['out_lang'] == config['out_lang']:
                data['mappings'][i] = config
                # rewrite config
                with open(GEN_CONFIG, 'w', encoding='utf8') as f:
                    yaml.dump(data, f, Dumper=IndentDumper,
                              default_flow_style=False)
                break
    else:
        LOGGER.warn(
            f"Not writing generated files because a non-generated mapping from {config['in_lang']} to {config['out_lang']} already exists.")
Example #3
0
def escape_special_characters(to_escape: Dict[str, str]) -> Dict[str, str]:
    for k, v in to_escape.items():
        if isinstance(v, str):
            escaped = re.escape(v)
        else:
            escaped = v
        if escaped != v:
            LOGGER.info(
                f"Escaped special characters in '{v}' with '{escaped}''. Set 'escape_special' to False in your Mapping configuration to disable this.")
        to_escape[k] = escaped
    return to_escape
Example #4
0
File: api.py Project: deltork/g2p
def update_docs():
    """ Update the swagger documentation with all nodes from the network
    """
    swagger_path = os.path.join(os.path.dirname(static_file), "swagger.json")
    with open(swagger_path) as f:
        data = json.load(f)
    data["components"]["schemas"]["Langs"]["enum"] = sorted(
        LANGS_NETWORK.nodes)
    with open(swagger_path, "w") as f:
        f.write(json.dumps(data))
    LOGGER.info("Updated API documentation")
Example #5
0
def update_docs():
    ''' Update the swagger documentation with all nodes from the network
    '''
    swagger_path = os.path.join(os.path.dirname(static_file), 'swagger.json')
    with open(swagger_path) as f:
        data = json.load(f)
    data['components']['schemas']['Langs']['enum'] = sorted(
        [x for x in LANGS_NETWORK.nodes])
    with open(swagger_path, 'w') as f:
        f.write(json.dumps(data))
    LOGGER.info('Updated API documentation')
Example #6
0
def normalize(inp: str, norm_form: str):
    ''' Normalize to NFC(omposed) or NFD(ecomposed).
        Also, find any Unicode Escapes & decode 'em!
    '''
    if norm_form not in ['none', 'NFC', 'NFD', 'NFKC', 'NFKD']:
        raise exceptions.InvalidNormalization(normalize)
    elif norm_form is None or norm_form == 'none':
        return inp
    else:
        normalized = ud.normalize(norm_form, unicode_escape(inp))
        if normalized != inp:
            LOGGER.info(
                'The string %s was normalized to %s using the %s standard and by decoding any Unicode escapes. Note that this is not necessarily the final stage of normalization.',
                inp, normalized, norm_form)
        return normalized
Example #7
0
File: utils.py Project: joanise/g2p
def load_mapping_from_path(path_to_mapping_config, index=0):
    ''' Loads a mapping from a path, if there is more than one mapping, then it loads based on the int
        provided to the 'index' argument. Default is 0.
    '''
    path = Path(path_to_mapping_config)
    # If path leads to actual mapping config
    if path.exists() and (path.suffix.endswith('yml')
                          or path.suffix.endswith('yaml')):
        # safe load it
        with open(path) as f:
            mapping = yaml.safe_load(f)
        # If more than one mapping in the mapping config
        if 'mappings' in mapping:
            try:
                LOGGER.info(
                    'Loading mapping from %s between "%s" and "%s" at index %s',
                    path_to_mapping_config,
                    mapping['mappings'][index]['in_lang'],
                    mapping['mappings'][index]['out_lang'], index)
                mapping = mapping['mappings'][index]
            except KeyError:
                LOGGER.warning(
                    'An index of %s was provided for the mapping %s but that index does not exist in the mapping. Please check your mapping.',
                    index, path_to_mapping_config)
        # Log the warning if an Index other than 0 was provided for a mapping config with a single mapping.
        elif index != 0:
            LOGGER.warning(
                'An index of %s was provided for the mapping %s but that index does not exist in the mapping. Please check your mapping.',
                index, path_to_mapping_config)
        # try to load the data from the mapping data file
        if 'mapping' in mapping:
            mapping['mapping_data'] = load_from_file(
                os.path.join(path.parent, mapping['mapping']))
        else:
            # Is "mapping" key missing?
            raise exceptions.MalformedMapping
        # load any abbreviations
        if 'abbreviations' in mapping:
            mapping['abbreviations_data'] = load_abbreviations_from_file(
                os.path.join(path.parent, mapping['abbreviations']))
        return mapping
    else:
        raise FileNotFoundError
Example #8
0
    def test_convert(self):
        LOGGER.info(
            f"Running {len(self.langs_to_test)} g2p convert test cases found in public/data"
        )
        error_count = 0
        for tok_option in [["--tok", "--check"], ["--no-tok"]]:
            for test in self.langs_to_test:
                output_string = self.runner.invoke(
                    convert,
                    [*tok_option, test[2], test[0], test[1]]).stdout.strip()
                if output_string != test[3].strip():
                    LOGGER.warning(
                        f"test_cli.py: {test[0]}->{test[1]} mapping error: '{test[2]}' "
                        f"should map to '{test[3]}', got '{output_string}' (with {tok_option})."
                    )
                    if error_count == 0:
                        first_failed_test = test + [tok_option]
                    error_count += 1

        if error_count > 0:
            reference_string = first_failed_test[3]
            output_string = self.runner.invoke(
                convert,
                [
                    first_failed_test[4],  # tok_option
                    first_failed_test[2],  # word to convert
                    first_failed_test[0],  # in_lang
                    first_failed_test[1],  # out_lang
                ],
            ).stdout.strip()
            self.assertEqual(
                output_string,
                reference_string.strip(),
                f"{first_failed_test[0]}->{first_failed_test[1]} mapping error "
                "for '{first_failed_test[2]}'.\n"
                "Look for warnings in the log for any more mapping errors",
            )
Example #9
0
def make_g2p(in_lang: str, out_lang: str):
    # Check in_lang is a node in network
    if not in_lang in LANGS_NETWORK.nodes:
        LOGGER.error(f"No lang called {in_lang}. Please try again.")
        raise (FileNotFoundError)

    # Check out_lang is a node in network
    if not out_lang in LANGS_NETWORK.nodes:
        LOGGER.error(f"No lang called {out_lang}. Please try again.")
        raise (FileNotFoundError)

    # Try to find the shortest path between the nodes
    try:
        path = shortest_path(LANGS_NETWORK, in_lang, out_lang)
    except NetworkXNoPath:
        LOGGER.error(
            f"Sorry, we couldn't find a way to convert {in_lang} to {out_lang}. Please update your langs by running `g2p update` and try again."
        )
        raise (NetworkXNoPath)

    # Find all mappings needed
    mappings_needed = []
    for i, lang in enumerate(path):
        try:
            mapping = Mapping(in_lang=path[i], out_lang=path[i + 1])
            LOGGER.info(
                f"Adding mapping between {path[i]} and {path[i+1]} to composite transducer."
            )
            mappings_needed.append(mapping)
        except IndexError:
            continue

    # Either return Transducer or Composite Transducer
    if len(mappings_needed) == 1:
        return Transducer(mappings_needed[0])
    else:
        return CompositeTransducer([Transducer(x) for x in mappings_needed])
Example #10
0
def doctor(mapping, list_all, list_ipa):
    """ Check for common errors in mappings.
        There should eventually be more checks here, but doctor currently checks for:

        1. Characters that are in IPA mappings but are not recognized by panphon library.

        You can list available mappings with --list-all or --list-ipa, or by visiting
        http://g2p-studio.herokuapp.com/api/v1/langs .
    """
    if list_all or list_ipa:
        out_langs = sorted(set([x["out_lang"] for x in MAPPINGS_AVAILABLE]))
        if list_ipa:
            out_langs = [x for x in out_langs if is_ipa(x)]
        LOGGER.info("Specifying an output language will check all mappings into that language:\n")
        for m in out_langs:
            print(f"{m}: ", end="")
            print(
                ("\n" + " " * len(m) + "  ").join(
                    [x["in_lang"] for x in MAPPINGS_AVAILABLE if x["out_lang"] == m]
                )
            )
            print("")
        return

    for m in mapping:
        if m not in [x["out_lang"] for x in MAPPINGS_AVAILABLE]:
            raise click.UsageError(
                f"No known mappings into '{m}'. "
                "Use --list-all or --list-ipa to list valid options."
            )
        if not is_ipa(m):
            LOGGER.warning(
                f"No checks implemented yet for non-IPA mappings: '{m}' will not be checked."
            )

    if not mapping:
        LOGGER.info("Checking all IPA mappings.")
    else:
        LOGGER.info("Checking the following mappings: \n" + "\n".join(mapping))

    check_ipa_known_segs(list(mapping))
Example #11
0
File: cli.py Project: deltork/g2p
def generate_mapping(
    in_lang,
    out_lang,
    dummy,
    ipa,
    list_dummy,
    out_dir,
    merge,
    from_langs,
    to_langs,
    distance,
):
    """ Generate a new mapping from existing mappings in the g2p system.

        This command has different modes of operation.

        Standard mode:

          g2p generate-mapping [--dummy|--ipa] IN_LANG [OUT_LANG]

          For specified IN_LANG, generate a mapping from IN_LANG-ipa to eng-ipa,
          or from IN_LANG-ipa to a dummy minimalist phone inventory. This assumes
          the mapping IN_LANG -> IN_LANG-ipa exists and creates a mapping from its
          output inventory.

          To generate a mapping from IN_LANG-ipa to eng-ipa from a mapping
          following a different patterns, e.g., from crl-equiv -> crl-ipa, specify
          both IN_LANG (crl-equiv in this example) and OUT_LANG (crl-ipa in this
          example).

          \b
          Sample usage:
            Generate Algonquin IPA to English IPA from alq -> alq-ipa:
                g2p generate-mapping --ipa alq
            Generate Mohawk IPA to English IPA from moh-equiv -> moh-ipa:
                g2p generate-mapping --ipa moh-equiv moh-ipa
            Generate Michif IPA to English IPA from the union of crg-dv -> crg-ipa
            and crg-tmd -> crg-ipa:
                g2p generate-mapping --ipa --merge crg-dv:crg-tmd crg-ipa

        List the dummy inventory used by --dummy:

          g2p generate-mapping --list-dummy

        From/to IPA mode:

        \b
          g2p generate-mapping --from FROM_L1 --to TO_L1
          g2p generate-mapping --from FROM_L1:FROM_L2:... --to TO_L1:TO_L2:...

          Generate an IPA mapping from the union of FROM_L1-ipa, FROM-L2-ipa, etc to
          the union of TO_L1-ipa, TO-L2-ipa, etc. One or more from/to language
          code(s) can be specified in colon- or comma-separated lists. Note, by default
          we use Panphon's weighted_feature_edit_distance, but you can change this with
          the --distance argument

        \b
          Sample usage:
            Generate a mapping from kwk-ipa to moh-ipa based on all mappings into
            kwk-ipa and moh-ipa:
                g2p generate-mapping --from kwk --to moh
            Generate a mapping from eng-ipa to crg-ipa based only on crg-dv -> crg-ipa:
                g2p generate-mapping --from eng --to crg-dv_to_crg-ipa
            Generate a mapping from kwk-ipa to moh-ipa+crg-ipa+eng-ipa based on
            all mappings into kwk-ipa (from side) and the union of all mappings
            into moh-ipa and crg-ipa plus eng-ipa_to_eng-arpabet (to side):
                g2p generate-mapping --from kwk --to moh:crg:eng

          Full syntax for specifying FROM_Ln and TO_Ln:

          \b
            lang (i.e., 3-letter code):
             - If there is only one mapping into lang-ipa, "lang" refers to the
               output of that mapping, e.g., "fra" means "fra_to_fra-ipa[out]".
             - If there are several mappings into lang-ipa, "lang" refers to the
               union of the outputs of those mappings, e.g., "moh" means the union
               of "moh-equiv_to_moh-ipa[out]" and "moh-festival_to_moh-ipa[out]".
             - It is an error if there are no mappings into lang-ipa.
             - Only mappings from non-IPA to IPA are considered (i.e., IPA-to-IPA
               mappings created by this command will not be included: use the
               longer syntax below if you want to use them).
             - Special case: "eng" refers to "eng-ipa_to_eng-arpabet[in]".

          \b
            in-lang_to_out-lang[[in]|[out]]:
             - This expanded syntax is used to avoid the union when it is not
               desired, e.g., "moh-equiv_to_moh-ipa" refers only to
               "moh-equiv_to_moh-ipa,out" rather than the union "moh" represents.
             - If out-lang is IPA, the output inventory is used; else if in-lang
               is IPA, the input inventory is used; it is an error if neither
               language is IPA.
             - Specify "[in]" or "[out]" to override the above default.
             - "_to_" is the joiner used to specify "the mapping from 'in-lang' to
               'out-lang'" in the g2p network, regardless of the name of the file
               it is stored in.

        If you just modified or created the mappings from which the new mapping is
        to be generated, don't forget to call "g2p update" first, so that "g2p
        generate-mapping" can see the latest version.

        Call "g2p update" again after calling "g2p generate-mapping" to compile
        the newly generated mapping and make it available.

        Note: exactly one of --ipa, --dummy, --from/--to, or --list-dummy is
        required.

        You can list available mappings with "g2p doctor --list-ipa", or by
        visiting http://g2p-studio.herokuapp.com/api/v1/langs .
    """

    # Make sure only one mode was specified on the command line
    mode_count = ((1 if ipa else 0) + (1 if dummy else 0) +
                  (1 if list_dummy else 0) + (1 if
                                              (from_langs or to_langs) else 0))
    if mode_count == 0:
        raise click.UsageError(
            "Nothing to do! Please specify at least one of --ipa, --dummy, "
            "--list-dummy, or --from/--to.")
    if mode_count > 1:
        raise click.UsageError(
            "Multiple modes selected. Choose only one of --ipa, --dummy, "
            "--list-dummy, or --from/--to.")

    if list_dummy or from_langs is not None or to_langs is not None:
        if in_lang is not None:
            raise click.UsageError(
                "IN_LANG is not allowed with --list-dummy or --from/--too", )

    if from_langs is not None or to_langs is not None:
        if from_langs is None or to_langs is None:
            raise click.UsageError("--from and --to must be used together")

    if merge:
        if not ipa and not dummy:
            raise click.UsageError(
                "--merge is only compatible with --ipa and --dummy.")
        if out_lang is None:
            raise click.UsageError("OUT_LANG is required with --merge.")

    if out_dir and not os.path.isdir(out_dir):
        raise click.BadParameter(
            f'Output directory "{out_dir}" does not exist. Cannot write mapping.',
            param_hint="--out-dir",
        )

    if list_dummy:
        # --list-dummy mode
        print("Dummy phone inventory: {}".format(DUMMY_INVENTORY))

    elif ipa or dummy:
        # --ipa and --dummy modes
        if in_lang is None:
            raise click.UsageError("Missing argument 'IN_LANG'.")
        if merge:
            in_langs = in_lang.split(":")
        else:
            in_langs = [in_lang]

        in_lang_choices = [
            x for x in LANGS_NETWORK.nodes
            if not is_ipa(x) and not is_xsampa(x)
        ]
        for l in in_langs:
            if l not in in_lang_choices:
                raise click.UsageError(
                    f'Invalid value for IN_LANG: "{l}".\n'
                    "IN_LANG must be a non-IPA language code with an existing IPA mapping, "
                    f"i.e., one of:\n{', '.join(in_lang_choices)}.")

        out_lang_choices = [x for x in LANGS_NETWORK.nodes if is_ipa(x)]
        if out_lang is None:
            out_lang = f"{in_lang}-ipa"
        elif out_lang not in out_lang_choices:
            raise click.UsageError(
                f'Invalid value for OUT_LANG: "{out_lang}".\n'
                "OUT_LANG must be an IPA language code with an existing mapping from IN_LANG, "
                f"i.e., one of:\n{', '.join(out_lang_choices)}")

        source_mappings = []
        for l in in_langs:
            try:
                source_mapping = Mapping(in_lang=l, out_lang=out_lang)
            except MappingMissing as e:
                raise click.BadParameter(
                    f'Cannot find IPA mapping from "{l}" to "{out_lang}": {e}',
                    param_hint=["IN_LANG", "OUT_LANG"],
                )
            source_mappings.append(source_mapping)

        if ipa:
            check_ipa_known_segs([f"{in_lang}-ipa"])
            eng_ipa = Mapping(in_lang="eng-ipa", out_lang="eng-arpabet")
            click.echo(f"Writing English IPA mapping for {out_lang} to file")
            new_mapping = create_mapping(source_mappings[0],
                                         eng_ipa,
                                         distance=distance)
            for m in source_mappings[1:]:
                new_mapping.extend(
                    create_mapping(m, eng_ipa, distance=distance))
        else:  # dummy
            click.echo(
                f"Writing dummy fallback mapping for {out_lang} to file")
            new_mapping = align_to_dummy_fallback(source_mappings[0],
                                                  distance=distance)
            for m in source_mappings[1:]:
                new_mapping.extend(
                    align_to_dummy_fallback(m, distance=distance))

        new_mapping.deduplicate()

        if out_dir:
            new_mapping.config_to_file(out_dir)
            new_mapping.mapping_to_file(out_dir)
        else:
            new_mapping.config_to_file()
            new_mapping.mapping_to_file()

    elif from_langs is not None:
        # --from/--to mode
        assert to_langs is not None

        from_mappings = []
        for from_lang in re.split(r"[:,]", from_langs):
            from_mappings.extend(parse_from_or_to_lang_spec(from_lang))
        to_mappings = []
        for to_lang in re.split(r"[:,]", to_langs):
            to_mappings.extend(parse_from_or_to_lang_spec(to_lang))

        if not from_mappings:
            raise click.UsageError(
                f'Invalid --from value "{from_langs}": no mappings found.')
        if not to_mappings:
            raise click.UsageError(
                f'Invalid --to value "{to_langs}": no mappings found.')

        for from_mapping, in_or_out in from_mappings:
            LOGGER.info(
                f'From mapping: {from_mapping.kwargs["in_lang"]}_to_{from_mapping.kwargs["out_lang"]}[{in_or_out}]'
            )
        for to_mapping, in_or_out in to_mappings:
            LOGGER.info(
                f'To mapping: {to_mapping.kwargs["in_lang"]}_to_{to_mapping.kwargs["out_lang"]}[{in_or_out}]'
            )

        new_mapping = create_multi_mapping(from_mappings,
                                           to_mappings,
                                           distance=distance)

        if out_dir:
            new_mapping.config_to_file(out_dir)
            new_mapping.mapping_to_file(out_dir)
        else:
            new_mapping.config_to_file()
            new_mapping.mapping_to_file()