Beispiel #1
0
def create_mapping(mapping_1: Mapping,
                   mapping_2: Mapping,
                   mapping_1_io: str = 'out',
                   mapping_2_io: str = 'in',
                   write_to_file: bool = False) -> Mapping:
    map_1_name = mapping_1.kwargs[f'{mapping_1_io}_lang']
    map_2_name = mapping_2.kwargs[f'{mapping_2_io}_lang']
    if not is_ipa(map_1_name) and not is_xsampa(map_1_name):
        LOGGER.warning(
            "Unsupported orthography of inventory 1: %s"
            " (must be ipa or x-sampa)", map_1_name)
    if not is_ipa(map_2_name) and not is_xsampa(map_2_name):
        LOGGER.warning(
            "Unsupported orthography of inventory 2: %s"
            " (must be ipa or x-sampa)", map_2_name)
    l1_is_xsampa, l2_is_xsampa = is_xsampa(map_1_name), is_xsampa(map_2_name)
    mapping = align_inventories(mapping_1.inventory(mapping_1_io),
                                mapping_2.inventory(mapping_2_io),
                                l1_is_xsampa, l2_is_xsampa)

    l1_display_name = mapping_1.kwargs.get(
        'language_name', 'No Language display name in Config')
    l2_display_name = mapping_2.kwargs.get(
        'language_name', 'No Language display name in Config')

    config = generate_config(map_1_name, map_2_name, l1_display_name,
                             l2_display_name)

    if write_to_file:
        write_generated_mapping_to_file(config, mapping)

    return Mapping(mapping,
                   **{k: v
                      for k, v in config.items() if k != 'mapping'})
def align_to_dummy_fallback(mapping: Mapping, io: str = 'in', write_to_file: bool = False, out_dir: str = ''):
    display_name = mapping.kwargs.get('language_name', 'No Language display name in Config')
    config = {'in_lang': mapping.kwargs[f'{io}_lang'], 'out_lang': 'dummy'}
    default_char = 't'
    if is_ipa(mapping.kwargs[f'{io}_lang']):
        mapping = align_inventories(mapping.inventory(io), DUMMY_INVENTORY)
    else:
        und_g2p = make_g2p('und', 'und-ipa')
        mapping = [{"in": unicode_escape(x), "out": und_g2p(unidecode(x).lower()).output_string} for x in mapping.inventory(io)]
        dummy_list = align_inventories([x['out'] for x in mapping], DUMMY_INVENTORY)
        dummy_dict = {}
        for x in dummy_list:
            if x['in']:
                dummy_dict[x['in']] = x['out']
                
        for x in mapping:
            try:
                x['out'] = dummy_dict[x['out']]
            except KeyError:
                LOGGER.warn(f"We couldn't guess at what {x['in']} means, so it's being replaced with '{default_char}' instead.")
                x['out'] = default_char       

    config['mapping'] = mapping
    mapping = Mapping(**config)
    if write_to_file:
        if out_dir:
            if os.path.isdir(out_dir):
                mapping.config_to_file(out_dir)
                mapping.mapping_to_file(out_dir)
            else:
                LOGGER.warning(f'{out_dir} is not a directory. Writing to default instead.')
        else:
            mapping.config_to_file()
            mapping.mapping_to_file()
    return mapping
Beispiel #3
0
def check_ipa_known_segs(mappings_to_check=False) -> bool:
    """Check the given mappings, or all IPA mappings, for invalid IPA in the "out" fields

    Returns True iff not errors were found.
    """
    if not mappings_to_check:
        mappings_to_check = [x["out_lang"] for x in MAPPINGS_AVAILABLE]
    found_error = False
    for mapping in [
            x for x in MAPPINGS_AVAILABLE if x["out_lang"] in mappings_to_check
    ]:
        if is_ipa(mapping["out_lang"]):
            reverse = mapping.get("reverse", False)
            for rule in mapping["mapping_data"]:
                output = rule["in"] if reverse else rule["out"]
                if not is_panphon(output):
                    LOGGER.warning(
                        f"Output '{rule['out']}' in rule {rule} in mapping between {mapping['in_lang']} "
                        f"and {mapping['out_lang']} is not recognized as valid IPA by panphon."
                    )
                    found_error = True
    if found_error:
        LOGGER.warning(
            "Please refer to https://github.com/dmort27/panphon for information about panphon."
        )
    return not found_error
Beispiel #4
0
 def check(
     self,
     tg: TransductionGraph,
     shallow=False,
     display_warnings=False,
     original_input=None,
 ):
     out_lang = self.mapping.kwargs["out_lang"]
     if "eng-arpabet" in out_lang:
         if not is_arpabet(tg.output_string):
             if display_warnings:
                 display_input = (original_input
                                  if original_input else tg.input_string)
                 LOGGER.warning(
                     f'Transducer output "{tg.output_string}" for input "{display_input}" is not fully valid eng-arpabet as recognized by soundswallower.'
                 )
             return False
         else:
             return True
     elif is_ipa(out_lang):
         if not is_panphon(tg.output_string,
                           display_warnings=display_warnings):
             if display_warnings:
                 display_input = (original_input
                                  if original_input else tg.input_string)
                 LOGGER.warning(
                     f'Transducer output "{tg.output_string}" for input "{display_input}" is not fully valid {out_lang}.'
                 )
             return False
         else:
             return True
     else:
         # No check implemented at this tier, just return True
         return True
Beispiel #5
0
def align_to_dummy_fallback(mapping: Mapping, io: str = 'in', write_to_file: bool = False):
    dummy_inventory = ["ɑ", "i", "u", "t", "s", "n"]
    display_name = mapping.kwargs.get('language_name', 'No Language display name in Config')
    config = generate_config(mapping.kwargs[f'{io}_lang'], 'dummy', display_name, display_name)
    default_char = 't'
    if is_ipa(mapping.kwargs[f'{io}_lang']):
        mapping = align_inventories(mapping.inventory(io), dummy_inventory)
    else:
        und_g2p = make_g2p('und', 'und-ipa')
        mapping = [{"in": unicode_escape(x), "out": und_g2p(unidecode(x).lower())} for x in mapping.inventory(io)]
        dummy_list = align_inventories([x['out'] for x in mapping], dummy_inventory)
        dummy_dict = {}
        for x in dummy_list:
            if x['in']:
                dummy_dict[x['in']] = x['out']
                
        for x in mapping:
            try:
                x['out'] = dummy_dict[x['out']]
            except KeyError:
                LOGGER.warn(f"We couldn't guess at what {x['in']} means, so it's being replaced with '{default_char}' instead.")
                x['out'] = default_char       
 
    if write_to_file:
        write_generated_mapping_to_file(config, mapping)
    return config, mapping
Beispiel #6
0
def create_mapping(
    mapping_1: Mapping,
    mapping_2: Mapping,
    mapping_1_io: str = "out",
    mapping_2_io: str = "in",
    distance: str = "weighted_feature_edit_distance",
) -> Mapping:
    """Create a mapping from mapping_1's output inventory to mapping_2's input inventory"""

    map_1_name = mapping_1.kwargs[f"{mapping_1_io}_lang"]
    map_2_name = mapping_2.kwargs[f"{mapping_2_io}_lang"]
    if not is_ipa(map_1_name) and not is_xsampa(map_1_name):
        LOGGER.warning(
            "Unsupported orthography of inventory 1: %s (must be ipa or x-sampa)",
            map_1_name,
        )
    if not is_ipa(map_2_name) and not is_xsampa(map_2_name):
        LOGGER.warning(
            "Unsupported orthography of inventory 2: %s (must be ipa or x-sampa)",
            map_2_name,
        )
    l1_is_xsampa, l2_is_xsampa = is_xsampa(map_1_name), is_xsampa(map_2_name)
    mapping = align_inventories(
        mapping_1.inventory(mapping_1_io),
        mapping_2.inventory(mapping_2_io),
        l1_is_xsampa,
        l2_is_xsampa,
        distance=distance,
    )

    # Initialize mapping with input language parameters (as_is,
    # case_sensitive, prevent_feeding, etc)
    config = mapping_1.kwargs.copy()
    # Fix up names, etc.
    if "authors" in config:
        del config["authors"]
    if "display_name" in config:
        del config["display_name"]
    if "language_name" in config:
        del config["language_name"]
    config["prevent_feeding"] = True
    config["in_lang"] = map_1_name
    config["out_lang"] = map_2_name
    config["mapping"] = mapping
    mapping = Mapping(**config)
    return mapping
Beispiel #7
0
 def mapping_type(name):
     if is_ipa(name):
         return 'IPA'
     elif is_xsampa(name):
         return 'XSAMPA'
     elif is_dummy(name):
         return 'dummy'
     else:
         return 'custom'
Beispiel #8
0
 def mapping_type(name):
     if is_ipa(name):
         return "IPA"
     elif is_xsampa(name):
         return "XSAMPA"
     elif is_dummy(name):
         return "dummy"
     else:
         return "custom"
Beispiel #9
0
def doctor(mapping, list_all, list_ipa):
    """ Check for common errors in mappings.
        There should eventually be more checks here, but doctor currently checks for:

        1. Characters that are in IPA mappings but are not recognized by panphon library.

        You can list available mappings with --list-all or --list-ipa, or by visiting
        http://g2p-studio.herokuapp.com/api/v1/langs .
    """
    if list_all or list_ipa:
        out_langs = sorted(set([x["out_lang"] for x in MAPPINGS_AVAILABLE]))
        if list_ipa:
            out_langs = [x for x in out_langs if is_ipa(x)]
        LOGGER.info("Specifying an output language will check all mappings into that language:\n")
        for m in out_langs:
            print(f"{m}: ", end="")
            print(
                ("\n" + " " * len(m) + "  ").join(
                    [x["in_lang"] for x in MAPPINGS_AVAILABLE if x["out_lang"] == m]
                )
            )
            print("")
        return

    for m in mapping:
        if m not in [x["out_lang"] for x in MAPPINGS_AVAILABLE]:
            raise click.UsageError(
                f"No known mappings into '{m}'. "
                "Use --list-all or --list-ipa to list valid options."
            )
        if not is_ipa(m):
            LOGGER.warning(
                f"No checks implemented yet for non-IPA mappings: '{m}' will not be checked."
            )

    if not mapping:
        LOGGER.info("Checking all IPA mappings.")
    else:
        LOGGER.info("Checking the following mappings: \n" + "\n".join(mapping))

    check_ipa_known_segs(list(mapping))
def create_mapping(mapping_1: Mapping, mapping_2: Mapping, mapping_1_io: str = 'out', mapping_2_io: str = 'in', write_to_file: bool = False, out_dir: str = '') -> Mapping:
    map_1_name = mapping_1.kwargs[f'{mapping_1_io}_lang']
    map_2_name = mapping_2.kwargs[f'{mapping_2_io}_lang']
    if not is_ipa(map_1_name) and not is_xsampa(map_1_name):
        LOGGER.warning("Unsupported orthography of inventory 1: %s"
                       " (must be ipa or x-sampa)",
                       map_1_name)
    if not is_ipa(map_2_name) and not is_xsampa(map_2_name):
        LOGGER.warning("Unsupported orthography of inventory 2: %s"
                       " (must be ipa or x-sampa)",
                       map_2_name)
    l1_is_xsampa, l2_is_xsampa = is_xsampa(map_1_name), is_xsampa(map_2_name)
    mapping = align_inventories(mapping_1.inventory(mapping_1_io), mapping_2.inventory(mapping_2_io),
                                l1_is_xsampa, l2_is_xsampa)

    # Initialize mapping with input language parameters (as_is,
    # case_sensitive, prevent_feeding, etc)
    config = mapping_1.kwargs.copy()
    # Fix up names, etc.
    if 'authors' in config:
        del config['authors']
    if 'display_name' in config:
        del config['display_name']
    if 'language_name' in config:
        del config['language_name']
    config['in_lang'] = map_1_name
    config['out_lang'] = map_2_name
    config['mapping'] = mapping
    mapping = Mapping(**config)
    if write_to_file:
        if out_dir:
            if os.path.isdir(out_dir):
                mapping.config_to_file(out_dir)
                mapping.mapping_to_file(out_dir)
            else:
                LOGGER.warning(f'{out_dir} is not a directory. Writing to default instead.')
        else:
            mapping.config_to_file()
            mapping.mapping_to_file()

    return mapping
Beispiel #11
0
def create_mapping(l1_mapping: Mapping, l2_mapping: Mapping) -> Mapping:
    ''' Create a mapping from the output of l1 and input of l2.
        Both must be either ipa or x-sampa.
    '''
    l1 = l1_mapping.kwargs['out_lang']
    l2 = l2_mapping.kwargs['in_lang']
    inv_l1 = l1_mapping.inventory("out")
    inv_l2 = l2_mapping.inventory()
    if not is_ipa(l1) and not is_xsampa(l1):
        LOGGER.warning(
            "Unsupported orthography of inventory 1: %s"
            " (must be ipa or x-sampa)", l1)
    if not is_ipa(l2) and not is_xsampa(l2):
        LOGGER.warning(
            "Unsupported orthography of inventory 2: %s"
            " (must be ipa or x-sampa)", l2)
    mapping = align_inventories(inv_l1["inventory"], inv_l2["inventory"],
                                is_xsampa(l1), is_xsampa(l2))

    output_mapping = Mapping(mapping, in_lang=l1, out_lang=l2)
    return output_mapping
Beispiel #12
0
    def __init__(self, seen_factor=1.0, unseen_factor=0.01):
        self.seen_factor = seen_factor
        self.unseen_factor = unseen_factor

        self.chars = {}
        self.inventories = {}
        for x in MAPPINGS_AVAILABLE:
            if (
                not is_ipa(x["in_lang"])
                and not is_xsampa(x["in_lang"])
                and not is_dummy(x["in_lang"])
            ):
                mapping = Mapping(in_lang=x["in_lang"], out_lang=x["out_lang"])
                self.inventories[x["in_lang"]] = set(mapping.inventory("in"))
                for s in self.inventories[x["in_lang"]]:
                    for c in s:
                        if c not in self.chars:  # not yet seen in any lang
                            # make an index for it
                            self.chars[c] = len(self.chars)
        self.langs = {k: i for i, k in enumerate(self.inventories.keys())}
        self.calculate_prior_probs()
Beispiel #13
0
def align_to_dummy_fallback(mapping: Mapping,
                            io: str = 'in',
                            distance: str = "weighted_feature_edit_distance"):
    """Create a mapping from mapping's output inventory to a minimalist dummy inventory"""
    config = {'in_lang': mapping.kwargs[f'{io}_lang'], 'out_lang': 'dummy'}
    default_char = 't'
    if is_ipa(mapping.kwargs[f'{io}_lang']):
        mapping = align_inventories(mapping.inventory(io),
                                    DUMMY_INVENTORY,
                                    distance=distance)
    else:
        und_g2p = make_g2p('und', 'und-ipa')
        mapping = [{
            "in": unicode_escape(x),
            "out": und_g2p(unidecode(x).lower()).output_string
        } for x in mapping.inventory(io)]
        dummy_list = align_inventories([x['out'] for x in mapping],
                                       DUMMY_INVENTORY,
                                       distance=distance)
        dummy_dict = {}
        for x in dummy_list:
            if x['in']:
                dummy_dict[x['in']] = x['out']

        for x in mapping:
            try:
                x['out'] = dummy_dict[x['out']]
            except KeyError:
                LOGGER.warning(
                    f"We couldn't guess at what {x['in']} means, so it's being "
                    f"replaced with '{default_char}' instead.")
                x['out'] = default_char

    config['mapping'] = mapping
    mapping = Mapping(**config)
    return mapping
Beispiel #14
0
def generate_mapping(
    in_lang,
    out_lang,
    dummy,
    ipa,
    list_dummy,
    out_dir,
    merge,
    from_langs,
    to_langs,
    distance,
):
    """ Generate a new mapping from existing mappings in the g2p system.

        This command has different modes of operation.

        Standard mode:

          g2p generate-mapping [--dummy|--ipa] IN_LANG [OUT_LANG]

          For specified IN_LANG, generate a mapping from IN_LANG-ipa to eng-ipa,
          or from IN_LANG-ipa to a dummy minimalist phone inventory. This assumes
          the mapping IN_LANG -> IN_LANG-ipa exists and creates a mapping from its
          output inventory.

          To generate a mapping from IN_LANG-ipa to eng-ipa from a mapping
          following a different patterns, e.g., from crl-equiv -> crl-ipa, specify
          both IN_LANG (crl-equiv in this example) and OUT_LANG (crl-ipa in this
          example).

          \b
          Sample usage:
            Generate Algonquin IPA to English IPA from alq -> alq-ipa:
                g2p generate-mapping --ipa alq
            Generate Mohawk IPA to English IPA from moh-equiv -> moh-ipa:
                g2p generate-mapping --ipa moh-equiv moh-ipa
            Generate Michif IPA to English IPA from the union of crg-dv -> crg-ipa
            and crg-tmd -> crg-ipa:
                g2p generate-mapping --ipa --merge crg-dv:crg-tmd crg-ipa

        List the dummy inventory used by --dummy:

          g2p generate-mapping --list-dummy

        From/to IPA mode:

        \b
          g2p generate-mapping --from FROM_L1 --to TO_L1
          g2p generate-mapping --from FROM_L1:FROM_L2:... --to TO_L1:TO_L2:...

          Generate an IPA mapping from the union of FROM_L1-ipa, FROM-L2-ipa, etc to
          the union of TO_L1-ipa, TO-L2-ipa, etc. One or more from/to language
          code(s) can be specified in colon- or comma-separated lists. Note, by default
          we use Panphon's weighted_feature_edit_distance, but you can change this with
          the --distance argument

        \b
          Sample usage:
            Generate a mapping from kwk-ipa to moh-ipa based on all mappings into
            kwk-ipa and moh-ipa:
                g2p generate-mapping --from kwk --to moh
            Generate a mapping from eng-ipa to crg-ipa based only on crg-dv -> crg-ipa:
                g2p generate-mapping --from eng --to crg-dv_to_crg-ipa
            Generate a mapping from kwk-ipa to moh-ipa+crg-ipa+eng-ipa based on
            all mappings into kwk-ipa (from side) and the union of all mappings
            into moh-ipa and crg-ipa plus eng-ipa_to_eng-arpabet (to side):
                g2p generate-mapping --from kwk --to moh:crg:eng

          Full syntax for specifying FROM_Ln and TO_Ln:

          \b
            lang (i.e., 3-letter code):
             - If there is only one mapping into lang-ipa, "lang" refers to the
               output of that mapping, e.g., "fra" means "fra_to_fra-ipa[out]".
             - If there are several mappings into lang-ipa, "lang" refers to the
               union of the outputs of those mappings, e.g., "moh" means the union
               of "moh-equiv_to_moh-ipa[out]" and "moh-festival_to_moh-ipa[out]".
             - It is an error if there are no mappings into lang-ipa.
             - Only mappings from non-IPA to IPA are considered (i.e., IPA-to-IPA
               mappings created by this command will not be included: use the
               longer syntax below if you want to use them).
             - Special case: "eng" refers to "eng-ipa_to_eng-arpabet[in]".

          \b
            in-lang_to_out-lang[[in]|[out]]:
             - This expanded syntax is used to avoid the union when it is not
               desired, e.g., "moh-equiv_to_moh-ipa" refers only to
               "moh-equiv_to_moh-ipa,out" rather than the union "moh" represents.
             - If out-lang is IPA, the output inventory is used; else if in-lang
               is IPA, the input inventory is used; it is an error if neither
               language is IPA.
             - Specify "[in]" or "[out]" to override the above default.
             - "_to_" is the joiner used to specify "the mapping from 'in-lang' to
               'out-lang'" in the g2p network, regardless of the name of the file
               it is stored in.

        If you just modified or created the mappings from which the new mapping is
        to be generated, don't forget to call "g2p update" first, so that "g2p
        generate-mapping" can see the latest version.

        Call "g2p update" again after calling "g2p generate-mapping" to compile
        the newly generated mapping and make it available.

        Note: exactly one of --ipa, --dummy, --from/--to, or --list-dummy is
        required.

        You can list available mappings with "g2p doctor --list-ipa", or by
        visiting http://g2p-studio.herokuapp.com/api/v1/langs .
    """

    # Make sure only one mode was specified on the command line
    mode_count = ((1 if ipa else 0) + (1 if dummy else 0) +
                  (1 if list_dummy else 0) + (1 if
                                              (from_langs or to_langs) else 0))
    if mode_count == 0:
        raise click.UsageError(
            "Nothing to do! Please specify at least one of --ipa, --dummy, "
            "--list-dummy, or --from/--to.")
    if mode_count > 1:
        raise click.UsageError(
            "Multiple modes selected. Choose only one of --ipa, --dummy, "
            "--list-dummy, or --from/--to.")

    if list_dummy or from_langs is not None or to_langs is not None:
        if in_lang is not None:
            raise click.UsageError(
                "IN_LANG is not allowed with --list-dummy or --from/--too", )

    if from_langs is not None or to_langs is not None:
        if from_langs is None or to_langs is None:
            raise click.UsageError("--from and --to must be used together")

    if merge:
        if not ipa and not dummy:
            raise click.UsageError(
                "--merge is only compatible with --ipa and --dummy.")
        if out_lang is None:
            raise click.UsageError("OUT_LANG is required with --merge.")

    if out_dir and not os.path.isdir(out_dir):
        raise click.BadParameter(
            f'Output directory "{out_dir}" does not exist. Cannot write mapping.',
            param_hint="--out-dir",
        )

    if list_dummy:
        # --list-dummy mode
        print("Dummy phone inventory: {}".format(DUMMY_INVENTORY))

    elif ipa or dummy:
        # --ipa and --dummy modes
        if in_lang is None:
            raise click.UsageError("Missing argument 'IN_LANG'.")
        if merge:
            in_langs = in_lang.split(":")
        else:
            in_langs = [in_lang]

        in_lang_choices = [
            x for x in LANGS_NETWORK.nodes
            if not is_ipa(x) and not is_xsampa(x)
        ]
        for l in in_langs:
            if l not in in_lang_choices:
                raise click.UsageError(
                    f'Invalid value for IN_LANG: "{l}".\n'
                    "IN_LANG must be a non-IPA language code with an existing IPA mapping, "
                    f"i.e., one of:\n{', '.join(in_lang_choices)}.")

        out_lang_choices = [x for x in LANGS_NETWORK.nodes if is_ipa(x)]
        if out_lang is None:
            out_lang = f"{in_lang}-ipa"
        elif out_lang not in out_lang_choices:
            raise click.UsageError(
                f'Invalid value for OUT_LANG: "{out_lang}".\n'
                "OUT_LANG must be an IPA language code with an existing mapping from IN_LANG, "
                f"i.e., one of:\n{', '.join(out_lang_choices)}")

        source_mappings = []
        for l in in_langs:
            try:
                source_mapping = Mapping(in_lang=l, out_lang=out_lang)
            except MappingMissing as e:
                raise click.BadParameter(
                    f'Cannot find IPA mapping from "{l}" to "{out_lang}": {e}',
                    param_hint=["IN_LANG", "OUT_LANG"],
                )
            source_mappings.append(source_mapping)

        if ipa:
            check_ipa_known_segs([f"{in_lang}-ipa"])
            eng_ipa = Mapping(in_lang="eng-ipa", out_lang="eng-arpabet")
            click.echo(f"Writing English IPA mapping for {out_lang} to file")
            new_mapping = create_mapping(source_mappings[0],
                                         eng_ipa,
                                         distance=distance)
            for m in source_mappings[1:]:
                new_mapping.extend(
                    create_mapping(m, eng_ipa, distance=distance))
        else:  # dummy
            click.echo(
                f"Writing dummy fallback mapping for {out_lang} to file")
            new_mapping = align_to_dummy_fallback(source_mappings[0],
                                                  distance=distance)
            for m in source_mappings[1:]:
                new_mapping.extend(
                    align_to_dummy_fallback(m, distance=distance))

        new_mapping.deduplicate()

        if out_dir:
            new_mapping.config_to_file(out_dir)
            new_mapping.mapping_to_file(out_dir)
        else:
            new_mapping.config_to_file()
            new_mapping.mapping_to_file()

    elif from_langs is not None:
        # --from/--to mode
        assert to_langs is not None

        from_mappings = []
        for from_lang in re.split(r"[:,]", from_langs):
            from_mappings.extend(parse_from_or_to_lang_spec(from_lang))
        to_mappings = []
        for to_lang in re.split(r"[:,]", to_langs):
            to_mappings.extend(parse_from_or_to_lang_spec(to_lang))

        if not from_mappings:
            raise click.UsageError(
                f'Invalid --from value "{from_langs}": no mappings found.')
        if not to_mappings:
            raise click.UsageError(
                f'Invalid --to value "{to_langs}": no mappings found.')

        for from_mapping, in_or_out in from_mappings:
            LOGGER.info(
                f'From mapping: {from_mapping.kwargs["in_lang"]}_to_{from_mapping.kwargs["out_lang"]}[{in_or_out}]'
            )
        for to_mapping, in_or_out in to_mappings:
            LOGGER.info(
                f'To mapping: {to_mapping.kwargs["in_lang"]}_to_{to_mapping.kwargs["out_lang"]}[{in_or_out}]'
            )

        new_mapping = create_multi_mapping(from_mappings,
                                           to_mappings,
                                           distance=distance)

        if out_dir:
            new_mapping.config_to_file(out_dir)
            new_mapping.mapping_to_file(out_dir)
        else:
            new_mapping.config_to_file()
            new_mapping.mapping_to_file()
Beispiel #15
0
CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])


@click.version_option(version=VERSION, prog_name="g2p")
@click.group(cls=FlaskGroup, create_app=create_app, context_settings=CONTEXT_SETTINGS)
def cli():
    '''Management script for G2P'''


@click.option('--out-dir', type=click.Path(exists=True, file_okay=False, dir_okay=True),
    help='Output results in DIRECTORY instead of the global "generated" directory.')
@click.option('--list-dummy', default=False, is_flag=True, help="List the dummy phone inventory.")
@click.option('--dummy/--no-dummy', default=False, help="Generate dummy fallback mapping to minimalist phone inventory.")
@click.option('--ipa/--no-ipa', default=False, help="Generate mapping from LANG-ipa to eng-ipa.")
@click.argument('in_lang', type=click.Choice([x for x in LANGS_NETWORK.nodes if not is_ipa(x) and not is_xsampa(x)]))
@cli.command(context_settings=CONTEXT_SETTINGS, short_help="Generate English IPA or dummy mapping.")
def generate_mapping(in_lang, dummy, ipa, list_dummy, out_dir):
    ''' For specified IN_LANG, generate a mapping from IN_LANG-ipa to eng-ipa,
        or from IN_LANG-ipa to a dummy minimalist phone inventory.

        If you just modified or wrote the IN_LANG to IN_LANG-ipa mapping, don't forget
        to call "g2p update" first so "g2p generate-mapping" sees the latest version.

        Call "g2p update" again after calling "g2p generate-mapping" to make the new
        IN_LANG-ipa to eng-ipa mapping available.
    '''
    if not ipa and not dummy and not list_dummy:
        click.echo('You have to choose to generate either an IPA-based mapping or a dummy fallback mapping. Check the docs for more information.')
    if out_dir and (os.path.exists(os.path.join(out_dir, 'config.yaml')) or os.path.exists(os.path.join(out_dir, 'config.yaml'))):
        click.echo(
Beispiel #16
0
def create_multi_mapping(
    src_mappings: List[Tuple[Mapping, str]],
    tgt_mappings: List[Tuple[Mapping, str]],
    distance: str = "weighted_feature_edit_distance",
) -> Mapping:
    """Create a mapping for a set of source mappings to a set of target mappings

    Each src/tgt mappings is a (mapping: Mapping, in_or_out: str) pair specifying
    the mapping to use and whether its input ("in") or output ("out") inventory
    should be used to create the new mapping.

    The name of the mapping is infered from src_mappings[0] and tgt_mappings[0]'s
    metadata.
    """

    def compact_ipa_names(ipa_names: Iterable) -> str:
        # ["fra-ipa", "eng-ipa", "kwk-ipa"] -> "fra-eng-kwk-ipa"
        return (
            "-".join(name[:-4] if name.endswith("-ipa") else name for name in ipa_names)
            + "-ipa"
        )

    def long_ipa_names(ipa_names: Iterable) -> str:
        # ["fra-ipa", "eng-ipa", "kwk-ipa"] -> "fra-ipa and eng-ipa and kwk-ipa"
        return " and ".join(ipa_names)

    def get_sorted_unique_names(mappings: List[Tuple[Mapping, str]]) -> List[str]:
        return sorted(
            {mapping.kwargs[f"{in_or_out}_lang"] for mapping, in_or_out in mappings}
        )

    def deduplicate(iterable: Iterable) -> List:
        # Use a dict, and not a set, to preserve the original order.
        return list({v: v for v in iterable}.values())

    map_1_names = get_sorted_unique_names(src_mappings)
    map_2_names = get_sorted_unique_names(tgt_mappings)

    src_inventory = []
    for (mapping, io) in src_mappings:
        name = mapping.kwargs[f"{io}_lang"]
        if not is_ipa(name):
            LOGGER.warning(
                "Unsupported orthography of src inventory: %s; must be IPA", name
            )
        src_inventory.extend(mapping.inventory(io))
    src_inventory = deduplicate(src_inventory)

    tgt_inventory = []
    for (mapping, io) in tgt_mappings:
        name = mapping.kwargs[f"{io}_lang"]
        if not is_ipa(name):
            LOGGER.warning(
                "Unsupported orthography of tgt inventory: %s; must be IPA", name
            )
        tgt_inventory.extend(mapping.inventory(io))
    tgt_inventory = deduplicate(tgt_inventory)

    mapping = align_inventories(src_inventory, tgt_inventory, distance=distance)

    config = {
        "in_lang": compact_ipa_names(map_1_names),
        "out_lang": compact_ipa_names(map_2_names),
        "language_name": "IPA",
        "rule_ordering": "apply-longest-first",
        "mapping": mapping,
        "prevent_feeding": True,
        "norm_form": "NFC",
        "display_name": (
            long_ipa_names(map_1_names) + " to " + long_ipa_names(map_2_names)
        ),
    }

    return Mapping(**config)
Beispiel #17
0
def create_app():
    return APP


@click.version_option(version=VERSION, prog_name="g2p")
@click.group(cls=FlaskGroup, create_app=create_app)
def cli():
    '''Management script for G2P'''


@click.option('--ipa/--no-ipa', default=False)
@click.option('--dummy/--no-dummy', default=False)
@click.argument('in_lang',
                type=click.Choice([
                    x for x in LANGS_NETWORK.nodes
                    if not is_ipa(x) and not is_xsampa(x)
                ]))
@cli.command()
def generate_mapping(in_lang, dummy, ipa):
    ''' Generate English mapping
    '''
    if not ipa and not dummy:
        click.echo(
            'You have to choose to generate either an IPA-based mapping or a dummy fallback mapping. Check the docs for more information.'
        )
    if ipa:
        eng_ipa = Mapping(in_lang='eng-ipa', out_lang='eng-arpabet')
        new_mapping = Mapping(in_lang=in_lang, out_lang=f'{in_lang}-ipa')
        click.echo(f"Writing English IPA mapping for {in_lang} to file")
        create_mapping(new_mapping, eng_ipa, write_to_file=True)
    if dummy:
Beispiel #18
0
def parse_from_or_to_lang_spec(lang_spec):
    """Parse a value given to g2p generate-mapping --from or --to.

    See the documentation of generate_mapping() for the syntax of lang_spec.

    Returns list[tuple[Mapping, io (str)]]:
        the mapping(s) lang_spec refers to, and "in" or "out", to indicate if the
        relevant inventory is the mapping's in_lang or out_lang.

    Raises:
        click.BadParameter if lang_spec is not valid
    """
    mapping_spec, _, in_or_out = lang_spec.partition("[")
    in_or_out.rstrip("]")
    in_lang, _, out_lang = mapping_spec.partition("_to_")

    if out_lang:
        try:
            mapping = Mapping(in_lang=in_lang, out_lang=out_lang)
        except MappingMissing as e:
            raise click.BadParameter(
                f'Cannot find mapping {in_lang}->{out_lang} for --from or --to spec "{lang_spec}": {e}'
            )
        if not in_or_out:
            if is_ipa(out_lang):
                in_or_out = "out"
            elif is_ipa(in_lang):
                in_or_out = "in"
            else:
                raise click.BadParameter(
                    f'Cannot guess in/out for IPA lang spec "{lang_spec}" because neither {in_lang} '
                    f'nor {out_lang} is IPA. Specify "[in]" or "[out]" if you are sure it is correct.'
                )
        if in_or_out not in ("in", "out"):
            raise click.BadParameter(
                f'Invalid IPA language specification "{lang_spec}": only "in" or "out" '
                "is allowed in square brackets, to disambiguate between input or output "
                "inventory when necessary.")
        return [(mapping, in_or_out)]

    else:
        if in_or_out:
            raise click.BadParameter(
                f'Bad IPA lang spec "{lang_spec}": the [in]/[out] qualifier is only '
                "supported with the full in-lang_to_out-lang[[in]|[out]] syntax."
            )
        if in_lang == "eng":
            mapping = Mapping(in_lang="eng-ipa", out_lang="eng-arpabet")
            in_or_out = "in"
            return [(mapping, in_or_out)]
        else:
            out_lang = in_lang + "-ipa"
            # check_ipa_known_segs([out_lang])  # this outputs a lot of spurious noise...
            mappings = [
                (Mapping(in_lang=m["in_lang"], out_lang=m["out_lang"]), "out")
                for m in MAPPINGS_AVAILABLE
                if m["out_lang"] == out_lang and not is_ipa(m["in_lang"])
            ]
            if not mappings:
                raise click.BadParameter(
                    f'No IPA mappings found for "{lang_spec}".')
            return mappings
Beispiel #19
0
    def make_tokenizer(self, in_lang, out_lang=None, tok_path=None):
        tokenizer_key = self.make_tokenizer_key(in_lang, out_lang, tok_path)
        if not self.tokenizers.get(tokenizer_key):
            # This tokenizer was not created yet, initialize it now.
            if tok_path:
                # LOGGER.warning(f"in_lang={in_lang} tok_path={tok_path}")
                if tok_path[0] != in_lang:
                    raise ValueError(
                        "calling make_tokenizer() with tok_path requires that tok_path[0] == in_lang"
                    )
                assert len(tok_path) >= 2
                if len(tok_path) == 2 or is_ipa(tok_path[1]):
                    out_lang = tok_path[1]
                elif len(tok_path) == 3 or is_ipa(tok_path[2]):
                    out_lang = tok_path[1:3]
                elif len(tok_path) > 3 and is_ipa(tok_path[3]):
                    out_lang = tok_path[1:4]
                else:
                    out_lang = tok_path[1:3]
            if not out_lang:
                try:
                    successors = [x for x in LANGS_NETWORK.successors(in_lang)]
                except NetworkXError:
                    successors = []
                ipa_successors = [x for x in successors if is_ipa(x)]
                # LOGGER.warning(pprint.pformat([in_lang, "->", successors, ipa_successors]))
                if ipa_successors:
                    # in_lang has an ipa successor, tokenize using it
                    # there currently are no langs with more than 1 IPA successor, but to
                    # be future-proof we'll arbitrarily take the first if there are more.
                    out_lang = ipa_successors[0]
                else:
                    # There is no direct IPA successor, look for a two-hop path to -ipa
                    for x in successors:
                        ipa_successors_two_hops = [
                            y for y in LANGS_NETWORK.successors(x) if is_ipa(y)
                        ]
                        # LOGGER.warning(pprint.pformat([in_lang, x, "->", [ipa_successors_two_hops]]))
                        if ipa_successors_two_hops:
                            out_lang = [x, ipa_successors_two_hops[0]]
                        break
                    # There is no two-hop IPA successor, use the first direct successor
                    if out_lang is None and successors:
                        out_lang = successors[0]
            # LOGGER.warning(f"Tokenizer for {in_lang} is {out_lang}.")
            if out_lang is None:
                # Default tokenizer:
                self.tokenizers[tokenizer_key] = self.tokenizers[None]
            elif isinstance(out_lang, list):
                # Build a multi-hop tokenizer
                assert len(out_lang) > 1
                try:
                    mappings = [Mapping(in_lang=in_lang, out_lang=out_lang[0])]
                    for i in range(1, len(out_lang)):
                        mappings.append(
                            Mapping(in_lang=out_lang[i - 1],
                                    out_lang=out_lang[i]))
                    self.tokenizers[tokenizer_key] = MultiHopTokenizer(
                        mappings)
                except MappingMissing:
                    self.tokenizers[tokenizer_key] = self.tokenizers[None]
                    LOGGER.warning(
                        f"missing mapping yet we looked for mappings in graph for {in_lang}-{out_lang}."
                    )
            else:
                # Build a one-hop tokenizer
                try:
                    mapping = Mapping(in_lang=in_lang, out_lang=out_lang)
                    self.tokenizers[tokenizer_key] = Tokenizer(mapping)
                except MappingMissing:
                    self.tokenizers[tokenizer_key] = self.tokenizers[None]
                    LOGGER.warning(
                        f"Cannot find mapping from '{in_lang}' to '{out_lang}'. Using default tokenizer instead"
                    )

            # Hack for Tlingit using dot as a letter when non word-final
            if in_lang == "tli":
                self.tokenizers[tokenizer_key].dot_is_letter = True

        return self.tokenizers.get(tokenizer_key)