def align_to_dummy_fallback(mapping: Mapping, io: str = 'in', write_to_file: bool = False, out_dir: str = ''):
    display_name = mapping.kwargs.get('language_name', 'No Language display name in Config')
    config = {'in_lang': mapping.kwargs[f'{io}_lang'], 'out_lang': 'dummy'}
    default_char = 't'
    if is_ipa(mapping.kwargs[f'{io}_lang']):
        mapping = align_inventories(mapping.inventory(io), DUMMY_INVENTORY)
    else:
        und_g2p = make_g2p('und', 'und-ipa')
        mapping = [{"in": unicode_escape(x), "out": und_g2p(unidecode(x).lower()).output_string} for x in mapping.inventory(io)]
        dummy_list = align_inventories([x['out'] for x in mapping], DUMMY_INVENTORY)
        dummy_dict = {}
        for x in dummy_list:
            if x['in']:
                dummy_dict[x['in']] = x['out']
                
        for x in mapping:
            try:
                x['out'] = dummy_dict[x['out']]
            except KeyError:
                LOGGER.warn(f"We couldn't guess at what {x['in']} means, so it's being replaced with '{default_char}' instead.")
                x['out'] = default_char       

    config['mapping'] = mapping
    mapping = Mapping(**config)
    if write_to_file:
        if out_dir:
            if os.path.isdir(out_dir):
                mapping.config_to_file(out_dir)
                mapping.mapping_to_file(out_dir)
            else:
                LOGGER.warning(f'{out_dir} is not a directory. Writing to default instead.')
        else:
            mapping.config_to_file()
            mapping.mapping_to_file()
    return mapping
Example #2
0
File: utils.py Project: deltork/g2p
def check_ipa_known_segs(mappings_to_check=False) -> bool:
    """Check the given mappings, or all IPA mappings, for invalid IPA in the "out" fields

    Returns True iff not errors were found.
    """
    if not mappings_to_check:
        mappings_to_check = [x["out_lang"] for x in MAPPINGS_AVAILABLE]
    found_error = False
    for mapping in [
            x for x in MAPPINGS_AVAILABLE if x["out_lang"] in mappings_to_check
    ]:
        if is_ipa(mapping["out_lang"]):
            reverse = mapping.get("reverse", False)
            for rule in mapping["mapping_data"]:
                output = rule["in"] if reverse else rule["out"]
                if not is_panphon(output):
                    LOGGER.warning(
                        f"Output '{rule['out']}' in rule {rule} in mapping between {mapping['in_lang']} "
                        f"and {mapping['out_lang']} is not recognized as valid IPA by panphon."
                    )
                    found_error = True
    if found_error:
        LOGGER.warning(
            "Please refer to https://github.com/dmort27/panphon for information about panphon."
        )
    return not found_error
Example #3
0
def create_mapping(mapping_1: Mapping,
                   mapping_2: Mapping,
                   mapping_1_io: str = 'out',
                   mapping_2_io: str = 'in',
                   write_to_file: bool = False) -> Mapping:
    map_1_name = mapping_1.kwargs[f'{mapping_1_io}_lang']
    map_2_name = mapping_2.kwargs[f'{mapping_2_io}_lang']
    if not is_ipa(map_1_name) and not is_xsampa(map_1_name):
        LOGGER.warning(
            "Unsupported orthography of inventory 1: %s"
            " (must be ipa or x-sampa)", map_1_name)
    if not is_ipa(map_2_name) and not is_xsampa(map_2_name):
        LOGGER.warning(
            "Unsupported orthography of inventory 2: %s"
            " (must be ipa or x-sampa)", map_2_name)
    l1_is_xsampa, l2_is_xsampa = is_xsampa(map_1_name), is_xsampa(map_2_name)
    mapping = align_inventories(mapping_1.inventory(mapping_1_io),
                                mapping_2.inventory(mapping_2_io),
                                l1_is_xsampa, l2_is_xsampa)

    l1_display_name = mapping_1.kwargs.get(
        'language_name', 'No Language display name in Config')
    l2_display_name = mapping_2.kwargs.get(
        'language_name', 'No Language display name in Config')

    config = generate_config(map_1_name, map_2_name, l1_display_name,
                             l2_display_name)

    if write_to_file:
        write_generated_mapping_to_file(config, mapping)

    return Mapping(mapping,
                   **{k: v
                      for k, v in config.items() if k != 'mapping'})
Example #4
0
 def check(
     self,
     tg: TransductionGraph,
     shallow=False,
     display_warnings=False,
     original_input=None,
 ):
     out_lang = self.mapping.kwargs["out_lang"]
     if "eng-arpabet" in out_lang:
         if not is_arpabet(tg.output_string):
             if display_warnings:
                 display_input = (original_input
                                  if original_input else tg.input_string)
                 LOGGER.warning(
                     f'Transducer output "{tg.output_string}" for input "{display_input}" is not fully valid eng-arpabet as recognized by soundswallower.'
                 )
             return False
         else:
             return True
     elif is_ipa(out_lang):
         if not is_panphon(tg.output_string,
                           display_warnings=display_warnings):
             if display_warnings:
                 display_input = (original_input
                                  if original_input else tg.input_string)
                 LOGGER.warning(
                     f'Transducer output "{tg.output_string}" for input "{display_input}" is not fully valid {out_lang}.'
                 )
             return False
         else:
             return True
     else:
         # No check implemented at this tier, just return True
         return True
Example #5
0
def get_tokenizer(*args, **kwargs):
    """ Deprecated; use make_tokenizer() instead. """

    global _deprecated_warning_printed
    if not _deprecated_warning_printed:
        LOGGER.warning(
            "g2p.get_tokenizer() / g2p.mappings.tokenizer.get_tokenizer() is deprecated. Import and use g2p.make_tokenizer() instead."
        )
        _deprecated_warning_printed = True

    return make_tokenizer(*args, **kwargs)
Example #6
0
    def rule_to_regex(self, rule: dict) -> Pattern:
        """Turns an input string (and the context) from an input/output pair
        into a regular expression pattern"

        The 'in' key is the match.
        The 'context_after' key creates a lookahead.
        The 'context_before' key creates a lookbehind.

        Args:
            rule: A dictionary containing 'in', 'out', 'context_before', and 'context_after' keys

        Raises:
            Exception: This is raised when un-supported regex characters or symbols exist in the rule

        Returns:
            Pattern: returns a regex pattern (re.Pattern)
            bool: returns False if input is null
        """
        # Prevent null input. See, https://github.com/roedoejet/g2p/issues/24
        if not rule['in']:
            LOGGER.warning(
                f'Rule with input \'{rule["in"]}\' and output \'{rule["out"]}\' has no input. This is disallowed. Please check your mapping file for rules with null inputs.'
            )
            return False
        if "context_before" in rule and rule['context_before']:
            before = rule["context_before"]
        else:
            before = ''
        if 'context_after' in rule and rule['context_after']:
            after = rule["context_after"]
        else:
            after = ''
        input_match = re.sub(re.compile(r'{\d+}'), "", rule['in'])
        try:
            inp = create_fixed_width_lookbehind(before) + input_match
            if after:
                inp += f"(?={after})"
            if not self.kwargs['case_sensitive']:
                rule_regex = re.compile(inp, re.I)
            else:
                rule_regex = re.compile(inp)
        except:
            in_lang = self.kwargs.get('in_lang', 'und')
            out_lang = self.kwargs.get('out_lang', 'und')
            LOGGER.error(
                f'Your regex in mapping between {in_lang} and {out_lang} is malformed. \
                    Do you have un-escaped regex characters in your input {inp}, contexts {before}, {after}?'
            )
            raise Exception(
                f'Your regex in mapping between {in_lang} and {out_lang} is malformed. \
                    Do you have un-escaped regex characters in your input {inp}, contexts {before}, {after}?'
            )
        return rule_regex
Example #7
0
    def test_convert(self):
        error_count = 0
        for test in self.langs_to_test:
            output_string = self.runner.invoke(convert, [test[2], test[0], test[1]]).stdout.strip()
            if output_string != test[3]:
                LOGGER.warning("test_cli.py: mapping error: {} from {} to {} should be {}, got {}".format(test[2], test[0], test[1], test[3], output_string))
                if error_count == 0:
                    first_failed_test = test
                error_count += 1

        if error_count > 0:
            output_string = self.runner.invoke(convert, [first_failed_test[2], first_failed_test[0], first_failed_test[1]]).stdout.strip()
            self.assertEqual(output_string, first_failed_test[3])
Example #8
0
    def find_good_match(p1, inventory_l2):
        """Find a good sequence in inventory_l2 matching p1."""

        # The proper way to do this would be with some kind of beam search
        # through a determinized/minimized FST, but in the absence of that
        # we can do a kind of heurstic greedy search.  (we don't want any
        # dependencies outside of PyPI otherwise we'd just use OpenFST)

        p1_pseq = dst.fm.ipa_segs(p1)

        i = 0
        good_match = []
        while i < len(p1_pseq):
            best_input = ""
            best_output = -1
            best_score = 0xDEADBEEF
            for j, p2_pseq in enumerate(p2_pseqs):
                # FIXME: Should also consider the (weighted) possibility
                # of deleting input or inserting any segment (but that
                # can't be done with a greedy search)
                if len(p2_pseq) == 0:
                    LOGGER.warning(
                        "No panphon mapping for %s - skipping", inventory_l2[j]
                    )
                    continue
                e = min(i + len(p2_pseq), len(p1_pseq))
                input_seg = p1_pseq[i:e]
                distance_method = get_distance_method(dst, distance)
                score = distance_method("".join(input_seg), "".join(p2_pseq))
                # Be very greedy and take the longest match
                if (
                    score < best_score
                    or score == best_score
                    and len(input_seg) > len(best_input)
                ):
                    best_input = input_seg
                    best_output = j
                    best_score = score
            LOGGER.debug(
                "Best match at position %d: %s => %s",
                i,
                best_input,
                inventory_l2[best_output],
            )
            good_match.append(inventory_l2[best_output])
            i += len(best_input)  # greedy!
        return "".join(good_match)
Example #9
0
 def setUp(self):
     DATA_DIR = os.path.dirname(data_dir)
     self.langs_to_test = []
     for fn in glob(f'{DATA_DIR}/*.*sv'):
         if fn.endswith('csv'):
             delimiter = ','
         elif fn.endswith('psv'):
             delimiter = '|'
         elif fn.endswith('tsv'):
             delimiter = '\t'
         with open(fn, encoding="utf-8") as csvfile:
             reader = csv.reader(csvfile, delimiter=delimiter)
             for row in reader:
                 if len(row) != 4:
                     LOGGER.warning(f'Row in {fn} containing values {row} does not have the right values. Please check your data.')
                 else:
                     self.langs_to_test.append(row)
Example #10
0
def create_mapping(
    mapping_1: Mapping,
    mapping_2: Mapping,
    mapping_1_io: str = "out",
    mapping_2_io: str = "in",
    distance: str = "weighted_feature_edit_distance",
) -> Mapping:
    """Create a mapping from mapping_1's output inventory to mapping_2's input inventory"""

    map_1_name = mapping_1.kwargs[f"{mapping_1_io}_lang"]
    map_2_name = mapping_2.kwargs[f"{mapping_2_io}_lang"]
    if not is_ipa(map_1_name) and not is_xsampa(map_1_name):
        LOGGER.warning(
            "Unsupported orthography of inventory 1: %s (must be ipa or x-sampa)",
            map_1_name,
        )
    if not is_ipa(map_2_name) and not is_xsampa(map_2_name):
        LOGGER.warning(
            "Unsupported orthography of inventory 2: %s (must be ipa or x-sampa)",
            map_2_name,
        )
    l1_is_xsampa, l2_is_xsampa = is_xsampa(map_1_name), is_xsampa(map_2_name)
    mapping = align_inventories(
        mapping_1.inventory(mapping_1_io),
        mapping_2.inventory(mapping_2_io),
        l1_is_xsampa,
        l2_is_xsampa,
        distance=distance,
    )

    # Initialize mapping with input language parameters (as_is,
    # case_sensitive, prevent_feeding, etc)
    config = mapping_1.kwargs.copy()
    # Fix up names, etc.
    if "authors" in config:
        del config["authors"]
    if "display_name" in config:
        del config["display_name"]
    if "language_name" in config:
        del config["language_name"]
    config["prevent_feeding"] = True
    config["in_lang"] = map_1_name
    config["out_lang"] = map_2_name
    config["mapping"] = mapping
    mapping = Mapping(**config)
    return mapping
Example #11
0
 def test_check_with_equiv(self):
     transducer = make_g2p("tau", "eng-arpabet", tok_lang="tau")
     tau_ipa = make_g2p("tau", "tau-ipa", tok_lang="tau")(
         "sh'oo Jign maasee' do'eent'aa shyyyh").output_string
     self.assertTrue(utils.is_panphon(tau_ipa))
     eng_ipa = make_g2p("tau", "eng-ipa", tok_lang="tau")(
         "sh'oo Jign maasee' do'eent'aa shyyyh").output_string
     self.assertTrue(utils.is_panphon(eng_ipa))
     eng_arpabet = make_g2p("tau", "eng-arpabet", tok_lang="tau")(
         "sh'oo Jign maasee' do'eent'aa shyyyh").output_string
     self.assertTrue(utils.is_arpabet(eng_arpabet))
     LOGGER.warning(
         f"tau-ipa {tau_ipa}\neng-ipa {eng_ipa}\n eng-arpabet {eng_arpabet}"
     )
     self.assertTrue(
         transducer.check(
             transducer("sh'oo Jign maasee' do'eent'aa shyyyh")))
Example #12
0
    def test_io(self):
        # go through each language declared in the test case set up
        # Instead of asserting immediately, we go through all the cases first, so that
        # running test_langs.py prints all the errors at once, to help debugging a given g2p mapping.
        # Then we call assertEqual on the first failed case, to make unittest register the failure.
        error_count = 0
        for test in self.langs_to_test:
            transducer = make_g2p(test[0], test[1])
            output_string = transducer(test[2]).output_string
            if output_string != test[3]:
                LOGGER.warning("test_langs.py: mapping error: {} from {} to {} should be {}, got {}".format(test[2], test[0], test[1], test[3], output_string))
                if error_count == 0:
                    first_failed_test = test
                error_count += 1

        if error_count > 0:
            transducer = make_g2p(first_failed_test[0], first_failed_test[1])
            self.assertEqual(transducer(first_failed_test[2]).output_string, first_failed_test[3])
Example #13
0
def load_mapping_from_path(path_to_mapping_config, index=0):
    ''' Loads a mapping from a path, if there is more than one mapping, then it loads based on the int
        provided to the 'index' argument. Default is 0.
    '''
    path = Path(path_to_mapping_config)
    # If path leads to actual mapping config
    if path.exists() and (path.suffix.endswith('yml')
                          or path.suffix.endswith('yaml')):
        # safe load it
        with open(path, encoding='utf8') as f:
            mapping = yaml.safe_load(f)
        # If more than one mapping in the mapping config
        if 'mappings' in mapping:
            try:
                LOGGER.debug(
                    'Loading mapping from %s between "%s" and "%s" at index %s',
                    path_to_mapping_config,
                    mapping['mappings'][index].get('in_lang', 'und'),
                    mapping['mappings'][index].get('out_lang', 'und'), index)
                mapping = mapping['mappings'][index]
            except KeyError:
                LOGGER.warning(
                    'An index of %s was provided for the mapping %s but that index does not exist in the mapping. Please check your mapping.',
                    index, path_to_mapping_config)
        # Log the warning if an Index other than 0 was provided for a mapping config with a single mapping.
        elif index != 0:
            LOGGER.warning(
                'An index of %s was provided for the mapping %s but that index does not exist in the mapping. Please check your mapping.',
                index, path_to_mapping_config)
        # try to load the data from the mapping data file
        if 'mapping' in mapping:
            mapping['mapping_data'] = load_from_file(
                os.path.join(path.parent, mapping['mapping']))
        else:
            # Is "mapping" key missing?
            raise exceptions.MalformedMapping
        # load any abbreviations
        if 'abbreviations' in mapping:
            mapping['abbreviations_data'] = load_abbreviations_from_file(
                os.path.join(path.parent, mapping['abbreviations']))
        return mapping
    else:
        raise FileNotFoundError
Example #14
0
def find_good_match(p1, inventory_l2, l2_is_xsampa=False):
    """Find a good sequence in inventory_l2 matching p1."""

    dst = panphon.distance.Distance()
    # The proper way to do this would be with some kind of beam search
    # through a determinized/minimized FST, but in the absence of that
    # we can do a kind of heurstic greedy search.  (we don't want any
    # dependencies outside of PyPI otherwise we'd just use OpenFST)
    p1_pseq = dst.fm.ipa_segs(p1)
    p2_pseqs = [
        dst.fm.ipa_segs(p)
        for p in process_characters(inventory_l2, l2_is_xsampa)
    ]
    i = 0
    good_match = []
    while i < len(p1_pseq):
        best_input = ""
        best_output = -1
        best_score = 0xdeadbeef
        for j, p2_pseq in enumerate(p2_pseqs):
            # FIXME: Should also consider the (weighted) possibility
            # of deleting input or inserting any segment (but that
            # can't be done with a greedy search)
            if len(p2_pseq) == 0:
                LOGGER.warning('No panphon mapping for %s - skipping',
                               inventory_l2[j])
                continue
            e = min(i + len(p2_pseq), len(p1_pseq))
            input_seg = p1_pseq[i:e]
            score = dst.weighted_feature_edit_distance(''.join(input_seg),
                                                       ''.join(p2_pseq))
            # Be very greedy and take the longest match
            if (score < best_score or score == best_score
                    and len(input_seg) > len(best_input)):
                best_input = input_seg
                best_output = j
                best_score = score
        LOGGER.debug('Best match at position %d: %s => %s', i, best_input,
                     inventory_l2[best_output])
        good_match.append(inventory_l2[best_output])
        i += len(best_input)  # greedy!
    return ''.join(good_match)
Example #15
0
def check_ipa_known_segs(mappings_to_check=False):
    dst = distance.Distance()
    if not mappings_to_check:
        mappings_to_check = [x['out_lang'] for x in MAPPINGS_AVAILABLE]
    found_error = False
    for mapping in [
            x for x in MAPPINGS_AVAILABLE if x['out_lang'] in mappings_to_check
    ]:
        if mapping['out_lang'].endswith('-ipa'):
            for rule in mapping['mapping_data']:
                joined_ipa_segs = ''.join(dst.fm.ipa_segs(rule['out']))
                if not joined_ipa_segs == rule['out']:
                    LOGGER.warning(
                        f"Output '{rule['out']}' in rule {rule} in mapping between {mapping['in_lang']} and {mapping['out_lang']} is not recognized as valid IPA by panphon. You may ignore this warning if you know it gets remapped to IPA later."
                    )
                    found_error = True
    if found_error:
        LOGGER.warning(
            "Please refer to https://github.com/dmort27/panphon for information about panphon."
        )
Example #16
0
 def setUp(self):
     self.runner = APP.test_cli_runner()
     self.data_dir = os.path.dirname(data_dir)
     self.langs_to_test = []
     for fn in glob(os.path.join(self.data_dir, "*.*sv")):
         if fn.endswith("csv"):
             delimiter = ","
         elif fn.endswith("psv"):
             delimiter = "|"
         elif fn.endswith("tsv"):
             delimiter = "\t"
         with open(fn, encoding="utf-8") as csvfile:
             reader = csv.reader(csvfile, delimiter=delimiter)
             for row in reader:
                 if len(row) < 4:
                     LOGGER.warning(
                         f"Row in {fn} containing values {row} does not have the right values."
                         f"Please check your data.")
                 else:
                     self.langs_to_test.append(row)
Example #17
0
def doctor(mapping, list_all, list_ipa):
    """ Check for common errors in mappings.
        There should eventually be more checks here, but doctor currently checks for:

        1. Characters that are in IPA mappings but are not recognized by panphon library.

        You can list available mappings with --list-all or --list-ipa, or by visiting
        http://g2p-studio.herokuapp.com/api/v1/langs .
    """
    if list_all or list_ipa:
        out_langs = sorted(set([x["out_lang"] for x in MAPPINGS_AVAILABLE]))
        if list_ipa:
            out_langs = [x for x in out_langs if is_ipa(x)]
        LOGGER.info("Specifying an output language will check all mappings into that language:\n")
        for m in out_langs:
            print(f"{m}: ", end="")
            print(
                ("\n" + " " * len(m) + "  ").join(
                    [x["in_lang"] for x in MAPPINGS_AVAILABLE if x["out_lang"] == m]
                )
            )
            print("")
        return

    for m in mapping:
        if m not in [x["out_lang"] for x in MAPPINGS_AVAILABLE]:
            raise click.UsageError(
                f"No known mappings into '{m}'. "
                "Use --list-all or --list-ipa to list valid options."
            )
        if not is_ipa(m):
            LOGGER.warning(
                f"No checks implemented yet for non-IPA mappings: '{m}' will not be checked."
            )

    if not mapping:
        LOGGER.info("Checking all IPA mappings.")
    else:
        LOGGER.info("Checking the following mappings: \n" + "\n".join(mapping))

    check_ipa_known_segs(list(mapping))
Example #18
0
def create_mapping(l1_mapping: Mapping, l2_mapping: Mapping) -> Mapping:
    ''' Create a mapping from the output of l1 and input of l2.
        Both must be either ipa or x-sampa.
    '''
    l1 = l1_mapping.kwargs['out_lang']
    l2 = l2_mapping.kwargs['in_lang']
    inv_l1 = l1_mapping.inventory("out")
    inv_l2 = l2_mapping.inventory()
    if not is_ipa(l1) and not is_xsampa(l1):
        LOGGER.warning(
            "Unsupported orthography of inventory 1: %s"
            " (must be ipa or x-sampa)", l1)
    if not is_ipa(l2) and not is_xsampa(l2):
        LOGGER.warning(
            "Unsupported orthography of inventory 2: %s"
            " (must be ipa or x-sampa)", l2)
    mapping = align_inventories(inv_l1["inventory"], inv_l2["inventory"],
                                is_xsampa(l1), is_xsampa(l2))

    output_mapping = Mapping(mapping, in_lang=l1, out_lang=l2)
    return output_mapping
Example #19
0
File: cli.py Project: deltork/g2p
def scan(lang, path):
    """ Returns the set of non-mapped characters in a document.
        Accounts for case sensitivity in the configuration.
    """
    # Check input lang exists
    if not lang in LANGS_NETWORK.nodes:
        raise click.UsageError(f"'{lang}' is not a valid value for 'LANG'")

    # Retrieve the mappings for lang
    case_sensitive = True
    mappings = []
    for mapping in MAPPINGS_AVAILABLE:
        mapping_name = mapping["in_lang"]
        # Exclude mappings for converting between IPAs
        if mapping_name.startswith(lang) and "ipa" not in mapping_name:
            case_sensitive = case_sensitive and mapping.get(
                "case_sensitive", True)
            mappings.append(mapping)

    # Get input chars in mapping
    mapped_chars = set()
    for lang_mapping in mappings:
        for x in lang_mapping["mapping_data"]:
            mapped_chars.add(normalize(x["in"], "NFD"))
    # Find unmapped chars
    filter_chars = " \n"
    mapped_string = "".join(mapped_chars)
    pattern = "[^" + mapped_string + filter_chars + ".]"
    prog = re.compile(pattern)

    with open(path, "r", encoding="utf8") as file:
        data = normalize(file.read(), "NFD")
        if not case_sensitive:
            data = data.lower()
        unmapped = set(prog.findall(data))
        if unmapped:
            LOGGER.warning("The following characters are not mapped:")
            print(unmapped)
Example #20
0
    def test_convert(self):
        LOGGER.info(
            f"Running {len(self.langs_to_test)} g2p convert test cases found in public/data"
        )
        error_count = 0
        for tok_option in [["--tok", "--check"], ["--no-tok"]]:
            for test in self.langs_to_test:
                output_string = self.runner.invoke(
                    convert,
                    [*tok_option, test[2], test[0], test[1]]).stdout.strip()
                if output_string != test[3].strip():
                    LOGGER.warning(
                        f"test_cli.py: {test[0]}->{test[1]} mapping error: '{test[2]}' "
                        f"should map to '{test[3]}', got '{output_string}' (with {tok_option})."
                    )
                    if error_count == 0:
                        first_failed_test = test + [tok_option]
                    error_count += 1

        if error_count > 0:
            reference_string = first_failed_test[3]
            output_string = self.runner.invoke(
                convert,
                [
                    first_failed_test[4],  # tok_option
                    first_failed_test[2],  # word to convert
                    first_failed_test[0],  # in_lang
                    first_failed_test[1],  # out_lang
                ],
            ).stdout.strip()
            self.assertEqual(
                output_string,
                reference_string.strip(),
                f"{first_failed_test[0]}->{first_failed_test[1]} mapping error "
                "for '{first_failed_test[2]}'.\n"
                "Look for warnings in the log for any more mapping errors",
            )
Example #21
0
def align_to_dummy_fallback(mapping: Mapping,
                            io: str = 'in',
                            distance: str = "weighted_feature_edit_distance"):
    """Create a mapping from mapping's output inventory to a minimalist dummy inventory"""
    config = {'in_lang': mapping.kwargs[f'{io}_lang'], 'out_lang': 'dummy'}
    default_char = 't'
    if is_ipa(mapping.kwargs[f'{io}_lang']):
        mapping = align_inventories(mapping.inventory(io),
                                    DUMMY_INVENTORY,
                                    distance=distance)
    else:
        und_g2p = make_g2p('und', 'und-ipa')
        mapping = [{
            "in": unicode_escape(x),
            "out": und_g2p(unidecode(x).lower()).output_string
        } for x in mapping.inventory(io)]
        dummy_list = align_inventories([x['out'] for x in mapping],
                                       DUMMY_INVENTORY,
                                       distance=distance)
        dummy_dict = {}
        for x in dummy_list:
            if x['in']:
                dummy_dict[x['in']] = x['out']

        for x in mapping:
            try:
                x['out'] = dummy_dict[x['out']]
            except KeyError:
                LOGGER.warning(
                    f"We couldn't guess at what {x['in']} means, so it's being "
                    f"replaced with '{default_char}' instead.")
                x['out'] = default_char

    config['mapping'] = mapping
    mapping = Mapping(**config)
    return mapping
Example #22
0
File: utils.py Project: deltork/g2p
def is_panphon(string, display_warnings=False):
    # Deferred importing required here, because g2p.transducer also imports this file.
    # Such circular dependency is probably bad design, maybe a reviewer of this code will
    # have a better solution to recommend?
    import g2p.transducer

    dst = getPanphonDistanceSingleton()
    panphon_preprocessor = g2p.transducer.Transducer(
        Mapping(id="panphon_preprocessor"))
    preprocessed_string = panphon_preprocessor(string).output_string
    # Use a loop that prints the warnings on all strings that are not panphon, even though
    # logically this should not be necessary to calculate the answer.
    result = True
    for word in preprocessed_string.split():
        word_ipa_segs = dst.fm.ipa_segs(word)
        word_ipa = "".join(word_ipa_segs)
        if word != word_ipa:
            if not display_warnings:
                return False
            LOGGER.warning(
                f'String "{word}" is not identical to its IPA segmentation: {word_ipa_segs}'
            )
            if "g" in word and not is_panphon.g_warning_printed:
                LOGGER.warning(
                    f"Common IPA gotcha: the ASCII 'g' character is not IPA, use 'ɡ' (\\u0261) instead."
                )
                is_panphon.g_warning_printed = True
            if ":" in word and not is_panphon.colon_warning_printed:
                LOGGER.warning(
                    f"Common IPA gotcha: the ASCII ':' character is not IPA, use 'ː' (\\u02D0) instead."
                )
                is_panphon.colon_warning_printed = True
            for c in word:
                if c not in word_ipa:
                    LOGGER.warning(
                        f"Character '{c}' (\\u{format(ord(c), '04x')}) in word '{word}' "
                        "was not recognized as IPA by panphon.")
            result = False
    return result
Example #23
0
def create_mapping(mapping_1: Mapping, mapping_2: Mapping, mapping_1_io: str = 'out', mapping_2_io: str = 'in', write_to_file: bool = False, out_dir: str = '') -> Mapping:
    map_1_name = mapping_1.kwargs[f'{mapping_1_io}_lang']
    map_2_name = mapping_2.kwargs[f'{mapping_2_io}_lang']
    if not is_ipa(map_1_name) and not is_xsampa(map_1_name):
        LOGGER.warning("Unsupported orthography of inventory 1: %s"
                       " (must be ipa or x-sampa)",
                       map_1_name)
    if not is_ipa(map_2_name) and not is_xsampa(map_2_name):
        LOGGER.warning("Unsupported orthography of inventory 2: %s"
                       " (must be ipa or x-sampa)",
                       map_2_name)
    l1_is_xsampa, l2_is_xsampa = is_xsampa(map_1_name), is_xsampa(map_2_name)
    mapping = align_inventories(mapping_1.inventory(mapping_1_io), mapping_2.inventory(mapping_2_io),
                                l1_is_xsampa, l2_is_xsampa)

    # Initialize mapping with input language parameters (as_is,
    # case_sensitive, prevent_feeding, etc)
    config = mapping_1.kwargs.copy()
    # Fix up names, etc.
    if 'authors' in config:
        del config['authors']
    if 'display_name' in config:
        del config['display_name']
    if 'language_name' in config:
        del config['language_name']
    config['in_lang'] = map_1_name
    config['out_lang'] = map_2_name
    config['mapping'] = mapping
    mapping = Mapping(**config)
    if write_to_file:
        if out_dir:
            if os.path.isdir(out_dir):
                mapping.config_to_file(out_dir)
                mapping.mapping_to_file(out_dir)
            else:
                LOGGER.warning(f'{out_dir} is not a directory. Writing to default instead.')
        else:
            mapping.config_to_file()
            mapping.mapping_to_file()

    return mapping
Example #24
0
    def make_tokenizer(self, in_lang, out_lang=None, tok_path=None):
        tokenizer_key = self.make_tokenizer_key(in_lang, out_lang, tok_path)
        if not self.tokenizers.get(tokenizer_key):
            # This tokenizer was not created yet, initialize it now.
            if tok_path:
                # LOGGER.warning(f"in_lang={in_lang} tok_path={tok_path}")
                if tok_path[0] != in_lang:
                    raise ValueError(
                        "calling make_tokenizer() with tok_path requires that tok_path[0] == in_lang"
                    )
                assert len(tok_path) >= 2
                if len(tok_path) == 2 or is_ipa(tok_path[1]):
                    out_lang = tok_path[1]
                elif len(tok_path) == 3 or is_ipa(tok_path[2]):
                    out_lang = tok_path[1:3]
                elif len(tok_path) > 3 and is_ipa(tok_path[3]):
                    out_lang = tok_path[1:4]
                else:
                    out_lang = tok_path[1:3]
            if not out_lang:
                try:
                    successors = [x for x in LANGS_NETWORK.successors(in_lang)]
                except NetworkXError:
                    successors = []
                ipa_successors = [x for x in successors if is_ipa(x)]
                # LOGGER.warning(pprint.pformat([in_lang, "->", successors, ipa_successors]))
                if ipa_successors:
                    # in_lang has an ipa successor, tokenize using it
                    # there currently are no langs with more than 1 IPA successor, but to
                    # be future-proof we'll arbitrarily take the first if there are more.
                    out_lang = ipa_successors[0]
                else:
                    # There is no direct IPA successor, look for a two-hop path to -ipa
                    for x in successors:
                        ipa_successors_two_hops = [
                            y for y in LANGS_NETWORK.successors(x) if is_ipa(y)
                        ]
                        # LOGGER.warning(pprint.pformat([in_lang, x, "->", [ipa_successors_two_hops]]))
                        if ipa_successors_two_hops:
                            out_lang = [x, ipa_successors_two_hops[0]]
                        break
                    # There is no two-hop IPA successor, use the first direct successor
                    if out_lang is None and successors:
                        out_lang = successors[0]
            # LOGGER.warning(f"Tokenizer for {in_lang} is {out_lang}.")
            if out_lang is None:
                # Default tokenizer:
                self.tokenizers[tokenizer_key] = self.tokenizers[None]
            elif isinstance(out_lang, list):
                # Build a multi-hop tokenizer
                assert len(out_lang) > 1
                try:
                    mappings = [Mapping(in_lang=in_lang, out_lang=out_lang[0])]
                    for i in range(1, len(out_lang)):
                        mappings.append(
                            Mapping(in_lang=out_lang[i - 1],
                                    out_lang=out_lang[i]))
                    self.tokenizers[tokenizer_key] = MultiHopTokenizer(
                        mappings)
                except MappingMissing:
                    self.tokenizers[tokenizer_key] = self.tokenizers[None]
                    LOGGER.warning(
                        f"missing mapping yet we looked for mappings in graph for {in_lang}-{out_lang}."
                    )
            else:
                # Build a one-hop tokenizer
                try:
                    mapping = Mapping(in_lang=in_lang, out_lang=out_lang)
                    self.tokenizers[tokenizer_key] = Tokenizer(mapping)
                except MappingMissing:
                    self.tokenizers[tokenizer_key] = self.tokenizers[None]
                    LOGGER.warning(
                        f"Cannot find mapping from '{in_lang}' to '{out_lang}'. Using default tokenizer instead"
                    )

            # Hack for Tlingit using dot as a letter when non word-final
            if in_lang == "tli":
                self.tokenizers[tokenizer_key].dot_is_letter = True

        return self.tokenizers.get(tokenizer_key)
Example #25
0
def create_multi_mapping(
    src_mappings: List[Tuple[Mapping, str]],
    tgt_mappings: List[Tuple[Mapping, str]],
    distance: str = "weighted_feature_edit_distance",
) -> Mapping:
    """Create a mapping for a set of source mappings to a set of target mappings

    Each src/tgt mappings is a (mapping: Mapping, in_or_out: str) pair specifying
    the mapping to use and whether its input ("in") or output ("out") inventory
    should be used to create the new mapping.

    The name of the mapping is infered from src_mappings[0] and tgt_mappings[0]'s
    metadata.
    """

    def compact_ipa_names(ipa_names: Iterable) -> str:
        # ["fra-ipa", "eng-ipa", "kwk-ipa"] -> "fra-eng-kwk-ipa"
        return (
            "-".join(name[:-4] if name.endswith("-ipa") else name for name in ipa_names)
            + "-ipa"
        )

    def long_ipa_names(ipa_names: Iterable) -> str:
        # ["fra-ipa", "eng-ipa", "kwk-ipa"] -> "fra-ipa and eng-ipa and kwk-ipa"
        return " and ".join(ipa_names)

    def get_sorted_unique_names(mappings: List[Tuple[Mapping, str]]) -> List[str]:
        return sorted(
            {mapping.kwargs[f"{in_or_out}_lang"] for mapping, in_or_out in mappings}
        )

    def deduplicate(iterable: Iterable) -> List:
        # Use a dict, and not a set, to preserve the original order.
        return list({v: v for v in iterable}.values())

    map_1_names = get_sorted_unique_names(src_mappings)
    map_2_names = get_sorted_unique_names(tgt_mappings)

    src_inventory = []
    for (mapping, io) in src_mappings:
        name = mapping.kwargs[f"{io}_lang"]
        if not is_ipa(name):
            LOGGER.warning(
                "Unsupported orthography of src inventory: %s; must be IPA", name
            )
        src_inventory.extend(mapping.inventory(io))
    src_inventory = deduplicate(src_inventory)

    tgt_inventory = []
    for (mapping, io) in tgt_mappings:
        name = mapping.kwargs[f"{io}_lang"]
        if not is_ipa(name):
            LOGGER.warning(
                "Unsupported orthography of tgt inventory: %s; must be IPA", name
            )
        tgt_inventory.extend(mapping.inventory(io))
    tgt_inventory = deduplicate(tgt_inventory)

    mapping = align_inventories(src_inventory, tgt_inventory, distance=distance)

    config = {
        "in_lang": compact_ipa_names(map_1_names),
        "out_lang": compact_ipa_names(map_2_names),
        "language_name": "IPA",
        "rule_ordering": "apply-longest-first",
        "mapping": mapping,
        "prevent_feeding": True,
        "norm_form": "NFC",
        "display_name": (
            long_ipa_names(map_1_names) + " to " + long_ipa_names(map_2_names)
        ),
    }

    return Mapping(**config)
Example #26
0
File: utils.py Project: deltork/g2p
def load_mapping_from_path(path_to_mapping_config, index=0):
    """ Loads a mapping from a path, if there is more than one mapping, then it loads based on the int
        provided to the 'index' argument. Default is 0.
    """
    path = Path(path_to_mapping_config)
    # If path leads to actual mapping config
    if path.exists() and (path.suffix.endswith("yml") or path.suffix.endswith("yaml")):
        # safe load it
        with open(path, encoding="utf8") as f:
            mapping = yaml.safe_load(f)
        # If more than one mapping in the mapping config
        if "mappings" in mapping:
            try:
                LOGGER.debug(
                    'Loading mapping from %s between "%s" and "%s" at index %s',
                    path_to_mapping_config,
                    mapping["mappings"][index].get("in_lang", "und"),
                    mapping["mappings"][index].get("out_lang", "und"),
                    index,
                )
                mapping = mapping["mappings"][index]
            except KeyError:
                LOGGER.warning(
                    "An index of %s was provided for the mapping %s but that index does not exist in the mapping. "
                    "Please check your mapping.",
                    index,
                    path_to_mapping_config,
                )
        # Log the warning if an Index other than 0 was provided for a mapping config with a single mapping.
        elif index != 0:
            LOGGER.warning(
                "An index of %s was provided for the mapping %s but that index does not exist in the mapping. "
                "Please check your mapping.",
                index,
                path_to_mapping_config,
            )
        # try to load the data from the mapping data file
        if "mapping" in mapping:
            try:
                mapping["mapping_data"] = load_from_file(
                    os.path.join(path.parent, mapping["mapping"])
                )
            except (OSError, exceptions.IncorrectFileType) as e:
                raise exceptions.MalformedMapping(
                    f"Cannot load mapping data file specified in {path}: {e}"
                ) from e
        elif mapping.get("type", "") == "unidecode":
            # This mapping is not implemented as a regular mapping, but as custom software
            pass
        else:
            # Is "mapping" key missing?
            raise exceptions.MalformedMapping(
                'Key "mapping:" missing from a mapping in {}.'.format(path)
            )
        # load any abbreviations
        if "abbreviations" in mapping:
            try:
                mapping["abbreviations_data"] = load_abbreviations_from_file(
                    os.path.join(path.parent, mapping["abbreviations"])
                )
            except (OSError, exceptions.IncorrectFileType) as e:
                raise exceptions.MalformedMapping(
                    f"Cannot load abbreviations data file specified in {path}: {e}"
                ) from e
        return mapping
    else:
        raise FileNotFoundError
Example #27
0
    def process_kwargs(self, mapping):
        ''' Apply kwargs in the order they are provided. kwargs are ordered as of python 3.6
        '''

        if 'as_is' in self.kwargs:
            as_is = self.kwargs['as_is']
            if as_is:
                appropriate_setting = "as-written"
            else:
                appropriate_setting = "apply-longest-first"

            self.kwargs["rule_ordering"] = appropriate_setting

            LOGGER.warning(
                f"mapping from {self.kwargs.get('in_lang')} to {self.kwargs.get('out_lang')} "
                'is using the deprecated parameter "as_is"; '
                f"replace `as_is: {as_is}` with `rule_ordering: {appropriate_setting}`"
            )

        # Add defaults
        if 'rule_ordering' in self.kwargs:
            # right now, "rule-ordering" is a more explict alias of the "as-is" option.
            ordering = self.kwargs["rule_ordering"]
            if ordering not in ("as-written", "apply-longest-first"):
                LOGGER.error(
                    f"mapping from {self.kwargs.get('in_lang')} to {self.kwargs.get('out_lang')} "
                    f"has invalid value '{ordering}' for rule_ordering parameter; "
                    "rule_ordering must be one of "
                    '"as-written" or "apply-longest-first"')
        else:
            self.kwargs["rule_ordering"] = "as-written"
        if 'case_sensitive' not in self.kwargs:
            self.kwargs['case_sensitive'] = True
        if 'escape_special' not in self.kwargs:
            self.kwargs['escape_special'] = False
        if 'norm_form' not in self.kwargs:
            self.kwargs['norm_form'] = 'NFD'
        if 'reverse' not in self.kwargs:
            self.kwargs['reverse'] = False
        if 'prevent_feeding' not in self.kwargs:
            self.kwargs['prevent_feeding'] = False
        if 'in_lang' not in self.kwargs:
            self.kwargs['in_lang'] = 'und'
        if 'out_lang' not in self.kwargs:
            self.kwargs['out_lang'] = 'und'

        # Process kwargs in order received
        for kwarg, val in self.kwargs.items():
            if kwarg == 'rule_ordering' and self.wants_rules_sorted():
                # sort by reverse len
                mapping = sorted(mapping,
                                 key=lambda x: len(x["in"]),
                                 reverse=True)
            elif kwarg == 'escape_special' and val:
                mapping = [escape_special_characters(x) for x in mapping]
            elif kwarg == 'norm_form' and val:
                for io in mapping:
                    for k, v in io.items():
                        if isinstance(v, str):
                            io[k] = normalize(v, self.kwargs['norm_form'])
            elif kwarg == 'reverse' and val:
                mapping = self.reverse_mappings(mapping)
        # After all processing is done, turn into regex
        for io in mapping:
            if self.kwargs['prevent_feeding'] or ('prevent_feeding' in io
                                                  and io['prevent_feeding']):
                io['intermediate_form'] = self._string_to_pua(
                    io['out'], mapping.index(io))
            io['match_pattern'] = self.rule_to_regex(io)
            if not io['match_pattern']:
                mapping.remove(io)
        self.processed = True
        return mapping
Example #28
0
 def config_to_file(self,
                    output_path: str = os.path.join(GEN_DIR, 'config.yaml'),
                    mapping_type: str = 'json'):
     ''' Write config to file
     '''
     add_config = False
     if os.path.exists(output_path) and os.path.isfile(output_path):
         LOGGER.warning(f'Adding mapping config to file at {output_path}')
         fn = output_path
         add_config = True
     elif os.path.isdir(output_path):
         fn = os.path.join(output_path, 'config.yaml')
     else:
         LOGGER.warning(f'writing mapping config to file at {output_path}')
         fn = output_path
     template = {
         "mappings": [{
             "language_name":
             self.kwargs.get('language_name',
                             self.kwargs.get('in_lang', 'und')),
             "display_name":
             self.kwargs.get(
                 'display_name',
                 self.kwargs.get('in_lang', 'und') + " " +
                 self.mapping_type(self.kwargs.get('out_lang', 'und')) +
                 " to " + self.kwargs.get('out_lang', 'und') + " " +
                 self.mapping_type(self.kwargs.get('out_lang', 'und'))),
             "in_lang":
             self.kwargs.get('in_lang', 'und'),
             "out_lang":
             self.kwargs.get('out_lang', 'und'),
             "authors":
             self.kwargs.get('authors', [f'Generated {dt.datetime.now()}']),
             "as_is":
             not self.wants_rules_sorted(),
             # TODO: rule_ordering
             "prevent_feeding":
             self.kwargs.get('prevent_feeding', False),
             "case_sensitive":
             self.kwargs.get('case_sensitive', True),
             "escape_special":
             self.kwargs.get('escape_special', False),
             "norm_form":
             self.kwargs.get('norm_form', "NFD"),
             "reverse":
             self.kwargs.get('reverse', False),
             "mapping":
             self.kwargs.get('in_lang', 'und') + "_to_" +
             self.kwargs.get('out_lang', 'und') + '.' + mapping_type
         }]
     }
     # If config file exists already, just add the mapping.
     if add_config:
         with open(fn, encoding='utf8') as f:
             existing_data = yaml.safe_load(f.read())
         updated = False
         for i, mapping in enumerate(existing_data['mappings']):
             # if the mapping exists, just update the generation data
             if mapping['in_lang'] == template['mappings'][0][
                     'in_lang'] and mapping['out_lang'] == template[
                         'mappings'][0]['out_lang']:
                 existing_data['mappings'][i]['authors'] = template[
                     'mappings'][0]['authors']
                 updated = True
                 break
         if not updated:
             existing_data['mappings'].append(template['mappings'][0])
         template = existing_data
     with open(fn, 'w', encoding='utf8') as f:
         yaml.dump(template,
                   f,
                   Dumper=IndentDumper,
                   default_flow_style=False)
Example #29
0
 def config_to_file(
     self,
     output_path: str = os.path.join(GEN_DIR, "config.yaml"),
     mapping_type: str = "json",
 ):
     """ Write config to file
     """
     add_config = False
     if os.path.isdir(output_path):
         output_path = os.path.join(output_path, "config.yaml")
     if os.path.exists(output_path) and os.path.isfile(output_path):
         LOGGER.warning(f"Adding mapping config to file at {output_path}")
         fn = output_path
         add_config = True
     else:
         LOGGER.warning(f"writing mapping config to file at {output_path}")
         fn = output_path
     template = {
         "mappings": [{
             "language_name":
             self.kwargs.get("language_name",
                             self.kwargs.get("in_lang", "und")),
             "display_name":
             self.kwargs.get(
                 "display_name",
                 self.kwargs.get("in_lang", "und") + " " +
                 self.mapping_type(self.kwargs.get("out_lang", "und")) +
                 " to " + self.kwargs.get("out_lang", "und") + " " +
                 self.mapping_type(self.kwargs.get("out_lang", "und")),
             ),
             "in_lang":
             self.kwargs.get("in_lang", "und"),
             "out_lang":
             self.kwargs.get("out_lang", "und"),
             "authors":
             self.kwargs.get("authors", [f"Generated {dt.datetime.now()}"]),
             "rule_ordering":
             self.kwargs.get("rule_ordering", "as-written"),
             "prevent_feeding":
             self.kwargs.get("prevent_feeding", False),
             "case_sensitive":
             self.kwargs.get("case_sensitive", True),
             "escape_special":
             self.kwargs.get("escape_special", False),
             "norm_form":
             self.kwargs.get("norm_form", "NFD"),
             "reverse":
             self.kwargs.get("reverse", False),
             "mapping":
             self.kwargs.get("in_lang", "und") + "_to_" +
             self.kwargs.get("out_lang", "und") + "." + mapping_type,
         }]
     }
     # If config file exists already, just add the mapping.
     if add_config:
         with open(fn, encoding="utf8") as f:
             existing_data = yaml.safe_load(f.read())
         updated = False
         for i, mapping in enumerate(existing_data["mappings"]):
             # if the mapping exists, just update the generation data
             if (mapping["in_lang"] == template["mappings"][0]["in_lang"]
                     and mapping["out_lang"]
                     == template["mappings"][0]["out_lang"]):
                 existing_data["mappings"][i]["authors"] = template[
                     "mappings"][0]["authors"]
                 updated = True
                 break
         if not updated:
             existing_data["mappings"].append(template["mappings"][0])
         template = existing_data
     with open(fn, "w", encoding="utf8") as f:
         yaml.dump(template,
                   f,
                   Dumper=IndentDumper,
                   default_flow_style=False)
Example #30
0
File: cli.py Project: deltork/g2p
def convert(
    in_lang,
    out_lang,
    input_text,
    path,
    tok,
    check,
    debugger,
    pretty_edges,
    tok_lang,
    config,
):
    """Convert INPUT_TEXT through g2p mapping(s) from IN_LANG to OUT_LANG.

       Visit http://g2p-studio.herokuapp.com/api/v1/langs for a list of languages.

       There must be a path from IN_LANG to OUT_LANG, possibly via some intermediates.
       For example, mapping from fra to eng-arpabet will successively apply
       fra->fra-ipa, fra-ipa->eng-ipa and eng-ipa->eng-arpabet.
    """
    # Check valid input
    # Check input != output
    if in_lang == out_lang:
        raise click.UsageError(
            "Values must be different for 'IN_LANG' and 'OUT_LANG'")
    if config:
        # This isn't that DRY - copied from g2p/mappings/langs/__init__.py
        mappings_legal_pairs = []
        with open(config, encoding="utf8") as f:
            data = yaml.safe_load(f)
        if "mappings" in data:
            for index, mapping in enumerate(data["mappings"]):
                mappings_legal_pairs.append((
                    data["mappings"][index]["in_lang"],
                    data["mappings"][index]["out_lang"],
                ))
                data["mappings"][index] = load_mapping_from_path(config, index)
        else:
            mapping = load_mapping_from_path(config)
            data["mappings"] = [mapping]
            mappings_legal_pairs.append(
                (mapping["in_lang"], mapping["out_lang"]))
        for pair in mappings_legal_pairs:
            if pair[0] in LANGS_NETWORK.nodes:
                LOGGER.warning(
                    f"A mapping with the name '{pair[0]}' is already defined in g2p. "
                    "Your local mapping with the same name might not function properly."
                )
        LANGS_NETWORK.add_edges_from(mappings_legal_pairs)
        MAPPINGS_AVAILABLE.extend(data["mappings"])
    # Check input lang exists
    if in_lang not in LANGS_NETWORK.nodes:
        raise click.UsageError(
            f"'{in_lang}' is not a valid value for 'IN_LANG'")
    # Check output lang exists
    if out_lang not in LANGS_NETWORK.nodes:
        raise click.UsageError(
            f"'{out_lang}' is not a valid value for 'OUT_LANG'")
    # Check if path exists
    if not has_path(LANGS_NETWORK, in_lang, out_lang):
        raise click.UsageError(
            f"Path between '{in_lang}' and '{out_lang}' does not exist")
    if os.path.exists(input_text) and input_text.endswith("txt"):
        with open(input_text, encoding="utf8") as f:
            input_text = f.read()
    # Determine which tokenizer to use, if any
    if tok is not None and not tok and tok_lang is not None:
        raise click.UsageError(
            "Specified conflicting --no-tok and --tok-lang options.")
    if tok and tok_lang is None:
        tok_lang = "path"
    # Transduce!!!
    if in_lang and out_lang:
        transducer = make_g2p(in_lang, out_lang, tok_lang=tok_lang)
    elif path:
        transducer = Transducer(Mapping(path))
    tg = transducer(input_text)
    if check:
        transducer.check(tg, display_warnings=True)
    outputs = [tg.output_string]
    if pretty_edges:
        outputs += [tg.pretty_edges()]
    if debugger:
        outputs += [tg.edges, tg.debugger]
    if len(outputs) > 1:
        click.echo(pprint.pformat(outputs, indent=4))
    else:
        click.echo(tg.output_string)