Esempio n. 1
0
def get_alternative_formats():
    """
    Utils to get alternative formats for numbers.
    """
    one_alternatives = load_labels(
        get_abs_path('data/numbers/cardinals_alternatives.tsv'))
    one_thousand_map = []
    for k in one_alternatives:
        default, alternative = k
        one_thousand_map.append((alternative.split()[1], alternative))
    one_thousand_map = pynini.string_map(one_thousand_map)

    one_thousand_alternative = pynini.cdrewrite(one_thousand_map, "[BOS]", "",
                                                NEMO_SIGMA)

    # Adapted from
    # https://github.com/google/TextNormalizationCoveringGrammars/blob/master/src/universal/thousands_punct.grm
    # Specifies common ways of delimiting thousands in digit strings.
    t = pynini.Far(get_abs_path('data/utils/universal_thousands_punct.far'))
    separators = (pynutil.add_weight(t['dot_thousands'], 0.1)
                  | pynutil.add_weight(t['no_delimiter'], -0.1)
                  | pynutil.add_weight(t['space_thousands'], 0.1))
    alternative_formats = {}
    alternative_formats[
        'one_thousand_alternative'] = one_thousand_alternative.optimize()
    alternative_formats['separators'] = separators.optimize()
    return alternative_formats
Esempio n. 2
0
    def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
        super().__init__(name="tokenize_and_classify", kind="classify")

        far_file = None
        if cache_dir is not None and cache_dir != "None":
            os.makedirs(cache_dir, exist_ok=True)
            far_file = os.path.join(cache_dir, "_en_itn.far")
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
            logging.info(f"ClassifyFst.fst was restored from {far_file}.")
        else:
            logging.info(f"Creating ClassifyFst grammars.")
            cardinal = CardinalFst()
            cardinal_graph = cardinal.fst

            ordinal = OrdinalFst(cardinal)
            ordinal_graph = ordinal.fst

            decimal = DecimalFst(cardinal)
            decimal_graph = decimal.fst

            measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal).fst
            date_graph = DateFst(ordinal=ordinal).fst
            word_graph = WordFst().fst
            time_graph = TimeFst().fst
            money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst
            whitelist_graph = WhiteListFst().fst
            punct_graph = PunctuationFst().fst
            electronic_graph = ElectronicFst().fst
            telephone_graph = TelephoneFst(cardinal).fst

            classify = (pynutil.add_weight(whitelist_graph, 1.01)
                        | pynutil.add_weight(time_graph, 1.1)
                        | pynutil.add_weight(date_graph, 1.09)
                        | pynutil.add_weight(decimal_graph, 1.1)
                        | pynutil.add_weight(measure_graph, 1.1)
                        | pynutil.add_weight(cardinal_graph, 1.1)
                        | pynutil.add_weight(ordinal_graph, 1.1)
                        | pynutil.add_weight(money_graph, 1.1)
                        | pynutil.add_weight(telephone_graph, 1.1)
                        | pynutil.add_weight(electronic_graph, 1.1)
                        | pynutil.add_weight(word_graph, 100))

            punct = pynutil.insert("tokens { ") + pynutil.add_weight(
                punct_graph, weight=1.1) + pynutil.insert(" }")
            token = pynutil.insert("tokens { ") + classify + pynutil.insert(
                " }")
            token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) +
                                token +
                                pynini.closure(pynutil.insert(" ") + punct))

            graph = token_plus_punct + pynini.closure(delete_extra_space +
                                                      token_plus_punct)
            graph = delete_space + graph + delete_space

            self.fst = graph.optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f"ClassifyFst grammars are saved to {far_file}.")
Esempio n. 3
0
    def __init__(self,
                 deterministic: bool = True,
                 cache_dir: str = None,
                 overwrite_cache: bool = False):
        super().__init__(name="verbalize_final",
                         kind="verbalize",
                         deterministic=deterministic)

        far_file = None
        if cache_dir is not None and cache_dir != "None":
            os.makedirs(cache_dir, exist_ok=True)
            far_file = os.path.join(
                cache_dir,
                f"de_tn_{deterministic}_deterministic_verbalizer.far")
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode="r")["verbalize"]
            logging.info(
                f'VerbalizeFinalFst graph was restored from {far_file}.')
        else:
            verbalize = VerbalizeFst(deterministic=deterministic).fst
            word = WordFst(deterministic=deterministic).fst

            types = verbalize | word
            graph = (pynutil.delete("tokens") + delete_space +
                     pynutil.delete("{") + delete_space + types +
                     delete_space + pynutil.delete("}"))
            graph = delete_space + pynini.closure(
                graph + delete_extra_space) + graph + delete_space

            self.fst = graph.optimize()
            if far_file:
                generator_main(far_file, {"verbalize": self.fst})
                logging.info(
                    f"VerbalizeFinalFst grammars are saved to {far_file}.")
Esempio n. 4
0
    def __init__(
        self,
        input_case: str,
        cache_dir: str = None,
        overwrite_cache: bool = False,
        deterministic: bool = True,
        whitelist: str = None,
    ):
        super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic)

        far_file = None
        if cache_dir is not None and cache_dir != "None":
            os.makedirs(cache_dir, exist_ok=True)
            whitelist_file = os.path.basename(whitelist) if whitelist else ""
            far_file = os.path.join(
                cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far"
            )
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
            logging.info(f'ClassifyFst.fst was restored from {far_file}.')
        else:
            logging.info(f"Creating ClassifyFst grammars.")

            word_graph = WordFst(deterministic=deterministic).fst
            whitelist_graph = WhiteListFst(input_case=input_case, deterministic=deterministic).fst
            punct_graph = PunctuationFst(deterministic=deterministic).fst

            classify = pynutil.add_weight(whitelist_graph, 1) | pynutil.add_weight(word_graph, 100)

            punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }")
            punct = pynini.closure(
                pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
                | (pynutil.insert(" ") + punct),
                1,
            )
            token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
            token_plus_punct = (
                pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)
            )

            graph = (
                token_plus_punct
                + pynini.closure(
                    (
                        pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
                        | (pynutil.insert(" ") + punct + pynutil.insert(" "))
                    )
                    + token_plus_punct
                ).optimize()
            )

            graph = delete_space + graph + delete_space
            graph |= punct

            self.fst = graph.optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f"ClassifyFst grammars are saved to {far_file}.")
Esempio n. 5
0
 def setUpClass(cls):
     super().setUpClass()
     fold = pynini.string_map((("A", "a"), ("B", "b"))).optimize()
     cls.far_path = tempfile.mkstemp(suffix=".far")[1]
     with pynini.Far(cls.far_path, "w") as far:
         far["DOWNCASE"] = fold
         far["UPCASE"] = fold.invert()
     cls.cascade = rule_cascade.RuleCascade(cls.far_path)
Esempio n. 6
0
 def test_iso_roundtrip(self, tag: str):
   tag = tag.upper()
   far_path = u.FAR_DIR / 'iso.far'
   with pynini.Far(file.AsResourcePath(far_path), 'r') as far:
     natv_to_iso = far[f'FROM_{tag}']
     iso_to_natv = far[f'TO_{tag}']
     self.assertFstProbablyIdentity([natv_to_iso, iso_to_natv],
                                    token_type='byte',
                                    samples=test_util.NUM_TEST_SAMPLES)
Esempio n. 7
0
 def test_romanization_roundtrip(self):
     far_path = u.FAR_DIR / 'reversible_roman.far'
     with pynini.Far(uf.AsResourcePath(far_path), 'r') as far:
         natv_to_latin = far['FROM_ARAB']
         latin_to_natv = far['TO_ARAB']
         round_trip = natv_to_latin @ latin_to_natv
         self.assertFstProbablyFunctional(round_trip,
                                          token_type='byte',
                                          samples=ut.NUM_TEST_SAMPLES)
Esempio n. 8
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="cardinal",
                         kind="classify",
                         deterministic=deterministic)

        graph = pynini.Far(
            get_abs_path("data/numbers/cardinal_number_name.far")).get_fst()
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            pynini.closure(NEMO_DIGIT, 2, 3)
            | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph
        self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure(
            pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT +
            NEMO_DIGIT + NEMO_DIGIT)) @ graph

        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
        single_digits_graph = pynutil.add_weight(
            pynini.invert(graph_digit | graph_zero), 1.2) | pynutil.add_weight(
                pynini.cross("0", "oh"), 1.1)
        self.single_digits_graph = single_digits_graph + pynini.closure(
            pynutil.insert(" ") + single_digits_graph)

        if not deterministic:
            single_digits_graph_with_commas = pynini.closure(
                self.single_digits_graph + pynutil.insert(" "), 1,
                3) + pynini.closure(
                    pynutil.delete(",") + single_digits_graph +
                    pynutil.insert(" ") + single_digits_graph +
                    pynutil.insert(" ") + single_digits_graph,
                    1,
                )
            self.graph |= self.single_digits_graph | get_hundreds_graph(
            ) | single_digits_graph_with_commas
            self.range_graph = (
                pynini.closure(pynutil.insert("from "), 0, 1) + self.graph +
                (pynini.cross("-", " to ") | pynini.cross("-", " ")) +
                self.graph)

            self.range_graph |= self.graph + (pynini.cross(
                "x", " by ") | pynini.cross(" x ", " by ")) + self.graph
            self.range_graph = self.range_graph.optimize()

        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)
        final_graph = self.graph | pynutil.add_weight(self.get_serial_graph(),
                                                      1.2)

        if not deterministic:
            final_graph |= self.range_graph

        final_graph = optional_minus_graph + pynutil.insert(
            "integer: \"") + final_graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Esempio n. 9
0
 def testFilledExporter(self):
     """Export two FSTs and check that they are stored in the file."""
     filename = os.path.join(FLAGS.test_tmpdir, 'test.far')
     FLAGS.output = filename
     with self.assertRaises(SystemExit):
         grm.run(generator_method)
     with pynini.Far(filename, 'r') as far:
         stored_fsts = dict(far)
     self.assertLen(stored_fsts, 2)
     self.assertTrue(stored_fsts['FST1'])
     self.assertTrue(stored_fsts['FST2'])
Esempio n. 10
0
def OpenFstFromFarSafe(far_dir: pathlib.Path, far_name: str, token_type: str,
                       fst_name: str, default: pynini.Fst) -> pynini.Fst:
    """Returns FST from a given FAR; returns default if FST is not found."""
    tt_suffix = {"byte": "", "utf8": "_utf8"}[token_type]
    far_path = far_dir / f"{far_name}{tt_suffix}.far"
    if not IsFileExist(far_path):
        return default
    with pynini.Far(AsResourcePath(far_path), "r") as far:
        try:
            return far[fst_name.upper()]
        except KeyError:
            return default
Esempio n. 11
0
  def setUp(self):
    super().setUp()
    self._letters_proto = letter_languages.read_textproto(
        u.LANG_DIR / 'letter_languages.textproto')
    self._roman_proto = unicode_strings_util.read_textproto(
        u.LANG_DIR / 'reversible_roman.textproto')

    far_path = u.FAR_DIR / 'reversible_roman.far'
    with pynini.Far(uf.AsResourcePath(far_path), 'r') as far:
      natv_to_roman = far['FROM_ARAB']
      roman_to_natv = far['TO_ARAB']
      self._round_trip = natv_to_roman @ roman_to_natv
Esempio n. 12
0
    def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):

        far_file = None
        if cache_dir is not None and cache_dir != "None":
            os.makedirs(cache_dir, exist_ok=True)
            far_file = os.path.join(cache_dir, "en_tn_post_processing.far")
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode="r")["post_process_graph"]
            logging.info(
                f'Post processing graph was restored from {far_file}.')
        else:
            self.set_punct_dict()
            self.fst = self.get_punct_postprocess_graph()

            if far_file:
                generator_main(far_file, {"post_process_graph": self.fst})
Esempio n. 13
0
def run_experiments(roots1, roots2):
    """Runs FLAGS.number_of_experiments experiments.

  Args:
    roots1: A Roots class instance
    roots2: A Roots class instance
  """
    mapping_rule = py.Far(FLAGS.far)[FLAGS.mapping_rule]
    for i in range(FLAGS.number_of_experiments):
        zipped = produce_paired_etyma(roots1, roots2)
        success = 0
        for (e1, e2) in zipped:
            if best_score(
                    e1 * mapping_rule * e2) <= FLAGS.levenshtein_threshold:
                print("{}\t{}".format(e1, e2))
                success += 1
        print("RUN:\t{}\t{}".format(i, success))
        sys.stdout.flush()
def main(unused_argv):
    far = py.Far(FLAGS.far)
    fst = far[FLAGS.rule]
    # Note that we tried to push weights to the beginning so that we don"t get
    # spurious selection of "free" cases where the first byte of a UTF8 character
    # has no weight.
    #
    #   fst = py.push(fst, push_weights=True, to_final=False)
    #
    # However this seems to produce artifacts of its own like endless series of
    # Greek roots starting with "drai".  On the other hand without it PAN gets
    # endless roots starting with ñ.
    if FLAGS.push:
        fst = py.push(fst, push_weights=True, to_final=False)
    rand = py.randgen(fst,
                      npath=FLAGS.npaths,
                      seed=int(time.time()),
                      select="log_prob",
                      weighted=True)
    print(Counter([p for p in rand.paths().ostrings()]))
Esempio n. 15
0
    def __init__(self):
        super().__init__(name="cardinal", kind="classify")

        graph = pynini.Far(
            get_abs_path("data/numbers/cardinal_number_name.far")).get_fst()
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            pynini.closure(NEMO_DIGIT, 2, 3)
            | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph
        self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure(
            pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT +
            NEMO_DIGIT + NEMO_DIGIT)) @ graph

        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)

        final_graph = optional_minus_graph + pynutil.insert(
            "integer: \"") + self.graph + pynutil.insert("\"")

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Esempio n. 16
0
def get_alternative_formats():
    """
    Utils to get alternative formats for numbers.
    """
    one_alternatives = load_labels(
        get_abs_path('data/numbers/cardinals_alternatives.tsv'))
    one_thousand_map = []
    for k in one_alternatives:
        default, alternative = k
        one_thousand_map.append((alternative.split()[1], alternative))
    one_thousand_map = pynini.string_map(one_thousand_map)

    one_thousand_alternative = pynini.cdrewrite(one_thousand_map, "[BOS]", "",
                                                NEMO_SIGMA)

    t = pynini.Far(get_abs_path('data/utils/universal_thousands_punct.far'))
    separators = (pynutil.add_weight(t['dot_thousands'], 0.1)
                  | pynutil.add_weight(t['no_delimiter'], -0.1)
                  | pynutil.add_weight(t['space_thousands'], 0.1))
    alternative_formats = {}
    alternative_formats['one_thousand_alternative'] = one_thousand_alternative
    alternative_formats['separators'] = separators
    return alternative_formats
Esempio n. 17
0
 def close(self) -> None:
     """Writes the registered FSTs into the given file and closes it."""
     assert self._is_open
     logging.info('Writing FSTs into \'%s\'.', self._filename)
     # TODO(b/123775699): Currently pytype is unable to resolve
     # the usage of typing.Literal for pynini.Far.__init__'s far_type, producing
     # the error:
     #
     #  Expected: (self, filename, mode, arc_type, far_type: Literal[str] = ...)
     #  Actually passed: (self, filename, mode, arc_type, far_type: Literal[str])
     #
     # Once typing.Literal support no longer makes this error, drop
     # the below pytype disable comment.
     with pynini.Far(self._filename,
                     'w',
                     arc_type=self._arc_type,
                     far_type=self._far_type) as sink:  # pytype: disable=wrong-arg-types
         for name in sorted(self._fsts):
             logging.info('Writing FST \'%s\' into \'%s\'.', name,
                          self._filename)
             sink[name] = self._fsts[name]
     logging.info('Writing FSTs into \'%s\' done.', self._filename)
     self._is_open = False
Esempio n. 18
0
 def _LoadFar(self) -> pynini.Far:
     return pynini.Far(uf.AsResourcePath(self._path_to_far))
Esempio n. 19
0
def get_number_names():
    """
    Creates numbers names.

    Based on: 1) Gorman, K., and Sproat, R. 2016. Minimally supervised number normalization.
    Transactions of the Association for Computational Linguistics 4: 507-519.
    and 2) Ng, A. H., Gorman, K., and Sproat, R. 2017.
    Minimally supervised written-to-spoken text normalization. In ASRU, pages 665-670.
    """
    a = pynini.Far(get_abs_path('data/utils/util_arithmetic.far'), mode='r')
    d = a['DELTA_STAR']
    f = a['IARITHMETIC_RESTRICTED']
    g = pynini.Fst.read(get_abs_path('data/utils/g.fst'))
    fg = (d @ (f @ (f @ (f @ g).optimize()).optimize()).optimize()).optimize()
    assert rewrite.top_rewrite("230", fg) == "(+ 200 30 +)"

    # Compiles lexicon transducers (L).
    cardinal_name_nominative = pynini.string_file(
        get_abs_path("data/numbers/1_cardinals_nominative_именительный.tsv")
    ).optimize()
    cardinal_name_genitive = pynini.string_file(
        get_abs_path(
            "data/numbers/2_cardinals_genitive_родительный.tsv")).optimize()
    cardinal_name_dative = pynini.string_file(
        get_abs_path(
            "data/numbers/3_cardinals_dative_датильный.tsv")).optimize()
    cardinal_name_accusative = pynini.string_file(
        get_abs_path(
            "data/numbers/4_cardinals_accusative_винительный.tsv")).optimize()
    cardinal_name_instrumental = pynini.string_file(
        get_abs_path("data/numbers/5_cardinals_instrumental_творительный.tsv")
    ).optimize()
    cardinal_name_prepositional = pynini.string_file(
        get_abs_path("data/numbers/6_cardinals_prepositional_предложный.tsv")
    ).optimize()

    cardinal_l = (
        pynini.closure(cardinal_name_nominative + pynini.accep(" ")) +
        cardinal_name_nominative).optimize()
    for case in [
            cardinal_name_genitive,
            cardinal_name_dative,
            cardinal_name_accusative,
            cardinal_name_instrumental,
            cardinal_name_prepositional,
    ]:
        cardinal_l |= (pynini.closure(case + pynini.accep(" ")) +
                       case).optimize()

    # Numbers up to 1000 in nominative case (to use, for example, with telephone)
    nominative_up_to_thousand_name = pynini.string_file(
        get_abs_path("data/numbers/cardinals_nominative_case.tsv"))
    nominative_up_to_thousand_name_l = (
        pynini.closure(nominative_up_to_thousand_name + pynini.accep(" ")) +
        nominative_up_to_thousand_name).optimize()

    # Convert e.g. "(* 5 1000 *)" back to  "5000" so complex ordinals will be formed correctly,
    #  e.g. "пятитысячный" will eventually be formed. (If we didn't do this, the incorrect phrase
    # "пять тысячный" would be formed).
    # We do this for all thousands from "(*2 1000 *)" —> "2000" to "(*20 1000 *)" —> "20000".
    # We do not go higher, in order to prevent the WFST graph becoming even larger.
    complex_numbers = pynini.cross("(* 2 1000 *)", "2000")
    for number in range(3, 21):
        complex_numbers |= pynini.cross(f"(* {number} 1000 *)", f"{number}000")

    complex_numbers = (NEMO_SIGMA + pynutil.add_weight(complex_numbers, -1) +
                       pynini.closure(pynini.union(" ", ")", "(", "+", "*")))
    fg_ordinal = pynutil.add_weight(pynini.compose(fg, complex_numbers),
                                    -1) | fg
    ordinal_name = pynini.string_file(
        get_abs_path("data/numbers/ordinals.tsv"))
    ordinal_l = (pynini.closure(cardinal_name_nominative + pynini.accep(" ")) +
                 ordinal_name).optimize()

    # Composes L with the leaf transducer (P), then composes that with FG.
    p = a['LEAVES']
    number_names = {}
    number_names['ordinal_number_names'] = (
        fg_ordinal @ (p @ ordinal_l)).optimize()
    number_names['cardinal_number_names'] = (fg @ (p @ cardinal_l)).optimize()
    number_names['nominative_up_to_thousand_names'] = (
        fg @ (p @ nominative_up_to_thousand_name_l)).optimize()
    return number_names
Esempio n. 20
0
    def __init__(
        self,
        input_case: str,
        deterministic: bool = True,
        cache_dir: str = None,
        overwrite_cache: bool = False,
        whitelist: str = None,
    ):
        super().__init__(name="tokenize_and_classify",
                         kind="classify",
                         deterministic=deterministic)

        far_file = None
        if cache_dir is not None and cache_dir != "None":
            os.makedirs(cache_dir, exist_ok=True)
            whitelist_file = os.path.basename(whitelist) if whitelist else ""
            far_file = os.path.join(
                cache_dir,
                f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far"
            )
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
            logging.info(f'ClassifyFst.fst was restored from {far_file}.')
        else:
            logging.info(f"Creating ClassifyFst grammars.")
            cardinal = CardinalFst(deterministic=deterministic)
            cardinal_graph = cardinal.fst

            ordinal = OrdinalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            ordinal_graph = ordinal.fst

            decimal = DecimalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            decimal_graph = decimal.fst
            fraction = FractionFst(deterministic=deterministic,
                                   cardinal=cardinal)
            fraction_graph = fraction.fst

            measure = MeasureFst(cardinal=cardinal,
                                 decimal=decimal,
                                 fraction=fraction,
                                 deterministic=deterministic)
            measure_graph = measure.fst
            date_graph = DateFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            word_graph = WordFst(deterministic=deterministic).fst
            time_graph = TimeFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            telephone_graph = TelephoneFst(deterministic=deterministic).fst
            electonic_graph = ElectronicFst(deterministic=deterministic).fst
            money_graph = MoneyFst(cardinal=cardinal,
                                   decimal=decimal,
                                   deterministic=deterministic).fst
            whitelist_graph = WhiteListFst(input_case=input_case,
                                           deterministic=deterministic,
                                           input_file=whitelist).fst
            punct_graph = PunctuationFst(deterministic=deterministic).fst
            serial_graph = SerialFst(cardinal=cardinal,
                                     ordinal=ordinal,
                                     deterministic=deterministic).fst

            v_time_graph = vTimeFst(deterministic=deterministic).fst
            v_ordinal_graph = vOrdinalFst(deterministic=deterministic)
            v_date_graph = vDateFst(ordinal=v_ordinal_graph,
                                    deterministic=deterministic).fst
            time_final = pynini.compose(time_graph, v_time_graph)
            date_final = pynini.compose(date_graph, v_date_graph)
            range_graph = RangeFst(time=time_final,
                                   date=date_final,
                                   cardinal=cardinal,
                                   deterministic=deterministic).fst

            classify = (
                pynutil.add_weight(whitelist_graph, 1.01)
                | pynutil.add_weight(time_graph, 1.1)
                | pynutil.add_weight(date_graph, 1.09)
                | pynutil.add_weight(decimal_graph, 1.1)
                | pynutil.add_weight(measure_graph, 1.1)
                | pynutil.add_weight(cardinal_graph, 1.1)
                | pynutil.add_weight(ordinal_graph, 1.1)
                | pynutil.add_weight(money_graph, 1.1)
                | pynutil.add_weight(telephone_graph, 1.1)
                | pynutil.add_weight(electonic_graph, 1.1)
                | pynutil.add_weight(fraction_graph, 1.1)
                | pynutil.add_weight(range_graph, 1.1)
                | pynutil.add_weight(
                    serial_graph,
                    1.1001)  # should be higher than the rest of the classes
            )

            # roman_graph = RomanFst(deterministic=deterministic).fst
            # classify |= pynutil.add_weight(roman_graph, 1.1)

            if not deterministic:
                abbreviation_graph = AbbreviationFst(
                    deterministic=deterministic).fst
                classify |= pynutil.add_weight(abbreviation_graph, 100)

            punct = pynutil.insert("tokens { ") + pynutil.add_weight(
                punct_graph, weight=2.1) + pynutil.insert(" }")
            punct = pynini.closure(
                pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1),
                               delete_extra_space)
                | (pynutil.insert(" ") + punct),
                1,
            )

            classify |= pynutil.add_weight(word_graph, 100)
            token = pynutil.insert("tokens { ") + classify + pynutil.insert(
                " }")
            token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) +
                                token +
                                pynini.closure(pynutil.insert(" ") + punct))

            graph = token_plus_punct + pynini.closure(
                (pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1),
                                delete_extra_space)
                 | (pynutil.insert(" ") + punct + pynutil.insert(" "))) +
                token_plus_punct)

            graph = delete_space + graph + delete_space
            graph |= punct

            self.fst = graph.optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f"ClassifyFst grammars are saved to {far_file}.")
Esempio n. 21
0
def _read_fst_map(filename):
    with pynini.Far(filename) as far:
        stored_fsts = dict(far)
    return stored_fsts
Esempio n. 22
0
                                       closure).optimize()

#Vowel insertion to break consonant clusters caused by suffixes
insertion = pynini.cdrewrite(pynini.transducer("", "e"), consonants, suffixes,
                             closure).optimize()

#Finnish seems to attempt preserving morae count with /s/ as a syllabic end.  Generates a stop that assimilates 'highness' of vowel and becomes /k/
#In case this generated stop occurs after VV, it instead assimilates /s/ and becomes /t/.  Then gradation occurs due to /e/ insertion
#Similar situation seemed to occur with /s/ -> /a/ / /a/_ + suffix.  So was added to transducer.
final_stress_preservation = pynini.cdrewrite(
    pynini.transducer("s", "t"), vowels +
    (pynini.acceptor("y") | "u"), suffixes, closure) * pynini.cdrewrite(
        pynini.transducer("", "k"),
        pynini.acceptor("y") | "u",
        "s" + suffixes, closure) * pynini.cdrewrite(
            pynini.transducer("s", "a"), "a", suffixes, closure)
final_stress_preservation.optimize()

#Rule for /nt/ assimilation.
nt_assimilation = pynini.cdrewrite(pynini.transducer("t", "n"), "n",
                                   vowels + suffixes, closure).optimize()

#Intersection of rules
transducer_adessive = regularize * transducer_adessive_base * nt_assimilation * final_stress_preservation * insertion * consonant_reduction * rvregularize
transducer_inessive = regularize * transducer_inessive_base * nt_assimilation * final_stress_preservation * insertion * consonant_reduction * rvregularize

#########################Generates FAR ###############################3
with pynini.Far("finnish.far", "w") as sink:
    sink["ADESSIVE"] = transducer_adessive
    sink["INESSIVE"] = transducer_inessive
Esempio n. 23
0
 def __init__(self, far_path: str):
     self.far = pynini.Far(far_path, "r")
     self.rules = []
    def __init__(
        self,
        input_case: str,
        deterministic: bool = True,
        cache_dir: str = None,
        overwrite_cache: bool = True,
        whitelist: str = None,
    ):
        super().__init__(name="tokenize_and_classify",
                         kind="classify",
                         deterministic=deterministic)

        far_file = None
        if cache_dir is not None and cache_dir != 'None':
            os.makedirs(cache_dir, exist_ok=True)
            whitelist_file = os.path.basename(whitelist) if whitelist else ""
            far_file = os.path.join(
                cache_dir,
                f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far"
            )
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode='r')['tokenize_and_classify']
            logging.info(f'ClassifyFst.fst was restored from {far_file}.')
        else:
            logging.info(
                f'Creating ClassifyFst grammars. This might take some time...')
            # TAGGERS
            cardinal = CardinalFst(deterministic=deterministic)
            cardinal_graph = cardinal.fst

            ordinal = OrdinalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            ordinal_graph = ordinal.fst

            decimal = DecimalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            decimal_graph = decimal.fst
            fraction = FractionFst(deterministic=deterministic,
                                   cardinal=cardinal)
            fraction_graph = fraction.fst

            measure = MeasureFst(cardinal=cardinal,
                                 decimal=decimal,
                                 fraction=fraction,
                                 deterministic=deterministic)
            measure_graph = measure.fst
            date_graph = DateFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            word_graph = WordFst(deterministic=deterministic).graph
            time_graph = TimeFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            telephone_graph = TelephoneFst(deterministic=deterministic).fst
            electronic_graph = ElectronicFst(deterministic=deterministic).fst
            money_graph = MoneyFst(cardinal=cardinal,
                                   decimal=decimal,
                                   deterministic=deterministic).fst
            whitelist = WhiteListFst(input_case=input_case,
                                     deterministic=deterministic,
                                     input_file=whitelist)
            whitelist_graph = whitelist.graph
            punct_graph = PunctuationFst(deterministic=deterministic).graph

            # VERBALIZERS
            cardinal = vCardinal(deterministic=deterministic)
            v_cardinal_graph = cardinal.fst
            decimal = vDecimal(cardinal=cardinal, deterministic=deterministic)
            v_decimal_graph = decimal.fst
            ordinal = vOrdinal(deterministic=deterministic)
            v_ordinal_graph = ordinal.fst
            fraction = vFraction(deterministic=deterministic)
            v_fraction_graph = fraction.fst
            v_telephone_graph = vTelephone(deterministic=deterministic).fst
            v_electronic_graph = vElectronic(deterministic=deterministic).fst
            measure = vMeasure(decimal=decimal,
                               cardinal=cardinal,
                               fraction=fraction,
                               deterministic=deterministic)
            v_measure_graph = measure.fst
            v_time_graph = vTime(deterministic=deterministic).fst
            v_date_graph = vDate(ordinal=ordinal,
                                 deterministic=deterministic).fst
            v_money_graph = vMoney(decimal=decimal,
                                   deterministic=deterministic).fst
            v_roman_graph = vRoman(deterministic=deterministic).fst
            v_abbreviation = vAbbreviation(deterministic=deterministic).fst

            classify_and_verbalize = (
                pynutil.add_weight(whitelist_graph, 1.01)
                | pynutil.add_weight(pynini.compose(time_graph, v_time_graph),
                                     1.1)
                | pynutil.add_weight(
                    pynini.compose(decimal_graph, v_decimal_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(measure_graph, v_measure_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(cardinal_graph, v_cardinal_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(ordinal_graph, v_ordinal_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(telephone_graph, v_telephone_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(electronic_graph, v_electronic_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(fraction_graph, v_fraction_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(money_graph, v_money_graph), 1.1)
                | pynutil.add_weight(word_graph, 100)
                | pynutil.add_weight(pynini.compose(date_graph, v_date_graph),
                                     1.09)).optimize()

            if not deterministic:
                roman_graph = RomanFst(deterministic=deterministic).fst
                # the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens
                classify_and_verbalize |= pynutil.add_weight(
                    pynini.compose(roman_graph, v_roman_graph), 100)

                abbreviation_graph = AbbreviationFst(
                    whitelist=whitelist, deterministic=deterministic).fst
                classify_and_verbalize |= pynutil.add_weight(
                    pynini.compose(abbreviation_graph, v_abbreviation), 100)

            punct = pynutil.add_weight(punct_graph, weight=1.1)
            token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) +
                                classify_and_verbalize +
                                pynini.closure(pynutil.insert(" ") + punct))

            graph = token_plus_punct + pynini.closure(delete_extra_space +
                                                      token_plus_punct)
            graph = delete_space + graph + delete_space

            self.fst = graph.optimize()
            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f'ClassifyFst grammars are saved to {far_file}.')
Esempio n. 25
0
    def __init__(self,
                 input_case: str,
                 deterministic: bool = True,
                 cache_dir: str = None,
                 overwrite_cache: bool = False):
        super().__init__(name="tokenize_and_classify",
                         kind="classify",
                         deterministic=deterministic)

        far_file = None
        if cache_dir is not None and cache_dir != "None":
            os.makedirs(cache_dir, exist_ok=True)
            far_file = os.path.join(
                cache_dir,
                f"_{input_case}_en_tn_{deterministic}_deterministic.far")
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
            logging.info(f'ClassifyFst.fst was restored from {far_file}.')
        else:
            logging.info(f"Creating ClassifyFst grammars.")
            cardinal = CardinalFst(deterministic=deterministic)
            cardinal_graph = cardinal.fst

            ordinal = OrdinalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            ordinal_graph = ordinal.fst

            decimal = DecimalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            decimal_graph = decimal.fst
            fraction = FractionFst(deterministic=deterministic,
                                   cardinal=cardinal)
            fraction_graph = fraction.fst

            measure = MeasureFst(cardinal=cardinal,
                                 decimal=decimal,
                                 fraction=fraction,
                                 deterministic=deterministic)
            measure_graph = measure.fst
            date_graph = DateFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            word_graph = WordFst(deterministic=deterministic).fst
            time_graph = TimeFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            telephone_graph = TelephoneFst(deterministic=deterministic).fst
            electonic_graph = ElectronicFst(deterministic=deterministic).fst
            money_graph = MoneyFst(cardinal=cardinal,
                                   decimal=decimal,
                                   deterministic=deterministic).fst
            whitelist_graph = WhiteListFst(input_case=input_case,
                                           deterministic=deterministic).fst
            punct_graph = PunctuationFst(deterministic=deterministic).fst

            classify = (pynutil.add_weight(whitelist_graph, 1.01)
                        | pynutil.add_weight(time_graph, 1.1)
                        | pynutil.add_weight(date_graph, 1.09)
                        | pynutil.add_weight(decimal_graph, 1.1)
                        | pynutil.add_weight(measure_graph, 1.1)
                        | pynutil.add_weight(cardinal_graph, 1.1)
                        | pynutil.add_weight(ordinal_graph, 1.1)
                        | pynutil.add_weight(money_graph, 1.1)
                        | pynutil.add_weight(telephone_graph, 1.1)
                        | pynutil.add_weight(electonic_graph, 1.1)
                        | pynutil.add_weight(fraction_graph, 1.1)
                        | pynutil.add_weight(word_graph, 100))

            if not deterministic:
                roman_graph = RomanFst(deterministic=deterministic).fst
                # the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens
                classify |= pynutil.add_weight(roman_graph, 100)

                abbreviation_graph = AbbreviationFst(
                    deterministic=deterministic).fst
                classify |= pynutil.add_weight(abbreviation_graph, 100)

            punct = pynutil.insert("tokens { ") + pynutil.add_weight(
                punct_graph, weight=1.1) + pynutil.insert(" }")
            token = pynutil.insert("tokens { ") + classify + pynutil.insert(
                " }")
            token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) +
                                token +
                                pynini.closure(pynutil.insert(" ") + punct))

            graph = token_plus_punct + pynini.closure(delete_extra_space +
                                                      token_plus_punct)
            graph = delete_space + graph + delete_space

            self.fst = graph.optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f"ClassifyFst grammars are saved to {far_file}.")
Esempio n. 26
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="cardinal",
                         kind="classify",
                         deterministic=deterministic)
        # TODO repalce to have "oh" as a default for "0"
        graph = pynini.Far(
            get_abs_path("data/numbers/cardinal_number_name.far")).get_fst()
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            pynini.closure(NEMO_DIGIT, 2, 3)
            | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph
        self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure(
            pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT +
            NEMO_DIGIT + NEMO_DIGIT)) @ graph

        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))

        single_digits_graph = pynini.invert(graph_digit | graph_zero)
        self.single_digits_graph = single_digits_graph + pynini.closure(
            insert_space + single_digits_graph)

        if not deterministic:
            # for a single token allow only the same normalization
            # "007" -> {"oh oh seven", "zero zero seven"} not {"oh zero seven"}
            single_digits_graph_zero = pynini.invert(graph_digit | graph_zero)
            single_digits_graph_oh = pynini.invert(graph_digit) | pynini.cross(
                "0", "oh")

            self.single_digits_graph = single_digits_graph_zero + pynini.closure(
                insert_space + single_digits_graph_zero)
            self.single_digits_graph |= single_digits_graph_oh + pynini.closure(
                insert_space + single_digits_graph_oh)

            single_digits_graph_with_commas = pynini.closure(
                self.single_digits_graph + insert_space, 1,
                3) + pynini.closure(
                    pynutil.delete(",") + single_digits_graph + insert_space +
                    single_digits_graph + insert_space + single_digits_graph,
                    1,
                )

            self.range_graph = pynutil.insert(
                "from ") + self.graph + pynini.cross("-", " to ") + self.graph
            self.range_graph |= self.graph + (pynini.cross(
                "x", " by ") | pynini.cross(" x ", " by ")) + self.graph
            self.range_graph |= (pynutil.insert("from ") +
                                 get_hundreds_graph() +
                                 pynini.cross("-", " to ") +
                                 get_hundreds_graph())
            self.range_graph = self.range_graph.optimize()

        serial_graph = self.get_serial_graph()
        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)

        if deterministic:
            long_numbers = pynini.compose(NEMO_DIGIT**(5, ...),
                                          self.single_digits_graph).optimize()
            final_graph = self.graph | serial_graph | pynutil.add_weight(
                long_numbers, -0.001)
            cardinal_with_leading_zeros = pynini.compose(
                pynini.accep("0") + pynini.closure(NEMO_DIGIT),
                self.single_digits_graph)
            final_graph |= cardinal_with_leading_zeros
        else:

            leading_zeros = pynini.compose(
                pynini.closure(pynini.accep("0"), 1), self.single_digits_graph)
            cardinal_with_leading_zeros = (
                leading_zeros + pynutil.insert(" ") +
                pynini.compose(pynini.closure(NEMO_DIGIT), self.graph))

            final_graph = (self.graph
                           | serial_graph
                           | self.range_graph
                           | self.single_digits_graph
                           | get_hundreds_graph()
                           | pynutil.add_weight(
                               single_digits_graph_with_commas, 0.001)
                           | cardinal_with_leading_zeros)

        final_graph = optional_minus_graph + pynutil.insert(
            "integer: \"") + final_graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)

        self.fst = final_graph.optimize()
    def __init__(
        self,
        input_case: str,
        deterministic: bool = True,
        cache_dir: str = None,
        overwrite_cache: bool = True,
        whitelist: str = None,
    ):
        super().__init__(name="tokenize_and_classify",
                         kind="classify",
                         deterministic=deterministic)

        far_file = None
        if cache_dir is not None and cache_dir != 'None':
            os.makedirs(cache_dir, exist_ok=True)
            whitelist_file = os.path.basename(whitelist) if whitelist else ""
            far_file = os.path.join(
                cache_dir,
                f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far"
            )
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode='r')['tokenize_and_classify']
            no_digits = pynini.closure(pynini.difference(
                NEMO_CHAR, NEMO_DIGIT))
            self.fst_no_digits = pynini.compose(self.fst, no_digits).optimize()
            logging.info(f'ClassifyFst.fst was restored from {far_file}.')
        else:
            logging.info(
                f'Creating ClassifyFst grammars. This might take some time...')
            # TAGGERS
            cardinal = CardinalFst(deterministic=deterministic)
            cardinal_graph = cardinal.fst

            ordinal = OrdinalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            deterministic_ordinal = OrdinalFst(cardinal=cardinal,
                                               deterministic=True)
            ordinal_graph = ordinal.fst

            decimal = DecimalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            decimal_graph = decimal.fst
            fraction = FractionFst(deterministic=deterministic,
                                   cardinal=cardinal)
            fraction_graph = fraction.fst

            measure = MeasureFst(cardinal=cardinal,
                                 decimal=decimal,
                                 fraction=fraction,
                                 deterministic=deterministic)
            measure_graph = measure.fst
            date_graph = DateFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            word_graph = WordFst(deterministic=deterministic).graph
            time_graph = TimeFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            telephone_graph = TelephoneFst(deterministic=deterministic).fst
            electronic_graph = ElectronicFst(deterministic=deterministic).fst
            money_graph = MoneyFst(cardinal=cardinal,
                                   decimal=decimal,
                                   deterministic=deterministic).fst
            whitelist = WhiteListFst(input_case=input_case,
                                     deterministic=deterministic,
                                     input_file=whitelist)
            whitelist_graph = whitelist.graph
            punct_graph = PunctuationFst(deterministic=deterministic).graph
            serial_graph = SerialFst(cardinal=cardinal,
                                     ordinal=deterministic_ordinal,
                                     deterministic=deterministic).fst

            # VERBALIZERS
            cardinal = vCardinal(deterministic=deterministic)
            v_cardinal_graph = cardinal.fst
            decimal = vDecimal(cardinal=cardinal, deterministic=deterministic)
            v_decimal_graph = decimal.fst
            ordinal = vOrdinal(deterministic=deterministic)
            v_ordinal_graph = ordinal.fst
            fraction = vFraction(deterministic=deterministic)
            v_fraction_graph = fraction.fst
            v_telephone_graph = vTelephone(deterministic=deterministic).fst
            v_electronic_graph = vElectronic(deterministic=deterministic).fst
            measure = vMeasure(decimal=decimal,
                               cardinal=cardinal,
                               fraction=fraction,
                               deterministic=deterministic)
            v_measure_graph = measure.fst
            v_time_graph = vTime(deterministic=deterministic).fst
            v_date_graph = vDate(ordinal=ordinal,
                                 deterministic=deterministic).fst
            v_money_graph = vMoney(decimal=decimal,
                                   deterministic=deterministic).fst
            v_roman_graph = vRoman(deterministic=deterministic).fst
            v_abbreviation = vAbbreviation(deterministic=deterministic).fst

            det_v_time_graph = vTime(deterministic=True).fst
            det_v_date_graph = vDate(ordinal=vOrdinal(deterministic=True),
                                     deterministic=True).fst
            time_final = pynini.compose(time_graph, det_v_time_graph)
            date_final = pynini.compose(date_graph, det_v_date_graph)
            range_graph = RangeFst(time=time_final,
                                   date=date_final,
                                   cardinal=CardinalFst(deterministic=True),
                                   deterministic=deterministic).fst
            v_word_graph = vWord(deterministic=deterministic).fst

            sem_w = 1
            word_w = 100
            punct_w = 2
            classify_and_verbalize = (
                pynutil.add_weight(whitelist_graph, sem_w)
                | pynutil.add_weight(pynini.compose(time_graph, v_time_graph),
                                     sem_w)
                | pynutil.add_weight(
                    pynini.compose(decimal_graph, v_decimal_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(measure_graph, v_measure_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(cardinal_graph, v_cardinal_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(ordinal_graph, v_ordinal_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(telephone_graph, v_telephone_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(electronic_graph, v_electronic_graph),
                    sem_w)
                | pynutil.add_weight(
                    pynini.compose(fraction_graph, v_fraction_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(money_graph, v_money_graph), sem_w)
                | pynutil.add_weight(word_graph, word_w)
                | pynutil.add_weight(pynini.compose(date_graph, v_date_graph),
                                     sem_w - 0.01)
                | pynutil.add_weight(pynini.compose(range_graph, v_word_graph),
                                     sem_w)
                | pynutil.add_weight(
                    pynini.compose(serial_graph, v_word_graph),
                    1.1001)  # should be higher than the rest of the classes
            ).optimize()

            if not deterministic:
                roman_graph = RomanFst(deterministic=deterministic).fst
                # the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens
                classify_and_verbalize |= pynutil.add_weight(
                    pynini.compose(roman_graph, v_roman_graph), word_w)

                abbreviation_graph = AbbreviationFst(
                    whitelist=whitelist, deterministic=deterministic).fst
                classify_and_verbalize |= pynutil.add_weight(
                    pynini.compose(abbreviation_graph, v_abbreviation), word_w)

            punct_only = pynutil.add_weight(punct_graph, weight=punct_w)
            punct = pynini.closure(
                pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1),
                               delete_extra_space)
                | (pynutil.insert(" ") + punct_only),
                1,
            )

            token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) +
                                classify_and_verbalize +
                                pynini.closure(pynutil.insert(" ") + punct))

            graph = token_plus_punct + pynini.closure(
                (pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1),
                                delete_extra_space)
                 | (pynutil.insert(" ") + punct + pynutil.insert(" "))) +
                token_plus_punct)

            graph |= punct_only + pynini.closure(punct)
            graph = delete_space + graph + delete_space

            remove_extra_spaces = pynini.closure(
                NEMO_NOT_SPACE,
                1) + pynini.closure(delete_extra_space +
                                    pynini.closure(NEMO_NOT_SPACE, 1))
            remove_extra_spaces |= (
                pynini.closure(pynutil.delete(" "), 1) +
                pynini.closure(NEMO_NOT_SPACE, 1) +
                pynini.closure(delete_extra_space +
                               pynini.closure(NEMO_NOT_SPACE, 1)))

            graph = pynini.compose(graph.optimize(),
                                   remove_extra_spaces).optimize()
            self.fst = graph
            no_digits = pynini.closure(pynini.difference(
                NEMO_CHAR, NEMO_DIGIT))
            self.fst_no_digits = pynini.compose(graph, no_digits).optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f'ClassifyFst grammars are saved to {far_file}.')
Esempio n. 28
0
def main(args: argparse.Namespace) -> None:
    with pynini.Far(args.far_path, mode="r") as far_reader:
        encoder = pynini.EncodeMapper(
            far_reader.arc_type(), encode_labels=True
        )
        fsa_path = tempfile.mkstemp(text=True)[1]
        with pynini.Far(
            fsa_path,
            mode="w",
            arc_type=far_reader.arc_type(),
            far_type="default",
        ) as far_writer:
            while not far_reader.done():
                fst = far_reader.get_fst()
                assert fst.verify(), "FST is ill-formed"
                fst.encode(encoder)
                far_writer.add(far_reader.get_key(), fst)
                far_reader.next()
        count_path = tempfile.mkstemp(text=True)[1]
        lm_path = tempfile.mkstemp(text=True)[1]
        logging.info(
            "alignment.far is encoded to FSAs for training. Now training starts."
        )
        cmd = [
            "ngramcount",
            "--require_symbols=false",
            f"--order={args.order}",
            fsa_path,
            count_path,
        ]
        subprocess.check_call(cmd)
        os.remove(fsa_path)
        cmd1 = [
            "ngrammake",
            f"--method={args.smoothing_method}",
            count_path,
            lm_path,
        ]
        subprocess.check_call(cmd1)
        os.remove(count_path)
        if args.shrinking_method:
            shrunk_lm_sh = tempfile.mkstemp(text=True)[1]
            cmd = [
                "shrinking_method",
                "--method=relative_entropy",
                f"--target_number_of_ngrams={args.target_number_of_ngrams}",
                lm_path,
                shrunk_lm_sh,
            ]
            subprocess.check_call(cmd)
            lm_path = shrunk_lm_sh
        logging.info(
            "%s-gram %s Language model is trained.",
            args.order,
            args.smoothing_method,
        )
        # Decoding the LM
        model = pynini.Fst.read(lm_path)
        os.remove(lm_path)
        model.decode(encoder)
        model.write(args.model_path)

    logging.info(
        "%s-gram %s Language model is built.",
        args.order,
        args.smoothing_method,
    )
Esempio n. 29
0
    def __init__(
        self,
        input_case: str,
        deterministic: bool = False,
        cache_dir: str = None,
        overwrite_cache: bool = False,
        whitelist: str = None,
    ):
        super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic)
        far_file = None
        if cache_dir is not None and cache_dir != "None":
            os.makedirs(cache_dir, exist_ok=True)
            whitelist_file = os.path.basename(whitelist) if whitelist else ""
            far_file = os.path.join(
                cache_dir, f"_{input_case}_de_tn_{deterministic}_deterministic{whitelist_file}.far"
            )
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
            logging.info(f"ClassifyFst.fst was restored from {far_file}.")
        else:
            logging.info(f"Creating ClassifyFst grammars. This might take some time...")

            self.cardinal = CardinalFst(deterministic=deterministic)
            cardinal_graph = self.cardinal.fst

            self.ordinal = OrdinalFst(cardinal=self.cardinal, deterministic=deterministic)
            ordinal_graph = self.ordinal.fst

            self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic)
            decimal_graph = self.decimal.fst

            self.fraction = FractionFst(cardinal=self.cardinal, deterministic=deterministic)
            fraction_graph = self.fraction.fst
            self.measure = MeasureFst(
                cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic
            )
            measure_graph = self.measure.fst
            self.date = DateFst(cardinal=self.cardinal, deterministic=deterministic)
            date_graph = self.date.fst
            word_graph = WordFst(deterministic=deterministic).fst
            self.time = TimeFst(deterministic=deterministic)
            time_graph = self.time.fst
            self.telephone = TelephoneFst(cardinal=self.cardinal, deterministic=deterministic)
            telephone_graph = self.telephone.fst
            self.electronic = ElectronicFst(deterministic=deterministic)
            electronic_graph = self.electronic.fst
            self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic)
            money_graph = self.money.fst
            self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist)
            whitelist_graph = self.whitelist.fst
            punct_graph = PunctuationFst(deterministic=deterministic).fst

            classify = (
                pynutil.add_weight(whitelist_graph, 1.01)
                | pynutil.add_weight(time_graph, 1.1)
                | pynutil.add_weight(measure_graph, 1.1)
                | pynutil.add_weight(cardinal_graph, 1.1)
                | pynutil.add_weight(fraction_graph, 1.1)
                | pynutil.add_weight(date_graph, 1.1)
                | pynutil.add_weight(ordinal_graph, 1.1)
                | pynutil.add_weight(decimal_graph, 1.1)
                | pynutil.add_weight(money_graph, 1.1)
                | pynutil.add_weight(telephone_graph, 1.1)
                | pynutil.add_weight(electronic_graph, 1.1)
                | pynutil.add_weight(word_graph, 100)
            )

            punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }")
            token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
            token_plus_punct = (
                pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)
            )

            graph = token_plus_punct + pynini.closure(pynutil.add_weight(delete_extra_space, 1.1) + token_plus_punct)
            graph = delete_space + graph + delete_space

            self.fst = graph.optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f"ClassifyFst grammars are saved to {far_file}.")
Esempio n. 30
0
    def __init__(self,
                 input_case: str,
                 deterministic: bool = False,
                 cache_dir: str = None,
                 overwrite_cache: bool = False):
        super().__init__(name="tokenize_and_classify",
                         kind="classify",
                         deterministic=deterministic)
        if deterministic:
            raise ValueError(
                'Ru TN only supports non-deterministic cases and produces multiple normalization options.'
            )
        far_file = None
        if cache_dir is not None and cache_dir != "None":
            os.makedirs(cache_dir, exist_ok=True)
            far_file = os.path.join(
                cache_dir,
                f"_{input_case}_ru_tn_{deterministic}_deterministic.far")
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
            logging.info(f"ClassifyFst.fst was restored from {far_file}.")
        else:
            logging.info(
                f"Creating ClassifyFst grammars. This might take some time...")
            number_names = get_number_names()
            alternative_formats = get_alternative_formats()

            self.cardinal = CardinalFst(
                number_names=number_names,
                alternative_formats=alternative_formats,
                deterministic=deterministic)
            cardinal_graph = self.cardinal.fst

            self.ordinal = OrdinalFst(number_names=number_names,
                                      alternative_formats=alternative_formats,
                                      deterministic=deterministic)
            ordinal_graph = self.ordinal.fst

            self.decimal = DecimalFst(cardinal=self.cardinal,
                                      deterministic=deterministic)
            decimal_graph = self.decimal.fst

            self.measure = MeasureFst(cardinal=self.cardinal,
                                      decimal=self.decimal,
                                      deterministic=deterministic)
            measure_graph = self.measure.fst
            self.date = DateFst(number_names=number_names,
                                deterministic=deterministic)
            date_graph = self.date.fst
            word_graph = WordFst(deterministic=deterministic).fst
            self.time = TimeFst(number_names=number_names,
                                deterministic=deterministic)
            time_graph = self.time.fst
            self.telephone = TelephoneFst(number_names=number_names,
                                          deterministic=deterministic)
            telephone_graph = self.telephone.fst
            self.electronic = ElectronicFst(deterministic=deterministic)
            electronic_graph = self.electronic.fst
            self.money = MoneyFst(cardinal=self.cardinal,
                                  decimal=self.decimal,
                                  deterministic=deterministic)
            money_graph = self.money.fst
            self.whitelist = WhiteListFst(input_case=input_case,
                                          deterministic=deterministic)
            whitelist_graph = self.whitelist.fst
            punct_graph = PunctuationFst(deterministic=deterministic).fst

            classify = (pynutil.add_weight(whitelist_graph, 1.01)
                        | pynutil.add_weight(time_graph, 1.1)
                        | pynutil.add_weight(date_graph, 1.09)
                        | pynutil.add_weight(decimal_graph, 1.1)
                        | pynutil.add_weight(measure_graph, 0.9)
                        | pynutil.add_weight(cardinal_graph, 1.1)
                        | pynutil.add_weight(ordinal_graph, 1.1)
                        | pynutil.add_weight(money_graph, 1.1)
                        | pynutil.add_weight(telephone_graph, 1.1)
                        | pynutil.add_weight(electronic_graph, 1.1)
                        | pynutil.add_weight(word_graph, 100))

            punct = pynutil.insert("tokens { ") + pynutil.add_weight(
                punct_graph, weight=1.1) + pynutil.insert(" }")
            token = pynutil.insert("tokens { ") + classify + pynutil.insert(
                " }")
            token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) +
                                token +
                                pynini.closure(pynutil.insert(" ") + punct))

            graph = token_plus_punct + pynini.closure(
                pynutil.add_weight(delete_extra_space, 1.1) + token_plus_punct)
            graph = delete_space + graph + delete_space

            self.fst = graph.optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f"ClassifyFst grammars are saved to {far_file}.")