Esempio n. 1
0
 def testMandatoryRewrite(self):
     rule = pynini.cdrewrite(pynutil.delete(self.td), self.consonant,
                             "[EOS]", self.sigstar).optimize()
     rewrites = tuple(rewrite.rewrites("fist", rule))
     # pylint: disable=g-generic-assert
     self.assertEqual(len(rewrites), 1)
     # pylint: enable=g-generic-assert
     self.assertEqual("fis", rewrites[0])
     self.assertEqual("fis", rewrite.top_rewrite("fist", rule))
     self.assertEqual("fis", rewrite.one_top_rewrite("fist", rule))
     self.assertTrue(rewrite.matches("fist", "fis", rule))
     self.assertFalse(rewrite.matches("fis", "fist", rule))
Esempio n. 2
0
    def post_process(self, normalized_text: 'pynini.FstLike') -> str:
        """
        Runs post processing graph on normalized text

        Args:
            normalized_text: normalized text

        Returns: shortest path
        """
        normalized_text = normalized_text.strip()
        if not normalized_text:
            return normalized_text
        normalized_text = pynini.escape(normalized_text)

        if self.post_processor is not None:
            normalized_text = top_rewrite(normalized_text,
                                          self.post_processor.fst)
        return normalized_text
Esempio n. 3
0
def get_number_names():
    """
    Creates numbers names.

    Based on: 1) Gorman, K., and Sproat, R. 2016. Minimally supervised number normalization.
    Transactions of the Association for Computational Linguistics 4: 507-519.
    and 2) Ng, A. H., Gorman, K., and Sproat, R. 2017.
    Minimally supervised written-to-spoken text normalization. In ASRU, pages 665-670.
    """
    a = pynini.Far(get_abs_path('data/utils/util_arithmetic.far'), mode='r')
    d = a['DELTA_STAR']
    f = a['IARITHMETIC_RESTRICTED']
    g = pynini.Fst.read(get_abs_path('data/utils/g.fst'))
    fg = (d @ (f @ (f @ (f @ g).optimize()).optimize()).optimize()).optimize()
    assert rewrite.top_rewrite("230", fg) == "(+ 200 30 +)"

    # Compiles lexicon transducers (L).
    cardinal_name_nominative = pynini.string_file(
        get_abs_path("data/numbers/1_cardinals_nominative_именительный.tsv")
    ).optimize()
    cardinal_name_genitive = pynini.string_file(
        get_abs_path(
            "data/numbers/2_cardinals_genitive_родительный.tsv")).optimize()
    cardinal_name_dative = pynini.string_file(
        get_abs_path(
            "data/numbers/3_cardinals_dative_датильный.tsv")).optimize()
    cardinal_name_accusative = pynini.string_file(
        get_abs_path(
            "data/numbers/4_cardinals_accusative_винительный.tsv")).optimize()
    cardinal_name_instrumental = pynini.string_file(
        get_abs_path("data/numbers/5_cardinals_instrumental_творительный.tsv")
    ).optimize()
    cardinal_name_prepositional = pynini.string_file(
        get_abs_path("data/numbers/6_cardinals_prepositional_предложный.tsv")
    ).optimize()

    cardinal_l = (
        pynini.closure(cardinal_name_nominative + pynini.accep(" ")) +
        cardinal_name_nominative).optimize()
    for case in [
            cardinal_name_genitive,
            cardinal_name_dative,
            cardinal_name_accusative,
            cardinal_name_instrumental,
            cardinal_name_prepositional,
    ]:
        cardinal_l |= (pynini.closure(case + pynini.accep(" ")) +
                       case).optimize()

    # Numbers up to 1000 in nominative case (to use, for example, with telephone)
    nominative_up_to_thousand_name = pynini.string_file(
        get_abs_path("data/numbers/cardinals_nominative_case.tsv"))
    nominative_up_to_thousand_name_l = (
        pynini.closure(nominative_up_to_thousand_name + pynini.accep(" ")) +
        nominative_up_to_thousand_name).optimize()

    # Convert e.g. "(* 5 1000 *)" back to  "5000" so complex ordinals will be formed correctly,
    #  e.g. "пятитысячный" will eventually be formed. (If we didn't do this, the incorrect phrase
    # "пять тысячный" would be formed).
    # We do this for all thousands from "(*2 1000 *)" —> "2000" to "(*20 1000 *)" —> "20000".
    # We do not go higher, in order to prevent the WFST graph becoming even larger.
    complex_numbers = pynini.cross("(* 2 1000 *)", "2000")
    for number in range(3, 21):
        complex_numbers |= pynini.cross(f"(* {number} 1000 *)", f"{number}000")

    complex_numbers = (NEMO_SIGMA + pynutil.add_weight(complex_numbers, -1) +
                       pynini.closure(pynini.union(" ", ")", "(", "+", "*")))
    fg_ordinal = pynutil.add_weight(pynini.compose(fg, complex_numbers),
                                    -1) | fg
    ordinal_name = pynini.string_file(
        get_abs_path("data/numbers/ordinals.tsv"))
    ordinal_l = (pynini.closure(cardinal_name_nominative + pynini.accep(" ")) +
                 ordinal_name).optimize()

    # Composes L with the leaf transducer (P), then composes that with FG.
    p = a['LEAVES']
    number_names = {}
    number_names['ordinal_number_names'] = (
        fg_ordinal @ (p @ ordinal_l)).optimize()
    number_names['cardinal_number_names'] = (fg @ (p @ cardinal_l)).optimize()
    number_names['nominative_up_to_thousand_names'] = (
        fg @ (p @ nominative_up_to_thousand_name_l)).optimize()
    return number_names
Esempio n. 4
0
 def encode(self, text: pynini.FstLike) -> str:
     return rewrite.top_rewrite(text, self._encoder)