def _lexicon(path: str, token_type: pynini.TokenType) -> pynini.Fst:
    """Compiles lexicon FST.

  Args:
    path: path to input file.
    token_type: token type, or the path to a symbol table.

  Returns:
    A lexicon FST.
  """
    words = pynini.string_file(path,
                               input_token_type=token_type,
                               output_token_type=token_type)
    return pynutil.join(words, " ").optimize()
Ejemplo n.º 2
0
 def __init__(self, chat_lexicon_path: str, lm_path: str) -> None:
     self._lm = pynini.Fst.read(lm_path)
     assert self._lm.output_symbols(), "No LM output symbol table found"
     self._lm_syms = self._lm.output_symbols()
     lexicon = [w for (l, w) in self._lm_syms if l > 0]
     lexicon_fsa = pynini.string_map(lexicon).optimize()
     self._deduplicator = chatspeak.Deduplicator(lexicon_fsa)
     self._deabbreviator = chatspeak.Deabbreviator(lexicon_fsa)
     self._regexps = chatspeak.Regexps()
     self._lexicon = chatspeak.Lexicon(chat_lexicon_path)
     lm_mapper = pynini.string_map(lexicon,
                                   input_token_type="byte",
                                   output_token_type=self._lm_syms)
     self._bytes_to_lm_mapper = pynutil.join(lm_mapper, " ").optimize()
     self._lm_to_bytes_mapper = pynini.invert(self._bytes_to_lm_mapper)
Ejemplo n.º 3
0
                              _sigma_star)

# The actual factorizer
_phi = (_pad_zeros @ _raw_factorizer @ _del_zeros @ _fix_teens).optimize()

_lambda = pynini.string_map([("1", "one"), ("2", "two"), ("3", "three"),
                             ("4", "four"), ("5", "five"), ("6", "six"),
                             ("7", "seven"), ("8", "eight"), ("9", "nine"),
                             ("1[E1]", "ten"), ("1[E1*]1", "eleven"),
                             ("1[E1*]2", "twelve"), ("1[E1*]3", "thirteen"),
                             ("1[E1*]4", "fourteen"), ("1[E1*]5", "fifteen"),
                             ("1[E1*]6", "sixteen"), ("1[E1*]7", "seventeen"),
                             ("1[E1*]8", "eighteen"), ("1[E1*]9", "nineteen"),
                             ("2[E1]", "twenty"), ("3[E1]", "thirty"),
                             ("4[E1]", "forty"), ("5[E1]", "fifty"),
                             ("6[E1]", "sixty"), ("7[E1]", "seventy"),
                             ("8[E1]", "eighty"), ("9[E1]", "ninety"),
                             ("[E2]", "hundred"), ("[E3]", "thousand"),
                             ("[E6]", "million")]).optimize()
_lambda_star = pynutil.join(_lambda, pynutil.insert(" ")).optimize()


def number(token: str) -> str:
    return rewrite.one_top_rewrite(token, _phi @ _lambda_star)


# TODO(kbg): Remove this once weather.py no longer requires it.
# I would like to leave the "verbalized weatehr report" as an exercise to
# the reader.
VERBALIZE = _phi @ _lambda_star
Ejemplo n.º 4
0
 def _make_lexicon(self, lexicon: Iterable[str]) -> None:
     lexicon_fst = pynini.string_map(lexicon)
     self._lexicon = pynutil.join(lexicon_fst, " ").optimize()
Ejemplo n.º 5
0
 def testJoin(self):
     joined = pynutil.join("a", " ")
     for i in range(1, 10):
         query = " ".join(["a"] * i)
         lattice = pynini.intersect(joined, query)
         self.assertNotEqual(lattice.start(), pynini.NO_STATE_ID)