def _lexicon(path: str, token_type: pynini.TokenType) -> pynini.Fst: """Compiles lexicon FST. Args: path: path to input file. token_type: token type, or the path to a symbol table. Returns: A lexicon FST. """ words = pynini.string_file(path, input_token_type=token_type, output_token_type=token_type) return pynutil.join(words, " ").optimize()
def __init__(self, chat_lexicon_path: str, lm_path: str) -> None: self._lm = pynini.Fst.read(lm_path) assert self._lm.output_symbols(), "No LM output symbol table found" self._lm_syms = self._lm.output_symbols() lexicon = [w for (l, w) in self._lm_syms if l > 0] lexicon_fsa = pynini.string_map(lexicon).optimize() self._deduplicator = chatspeak.Deduplicator(lexicon_fsa) self._deabbreviator = chatspeak.Deabbreviator(lexicon_fsa) self._regexps = chatspeak.Regexps() self._lexicon = chatspeak.Lexicon(chat_lexicon_path) lm_mapper = pynini.string_map(lexicon, input_token_type="byte", output_token_type=self._lm_syms) self._bytes_to_lm_mapper = pynutil.join(lm_mapper, " ").optimize() self._lm_to_bytes_mapper = pynini.invert(self._bytes_to_lm_mapper)
_sigma_star) # The actual factorizer _phi = (_pad_zeros @ _raw_factorizer @ _del_zeros @ _fix_teens).optimize() _lambda = pynini.string_map([("1", "one"), ("2", "two"), ("3", "three"), ("4", "four"), ("5", "five"), ("6", "six"), ("7", "seven"), ("8", "eight"), ("9", "nine"), ("1[E1]", "ten"), ("1[E1*]1", "eleven"), ("1[E1*]2", "twelve"), ("1[E1*]3", "thirteen"), ("1[E1*]4", "fourteen"), ("1[E1*]5", "fifteen"), ("1[E1*]6", "sixteen"), ("1[E1*]7", "seventeen"), ("1[E1*]8", "eighteen"), ("1[E1*]9", "nineteen"), ("2[E1]", "twenty"), ("3[E1]", "thirty"), ("4[E1]", "forty"), ("5[E1]", "fifty"), ("6[E1]", "sixty"), ("7[E1]", "seventy"), ("8[E1]", "eighty"), ("9[E1]", "ninety"), ("[E2]", "hundred"), ("[E3]", "thousand"), ("[E6]", "million")]).optimize() _lambda_star = pynutil.join(_lambda, pynutil.insert(" ")).optimize() def number(token: str) -> str: return rewrite.one_top_rewrite(token, _phi @ _lambda_star) # TODO(kbg): Remove this once weather.py no longer requires it. # I would like to leave the "verbalized weatehr report" as an exercise to # the reader. VERBALIZE = _phi @ _lambda_star
def _make_lexicon(self, lexicon: Iterable[str]) -> None: lexicon_fst = pynini.string_map(lexicon) self._lexicon = pynutil.join(lexicon_fst, " ").optimize()
def testJoin(self): joined = pynutil.join("a", " ") for i in range(1, 10): query = " ".join(["a"] * i) lattice = pynini.intersect(joined, query) self.assertNotEqual(lattice.start(), pynini.NO_STATE_ID)