def __construct_compound_filter(self): ''' Construct the compound filter ''' with pynini.default_token_type(self.__syms.alphabet): alphabet = pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>"]).project("input"), self.__syms.stem_types, pynini.cross(self.__syms.categories, ""), pynini.cross(self.__syms.origin_features, ""), pynini.cross("<NoPref>", "") ) return pynini.concat( pynini.union( pynini.cross("<Initial>", ""), pynini.accep("<NoHy>"), pynini.accep("<NoDef>") ).closure(0,1), pynini.concat( pynini.union( pynini.concat( alphabet.closure(), pynini.cross(pynini.string_map(["<ABK>", "<ADV>", "<CARD>", "<NE>", "<PRO>", "<V>", "<ORD>", "<OTHER>"]).project("input"), "") ), pynini.concat( pynini.cross("", "<VADJ>"), pynini.concat( pynini.union( alphabet, pynini.cross("<kompos>", "") ).closure(), pynini.concat( pynini.cross("<kompos>", ""), pynini.concat( alphabet.closure(), pynini.cross("<V>", "") ) ) ) ), pynini.concat( pynini.union( alphabet, pynini.cross("<kompos>", "") ).closure(), pynini.cross(pynini.string_map(["<ADJ>", "<NN>"]).project("input"), "") ) ), pynini.concat( pynini.cross("<base>", ""), pynini.concat( pynini.cross(self.__syms.origin_features, ""), self.__syms.inflection_classes ) ) ) ).optimize()
def __construct_tail(self): ''' Define possible final sequences of a derivation ''' with pynini.default_token_type(self.__syms.alphabet): # C1 initial_stuff = pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<UL>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>", "<Pref_Stems>"]).project("input") ).closure() # C2 intermediate_stuff = pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<UL>", "<SS>", "<FB>", "<ge>", "<Suff_Stems>"]).project("input") ).closure() # C3 final_stuff = pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<UL>", "<SS>", "<FB>"]).project("input"), self.__syms.categories, self.__syms.stem_type_features, self.__syms.origin_features, pynini.string_map(["<NSNeut_es_e>", "<NSFem_0_n>", "<NSFem_0_en>", "<NSMasc_es_e>", "<NSMasc_es_$e>", "<NSMasc-s/$sse>"]).project("input") ).closure() # TAIL tail1 = initial_stuff + self.__syms.base_stem_types + intermediate_stuff return pynini.concat(tail1.closure(0,1) + final_stuff, self.__syms.inflection_classes.closure(0,1)).optimize()
def __construct_r21(self): ''' Low to up ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( ["<NoHy>", "<NoDef>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()) self.__syms.to_upper.draw("to_upper.dot") # Construction in SFST involves negation (which is expensiv). # It looks like we can do better: return pynini.push(pynini.union( alphabet.closure(), pynini.concat( pynini.transducer( "<^UC>", "", input_token_type=self.__syms.alphabet).closure(1), pynini.union( pynini.string_map( ["<NoHy>", "<NoDef>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), self.__syms.to_upper))).closure(), push_labels=True).optimize()
def __construct_r20(self): ''' Up to low ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( ["<^UC>", "<NoHy>", "<NoDef>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()) # # SFST uses a rewrite rule here return pynini.push(pynini.union( alphabet.closure(), pynini.concat( pynini.transducer( "<CB>", "", input_token_type=self.__syms.alphabet).closure(1), pynini.union( pynini.string_map( ["<^UC>", "<NoHy>", "<NoDef>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), self.__syms.to_lower))).closure(), push_labels=True).optimize()
def __construct_r14(self): ''' e-epenthesis 2 ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( [ "<CB>", "<FB>", "<DEL-S>", "<SS>", "<WB>", "<^UC>", "<^Ax>", "<^pl>", "<^Gen>", "<^Del>", "<NoHy>", "<NoDef>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()) tau = pynini.transducer("<DEL-S>", "e", input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet) return pynini.cdrewrite( tau, pynini.union( pynini.concat( pynini.string_map( ["d", "t"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), pynini.acceptor("m", token_type=self.__syms.alphabet).closure( 0, 1)), pynini.acceptor("t w", token_type=self.__syms.alphabet)), "", alphabet.closure()).optimize()
def __construct_r13(self): ''' e-epenthesis 1 ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( [ "<CB>", "<FB>", "<DEL-S>", "<SS>", "<WB>", "<^UC>", "<^Ax>", "<^pl>", "<^Gen>", "<^Del>", "<NoHy>", "<NoDef>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()) return pynini.union( alphabet, pynini.transducer( pynini.string_map( [ "<DEL-S>", "<SS>", "<FB>", "<^Gen>", "<^Del>", "<^pl>", "<^Ax>", "<WB>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), "")).closure().optimize()
def __construct_r1(self): ''' Umlaut Apfel$ ==> Äpfel ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( [ "<CB>", "<FB>", "<UL>", "<DEL-S>", "<SS>", "<WB>", "<^UC>", "<^Ax>", "<e>", "<^pl>", "<^Gen>", "<^Del>", "<NoHy>", "<NoDef>", "<UL>", "<FB>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()) # r1a tau = pynini.push(pynini.string_map( [("a", "ä"), ("o", "ö"), ("u", "ü"), ("A", "Ä"), ("O", "Ö"), ("U", "Ü")], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet), push_labels=True) lc = pynini.union( self.__syms.consonants, pynini.string_map( ["<CB>", "<WB>", "<NoHy>", "<NoDef>", "<^UC>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()).optimize() r1a = pynini.cdrewrite( tau, lc, pynini.concat( alphabet.closure(), pynini.acceptor("<UL>", token_type=self.__syms.alphabet)), alphabet.closure()) # r1c tau = pynini.transducer("a", "", input_token_type=self.__syms.alphabet) r1c = pynini.cdrewrite( tau, pynini.string_map( ["ä", "Ä"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), pynini.concat( self.__syms.consonants_lower, alphabet.closure(), pynini.acceptor("<UL>", token_type=self.__syms.alphabet)), alphabet.closure()).optimize() # r1d r1d = pynini.cdrewrite( pynini.transducer("<UL>", "<FB>", input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet), "", "", alphabet.closure()) return pynini.compose(r1a, pynini.compose(r1c, r1d)).optimize()
def get_names(): """ Returns the graph that matched common male and female names. """ male_labels = load_labels(get_abs_path("data/roman/male.tsv")) female_labels = load_labels(get_abs_path("data/roman/female.tsv")) male_labels.extend([[x[0].upper()] for x in male_labels]) female_labels.extend([[x[0].upper()] for x in female_labels]) names = pynini.string_map(male_labels).optimize() names |= pynini.string_map(female_labels).optimize() return names
def __construct_umlautung(self): ''' Map "a", "o" and "u" onto "ä", "ö" and "ü", corresp., if the umlaut marker "<UL>" is present. ''' with pynini.default_token_type(self.__syms.alphabet): alphabet = pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>"]).project("input"), self.__syms.stem_types, self.__syms.categories, ).closure() return pynini.concat( pynini.concat( alphabet, pynini.concat( self.__syms.consonants, pynini.concat( pynini.union( pynini.union( pynini.cross("a", "ä"), pynini.cross("o", "ö"), pynini.cross("u", "ü") ), pynini.concat( pynini.cross("a", "ä"), pynini.union( pynini.cross("a", ""), pynini.accep("u") ) ) ), pynini.concat( self.__syms.consonants.closure(), pynini.concat( pynini.concat( pynini.accep("e"), pynini.string_map(["l", "r"]).project("input") ).closure(0, 1), pynini.concat( pynini.accep("<Suff_Stems>"), pynini.cross("<UL>", "") ) ) ) ) ).closure(0, 1) ), self.__tail ).optimize()
def __init__(self, chat_lexicon_path: str, lm_path: str) -> None: self._lm = pynini.Fst.read(lm_path) assert self._lm.output_symbols(), "No LM output symbol table found" self._lm_syms = self._lm.output_symbols() lexicon = [w for (l, w) in self._lm_syms if l > 0] lexicon_fsa = pynini.string_map(lexicon).optimize() self._deduplicator = chatspeak.Deduplicator(lexicon_fsa) self._deabbreviator = chatspeak.Deabbreviator(lexicon_fsa) self._regexps = chatspeak.Regexps() self._lexicon = chatspeak.Lexicon(chat_lexicon_path) lm_mapper = pynini.string_map(lexicon, input_token_type="byte", output_token_type=self._lm_syms) self._bytes_to_lm_mapper = pynutil.join(lm_mapper, " ").optimize() self._lm_to_bytes_mapper = pynini.invert(self._bytes_to_lm_mapper)
def get_serial_graph(self): """ Finite state transducer for classifying serial. The serial is a combination of digits, letters and dashes, e.g.: c325-b -> tokens { serial { value: "c three two five b" } } """ alpha = NEMO_ALPHA if self.deterministic: num_graph = self.single_digits_graph else: num_graph = self.graph letter_pronunciation = pynini.string_map( load_labels(get_abs_path("data/letter_pronunciation.tsv"))) alpha |= letter_pronunciation delimiter = insert_space | pynini.cross("-", " ") | pynini.cross( "/", " ") letter_num = pynini.closure(alpha + delimiter, 1) + num_graph num_letter = pynini.closure(num_graph + delimiter, 1) + alpha next_alpha_or_num = pynini.closure(delimiter + (alpha | num_graph)) serial_graph = (letter_num | num_letter) + next_alpha_or_num if not self.deterministic: serial_graph += pynini.closure( pynini.accep("s") | pynini.cross("s", "es"), 0, 1) return serial_graph
def get_alternative_formats(): """ Utils to get alternative formats for numbers. """ one_alternatives = load_labels( get_abs_path('data/numbers/cardinals_alternatives.tsv')) one_thousand_map = [] for k in one_alternatives: default, alternative = k one_thousand_map.append((alternative.split()[1], alternative)) one_thousand_map = pynini.string_map(one_thousand_map) one_thousand_alternative = pynini.cdrewrite(one_thousand_map, "[BOS]", "", NEMO_SIGMA) # Adapted from # https://github.com/google/TextNormalizationCoveringGrammars/blob/master/src/universal/thousands_punct.grm # Specifies common ways of delimiting thousands in digit strings. t = pynini.Far(get_abs_path('data/utils/universal_thousands_punct.far')) separators = (pynutil.add_weight(t['dot_thousands'], 0.1) | pynutil.add_weight(t['no_delimiter'], -0.1) | pynutil.add_weight(t['space_thousands'], 0.1)) alternative_formats = {} alternative_formats[ 'one_thousand_alternative'] = one_thousand_alternative.optimize() alternative_formats['separators'] = separators.optimize() return alternative_formats
def testVerifyAsciiDefinition(self): ascii_char = pynini.string_map( # UTF-8 ASCII uses the all single byte characters with most # significant bit set to 0, barring NUL, which we ignore. pynini.escape(chr(codepoint)) for codepoint in range(1, 128)).optimize() self.assertFsasEquivalent(ascii_char, utf8.SINGLE_BYTE)
def _get_whitelist_non_deterministic_graph( file="data/whitelist_alternatives.tsv"): whitelist = load_labels(get_abs_path(file)) whitelist_lower = [(x.lower(), y.lower()) for x, y in whitelist] whitelist_cased = [(x, y) for x, y in whitelist] graph = pynini.string_map(whitelist_lower + whitelist_cased) return graph
def __construct_del_ge(self): ''' Case-dependent deletion of the ge marker ''' # delete <ge> at certain suffixes like 'ver' return pynini.concat( pynini.transducer("<no-ge>", "", input_token_type=self.__syms.alphabet), pynini.concat( pynini.acceptor("<Pref_Stems>", token_type=self.__syms.alphabet), pynini.concat( pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<e>", "<d>", "<~n>"]).project()).closure(), pynini.concat( pynini.transducer( "<V> <nativ>", "", input_token_type=self.__syms.alphabet), pynini.acceptor( "<NoDef>", token_type=self.__syms.alphabet).closure(0, 1), pynini.transducer( "<ge>", "", input_token_type=self.__syms.alphabet), self.__prefix_filter_helper, self.__syms.stem_type_features, pynini.acceptor( "<nativ>", token_type=self.__syms.alphabet))))).optimize()
def __construct_category_filter(self): ''' Filter-out non-matching category sequences ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( [ "<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), self.__syms.stem_types, self.__syms.categories, ).closure() filtering = self.__suff_stems_filter([ "<ABK>", "<ADJ>", "<ADV>", "<CARD>", "<DIGCARD>", "<NE>", "<NN>", "<PRO>", "<V>", "<ORD>" ]) return pynini.concat( pynini.concat(alphabet, filtering).closure(), self.__tail).optimize()
def __construct_suff_phon(self): ''' ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( [ "<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>", "<NN>", "<ADJ>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), self.__syms.stem_types, ).closure() Tau = pynini.transducer("i", "", input_token_type=self.__syms.alphabet) Lambda = pynini.concat( pynini.union( pynini.acceptor("i", token_type=self.__syms.alphabet), pynini.concat( self.__syms.consonants.project(), pynini.acceptor("y", token_type=self.__syms.alphabet))), pynini.acceptor("<Suff_Stems>", token_type=self.__syms.alphabet)) return pynini.concat( pynini.cdrewrite(Tau, Lambda, "", alphabet.project()), self.__tail).optimize()
def __construct_suff_phon(self): ''' ''' with pynini.default_token_type(self.__syms.alphabet): alphabet = pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>", "<NN>", "<ADJ>"]).project("input"), self.__syms.stem_types, ).closure() Tau = pynini.cross("i", "") Lambda = pynini.concat( pynini.union( pynini.accep("i"), pynini.concat( self.__syms.consonants.project("input"), pynini.accep("y") ) ), pynini.accep("<Suff_Stems>") ) return pynini.concat( pynini.cdrewrite( Tau, Lambda, "", alphabet.project("input") ), self.__tail ).optimize()
def __construct_insert_zu(self): ''' Inserts "zu" into infinitives with separable prefixes ''' with pynini.default_token_type(self.__syms.alphabet): alphabet = pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<~n>", "<e>", "<d>", "<NoHy>", "<NoDef>", "<VADJ>", "<CB>", "<FB>", "<UL>", "<SS>", "<DEL-S>", "<Low#>", "<Up#>", "<Fix#>", "<^imp>", "<^UC>", "<^Ax>", "<^pl>", "<^Gen>", "<^Del>"]).project("input") ).optimize() c2 = pynini.union( alphabet, self.__syms.stem_types ).closure().optimize() # From deko.fst: # insert "zu" after verbal prefixes if followed by infinitive marker return pynini.union( c2, #pynini.concat( # pynini.accep("<Base_Stems>"), # alphabet.closure(), # pynini.cross("<^zz>", ""), # alphabet.closure() # ), c2 + pynini.accep("<Pref_Stems>") + alphabet.closure() + pynini.accep("<Base_Stems>") + pynini.cross("", "z u") + alphabet.closure() + pynini.cross("<^zz>", "") + alphabet.closure() ).optimize()
def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="decimal", kind="classify", deterministic=deterministic) graph_digit = digit | zero if not deterministic: graph = pynini.union(graph_digit, cardinal.hundreds, cardinal.tens) graph += pynini.closure(insert_space + graph) else: # General pattern seems to be 1-3 digits: map as cardinal, default to digits otherwise \ graph = pynini.union( graph_digit, cardinal.tens, cardinal.hundreds, graph_digit + pynini.closure(insert_space + graph_digit, 3), zero + pynini.closure(insert_space + zero) + pynini.closure(insert_space + graph_digit), # For cases such as "1,010" ) # Need to strip apocope everywhere BUT end of string reverse_apocope = pynini.string_map([("un", "uno"), ("ún", "uno")]) apply_reverse_apocope = pynini.cdrewrite(reverse_apocope, "", NEMO_SPACE, NEMO_SIGMA) graph @= apply_reverse_apocope # Technically decimals should be space delineated groups of three, e.g. (1,333 333). This removes any possible spaces strip_formatting = pynini.cdrewrite(delete_space, "", "", NEMO_SIGMA) graph = strip_formatting @ graph self.graph = graph.optimize() graph_separator = pynutil.delete(decimal_separator) optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) self.graph_fractional = pynutil.insert( "fractional_part: \"") + self.graph + pynutil.insert("\"") # Integer graph maintains apocope except for ones place graph_integer = (strip_cardinal_apocope( cardinal.graph) if deterministic else pynini.union( cardinal.graph, strip_cardinal_apocope(cardinal.graph)) ) # Gives us forms w/ and w/o apocope self.graph_integer = pynutil.insert( "integer_part: \"") + graph_integer + pynutil.insert("\"") final_graph_wo_sign = self.graph_integer + graph_separator + insert_space + self.graph_fractional self.final_graph_wo_negative = (final_graph_wo_sign | get_quantity( final_graph_wo_sign, cardinal.graph).optimize()) final_graph = optional_graph_negative + self.final_graph_wo_negative final_graph += pynutil.insert(" preserve_order: true") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def _get_whitelist_graph(input_case, file): whitelist = load_labels(file) if input_case == "lower_cased": whitelist = [(x.lower(), y) for x, y in whitelist] else: whitelist = [(x, y) for x, y in whitelist] graph = pynini.string_map(whitelist) return graph
def _get_whitelist_graph(input_case, file="data/whitelist.tsv"): whitelist = load_labels(get_abs_path(file)) if input_case == "lower_cased": whitelist = [(x.lower(), y) for x, y in whitelist] else: whitelist = [(x, y) for x, y in whitelist] graph = pynini.string_map(whitelist) return graph
def __split_disjunctive_feats(self, disjunctive_feat_list): single_splits = [] for disjunctive_feat in disjunctive_feat_list: splitted = [] for cat in disjunctive_feat[1:-1].split(","): splitted.append("<" + cat + ">") single_splits.append(pynini.transducer(disjunctive_feat, pynini.string_map(splitted, input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet), input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet)) return pynini.union(*(single_splits)).optimize()
def testVerifyUtf8CharRegionalIndicatorSymbolDefinition(self): regional_indicator = pynini.string_map( # Regional indicator symbols have codepoints in the range 0x1F1E6 # through 0x1F1FF. pynini.escape(chr(codepoint)) for codepoint in range(0x1F1E6, 0x1F1FF + 1)).optimize() self.assertFsasEquivalent( regional_indicator, utf8.VALID_UTF8_CHAR_REGIONAL_INDICATOR_SYMBOL)
def setUpClass(cls): super().setUpClass() fold = pynini.string_map((("A", "a"), ("B", "b"))).optimize() cls.far_path = tempfile.mkstemp(suffix=".far")[1] with pynini.Far(cls.far_path, "w") as far: far["DOWNCASE"] = fold far["UPCASE"] = fold.invert() cls.cascade = rule_cascade.RuleCascade(cls.far_path)
def _get_whitelist_graph(input_case, file): whitelist = load_labels(file) if input_case == "lower_cased": whitelist = [[x[0].lower()] + x[1:] for x in whitelist] else: whitelist = [[x[0].lower()] + x[1:] for x in whitelist] graph = pynini.string_map(whitelist) return graph
def __construct_r19(self): ''' Eliminate markers ''' with pynini.default_token_type(self.__syms.alphabet): alphabet = pynini.union( self.__syms.characters, pynini.string_map(["<CB>", "<^UC>", "<NoHy>", "<NoDef>"]).project("input")) return pynini.union( alphabet, pynini.cross( pynini.string_map([ "<DEL-S>", "<SS>", "<FB>", "<^Gen>", "<^Del>", "<^pl>", "<^Ax>", "<WB>" ]).project("input"), "")).closure().optimize()
def transducerOfRule(mapping, leftContext, rightContext, alphabet): valid = sandwich.union(*alphabet).closure() language = sandwich.union(*(['.'] + alphabet)).closure() return sandwich.cdrewrite(sandwich.string_map(mapping), leftContext, rightContext, language, direction="sim") * valid
def generate_formula(in_to_out: List[Tuple[str]], envs: Tuple[Set[str]]): env_formulas = list() # for env in envs: # env_formula = pynini.accep("0").star # if env != [""]: # for j in range(0, len(env)): # env_formula = env_formula + pynini.accep("0").star + pynini.union(*env[j]) # env_formulas.append(env_formula + pynini.accep("0").star) # for env in envs: # env_formula = pynini,env[0] for i in range(len(envs)): if envs[i] != [""] and (len(envs[i]) > 1 or in_to_out[0][0] != '' or i > 0): env_formula = pynini.union(*envs[i][0]) for j in range( 1, len(envs[i]) - int(in_to_out[0][0] == '' and i == 0)): env_formula = (env_formula + pynini.union(*envs[i][j])).optimize() else: env_formula = pynini.accep("") env_formulas.append(pynini.rmepsilon(env_formula.optimize())) if in_to_out[0][0] == '': str_map: pynini.Fst = pynini.string_map( (e_v, e_v + in_to_out[0][1]) for e_v in envs[0][len(envs[0]) - 1]) return pynini.cdrewrite(str_map.ques, env_formulas[0], env_formulas[1], self.sigma_star, direction="ltr").optimize() else: str_map: pynini.Fst = pynini.string_map( in_to_out).ques.rmepsilon().optimize() return pynini.cdrewrite(str_map.ques, env_formulas[0], env_formulas[1], self.sigma_star, direction="ltr").optimize()
def testVerifyUtf8Rfc3629Definition(self): utf8_rfc3629_char = pynini.string_map( # UTF-8 encoded strings can store codepoints in U+0000 through # U+0x10FFFF, excluding the surrogate halves in U+D800 through # U+DFFF, but we exclude U+0000 as it would be strange to match NUL # and that label is reserved for epsilon. pynini.escape(chr(codepoint)) for codepoint in range(1, 0x10FFFF + 1) if not 0xD800 <= codepoint <= 0xDFFF).optimize() self.assertFsasEquivalent(utf8_rfc3629_char, utf8.VALID_UTF8_CHAR)