def dedup_rule(letter: str) -> pynini.Fst: """Compiles transducer that optionally deletes multiple letters. One or two of the same letter must be encountered beforehand. Args: letter: a letter. Returns: An FST deleting that in an appropriate sequence. """ not_letter = byte.LOWER - letter return pynini.cdrewrite( pynini.cross(_plus(letter), _ques(letter)), ("[BOS]" | not_letter) + letter, ("[EOS]" | not_letter), _sigma_star)
def __init__(self, ordinal: GraphFst, deterministic: bool = True): super().__init__(name="date", kind="verbalize", deterministic=deterministic) day_cardinal = pynutil.delete("day: \"") + pynini.closure( NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") day = day_cardinal @ pynini.cdrewrite( ordinal.ordinal_stem, "", "[EOS]", NEMO_SIGMA) + pynutil.insert("ter") months_names = pynini.union(*[ x[1] for x in load_labels(get_abs_path("data/months/abbr_to_name.tsv")) ]) month = pynutil.delete("month: \"") + pynini.closure( NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") final_month = month @ months_names final_month |= month @ pynini.difference( NEMO_SIGMA, months_names) @ pynini.cdrewrite( ordinal.ordinal_stem, "", "[EOS]", NEMO_SIGMA) + pynutil.insert("ter") year = pynutil.delete("year: \"") + pynini.closure( NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") # day month year graph_dmy = day + pynini.accep(" ") + final_month + pynini.closure( pynini.accep(" ") + year, 0, 1) graph_dmy |= final_month + pynini.accep(" ") + year self.graph = graph_dmy | year final_graph = self.graph + delete_preserve_order delete_tokens = self.delete_tokens(final_graph) self.fst = delete_tokens.optimize()
def delete_tokens(self, fst) -> 'pynini.FstLike': """ Deletes class name wrap around output of given fst Args: fst: input fst Returns: Fst: fst """ res = (pynutil.delete(f"{self.name}") + delete_space + pynutil.delete("{") + delete_space + fst + delete_space + pynutil.delete("}")) return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA)
def strip_cardinal_apocope(fst: 'pynini.FstLike') -> 'pynini.FstLike': """ Reverts apocope on cardinal strings in line with formation rules. e.g. "un" -> "uno". Due to cardinal formation rules, this in effect only affects strings where the final value is a variation of "un". e.g. "un" -> "uno" "veintiún" -> "veintiuno" Args: fst: Any fst. Composes conversion onto fst's output strings """ # Since cardinals use apocope by default for large values (e.g. "millón"), this only needs to act on the last instance of one strip = pynini.cross("un", "uno") | pynini.cross("ún", "uno") strip = pynini.cdrewrite(strip, "", pynini.union("[EOS]", "\""), NEMO_SIGMA) return fst @ strip
def add_cardinal_apocope_fem(fst: 'pynini.FstLike') -> 'pynini.FstLike': """ Adds apocope on cardinal strings in line with stressing rules. e.g. "una" -> "un". This only occurs when "una" precedes a stressed "a" sound in formal speech. This is not predictable with text string, so is included for non-deterministic cases. e.g. "una" -> "un" "veintiuna" -> "veintiun" Args: fst: Any fst. Composes conversion onto fst's output strings """ # Since the stress trigger follows the cardinal string and only affects the preceding sound, this only needs to act on the last instance of one strip = pynini.cross("una", "un") | pynini.cross("veintiuna", "veintiún") strip = pynini.cdrewrite(strip, "", pynini.union("[EOS]", "\""), NEMO_SIGMA) return fst @ strip
def __init__(self, tag_label: str, matcher: pynini.FstLike, sigma_star: pynini.FstLike) -> None: """Constructor. Args: tag_label: String used as a tag. It must be in-alphabet when processed by the specified token type. matcher: an acceptor matching the strings to be tagged. sigma_star: an unweighted cyclic acceptor over the vocabulary. Raises: Error: Tag is not in the alphabet. """ # Builds tag transducer. ltag = pynutil.insert(self.LTAG_TEMPLATE.format(tag_label)) rtag = pynutil.insert(self.RTAG_TEMPLATE.format(tag_label)) self._tagger = pynini.cdrewrite(ltag + matcher + rtag, "", "", sigma_star).optimize()
def get_four_digit_year_graph(deterministic: bool = True): """ Returns a four digit transducer which is combination of ties/teen or digits (using hundred instead of thousand format), e.g. 1219 -> twelve nineteen 3900 -> thirty nine hundred """ graph_ties = get_ties_graph(deterministic) graph_with_s = ( (graph_ties + insert_space + graph_ties) | (graph_teen + insert_space + (ties_graph | pynini.cross("1", "ten"))) ) + pynutil.delete("0s") graph_with_s |= (graph_teen | graph_ties) + insert_space + pynini.cross("00", "hundred") + pynutil.delete("s") graph_with_s = graph_with_s @ pynini.cdrewrite( pynini.cross("y", "ies") | pynutil.insert("s"), "", "[EOS]", NEMO_SIGMA ) graph = graph_ties + insert_space + graph_ties graph |= (graph_teen | graph_ties) + insert_space + pynini.cross("00", "hundred") thousand_graph = ( graph_digit + insert_space + pynini.cross("00", "thousand") + (pynutil.delete("0") | insert_space + graph_digit) ) thousand_graph |= ( graph_digit + insert_space + pynini.cross("000", "thousand") + pynini.closure(pynutil.delete(" "), 0, 1) + pynini.accep("s") ) graph |= graph_with_s if deterministic: graph = plurals._priority_union(thousand_graph, graph, NEMO_SIGMA) else: graph |= thousand_graph return graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="time", kind="verbalize", deterministic=deterministic) hour = (pynutil.delete("hours:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) minute = (pynutil.delete("minutes:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) suffix = (pynutil.delete("suffix:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) optional_suffix = pynini.closure(delete_space + insert_space + suffix, 0, 1) zone = (pynutil.delete("zone:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) optional_zone = pynini.closure(delete_space + insert_space + zone, 0, 1) second = (pynutil.delete("seconds:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) graph_hms = (hour + pynutil.insert(" hours ") + delete_space + minute + pynutil.insert(" minutes and ") + delete_space + second + pynutil.insert(" seconds") + optional_suffix + optional_zone) graph_hms @= pynini.cdrewrite( pynutil.delete("o ") | pynini.cross("one minutes", "one minute") | pynini.cross("one seconds", "one second") | pynini.cross("one hours", "one hour"), pynini.union(" ", "[BOS]"), "", NEMO_SIGMA, ) graph = hour + delete_space + insert_space + minute + optional_suffix + optional_zone graph |= hour + insert_space + pynutil.insert( "o'clock") + optional_zone graph |= hour + delete_space + insert_space + suffix + optional_zone graph |= graph_hms delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="date", kind="verbalize", deterministic=deterministic) day_cardinal = pynutil.delete("day: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") day = strip_cardinal_apocope(day_cardinal) primero = pynini.cdrewrite(pynini.cross("uno", "primero"), "[BOS]", "[EOS]", NEMO_SIGMA) day = ( (day @ primero) if deterministic else pynini.union(day, day @ primero) ) # Primero for first day is traditional, but will vary depending on region month = pynutil.delete("month: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") year = ( pynutil.delete("year: \"") + articles + NEMO_SPACE + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) # Insert preposition if wasn't originally with the year. This would mean a space was present year = pynutil.add_weight(year, -0.001) year |= ( pynutil.delete("year: \"") + pynutil.insert("de ") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) # day month year graph_dmy = day + pynini.cross(NEMO_SPACE, " de ") + month + pynini.closure(pynini.accep(" ") + year, 0, 1) graph_mdy = month + NEMO_SPACE + day + pynini.closure(NEMO_SPACE + year, 0, 1) if deterministic: graph_mdy += pynutil.delete(" preserve_order: true") # Only accepts this if was explicitly passed self.graph = graph_dmy | graph_mdy final_graph = self.graph + delete_preserve_order delete_tokens = self.delete_tokens(final_graph) self.fst = delete_tokens.optimize()
def get_hundreds_graph(deterministic: bool = True): """ Returns a four digit transducer which is combination of ties/teen or digits (using hundred instead of thousand format), e.g. 1219 -> twelve nineteen 3900 -> thirty nine hundred """ graph_ties = get_ties_graph(deterministic) graph = (graph_ties + insert_space + graph_ties | graph_teen + insert_space + pynini.cross("00", "hundred") | (graph_teen + insert_space + (ties_graph | pynini.cross("1", "ten")) + pynutil.delete("0s")) @ pynini.cdrewrite( pynini.cross("y", "ies") | pynutil.insert("s"), "", "[EOS]", NEMO_SIGMA) | pynutil.add_weight( graph_digit + insert_space + pynini.cross("00", "thousand") + (pynutil.delete("0") | insert_space + graph_digit), weight=-0.001, )) return graph
def __construct_r14(self): ''' e-epenthesis 2 ''' with pynini.default_token_type(self.__syms.alphabet): alphabet = pynini.union( self.__syms.characters, pynini.string_map([ "<CB>", "<FB>", "<DEL-S>", "<SS>", "<WB>", "<^UC>", "<^Ax>", "<^pl>", "<^Gen>", "<^Del>", "<NoHy>", "<NoDef>" ]).project("input")) tau = pynini.cross("<DEL-S>", "e") return pynini.cdrewrite( tau, pynini.union( pynini.concat( pynini.string_map(["d", "t"]).project("input"), pynini.accep("m").closure(0, 1)), pynini.accep("t w")), "", alphabet.closure()).optimize()
def __init__(self): super().__init__(name="telephone", kind="classify") # country code, number_part, extension digit_to_str = pynini.invert( pynini.string_file(get_abs_path("data/numbers/digit.tsv")) ).optimize() | pynini.cross("0", pynini.union("o", "oh", "zero")) double_digit = pynini.union( *[ pynini.cross( pynini.project(str(i) @ digit_to_str, "output") + pynini.accep(" ") + pynini.project(str(i) @ digit_to_str, "output"), pynutil.insert("double ") + pynini.project(str(i) @ digit_to_str, "output"), ) for i in range(10) ] ) double_digit.invert() number_part = ( pynini.closure(digit_to_str + insert_space, 2, 2) + digit_to_str + pynutil.delete("-") + insert_space + pynini.closure(digit_to_str + insert_space, 2, 2) + digit_to_str + pynutil.delete("-") + insert_space + pynini.closure(digit_to_str + insert_space, 3, 3) + digit_to_str ) number_part = ( pynutil.insert("number_part: \"") + pynini.cdrewrite(double_digit, "", "", NEMO_SIGMA) @ pynini.invert(number_part) + pynutil.insert("\"") ) graph = number_part final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def _rw_ph_context(gr, in_ph, out_ph, post_str=c.EPSILON, pre_str=c.EPSILON) -> p.Fst: """Rewrites the phoneme assigned to a grapheme in context. In an alignment (gr=ph_in), changes the assigned phoneme, resulting in (gr=ph_out), based on the context such as post-nasal or word final. Following call ``` _rw_ph_context('ans', 'nsl', 'm', pre_str=labial) ``` would return: ``` p.cdrewrite(p.cross('(ans=nsl)', '(ans=m)'), '', c.L_SIDE + labial + c.PH_END, c.SIGMA_STAR).optimize() ``` Args: gr: grapheme, ans in the example in_ph: input phoneme, nsl in the example out_ph: output phoneme, m in the example post_str: preceeding context, eg. if nasal, the rule is post-nasal pre_str: following context, eg. if labial, the rule is pre-labial Returns: Rewrite rule FST. """ in_algn = c.L_BOUND + gr + c.ASSIGN + in_ph + c.R_BOUND out_algn = c.L_BOUND + gr + c.ASSIGN + out_ph + c.R_BOUND return p.cdrewrite(p.cross(in_algn, out_algn), post_str, pre_str, c.SIGMA_STAR).optimize()
def get_alternative_formats(): """ Utils to get alternative formats for numbers. """ one_alternatives = load_labels( get_abs_path('data/numbers/cardinals_alternatives.tsv')) one_thousand_map = [] for k in one_alternatives: default, alternative = k one_thousand_map.append((alternative.split()[1], alternative)) one_thousand_map = pynini.string_map(one_thousand_map) one_thousand_alternative = pynini.cdrewrite(one_thousand_map, "[BOS]", "", NEMO_SIGMA) t = pynini.Far(get_abs_path('data/utils/universal_thousands_punct.far')) separators = (pynutil.add_weight(t['dot_thousands'], 0.1) | pynutil.add_weight(t['no_delimiter'], -0.1) | pynutil.add_weight(t['space_thousands'], 0.1)) alternative_formats = {} alternative_formats['one_thousand_alternative'] = one_thousand_alternative alternative_formats['separators'] = separators return alternative_formats
def _get_year_graph(): """ Transducer for year, only from 1000 - 2999 e.g. 1290-> twelve nineteen 2000 - 2009 will be verbalized as two thousand.. """ graph_ties = _get_ties_graph() graph = ( graph_ties + insert_space + graph_ties | graph_teen + insert_space + pynini.cross("00", "hundred") | (graph_teen + insert_space + (ties_graph | pynini.cross("1", "ten")) + pynutil.delete("0s")) @ pynini.cdrewrite(pynini.cross("y", "ies") | pynutil.insert("s"), "", "[EOS]", NEMO_SIGMA) | pynutil.add_weight( graph_digit + insert_space + pynini.cross("00", "thousand") + (pynutil.delete("0") | insert_space + graph_digit), weight=-0.001, ) ) graph = (pynini.union("1", "2") + NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT + pynini.closure("s", 0, 1)) @ graph return graph
def get_quantity(decimal_graph: 'pynini.FstLike', cardinal_graph: 'pynini.FstLike') -> 'pynini.FstLike': """ Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral, e.g. 2 millones -> integer_part: "dos" quantity: "millones" e.g. 2,4 millones -> integer_part: "dos" fractional_part: "quatro" quantity: "millones" e.g. 2,400 millones -> integer_part: "dos mil cuatrocientos" fractional_part: "quatro" quantity: "millones" Args: decimal_graph: DecimalFST cardinal_graph: CardinalFST """ numbers = pynini.closure(NEMO_DIGIT, 1, 6) @ cardinal_graph numbers = pynini.cdrewrite(pynutil.delete(cardinal_separator), "", "", NEMO_SIGMA) @ numbers res = ( pynutil.insert("integer_part: \"") + numbers # The cardinal we're passing only produces 'un' for one, so gender agreement is safe (all quantities are masculine). Limit to 10^6 power. + pynutil.insert("\"") + NEMO_SPACE + pynutil.insert("quantity: \"") + quantities + pynutil.insert("\"")) res |= decimal_graph + NEMO_SPACE + pynutil.insert( "quantity: \"") + quantities + pynutil.insert("\"") return res
def __init__(self): super().__init__() attr_map_0 = pynini.transducer(self.cable_digits, '#жил') attr_map_1 = pynini.transducer(self.cable_digits, 'Длина_кабеля') attr_map_2 = pynini.transducer(self.cable_digits, 'Диаметр') attr_map_3 = pynini.transducer(self.cable_digits, '#соединительных_проводов') attr_map_4 = pynini.transducer(' . ', ' Диаметр ') attr_map_0_lc = self.ngram_comb attr_map_0_rc = self.cable_splitters attr_map_0_s = pynini.cdrewrite(attr_map_0, attr_map_0_lc, attr_map_0_rc, self.alphabet).optimize() attr_map_3_lc = self.cable_splitters attr_map_3_rc = self.cable_splitters attr_map_3_s = pynini.cdrewrite(attr_map_3, attr_map_3_lc, attr_map_3_rc, self.alphabet).optimize() attr_map_2_lc = self.cable_floats attr_map_2_rc = self.cable_length_0 attr_map_2_s = pynini.cdrewrite(attr_map_2, attr_map_2_lc, attr_map_2_rc, self.alphabet).optimize() attr_map_4_lc = self.cable_splitters attr_map_4_rc = self.cable_floats attr_map_4_s = pynini.cdrewrite(attr_map_2, attr_map_4_lc, attr_map_4_rc, self.alphabet).optimize() attr_map_5_lc = self.cable_digits attr_map_5_rc = self.cable_digits attr_map_5_s = pynini.cdrewrite(attr_map_4, attr_map_5_lc, attr_map_5_rc, self.alphabet).optimize() attr_map_6_rc = self.cable_length_0 attr_map_6_lc = self.cable_length_1 attr_map_6_s = pynini.cdrewrite(attr_map_1, attr_map_6_rc, attr_map_6_lc, self.alphabet).optimize() attr_map_comp_0 = pynini.compose(pynini.compose(attr_map_2_s, attr_map_4_s).optimize(), attr_map_5_s).optimize() self.rules = { 'жилы': attr_map_0_s, 'соединительные_провода': attr_map_3_s, 'диаметр': attr_map_comp_0, 'длина_кабеля': attr_map_6_s }
def __init__(self): super().__init__() attr_map_0 = pynini.transducer(self.cable_digits, '#жил') attr_map_1 = pynini.transducer(self.cable_digits, 'Сечение_кабеля') attr_map_2 = pynini.transducer(self.cable_digits, 'Длина_кабеля') attr_map_3 = pynini.transducer('.', 'Сечение_кабеля') attr_map_0_rc = self.cable_splitters attr_map_0_lc = pynini.union(" ") attr_map_0_s = pynini.cdrewrite(attr_map_0, attr_map_0_lc, attr_map_0_rc, self.alphabet).optimize() attr_map_1_rc = self.cable_splitters attr_map_1_lc = self.cable_floats attr_map_1_s = pynini.cdrewrite(attr_map_1, attr_map_1_rc, attr_map_1_lc, self.alphabet).optimize() attr_map_2_rc = self.cable_floats attr_map_2_lc = self.cable_length_0 attr_map_2_s = pynini.cdrewrite(attr_map_1, attr_map_2_rc, attr_map_2_lc, self.alphabet).optimize() attr_map_3_rc = self.cable_length_0 attr_map_3_lc = self.cable_length_1 attr_map_3_s = pynini.cdrewrite(attr_map_2, attr_map_3_rc, attr_map_3_lc, self.alphabet).optimize() attr_map_4_rc = self.cable_digits attr_map_4_lc = self.cable_digits attr_map_4_s = pynini.cdrewrite(attr_map_3, attr_map_4_rc, attr_map_4_lc, self.alphabet).optimize() attr_map_5_rc = self.cable_splitters attr_map_5_lc = self.cable_length_0 attr_map_5_s = pynini.cdrewrite(attr_map_1, attr_map_5_rc, attr_map_5_lc, self.alphabet).optimize() attr_map_comp_0 = pynini.compose(pynini.compose(attr_map_1_s, attr_map_2_s).optimize(), attr_map_4_s).optimize() self.rules = { 'жилы': attr_map_0_s, 'сечение_кабеля_0': attr_map_comp_0, 'длина_кабеля': attr_map_3_s, 'сечение_кабеля_1': attr_map_5_s }
def _composed_typ() -> p.Fst: """Maps multiple ISO characters to single native characters.""" compose_diphthong_aux = (p.cross('(a)(i)', '(ai)') | p.cross('(a)(u)', '(au)')) compose_diphthong = p.cdrewrite(compose_diphthong_aux, '', '', c.SIGMA_STAR).optimize() compose_vocalic_aux = (p.cross('(l)(vocal)', '(l_vocal)') | p.cross('(r)(vocal)', '(r_vocal)')) compose_vocalic = p.cdrewrite(compose_vocalic_aux, '', '', c.SIGMA_STAR).optimize() compose_retroflex_vocalic_aux = (p.cross('(l_vocal)(long)', '(ll_vocal)') | p.cross('(r_vocal)(long)', '(rr_vocal)')) compose_retroflex_vocalic = p.cdrewrite(compose_retroflex_vocalic_aux, '', '', c.SIGMA_STAR).optimize() compose_ind_vowel_aux = (p.cross('(ind)(a)', '(a_i)') | p.cross('(ind)(aa)', '(aa_i)') | p.cross('(ind)(ac)', '(ac_i)') | p.cross('(ind)(e)', '(e_i)') | p.cross('(ind)(ee)', '(ee_i)') | p.cross('(ind)(ec)', '(ec_i)') | p.cross('(ind)(i)', '(i_i)') | p.cross('(ind)(ii)', '(ii_i)') | p.cross('(ind)(o)', '(o_i)') | p.cross('(ind)(oo)', '(oo_i)') | p.cross('(ind)(oc)', '(oc_i)') | p.cross('(ind)(u)', '(u_i)') | p.cross('(ind)(uu)', '(uu_i)') | p.cross('(ind)(ai)', '(ai_i)') | p.cross('(ind)(au)', '(au_i)') | p.cross('(ind)(l_vocal)', '(l_vocal_i)') | p.cross('(ind)(ll_vocal)', '(ll_vocal_i)') | p.cross('(ind)(r_vocal)', '(r_vocal_i)') | p.cross('(ind)(rr_vocal)', '(rr_vocal_i)')) compose_ind_vowel = p.cdrewrite(compose_ind_vowel_aux, '', '', c.SIGMA_STAR).optimize() compose_aspiration_aux = (p.cross('(b)(asp)', '(bh)') | p.cross('(c)(asp)', '(ch)') | p.cross('(d)(asp)', '(dh)') | p.cross('(dd)(asp)', '(ddh)') | p.cross('(g)(asp)', '(gh)') | p.cross('(j)(asp)', '(jh)') | p.cross('(k)(asp)', '(kh)') | p.cross('(p)(asp)', '(ph)') | p.cross('(rd)(asp)', '(rdh)') | p.cross('(t)(asp)', '(th)') | p.cross('(tt)(asp)', '(tth)')) compose_aspiration = p.cdrewrite(compose_aspiration_aux, '', '', c.SIGMA_STAR).optimize() compose_candra = p.cdrewrite(p.cross('(m)(candra)', '(cnd)'), '', '', c.SIGMA_STAR).optimize() # Malayalam chillu characters compose_chillu_aux = (p.cross('(k)(chl)', '(k_chl)') | p.cross('(l)(chl)', '(l_chl)') | p.cross('(ll)(chl)', '(ll_chl)') | p.cross('(n)(chl)', '(n_chl)') | p.cross('(nn)(chl)', '(nn_chl)') | p.cross('(rr)(chl)', '(rr_chl)') | p.cross('(r)(chl)', '(reph)')) compose_chillu = p.cdrewrite(compose_chillu_aux, '', '', c.SIGMA_STAR).optimize() # Marathi eyelash ra compose_eyelash = p.cdrewrite(p.cross('(r)(eye)', '(reye)'), '', '', c.SIGMA_STAR).optimize() compose_om = p.cdrewrite(p.cross('(ot)(m)', '(om)'), '', '', c.SIGMA_STAR).optimize() return (compose_diphthong @ compose_vocalic @ compose_retroflex_vocalic @ compose_ind_vowel @ compose_aspiration @ compose_candra @ compose_chillu @ compose_eyelash @ compose_om).optimize()
chars = ([chr(i) for i in range(1, 91)] + ["\\[", "\\]", "\\\\"] + [chr(i) for i in range(94, 256)]) sigma_star = pynini.union(*chars).closure() sigma_star.optimize() input_string = "Do you have Camembert or Edam?" # Do you have <cheese>Camembert</cheese> or <cheese>Edam</cheese>? cheeses = ("Boursin", "Camembert", "Cheddar", "Edam", "Gruyere", "Ilchester", "Jarlsberg", "Red Leicester", "Stilton") output_string = "Do you have <cheese>Camembert</cheese> or <cheese>Edam</cheese>" fst_target = pynini.string_map(cheeses) ltag = pynini.transducer("", "<cheese>") rtag = pynini.transducer("", "</cheese>") substitution = ltag + fst_target + rtag rewrite = pynini.cdrewrite(substitution, "", "", sigma_star) output = pynini.compose(input_string, rewrite).stringify() ####################################################################################################################### singular_map = pynini.union( pynini.transducer("feet", "foot"), pynini.transducer("pence", "penny"), # Any sequence of bytes ending in "ches" strips the "es"; # the last argument -1 is a "weight" that gives this analysis a higher priority, if it matches the input. sigma_star + pynini.transducer("ches", "ch", -1), # Any sequence of bytes ending in "s" strips the "s". sigma_star + pynini.transducer("s", ""))
def __init__(self, deterministic: bool = True): super().__init__(name="telephone", kind="classify") # create `single_digits` and `double_digits` graphs as these will be # the building blocks of possible telephone numbers single_digits = pynini.invert(graph_digit).optimize() | pynini.cross( "0", "cero") double_digits = pynini.union( graph_twenties, graph_teen, (graph_ties + pynutil.delete("0")), (graph_ties + insert_space + pynutil.insert("y") + insert_space + graph_digit), ) double_digits = pynini.invert(double_digits) # define `ten_digit_graph`, `nine_digit_graph`, `eight_digit_graph` # which produces telephone numbers spoken (1) only with single digits, # or (2) spoken with double digits (and sometimes single digits) # 10-digit option (1): all single digits ten_digit_graph = (pynini.closure(single_digits + insert_space, 3, 3) + pynutil.delete("-") + pynini.closure(single_digits + insert_space, 3, 3) + pynutil.delete("-") + pynini.closure(single_digits + insert_space, 3, 3) + single_digits) # 9-digit option (1): all single digits nine_digit_graph = ( pynini.closure(single_digits + insert_space, 3, 3) + pynutil.delete("-") + pynini.closure(single_digits + insert_space, 3, 3) + pynutil.delete("-") + pynini.closure(single_digits + insert_space, 2, 2) + single_digits) # 8-digit option (1): all single digits eight_digit_graph = ( pynini.closure(single_digits + insert_space, 4, 4) + pynutil.delete("-") + pynini.closure(single_digits + insert_space, 3, 3) + single_digits) if not deterministic: # 10-digit option (2): (1+2) + (1+2) + (2+2) digits ten_digit_graph |= (single_digits + insert_space + double_digits + insert_space + pynutil.delete("-") + single_digits + insert_space + double_digits + insert_space + pynutil.delete("-") + double_digits + insert_space + double_digits) # 9-digit option (2): (1+2) + (1+2) + (1+2) digits nine_digit_graph |= (single_digits + insert_space + double_digits + insert_space + pynutil.delete("-") + single_digits + insert_space + double_digits + insert_space + pynutil.delete("-") + single_digits + insert_space + double_digits) # 8-digit option (2): (2+2) + (2+2) digits eight_digit_graph |= (double_digits + insert_space + double_digits + insert_space + pynutil.delete("-") + double_digits + insert_space + double_digits) number_part = pynini.union(ten_digit_graph, nine_digit_graph, eight_digit_graph) number_part @= pynini.cdrewrite(pynini.cross(ones, "uno"), "", "", NEMO_SIGMA) number_part = pynutil.insert( "number_part: \"") + number_part + pynutil.insert("\"") graph = number_part final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def Rewrite(rule: pynini.FstLike, sigma: pynini.Fst = byte.BYTE, left: pynini.FstLike = "", right: pynini.FstLike = "") -> pynini.Fst: return pynini.optimize(pynini.cdrewrite(rule, left, right, sigma.star))
def __init__(self, decimal: GraphFst, cardinal: GraphFst, fraction: GraphFst, deterministic: bool): super().__init__(name="measure", kind="verbalize", deterministic=deterministic) graph_decimal_masc = decimal.delete_tokens(decimal.graph_masc) graph_decimal_fem = decimal.delete_tokens(decimal.graph_fem) graph_cardinal_masc = cardinal.delete_tokens(cardinal.graph_masc) graph_cardinal_fem = cardinal.delete_tokens(cardinal.graph_fem) graph_fraction_fem = fraction.delete_tokens(fraction.graph_fem) graph_fraction_masc = fraction.delete_tokens(fraction.graph_masc) unit_masc = (unit_plural_masc | unit_singular_masc) + pynini.closure( NEMO_WHITE_SPACE + "por" + pynini.closure(NEMO_NOT_QUOTE, 1), 0, 1) unit_masc |= "por" + pynini.closure(NEMO_NOT_QUOTE, 1) unit_masc = pynutil.delete("units: \"") + ( pynini.closure(NEMO_NOT_QUOTE) @ unit_masc) + pynutil.delete("\"") unit_fem = (unit_plural_fem | unit_singular_fem) + pynini.closure( NEMO_WHITE_SPACE + "por" + pynini.closure(NEMO_NOT_QUOTE, 1), 0, 1) unit_fem = pynutil.delete("units: \"") + ( pynini.closure(NEMO_NOT_QUOTE) @ unit_fem) + pynutil.delete("\"") graph_masc = (graph_cardinal_masc | graph_decimal_masc) + NEMO_WHITE_SPACE + unit_masc graph_masc |= graph_fraction_masc + NEMO_WHITE_SPACE + pynutil.insert( "de ") + unit_masc graph_masc |= pynutil.add_weight( graph_fraction_masc @ (NEMO_SIGMA + pynini.union("medio", "medios")) + NEMO_WHITE_SPACE + unit_masc, -0.001) # "medio litro" not "medio de litro" graph_fem = (graph_cardinal_fem | graph_decimal_fem) + NEMO_WHITE_SPACE + unit_fem graph_fem |= graph_fraction_fem + NEMO_WHITE_SPACE + pynutil.insert( "de ") + unit_fem graph_fem |= pynutil.add_weight( graph_fraction_fem @ (NEMO_SIGMA + pynini.union("media", "medias")) + NEMO_WHITE_SPACE + unit_fem, -0.001) graph = graph_masc | graph_fem graph = (pynini.cdrewrite( pynutil.insert(" de"), "quantity: \"" + pynini.closure(NEMO_NOT_QUOTE, 1), "\"", NEMO_SIGMA) @ graph ) # billones de xyz graph @= pynini.cdrewrite(pynini.cross(ones, "uno"), "", NEMO_WHITE_SPACE + "por", NEMO_SIGMA) # To manage alphanumeric combonations ("a-8, 5x"), we let them use a weighted default path. alpha_num_unit = pynutil.delete("units: \"") + pynini.closure( NEMO_NOT_QUOTE) + pynutil.delete("\"") graph_alpha_num = pynini.union( (graph_cardinal_masc | graph_decimal_masc) + NEMO_SPACE + alpha_num_unit, alpha_num_unit + delete_extra_space + (graph_cardinal_masc | graph_decimal_masc), ) graph |= pynutil.add_weight(graph_alpha_num, 0.01) graph += delete_preserve_order delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
####Ensures regular path is default transducer_adessive_base.set_start(0) transducer_inessive_base.set_start(0) transducer_adessive_base.optimize() transducer_inessive_base.optimize() ######Morphophonemic Rules################ #Consonant Gradation rules. double_consonants_reduce = pynini.string_map([["kk", "k"], ["pp", "p"], ["tt", "t"], ["lk", "l"], ["t", "d"]]) ##Courtesy of http://www.lysator.liu.se/language/Languages/Finnish/Grammar.html and https://web.stanford.edu/~kiparsky/Papers/finnish.article.pdf consonant_reduction = pynini.cdrewrite(double_consonants_reduce, "l" | vowels | "n", vowels + suffixes, closure).optimize() #Vowel insertion to break consonant clusters caused by suffixes insertion = pynini.cdrewrite(pynini.transducer("", "e"), consonants, suffixes, closure).optimize() #Finnish seems to attempt preserving morae count with /s/ as a syllabic end. Generates a stop that assimilates 'highness' of vowel and becomes /k/ #In case this generated stop occurs after VV, it instead assimilates /s/ and becomes /t/. Then gradation occurs due to /e/ insertion #Similar situation seemed to occur with /s/ -> /a/ / /a/_ + suffix. So was added to transducer. final_stress_preservation = pynini.cdrewrite( pynini.transducer("s", "t"), vowels + (pynini.acceptor("y") | "u"), suffixes, closure) * pynini.cdrewrite( pynini.transducer("", "k"), pynini.acceptor("y") | "u", "s" + suffixes, closure) * pynini.cdrewrite(
def __init__(self, deterministic: bool = True): super().__init__(name="time", kind="verbalize", deterministic=deterministic) change_minutes = pynini.cdrewrite(alt_minutes, pynini.accep("[BOS]"), pynini.accep("[EOS]"), NEMO_SIGMA) morning_phrases = pynini.cross("am", "de la mañana") afternoon_phrases = pynini.cross("pm", "de la tarde") evening_phrases = pynini.cross("pm", "de la noche") # For the 12's mid_times = pynini.accep("doce") mid_phrases = ( pynini.string_map([("pm", "del mediodía"), ("am", "de la noche")]) if deterministic else pynini.string_map( [ ("pm", "de la mañana"), ("pm", "del día"), ("pm", "del mediodía"), ("am", "de la noche"), ("am", "de la medianoche"), ] ) ) hour = ( pynutil.delete("hours:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) minute = ( pynutil.delete("minutes:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) minute = (minute @ change_minutes) if deterministic else pynini.union(minute, minute @ change_minutes) suffix = ( pynutil.delete("suffix:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) zone = ( pynutil.delete("zone:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) optional_zone = pynini.closure(delete_space + insert_space + zone, 0, 1) second = ( pynutil.delete("seconds:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) graph_hms = ( hour + pynutil.insert(" horas ") + delete_space + minute + pynutil.insert(" minutos y ") + delete_space + second + pynutil.insert(" segundos") ) graph_hm = hour + delete_space + pynutil.insert(" y ") + minute graph_hm |= pynini.union( (hour @ morning_times) + delete_space + pynutil.insert(" y ") + minute + delete_space + insert_space + (suffix @ morning_phrases), (hour @ afternoon_times) + delete_space + pynutil.insert(" y ") + minute + delete_space + insert_space + (suffix @ afternoon_phrases), (hour @ evening_times) + delete_space + pynutil.insert(" y ") + minute + delete_space + insert_space + (suffix @ evening_phrases), (hour @ mid_times) + delete_space + pynutil.insert(" y ") + minute + delete_space + insert_space + (suffix @ mid_phrases), ) graph_h = pynini.union( hour, (hour @ morning_times) + delete_space + insert_space + (suffix @ morning_phrases), (hour @ afternoon_times) + delete_space + insert_space + (suffix @ afternoon_phrases), (hour @ evening_times) + delete_space + insert_space + (suffix @ evening_phrases), (hour @ mid_times) + delete_space + insert_space + (suffix @ mid_phrases), ) graph = (graph_hms | graph_hm | graph_h) + optional_zone if not deterministic: graph_style_1 = pynutil.delete(" style: \"1\"") graph_style_2 = pynutil.delete(" style: \"2\"") graph_menos = hour + delete_space + pynutil.insert(" menos ") + minute + graph_style_1 graph_menos |= ( (hour @ morning_times) + delete_space + pynutil.insert(" menos ") + minute + delete_space + insert_space + (suffix @ morning_phrases) + graph_style_1 ) graph_menos |= ( (hour @ afternoon_times) + delete_space + pynutil.insert(" menos ") + minute + delete_space + insert_space + (suffix @ afternoon_phrases) + graph_style_1 ) graph_menos |= ( (hour @ evening_times) + delete_space + pynutil.insert(" menos ") + minute + delete_space + insert_space + (suffix @ evening_phrases) + graph_style_1 ) graph_menos |= ( (hour @ mid_times) + delete_space + pynutil.insert(" menos ") + minute + delete_space + insert_space + (suffix @ mid_phrases) + graph_style_1 ) graph_menos += optional_zone graph_para = minute + pynutil.insert(" para las ") + delete_space + hour + graph_style_2 graph_para |= ( minute + pynutil.insert(" para las ") + delete_space + (hour @ morning_times) + delete_space + insert_space + (suffix @ morning_phrases) + graph_style_2 ) graph_para |= ( minute + pynutil.insert(" para las ") + delete_space + (hour @ afternoon_times) + delete_space + insert_space + (suffix @ afternoon_phrases) + graph_style_2 ) graph_para |= ( minute + pynutil.insert(" para las ") + delete_space + (hour @ evening_times) + delete_space + insert_space + (suffix @ evening_phrases) + graph_style_2 ) graph_para |= ( minute + pynutil.insert(" para las ") + delete_space + (hour @ mid_times) + delete_space + insert_space + (suffix @ mid_phrases) + graph_style_2 ) graph_para += optional_zone graph_para @= pynini.cdrewrite( pynini.cross(" las ", " la "), "para", "una", NEMO_SIGMA ) # Need agreement with one graph |= graph_menos | graph_para delete_tokens = self.delete_tokens(graph + delete_preserve_order) self.fst = delete_tokens.optimize()
] gemination_map = [ ("s", "ss"), ("k", "kk"), ("t", "tt"), ] monograph_bos_map = [ ("じ", "dʑi"), ("ず", "dzɯ"), ("ぞ", "dzo"), ] G2P = ( cdrewrite(cross("は", "ɰɑ"), "[BOS]", "[EOS]", SIGMA_STAR) @ cdrewrite(string_map(digraph_bos_map), "[BOS]", "", SIGMA_STAR) @ cdrewrite(string_map(digraph_map), "", "", SIGMA_STAR) @ cdrewrite(string_map(monograph_bos_map), "[BOS]", "", SIGMA_STAR) @ cdrewrite(cross("ん", "n"), "ː", "", SIGMA_STAR) @ cdrewrite(cross("ん", "ɴ"), "", "", SIGMA_STAR) @ cdrewrite(string_map(long_vowel_map), "", "", SIGMA_STAR) @ cdrewrite(string_map(context_free_map), "", "", SIGMA_STAR) @ cdrewrite(string_map(nasalization_map), "", "ɴ", SIGMA_STAR) @ cdrewrite( string_map(devoicing_map), union(voiceless_consonants), union(voiceless_consonants, "[EOS]"), SIGMA_STAR, ) @ cdrewrite(string_map(gemination_map), sokuon, "", SIGMA_STAR)
#measure back_vowel = pynini.union("u", "o", "a") neutral_vowel = pynini.union("i", "e") front_vowel = pynini.union("y", "ö", "ä") vowel = pynini.union(back_vowel, neutral_vowel, front_vowel) archiphoneme = pynini.union("A", "I", "E", "O", "U") consonant = pynini.union("b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z") sigma_star = pynini.union(vowel, consonant, archiphoneme).closure().optimize() adessive = "llA" intervener = pynini.union(consonant, neutral_vowel).closure() adessive_harmony = ( pynini.cdrewrite(pynini.transducer("A", "a"), back_vowel + intervener, "", sigma_star) * pynini.cdrewrite(pynini.t("A", "ä"), "", "", sigma_star)).optimize() def make_adessive(stem): return ((stem + adessive) * adessive_harmony).stringify() make_adessive("training") singular_map = pynini.union( pynini.transducer("feet", "foot"), pynini.transducer("pence", "penny"), # Any sequence of bytes ending in "ches" strips the "es"; # the last argument -1 is a "weight" that gives this analysis # a higher priority, if it matches the input.
def __init__(self, deterministic: bool = True): super().__init__(name="fraction", kind="verbalize", deterministic=deterministic) # Derivational strings append 'avo' as a suffix. Adding space for processing aid fraction_stem = pynutil.insert(" avo") plural = pynutil.insert("s") conjunction = pynutil.insert(" y ") integer = (pynutil.delete("integer_part: \"") + strip_cardinal_apocope(pynini.closure(NEMO_NOT_QUOTE)) + pynutil.delete("\"")) numerator_one = pynutil.delete("numerator: \"") + pynini.accep( "un") + pynutil.delete("\" ") numerator = (pynutil.delete("numerator: \"") + pynini.difference(pynini.closure(NEMO_NOT_QUOTE), "un") + pynutil.delete("\" ")) denominator_add_stem = pynutil.delete("denominator: \"") + ( pynini.closure(NEMO_NOT_QUOTE) + fraction_stem + pynutil.delete("\" morphosyntactic_features: \"add_root\"")) denominator_ordinal = pynutil.delete("denominator: \"") + ( pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\" morphosyntactic_features: \"ordinal\"")) denominator_cardinal = pynutil.delete("denominator: \"") + ( pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"")) denominator_singular = pynini.union(denominator_add_stem, denominator_ordinal) if not deterministic: # Occasional exceptions denominator_singular |= denominator_add_stem @ pynini.string_map( [("once avo", "undécimo"), ("doce avo", "duodécimo")]) denominator_plural = denominator_singular + plural # Merging operations merge = pynini.cdrewrite( pynini.cross(" y ", "i"), "", "", NEMO_SIGMA ) # The denominator must be a single word, with the conjunction "y" replaced by i merge @= pynini.cdrewrite(delete_space, "", pynini.difference(NEMO_CHAR, "parte"), NEMO_SIGMA) # The merger can produce duplicate vowels. This is not allowed in orthography delete_duplicates = pynini.string_map([("aa", "a"), ("oo", "o")]) # Removes vowels delete_duplicates = pynini.cdrewrite(delete_duplicates, "", "", NEMO_SIGMA) remove_accents = pynini.cdrewrite( accents, pynini.union(NEMO_SPACE, pynini.accep("[BOS]")) + pynini.closure(NEMO_NOT_SPACE), pynini.closure(NEMO_NOT_SPACE) + pynini.union("avo", "ava", "ésimo", "ésima"), NEMO_SIGMA, ) merge_into_single_word = merge @ remove_accents @ delete_duplicates fraction_default = numerator + delete_space + insert_space + ( denominator_plural @ merge_into_single_word) fraction_with_one = (numerator_one + delete_space + insert_space + (denominator_singular @ merge_into_single_word)) fraction_with_cardinal = strip_cardinal_apocope(numerator | numerator_one) fraction_with_cardinal += ( delete_space + pynutil.insert(" sobre ") + strip_cardinal_apocope(denominator_cardinal)) if not deterministic: # There is an alternative rendering where ordinals act as adjectives for 'parte'. This requires use of the feminine # Other rules will manage use of "un" at end, so just worry about endings exceptions = pynini.string_map([("tercia", "tercera")]) apply_exceptions = pynini.cdrewrite(exceptions, "", "", NEMO_SIGMA) vowel_change = pynini.cdrewrite(pynini.cross("o", "a"), "", pynini.accep("[EOS]"), NEMO_SIGMA) denominator_singular_fem = shift_cardinal_gender( denominator_singular) @ vowel_change @ apply_exceptions denominator_plural_fem = denominator_singular_fem + plural numerator_one_fem = shift_cardinal_gender(numerator_one) numerator_fem = shift_cardinal_gender(numerator) fraction_with_cardinal |= ( (numerator_one_fem | numerator_fem) + delete_space + pynutil.insert(" sobre ") + shift_cardinal_gender(denominator_cardinal)) # Still need to manage stems merge_stem = pynini.cdrewrite( delete_space, "", pynini.union("avo", "ava", "avos", "avas"), NEMO_SIGMA) # For managing alternative spacing merge_stem @= remove_accents @ delete_duplicates fraction_with_one_fem = numerator_one_fem + delete_space + insert_space fraction_with_one_fem += pynini.union( denominator_singular_fem @ merge_stem, denominator_singular_fem @ merge_into_single_word) # Both forms exists fraction_with_one_fem += pynutil.insert(" parte") fraction_with_one_fem @= pynini.cdrewrite( pynini.cross("una media", "media"), "", "", NEMO_SIGMA) # "media" not "una media" fraction_default_fem = numerator_fem + delete_space + insert_space fraction_default_fem += pynini.union( denominator_plural_fem @ merge_stem, denominator_plural_fem @ merge_into_single_word) fraction_default_fem += pynutil.insert(" partes") fraction_default |= (numerator + delete_space + insert_space + denominator_plural @ merge_stem ) # Case of no merger fraction_default |= fraction_default_fem fraction_with_one |= numerator_one + delete_space + insert_space + denominator_singular @ merge_stem fraction_with_one |= fraction_with_one_fem fraction_with_one @= pynini.cdrewrite(pynini.cross( "un medio", "medio"), "", "", NEMO_SIGMA) # "medio" not "un medio" fraction = fraction_with_one | fraction_default | fraction_with_cardinal graph_masc = pynini.closure(integer + delete_space + conjunction, 0, 1) + fraction # Manage cases of fem gender (only shows on integer except for "medio") integer_fem = shift_cardinal_gender(integer) fraction_default |= ( shift_cardinal_gender(numerator) + delete_space + insert_space + (denominator_plural @ pynini.cross("medios", "medias"))) fraction_with_one |= ( pynutil.delete(numerator_one) + delete_space + (denominator_singular @ pynini.cross("medio", "media"))) fraction_fem = fraction_with_one | fraction_default | fraction_with_cardinal graph_fem = pynini.closure(integer_fem + delete_space + conjunction, 0, 1) + fraction_fem self.graph_masc = pynini.optimize(graph_masc) self.graph_fem = pynini.optimize(graph_fem) self.graph = graph_masc | graph_fem delete_tokens = self.delete_tokens(self.graph) self.fst = delete_tokens.optimize()
def __init__(self): super().__init__(name="cardinal", kind="classify") graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) graph_hundred = pynini.cross("hundred", "") graph_hundred_component = pynini.union(graph_digit + delete_space + graph_hundred, pynutil.insert("0")) graph_hundred_component += delete_space graph_hundred_component += pynini.union( graph_teen | pynutil.insert("00"), (graph_ties | pynutil.insert("0")) + delete_space + (graph_digit | pynutil.insert("0")), ) graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ ( pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT) ) self.graph_hundred_component_at_least_one_none_zero_digit = ( graph_hundred_component_at_least_one_none_zero_digit ) graph_thousands = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("thousand"), pynutil.insert("000", weight=0.1), ) graph_million = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("million"), pynutil.insert("000", weight=0.1), ) graph_billion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("billion"), pynutil.insert("000", weight=0.1), ) graph_trillion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("trillion"), pynutil.insert("000", weight=0.1), ) graph_quadrillion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("quadrillion"), pynutil.insert("000", weight=0.1), ) graph_quintillion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("quintillion"), pynutil.insert("000", weight=0.1), ) graph_sextillion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("sextillion"), pynutil.insert("000", weight=0.1), ) graph = pynini.union( graph_sextillion + delete_space + graph_quintillion + delete_space + graph_quadrillion + delete_space + graph_trillion + delete_space + graph_billion + delete_space + graph_million + delete_space + graph_thousands + delete_space + graph_hundred_component, graph_zero, ) graph = graph @ pynini.union( pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0" ) labels_exception = [num_to_word(x) for x in range(0, 13)] graph_exception = pynini.union(*labels_exception) graph = ( pynini.cdrewrite(pynutil.delete("and"), NEMO_SPACE, NEMO_SPACE, NEMO_SIGMA) @ (NEMO_ALPHA + NEMO_SIGMA) @ graph ) self.graph_no_exception = graph self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("minus", "\"-\"") + NEMO_SPACE, 0, 1 ) final_graph = optional_minus_graph + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
"n", "ɲ", "o", "p", "r", "ɾ", "s", "ʃ", "t", "u", "w", "x", "z") _sigma_star = pynini.union(_g, _p).closure().optimize() # Rules. _r1 = pynini.cdrewrite( pynini.string_map([ ("ch", "tʃ"), ("ll", "ʝ"), ("qu", "k"), ("j", "x"), ("ñ", "ɲ"), ("v", "b"), ("x", "s"), ("y", "j"), ("á", "a"), ("é", "e"), ("í", "i"), ("ó", "o"), ("ú", "u"), ("ü", "w"), ]), "", "", _sigma_star, ).optimize() _r2 = pynini.cdrewrite(pynutil.delete("h"), "", "", _sigma_star).optimize() _v = pynini.union("a", "e", "i", "o", "u") _r3 = pynini.cdrewrite(pynini.cross("r", "ɾ"), _v, _v, _sigma_star).optimize() _r4 = pynini.cdrewrite(pynini.cross("rr", "r"), "", "", _sigma_star).optimize() _r5 = pynini.cdrewrite(pynini.string_map([("c", "s"), ("g", "x")]), "",