def __init__( self, itn_cardinal_tagger: GraphFst, itn_decimal_tagger: GraphFst, itn_fraction_tagger: GraphFst, deterministic: bool = True, ): super().__init__(name="measure", kind="classify", deterministic=deterministic) cardinal_graph = (pynini.cdrewrite( pynini.cross(pynini.union("ein", "eine"), "eins"), "[BOS]", "[EOS]", NEMO_SIGMA) @ itn_cardinal_tagger.graph_no_exception) graph_unit_singular = pynini.invert(unit_singular) # singular -> abbr unit = (pynini.invert(singular_to_plural()) @ graph_unit_singular) | graph_unit_singular # plural -> abbr unit = convert_space(unit) graph_unit_singular = convert_space(graph_unit_singular) optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("minus", "\"true\"") + delete_extra_space, 0, 1) unit_misc = pynutil.insert("/") + pynutil.delete( "pro") + delete_space + graph_unit_singular unit = (pynutil.insert("units: \"") + (unit | unit_misc | pynutil.add_weight(unit + delete_space + unit_misc, 0.01)) + pynutil.insert("\"")) subgraph_decimal = (pynutil.insert("decimal { ") + optional_graph_negative + itn_decimal_tagger.final_graph_wo_negative + pynutil.insert(" }") + delete_extra_space + unit) subgraph_fraction = (pynutil.insert("decimal { ") + optional_graph_negative + pynutil.insert("integer_part: \"") + itn_fraction_tagger.graph + pynutil.insert("\" }") + delete_extra_space + unit) subgraph_cardinal = (pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + cardinal_graph + pynutil.insert("\"") + pynutil.insert(" }") + delete_extra_space + unit) final_graph = subgraph_cardinal | subgraph_decimal | subgraph_fraction final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="measure", kind="classify") cardinal_graph = cardinal.graph_no_exception graph_unit_singular = pynini.string_file( get_abs_path("data/measurements_singular.tsv")) graph_unit_singular = pynini.invert( graph_unit_singular) # singular -> abbr graph_unit_plural = pynini.string_file( get_abs_path("data/measurements_plural.tsv")) graph_unit_plural = pynini.invert(graph_unit_plural) # plural -> abbr optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("menos", "\"true\"") + delete_extra_space, 0, 1) unit_singular = convert_space(graph_unit_singular) unit_plural = convert_space(graph_unit_plural) unit_misc = pynutil.insert("/") + pynutil.delete( "por") + delete_space + convert_space(graph_unit_singular) unit_singular = (pynutil.insert("units: \"") + (unit_singular | unit_misc | pynutil.add_weight( unit_singular + delete_space + unit_misc, 0.01)) + pynutil.insert("\"")) unit_plural = (pynutil.insert("units: \"") + (unit_plural | unit_misc | pynutil.add_weight( unit_plural + delete_space + unit_misc, 0.01)) + pynutil.insert("\"")) subgraph_decimal = (pynutil.insert("decimal { ") + optional_graph_negative + decimal.final_graph_wo_negative + pynutil.insert(" }") + delete_extra_space + unit_plural) subgraph_cardinal = ( pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + ((NEMO_SIGMA - "un" - "una" - "uno") @ cardinal_graph) + pynutil.insert("\"") + pynutil.insert(" }") + delete_extra_space + unit_plural) subgraph_cardinal |= ( pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + (pynini.cross("un", "1") | pynini.cross("una", "1") | pynini.cross("uno", "1")) + pynutil.insert("\"") + pynutil.insert(" }") + delete_extra_space + unit_singular) final_graph = subgraph_decimal | subgraph_cardinal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, input_case: str, deterministic: bool = True): super().__init__(name="whitelist", kind="classify", deterministic=deterministic) def _get_whitelist_graph(input_case, file="data/whitelist.tsv"): whitelist = load_labels(get_abs_path(file)) if input_case == "lower_cased": whitelist = [[x[0].lower()] + x[1:] for x in whitelist] else: whitelist = [[x[0].lower()] + x[1:] for x in whitelist] graph = pynini.string_map(whitelist) return graph graph = _get_whitelist_graph(input_case) units_graph = _get_whitelist_graph(input_case, file="data/measurements.tsv") # do not replace single letter units, like `м` or `°` units_graph = pynini.compose( pynini.difference(pynini.project(units_graph, "input"), NEMO_ALPHA), units_graph) graph |= units_graph.optimize() graph |= TO_LATIN + pynini.closure(pynutil.insert(" ") + TO_LATIN) self.final_graph = convert_space(graph) self.fst = (pynutil.insert("name: \"") + self.final_graph + pynutil.insert("\"")).optimize()
def __init__(self, itn_cardinal_tagger: GraphFst, tn_fraction_verbalizer: GraphFst, deterministic: bool = True): super().__init__(name="fraction", kind="classify", deterministic=deterministic) tagger = tn_fraction_verbalizer.graph.invert().optimize() delete_optional_sign = pynini.closure(pynutil.delete("negative: ") + pynini.cross("\"true\" ", "-"), 0, 1) delete_integer_marker = ( pynutil.delete("integer_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) @ itn_cardinal_tagger.graph_no_exception delete_numerator_marker = ( pynutil.delete("numerator: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) @ itn_cardinal_tagger.graph_no_exception delete_denominator_marker = ( pynutil.insert('/') + (pynutil.delete("denominator: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) @ itn_cardinal_tagger.graph_no_exception ) graph = ( pynini.closure(delete_integer_marker + pynini.accep(" "), 0, 1) + delete_numerator_marker + delete_space + delete_denominator_marker ).optimize() verbalizer = delete_optional_sign + graph self.graph = tagger @ verbalizer graph = pynutil.insert("name: \"") + convert_space(self.graph) + pynutil.insert("\"") self.fst = graph.optimize()
def __init__(self, tn_cardinal_tagger: GraphFst, deterministic: bool = True): super().__init__(name="telephone", kind="classify", deterministic=deterministic) separator = pynini.accep(" ") # between components digit = pynini.union(*list(map(str, range(1, 10)))) @ tn_cardinal_tagger.two_digit_non_zero zero = pynini.cross("0", "null") number_part = ( pynutil.delete("(") + zero + insert_space + pynini.closure(digit + insert_space, 2, 2) + digit + pynutil.delete(")") + separator + pynini.closure(digit + insert_space, 3, 3) + digit + pynutil.delete("-") + insert_space + pynini.closure(digit + insert_space, 3, 3) + digit ) graph = convert_space(pynini.invert(number_part)) final_graph = pynutil.insert("name: \"") + graph + pynutil.insert("\"") self.fst = final_graph.optimize()
def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None): super().__init__(name="whitelist", kind="classify", deterministic=deterministic) def _get_whitelist_graph(input_case, file): whitelist = load_labels(file) if input_case == "lower_cased": whitelist = [[x[0].lower()] + x[1:] for x in whitelist] graph = pynini.string_map(whitelist) return graph graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist.tsv")) if not deterministic and input_case != "lower_cased": graph |= pynutil.add_weight( _get_whitelist_graph("lower_cased", get_abs_path("data/whitelist.tsv")), weight=0.0001 ) if input_file: whitelist_provided = _get_whitelist_graph(input_case, input_file) if not deterministic: graph |= whitelist_provided else: graph = whitelist_provided if not deterministic: units_graph = _get_whitelist_graph(input_case, file=get_abs_path("data/measure/measurements.tsv")) graph |= units_graph self.graph = graph self.final_graph = convert_space(self.graph).optimize() self.fst = (pynutil.insert("name: \"") + self.final_graph + pynutil.insert("\"")).optimize()
def __init__(self, input_case: str, deterministic: bool = True): super().__init__(name="whitelist", kind="classify") def _get_whitelist_graph(input_case, file="data/whitelist.tsv"): whitelist = load_labels(get_abs_path(file)) if input_case == "lower_cased": whitelist = [(x.lower(), y) for x, y in whitelist] else: whitelist = [(x, y) for x, y in whitelist] graph = pynini.string_map(whitelist) return graph def _get_whitelist_non_deterministic_graph( file="data/whitelist_alternatives.tsv"): whitelist = load_labels(get_abs_path(file)) whitelist_lower = [(x.lower(), y.lower()) for x, y in whitelist] whitelist_cased = [(x, y) for x, y in whitelist] graph = pynini.string_map(whitelist_lower + whitelist_cased) return graph graph = _get_whitelist_graph(input_case) if not deterministic: graph |= _get_whitelist_graph( "lower_cased") | _get_whitelist_non_deterministic_graph() self.graph = (convert_space(graph)).optimize() self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="word", kind="classify", deterministic=deterministic) symbols_to_exclude = (pynini.union("$", "€", "₩", "£", "¥", "#", "%") | NEMO_DIGIT).optimize() graph = pynini.closure(pynini.difference(NEMO_NOT_SPACE, symbols_to_exclude), 1) # leave phones of format [HH AH0 L OW1] untouched phoneme_unit = pynini.closure(NEMO_ALPHA, 1) + pynini.closure(NEMO_DIGIT) phoneme = ( pynini.accep(pynini.escape("[")) + pynini.closure(phoneme_unit + pynini.accep(" ")) + phoneme_unit + pynini.accep(pynini.escape("]")) ) if not deterministic: phoneme = ( pynini.accep(pynini.escape("[")) + pynini.closure(pynini.accep(" "), 0, 1) + pynini.closure(phoneme_unit + pynini.accep(" ")) + phoneme_unit + pynini.closure(pynini.accep(" "), 0, 1) + pynini.accep(pynini.escape("]")) ) self.graph = plurals._priority_union(convert_space(phoneme), graph, NEMO_SIGMA) self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None): super().__init__(name="whitelist", kind="classify", deterministic=deterministic) def _get_whitelist_graph(input_case, file): whitelist = load_labels(file) if input_case == "lower_cased": whitelist = [[x[0].lower()] + x[1:] for x in whitelist] else: whitelist = [[x[0].lower()] + x[1:] for x in whitelist] graph = pynini.string_map(whitelist) return graph graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist.tsv")) if input_file: graph = _get_whitelist_graph(input_case, input_file) units_graph = _get_whitelist_graph(input_case, file=get_abs_path("data/measurements.tsv")) # do not replace single letter units, like `м`, `°` and `%` will be replaced units_graph = pynini.compose((NEMO_CHAR ** (2, ...) | pynini.difference(NEMO_CHAR, RU_ALPHA)), units_graph) graph |= units_graph.optimize() graph |= TO_CYRILLIC + pynini.closure(pynutil.insert(" ") + TO_CYRILLIC) self.final_graph = convert_space(graph) self.fst = (pynutil.insert("name: \"") + self.final_graph + pynutil.insert("\"")).optimize()
def __init__(self): super().__init__(name="whitelist", kind="classify") whitelist = pynini.string_file( get_abs_path("data/whitelist.tsv")).invert() graph = pynutil.insert("name: \"") + convert_space( whitelist) + pynutil.insert("\"") self.fst = graph.optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = True): super().__init__(name="money", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph graph_decimal_final = decimal.final_graph_wo_negative unit_singular = pynini.string_file( get_abs_path("data/currency/currency.tsv")) unit_plural = convert_space(unit_singular @ SINGULAR_TO_PLURAL) unit_singular = convert_space(unit_singular) graph_unit_singular = pynutil.insert( "currency: \"") + unit_singular + pynutil.insert("\"") graph_unit_plural = pynutil.insert( "currency: \"") + unit_plural + pynutil.insert("\"") singular_graph = (graph_unit_singular + pynutil.insert(" integer_part: \"") + pynini.cross("1", "one") + pynutil.insert("\"")) graph_decimal = graph_unit_plural + insert_space + graph_decimal_final if deterministic: graph_integer = (graph_unit_plural + pynutil.insert(" integer_part: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + pynutil.insert("\"")) else: graph_integer = ( graph_unit_plural + pynutil.insert(" integer_part: \"") + ((NEMO_SIGMA - "1") @ (get_hundreds_graph(deterministic) | cardinal_graph)) + pynutil.insert("\"")) graph_decimal |= singular_graph + insert_space + graph_decimal_final graph_integer |= singular_graph final_graph = graph_integer | graph_decimal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, tn_whitelist_tagger: GraphFst, deterministic: bool = True): super().__init__(name="whitelist", kind="classify", deterministic=deterministic) whitelist = pynini.invert(tn_whitelist_tagger.graph) graph = pynutil.insert("name: \"") + convert_space( whitelist) + pynutil.insert("\"") self.fst = graph.optimize()
def __init__(self, itn_cardinal_tagger: GraphFst, itn_decimal_tagger: GraphFst, deterministic: bool = True): super().__init__(name="money", kind="classify", deterministic=deterministic) cardinal_graph = (pynini.cdrewrite( pynini.cross(pynini.union("ein", "eine"), "eins"), "[BOS]", "[EOS]", NEMO_SIGMA) @ itn_cardinal_tagger.graph_no_exception) graph_decimal_final = itn_decimal_tagger.final_graph_wo_negative graph_unit = pynini.invert(maj_singular) graph_unit = pynutil.insert("currency: \"") + convert_space( graph_unit) + pynutil.insert("\"") add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | ( pynutil.insert("0") + NEMO_DIGIT) min_unit = pynini.project(min_singular | min_plural, "output") # elf euro (und) vier cent, vier cent cents_standalone = (pynutil.insert("fractional_part: \"") + cardinal_graph @ add_leading_zero_to_double_digit + delete_space + pynutil.delete(min_unit) + pynutil.insert("\"")) optional_cents_standalone = pynini.closure( delete_space + pynini.closure(pynutil.delete("und") + delete_space, 0, 1) + insert_space + cents_standalone, 0, 1, ) # elf euro vierzig, only after integer optional_cents_suffix = pynini.closure( delete_extra_space + pynutil.insert("fractional_part: \"") + pynutil.add_weight( cardinal_graph @ add_leading_zero_to_double_digit, -0.7) + pynutil.insert("\""), 0, 1, ) graph_integer = (pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") + delete_extra_space + graph_unit + (optional_cents_standalone | optional_cents_suffix)) graph_decimal = graph_decimal_final + delete_extra_space + graph_unit graph_decimal |= pynutil.insert( "currency: \"€\" integer_part: \"0\" ") + cents_standalone final_graph = graph_integer | graph_decimal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__( self, itn_cardinal_tagger: GraphFst, tn_date_tagger: GraphFst, tn_date_verbalizer: GraphFst, deterministic: bool = True, ): super().__init__(name="date", kind="classify", deterministic=deterministic) add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert("0") + NEMO_DIGIT) optional_delete_space = pynini.closure(NEMO_SIGMA | pynutil.delete(" ", weight=0.0001)) tagger = tn_date_verbalizer.graph.invert().optimize() delete_day_marker = ( pynutil.delete("day: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) @ itn_cardinal_tagger.graph_no_exception month_as_number = pynutil.delete("month: \"") + itn_cardinal_tagger.graph_no_exception + pynutil.delete("\"") month_as_string = pynutil.delete("month: \"") + tn_date_tagger.month_abbr.invert() + pynutil.delete("\"") convert_year = (tn_date_tagger.year @ optional_delete_space).invert().optimize() delete_year_marker = ( pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) @ convert_year # day. month as string (year) verbalizer = ( pynini.closure(delete_day_marker + pynutil.insert(".") + pynini.accep(" "), 0, 1) + month_as_string + pynini.closure(pynini.accep(" ") + delete_year_marker, 0, 1) ) # day. month as number (year) verbalizer |= ( delete_day_marker @ add_leading_zero_to_double_digit + pynutil.insert(".") + pynutil.delete(" ") + month_as_number @ add_leading_zero_to_double_digit + pynutil.insert(".") + pynini.closure(pynutil.delete(" ") + delete_year_marker, 0, 1) ) # year verbalizer |= delete_year_marker final_graph = tagger @ verbalizer graph = pynutil.insert("name: \"") + convert_space(final_graph) + pynutil.insert("\"") self.fst = graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="word", kind="classify", deterministic=deterministic) punct = PunctuationFst().graph self.graph = pynini.closure(pynini.difference(NEMO_NOT_SPACE, punct.project("input")), 1) if not deterministic: self.graph = pynini.closure( pynini.difference( self.graph, pynini.union("$", "€", "₩", "£", "¥", "#", "$", "%") + pynini.closure(NEMO_DIGIT, 1) ), 1, ) # leave phones of format [HH AH0 L OW1] untouched phoneme_unit = pynini.closure(NEMO_ALPHA, 1) + pynini.closure(NEMO_DIGIT) phoneme = ( pynini.accep(pynini.escape("[")) + pynini.closure(phoneme_unit + pynini.accep(" ")) + phoneme_unit + pynini.accep(pynini.escape("]")) ) self.graph = plurals._priority_union(convert_space(phoneme), self.graph, NEMO_SIGMA) self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
def __init__( self, time: GraphFst, date: GraphFst, cardinal: GraphFst, deterministic: bool = True, lm: bool = False ): super().__init__(name="range", kind="classify", deterministic=deterministic) delete_space = pynini.closure(pynutil.delete(" "), 0, 1) cardinal = cardinal.graph_with_and approx = pynini.cross("~", "approximately") # TIME time_graph = time + delete_space + pynini.cross("-", " to ") + delete_space + time self.graph = time_graph | (approx + time) # YEAR date_year_four_digit = (NEMO_DIGIT ** 4 + pynini.closure(pynini.accep("s"), 0, 1)) @ date date_year_two_digit = (NEMO_DIGIT ** 2 + pynini.closure(pynini.accep("s"), 0, 1)) @ date year_to_year_graph = ( date_year_four_digit + delete_space + pynini.cross("-", " to ") + delete_space + (date_year_four_digit | date_year_two_digit | (NEMO_DIGIT ** 2 @ cardinal)) ) self.graph |= year_to_year_graph # ADDITION range_graph = cardinal + pynini.closure(pynini.cross("+", " plus ") + cardinal, 1) range_graph |= cardinal + pynini.closure(pynini.cross(" + ", " plus ") + cardinal, 1) range_graph |= approx + cardinal range_graph |= cardinal + (pynini.cross("...", " ... ") | pynini.accep(" ... ")) + cardinal if not deterministic or lm: # cardinal ---- cardinal_to_cardinal_graph = ( cardinal + delete_space + pynini.cross("-", pynini.union(" to ", " minus ")) + delete_space + cardinal ) range_graph |= cardinal_to_cardinal_graph | ( cardinal + delete_space + pynini.cross(":", " to ") + delete_space + cardinal ) # MULTIPLY for x in [" x ", "x"]: range_graph |= cardinal + pynini.closure( pynini.cross(x, pynini.union(" by ", " times ")) + cardinal, 1 ) for x in ["*", " * "]: range_graph |= cardinal + pynini.closure(pynini.cross(x, " times ") + cardinal, 1) # supports "No. 12" -> "Number 12" range_graph |= ( (pynini.cross(pynini.union("NO", "No"), "Number") | pynini.cross("no", "number")) + pynini.closure(pynini.union(". ", " "), 0, 1) + cardinal ) for x in ["/", " / "]: range_graph |= cardinal + pynini.closure(pynini.cross(x, " divided by ") + cardinal, 1) self.graph |= range_graph self.graph = self.graph.optimize() graph = pynutil.insert("name: \"") + convert_space(self.graph).optimize() + pynutil.insert("\"") self.fst = graph.optimize()
def __init__(self): super().__init__(name="time", kind="classify") suffix_graph = pynini.string_file( get_abs_path("data/time/time_suffix.tsv")) time_to_graph = pynini.string_file( get_abs_path("data/time/time_to.tsv")) graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) graph_twenties = pynini.string_file( get_abs_path("data/numbers/twenties.tsv")) graph_1_to_100 = pynini.union( graph_digit, graph_twenties, graph_teen, (graph_ties + pynutil.insert("0")), (graph_ties + pynutil.delete(" y ") + graph_digit), ) # note that graph_hour will start from 2 hours # "1 o'clock" will be treated differently because it # is singular digits_2_to_23 = [str(digits) for digits in range(2, 24)] digits_1_to_59 = [str(digits) for digits in range(1, 60)] graph_1oclock = pynini.cross("la una", "la 1") graph_hour = pynini.cross( "las ", "las ") + graph_1_to_100 @ pynini.union(*digits_2_to_23) graph_minute = graph_1_to_100 @ pynini.union(*digits_1_to_59) graph_minute_verbose = pynini.cross("media", "30") | pynini.cross( "cuarto", "15") final_graph_hour = pynutil.insert("hours: \"") + ( graph_1oclock | graph_hour) + pynutil.insert("\"") final_graph_minute = (pynutil.insert("minutes: \"") + pynini.closure( (pynutil.delete("y") | pynutil.delete("con")) + delete_space, 0, 1) + (graph_minute | graph_minute_verbose) + pynutil.insert("\"")) final_suffix = pynutil.insert("suffix: \"") + convert_space( suffix_graph) + pynutil.insert("\"") final_suffix_optional = pynini.closure( delete_space + insert_space + final_suffix, 0, 1) # las nueve a eme (only convert on-the-hour times if they are followed by a suffix) graph_hsuffix = (final_graph_hour + delete_extra_space + pynutil.insert("minutes: \"00\"") + insert_space + final_suffix) # las nueve y veinticinco graph_hm = final_graph_hour + delete_extra_space + final_graph_minute # un cuarto para las cinco graph_mh = (pynutil.insert("minutes: \"") + pynini.union( pynini.cross("un cuarto para", "45"), pynini.cross("cuarto para", "45"), ) + pynutil.insert("\"") + delete_extra_space + pynutil.insert("hours: \"") + time_to_graph + pynutil.insert("\"")) # las diez menos diez graph_time_to = (pynutil.insert("hours: \"") + time_to_graph + pynutil.insert("\"") + delete_extra_space + pynutil.insert("minutes: \"") + delete_space + pynutil.delete("menos") + delete_space + pynini.union( pynini.cross("cinco", "55"), pynini.cross("diez", "50"), pynini.cross("cuarto", "45"), pynini.cross("veinte", "40"), pynini.cross("veinticinco", "30"), ) + pynutil.insert("\"")) final_graph = pynini.union( (graph_hm | graph_mh | graph_time_to) + final_suffix_optional, graph_hsuffix).optimize() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="money", kind="classify") # quantity, integer_part, fractional_part, currency cardinal_graph = cardinal.graph_no_exception graph_decimal_final = decimal.final_graph_wo_negative unit_singular = pynini.string_file( get_abs_path("data/currency_singular.tsv")) unit_singular = pynini.invert(unit_singular) unit_plural = pynini.string_file( get_abs_path("data/currency_plural.tsv")) unit_plural = pynini.invert(unit_plural) graph_unit_singular = pynutil.insert("currency: \"") + convert_space( unit_singular) + pynutil.insert("\"") graph_unit_plural = pynutil.insert("currency: \"") + convert_space( unit_plural) + pynutil.insert("\"") add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | ( pynutil.insert("0") + NEMO_DIGIT) # twelve dollars (and) fifty cents, zero cents cents_standalone = ( pynutil.insert("morphosyntactic_features: \",\"" ) # always use a comma in the decimal + insert_space + pynutil.insert("fractional_part: \"") + pynini.union( pynutil.add_weight( ((NEMO_SIGMA - "un") @ cardinal_graph), -0.7) @ add_leading_zero_to_double_digit + delete_space + pynutil.delete(pynini.union("centavos", "céntimos")), pynini.cross("un", "01") + delete_space + pynutil.delete(pynini.union("centavo", "céntimo")), ) + pynutil.insert("\"")) optional_cents_standalone = pynini.closure( delete_space + pynini.closure( (pynutil.delete("con") | pynutil.delete('y')) + delete_space, 0, 1) + insert_space + cents_standalone, 0, 1, ) # twelve dollars fifty, only after integer # setenta y cinco dólares con sesenta y tres~$75,63 optional_cents_suffix = pynini.closure( delete_extra_space + pynutil.insert("morphosyntactic_features: \",\"" ) # always use a comma in the decimal + insert_space + pynutil.insert("fractional_part: \"") + pynini.closure( (pynutil.delete("con") | pynutil.delete('y')) + delete_space, 0, 1) + pynutil.add_weight( cardinal_graph @ add_leading_zero_to_double_digit, -0.7) + pynutil.insert("\""), 0, 1, ) graph_integer = (pynutil.insert("integer_part: \"") + ((NEMO_SIGMA - "un" - "una") @ cardinal_graph) + pynutil.insert("\"") + delete_extra_space + graph_unit_plural + (optional_cents_standalone | optional_cents_suffix)) graph_integer |= ( pynutil.insert("integer_part: \"") + (pynini.cross("un", "1") | pynini.cross("una", "1")) + pynutil.insert("\"") + delete_extra_space + graph_unit_singular + (optional_cents_standalone | optional_cents_suffix)) graph_decimal = graph_decimal_final + delete_extra_space + graph_unit_plural graph_decimal |= pynutil.insert( "currency: \"$\" integer_part: \"0\" ") + cents_standalone final_graph = graph_integer | graph_decimal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="time", kind="classify", deterministic=deterministic) final_suffix = pynutil.delete(" ") + pynutil.delete( "Uhr") | pynutil.delete("uhr") time_zone_graph = pynini.string_file( get_abs_path("data/time/time_zone.tsv")) labels_hour = [str(x) for x in range(0, 25)] labels_minute_single = [str(x) for x in range(1, 10)] labels_minute_double = [str(x) for x in range(10, 60)] delete_leading_zero_to_double_digit = (pynutil.delete("0") | (NEMO_DIGIT - "0")) + NEMO_DIGIT graph_hour = pynini.union(*labels_hour) graph_minute_single = pynini.union(*labels_minute_single) graph_minute_double = pynini.union(*labels_minute_double) final_graph_hour_only = pynutil.insert( "hours: \"") + graph_hour + pynutil.insert("\"") final_graph_hour = (pynutil.insert("hours: \"") + delete_leading_zero_to_double_digit @ graph_hour + pynutil.insert("\"")) final_graph_minute = ( pynutil.insert("minutes: \"") + (pynutil.delete("0") + graph_minute_single | graph_minute_double) + pynutil.insert("\"")) final_graph_second = ( pynutil.insert("seconds: \"") + (pynutil.delete("0") + graph_minute_single | graph_minute_double) + pynutil.insert("\"")) final_time_zone_optional = pynini.closure( pynini.accep(" ") + pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\""), 0, 1, ) # 02:30 Uhr graph_hm = (final_graph_hour + pynutil.delete(":") + (pynutil.delete("00") | (insert_space + final_graph_minute)) + final_suffix + final_time_zone_optional) # 10:30:05 Uhr, graph_hms = (final_graph_hour + pynutil.delete(":") + (pynini.cross("00", " minutes: \"0\"") | (insert_space + final_graph_minute)) + pynutil.delete(":") + (pynini.cross("00", " seconds: \"0\"") | (insert_space + final_graph_second)) + final_suffix + final_time_zone_optional + pynutil.insert(" preserve_order: true")) # 2 Uhr est graph_h = final_graph_hour_only + final_suffix + final_time_zone_optional final_graph = (graph_hm | graph_h | graph_hms).optimize() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = True): super().__init__(name="money", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph graph_decimal_final = decimal.final_graph_wo_negative unit_singular = pynini.string_file( get_abs_path("data/currency/currency.tsv")) unit_plural = convert_space(unit_singular @ SINGULAR_TO_PLURAL) unit_singular = convert_space(unit_singular) graph_unit_singular = pynutil.insert( "currency: \"") + unit_singular + pynutil.insert("\"") graph_unit_plural = pynutil.insert( "currency: \"") + unit_plural + pynutil.insert("\"") singular_graph = (graph_unit_singular + pynutil.insert(" integer_part: \"") + pynini.cross("1", "one") + pynutil.insert("\"")) graph_decimal = graph_unit_plural + insert_space + graph_decimal_final if deterministic: graph_integer = (graph_unit_plural + pynutil.insert(" integer_part: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + pynutil.insert("\"")) else: graph_integer = ( graph_unit_plural + pynutil.insert(" integer_part: \"") + ((NEMO_SIGMA - "1") @ (get_hundreds_graph(deterministic) | cardinal_graph)) + pynutil.insert("\"")) graph_decimal |= singular_graph + insert_space + graph_decimal_final graph_integer |= singular_graph final_graph = graph_integer | graph_decimal if not deterministic: currencies = load_labels( get_abs_path("data/currency/currency.tsv")) zero_graph = pynini.cross("0", "") | pynini.accep("0") # add minor currency part only when there are two digits after the point # .01 -> {zero one cent, one cent}, .05 -> {oh five, five cents} two_digits_fractional_part = ( NEMO_SIGMA + pynini.closure(NEMO_DIGIT) + ((pynini.accep(".") + (NEMO_DIGIT**(2) | zero_graph + (NEMO_DIGIT - "0"))) | pynutil.delete(".") + pynini.cross(pynini.closure("0", 1), ""))) integer_graph = None decimal_graph_with_minor = None decimal_graph_default = None for curr_symbol, curr_name in currencies: curr_symbol_graph = pynutil.delete(curr_symbol) graph_end = pynutil.insert(" currency: \"" + curr_symbol + "\"") preserve_order = pynutil.insert(" preserve_order: True") integer_part = decimal.graph_integer + graph_end + preserve_order # "$4" -> 'integer_part: "four" currency: "$" preserve_order: True' -> four dollars integer_graph_curr = curr_symbol_graph + integer_part # remove fractional part if it contains only zeros # "$4.00" -> 'integer_part: "four" currency: "$" preserve_order: True' -> four dollars integer_graph_curr |= pynini.compose( two_digits_fractional_part, integer_graph_curr) decimal_graph_with_minor_curr = ( curr_symbol_graph + pynini.closure(integer_part, 0, 1) + pynini.cross(".", " ") + decimal.graph_fractional + graph_end) # "$.5" -> 'fractional_part: "five" currency: "dollars"' -> point five dollars decimal_graph_default_curr = ( pynutil.delete("currency: \"" + pynini.compose(curr_symbol, unit_plural) + "\"") + delete_space + pynini.accep("fractional_part") + NEMO_SIGMA + pynutil.insert(" currency: \"" + pynini.compose(curr_symbol, unit_plural) + "\"")) # "$4.5" -> 'integer_part: "four" fractional_part: "five" currency: "dollars"' -> "four point five dollars" decimal_graph_default_curr |= ( pynutil.delete("currency: \"" + curr_name + pynini.closure(NEMO_NOT_QUOTE) + "\"") + delete_space + pynini.accep("integer_part") + NEMO_SIGMA + pynini.accep("fractional_part") + NEMO_SIGMA + pynutil.insert(" currency: \"" + pynini.compose(curr_symbol, unit_plural) + "\"")) # "£4 billion" -> 'integer_part: "four" quantity: "billion" currency: "pounds"' -> "four billion dollars" decimal_graph_default_curr |= ( pynutil.delete("currency: \"") + pynutil.delete( rewrite.rewrite_lattice( curr_symbol, pynini.compose(curr_symbol, unit_plural)) + "\" ") + pynini.difference(NEMO_SIGMA, "fractional_part") + pynutil.insert(" currency: \"" + pynini.compose(curr_symbol, unit_plural) + "\"")) decimal_graph_with_minor_curr = pynini.compose( two_digits_fractional_part, decimal_graph_with_minor_curr) decimal_graph_default_curr = pynini.compose( graph_decimal, decimal_graph_default_curr) integer_graph = (integer_graph_curr if integer_graph is None else pynini.union( integer_graph, integer_graph_curr)) decimal_graph_with_minor = (decimal_graph_with_minor_curr if decimal_graph_with_minor is None else pynini.union( decimal_graph_with_minor, decimal_graph_with_minor_curr)) decimal_graph_default = ( decimal_graph_default_curr if decimal_graph_default is None else pynini.union( decimal_graph_default, decimal_graph_default_curr)) final_graph = decimal_graph_with_minor | decimal_graph_default | integer_graph final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = True, lm: bool = False): super().__init__(name="integer", kind="classify", deterministic=deterministic) """ Finite state transducer for classifying serial (handles only cases without delimiters, values with delimiters are handled by default). The serial is a combination of digits, letters and dashes, e.g.: c325b -> tokens { cardinal { integer: "c three two five b" } } """ num_graph = pynini.compose(NEMO_DIGIT**(6, ...), cardinal.single_digits_graph).optimize() num_graph |= pynini.compose(NEMO_DIGIT**(1, 5), cardinal.graph).optimize() # to handle numbers starting with zero num_graph |= pynini.compose( pynini.accep("0") + pynini.closure(NEMO_DIGIT), cardinal.single_digits_graph).optimize() # TODO: "#" doesn't work from the file symbols_graph = pynini.string_file( get_abs_path("data/whitelist/symbol.tsv")).optimize( ) | pynini.cross("#", "hash") num_graph |= symbols_graph if not self.deterministic and not lm: num_graph |= cardinal.single_digits_graph # also allow double digits to be pronounced as integer in serial number num_graph |= pynutil.add_weight( NEMO_DIGIT**2 @ cardinal. graph_hundred_component_at_least_one_none_zero_digit, weight=0.0001) # add space between letter and digit/symbol symbols = [ x[0] for x in load_labels(get_abs_path("data/whitelist/symbol.tsv")) ] symbols = pynini.union(*symbols) digit_symbol = NEMO_DIGIT | symbols graph_with_space = pynini.compose( pynini.cdrewrite(pynutil.insert(" "), NEMO_ALPHA | symbols, digit_symbol, NEMO_SIGMA), pynini.cdrewrite(pynutil.insert(" "), digit_symbol, NEMO_ALPHA | symbols, NEMO_SIGMA), ) # serial graph with delimiter delimiter = pynini.accep("-") | pynini.accep("/") | pynini.accep(" ") alphas = pynini.closure(NEMO_ALPHA, 1) letter_num = alphas + delimiter + num_graph num_letter = pynini.closure(num_graph + delimiter, 1) + alphas next_alpha_or_num = pynini.closure(delimiter + (alphas | num_graph)) next_alpha_or_num |= pynini.closure( delimiter + num_graph + plurals._priority_union(pynini.accep(" "), pynutil.insert(" "), NEMO_SIGMA).optimize() + alphas) serial_graph = letter_num + next_alpha_or_num serial_graph |= num_letter + next_alpha_or_num # numbers only with 2+ delimiters serial_graph |= (num_graph + delimiter + num_graph + delimiter + num_graph + pynini.closure(delimiter + num_graph)) # 2+ symbols serial_graph |= pynini.compose(NEMO_SIGMA + symbols + NEMO_SIGMA, num_graph + delimiter + num_graph) # exclude ordinal numbers from serial options serial_graph = pynini.compose( pynini.difference(NEMO_SIGMA, pynini.project(ordinal.graph, "input")), serial_graph).optimize() serial_graph = pynutil.add_weight(serial_graph, 0.0001) serial_graph |= (pynini.closure(NEMO_NOT_SPACE, 1) + (pynini.cross("^2", " squared") | pynini.cross("^3", " cubed")).optimize()) # at least one serial graph with alpha numeric value and optional additional serial/num/alpha values serial_graph = ( pynini.closure((serial_graph | num_graph | alphas) + delimiter) + serial_graph + pynini.closure(delimiter + (serial_graph | num_graph | alphas))) serial_graph |= pynini.compose(graph_with_space, serial_graph.optimize()).optimize() serial_graph = pynini.compose(pynini.closure(NEMO_NOT_SPACE, 2), serial_graph).optimize() self.graph = serial_graph.optimize() graph = pynutil.insert("name: \"") + convert_space( self.graph).optimize() + pynutil.insert("\"") self.fst = graph.optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, deterministic: bool = True): super().__init__(name="measure", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph if not deterministic: cardinal_graph |= cardinal.range_graph graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv")) graph_unit |= pynini.compose( pynini.closure(TO_LOWER, 1) + pynini.closure(NEMO_ALPHA), graph_unit) graph_unit_plural = convert_space(graph_unit @ SINGULAR_TO_PLURAL) graph_unit = convert_space(graph_unit) optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) graph_unit2 = pynini.cross("/", "per") + delete_space + pynutil.insert( NEMO_NON_BREAKING_SPACE) + graph_unit optional_graph_unit2 = pynini.closure( delete_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit2, 0, 1, ) unit_plural = ( pynutil.insert("units: \"") + (graph_unit_plural + optional_graph_unit2 | graph_unit2) + pynutil.insert("\"")) unit_singular = (pynutil.insert("units: \"") + (graph_unit + optional_graph_unit2 | graph_unit2) + pynutil.insert("\"")) subgraph_decimal = (pynutil.insert("decimal { ") + optional_graph_negative + decimal.final_graph_wo_negative + delete_space + pynutil.insert(" } ") + unit_plural) subgraph_cardinal = (pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + delete_space + pynutil.insert("\"") + pynutil.insert(" } ") + unit_plural) subgraph_cardinal |= (pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + pynini.cross("1", "one") + delete_space + pynutil.insert("\"") + pynutil.insert(" } ") + unit_singular) cardinal_dash_alpha = (pynutil.insert("cardinal { integer: \"") + cardinal_graph + pynini.accep('-') + pynutil.insert("\" } units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynutil.insert("\"")) alpha_dash_cardinal = (pynutil.insert("units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynini.accep('-') + pynutil.insert("\"") + pynutil.insert(" cardinal { integer: \"") + cardinal_graph + pynutil.insert("\" } preserve_order: true")) decimal_dash_alpha = (pynutil.insert("decimal { ") + decimal.final_graph_wo_negative + pynini.cross('-', '') + pynutil.insert(" } units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynutil.insert("\"")) decimal_times = (pynutil.insert("decimal { ") + decimal.final_graph_wo_negative + pynutil.insert(" } units: \"") + pynini.cross(pynini.union('x', "X"), 'x') + pynutil.insert("\"")) alpha_dash_decimal = (pynutil.insert("units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynini.accep('-') + pynutil.insert("\"") + pynutil.insert(" decimal { ") + decimal.final_graph_wo_negative + pynutil.insert(" } preserve_order: true")) subgraph_fraction = (pynutil.insert("fraction { ") + fraction.graph + delete_space + pynutil.insert(" } ") + unit_plural) address = self.get_address_graph(cardinal) address = ( pynutil.insert("units: \"address\" cardinal { integer: \"") + address + pynutil.insert("\" } preserve_order: true")) math_operations = pynini.string_file( get_abs_path("data/math_operations.tsv")) delimiter = pynini.accep(" ") | pynutil.insert(" ") math = (cardinal_graph + delimiter + math_operations + delimiter + cardinal_graph + delimiter + pynini.cross("=", "equals") + delimiter + cardinal_graph) math = (pynutil.insert("units: \"math\" cardinal { integer: \"") + math + pynutil.insert("\" } preserve_order: true")) final_graph = (subgraph_decimal | subgraph_cardinal | cardinal_dash_alpha | alpha_dash_cardinal | decimal_dash_alpha | decimal_times | alpha_dash_decimal | subgraph_fraction | address | math) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, deterministic: bool = True): super().__init__(name="measure", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph graph_unit_singular = convert_space(unit_singular) graph_unit_plural = graph_unit_singular @ pynini.cdrewrite(convert_space(suppletive), "", "[EOS]", NEMO_SIGMA) optional_graph_negative = pynini.closure("-", 0, 1) graph_unit_denominator = ( pynini.cross("/", "pro") + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_singular ) optional_unit_denominator = pynini.closure( pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, 0, 1, ) unit_plural = ( pynutil.insert("units: \"") + (graph_unit_plural + (optional_unit_denominator) | graph_unit_denominator) + pynutil.insert("\"") ) unit_singular_graph = ( pynutil.insert("units: \"") + ((graph_unit_singular + optional_unit_denominator) | graph_unit_denominator) + pynutil.insert("\"") ) subgraph_decimal = decimal.fst + insert_space + pynini.closure(pynutil.delete(" "), 0, 1) + unit_plural subgraph_cardinal = ( (optional_graph_negative + (pynini.closure(NEMO_DIGIT) - "1")) @ cardinal.fst + insert_space + pynini.closure(pynutil.delete(" "), 0, 1) + unit_plural ) subgraph_cardinal |= ( (optional_graph_negative + pynini.accep("1")) @ cardinal.fst @ pynini.cdrewrite(pynini.cross("eins", "ein"), "", "", NEMO_SIGMA) + insert_space + pynini.closure(pynutil.delete(" "), 0, 1) + unit_singular_graph ) subgraph_fraction = fraction.fst + insert_space + pynini.closure(pynutil.delete(" "), 0, 1) + unit_plural cardinal_dash_alpha = ( pynutil.insert("cardinal { integer: \"") + cardinal_graph + pynutil.delete('-') + pynutil.insert("\" } units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynutil.insert("\"") ) alpha_dash_cardinal = ( pynutil.insert("units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynutil.delete('-') + pynutil.insert("\"") + pynutil.insert(" cardinal { integer: \"") + cardinal_graph + pynutil.insert("\" }") ) decimal_dash_alpha = ( pynutil.insert("decimal { ") + decimal.final_graph_wo_negative + pynutil.delete('-') + pynutil.insert(" } units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynutil.insert("\"") ) decimal_times = ( pynutil.insert("decimal { ") + decimal.final_graph_wo_negative + pynutil.insert(" } units: \"") + pynini.union('x', 'X') + pynutil.insert("\"") ) cardinal_times = ( pynutil.insert("cardinal { integer: \"") + cardinal_graph + pynutil.insert("\" } units: \"") + pynini.union('x', 'X') + pynutil.insert("\"") ) alpha_dash_decimal = ( pynutil.insert("units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynutil.delete('-') + pynutil.insert("\"") + pynutil.insert(" decimal { ") + decimal.final_graph_wo_negative + pynutil.insert(" }") ) final_graph = ( subgraph_decimal | subgraph_cardinal | cardinal_dash_alpha | alpha_dash_cardinal | decimal_dash_alpha | decimal_times | alpha_dash_decimal | subgraph_fraction | cardinal_times ) final_graph += pynutil.insert(" preserve_order: true") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, deterministic: bool = True): super().__init__(name="measure", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph unit_singular = unit unit_plural = unit_singular @ (unit_plural_fem | unit_plural_masc) graph_unit_singular = convert_space(unit_singular) graph_unit_plural = convert_space(unit_plural) optional_graph_negative = pynini.closure("-", 0, 1) graph_unit_denominator = ( pynini.cross("/", "por") + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_singular ) optional_unit_denominator = pynini.closure( pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, 0, 1, ) unit_plural = ( pynutil.insert("units: \"") + ((graph_unit_plural + optional_unit_denominator) | graph_unit_denominator) + pynutil.insert("\"") ) unit_singular_graph = ( pynutil.insert("units: \"") + ((graph_unit_singular + optional_unit_denominator) | graph_unit_denominator) + pynutil.insert("\"") ) subgraph_decimal = decimal.fst + insert_space + pynini.closure(NEMO_SPACE, 0, 1) + unit_plural subgraph_cardinal = ( (optional_graph_negative + (NEMO_SIGMA - "1")) @ cardinal.fst + insert_space + pynini.closure(delete_space, 0, 1) + unit_plural ) subgraph_cardinal |= ( (optional_graph_negative + pynini.accep("1")) @ cardinal.fst + insert_space + pynini.closure(delete_space, 0, 1) + unit_singular_graph ) subgraph_fraction = fraction.fst + insert_space + pynini.closure(delete_space, 0, 1) + unit_singular_graph decimal_times = ( pynutil.insert("decimal { ") + decimal.final_graph_wo_negative + pynutil.insert(" } units: \"") + pynini.union('x', 'X') + pynutil.insert("\"") ) cardinal_times = ( pynutil.insert("cardinal { integer: \"") + strip_cardinal_apocope(cardinal_graph) + pynutil.insert("\" } units: \"") + pynini.union('x', 'X') + pynutil.insert("\"") ) cardinal_dash_alpha = ( pynutil.insert("cardinal { integer: \"") + strip_cardinal_apocope(cardinal_graph) + pynutil.delete('-') + pynutil.insert("\" } units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynutil.insert("\"") ) decimal_dash_alpha = ( pynutil.insert("decimal { ") + decimal.final_graph_wo_negative + pynutil.delete('-') + pynutil.insert(" } units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynutil.insert("\"") ) alpha_dash_cardinal = ( pynutil.insert("units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynutil.delete('-') + pynutil.insert("\"") + pynutil.insert(" cardinal { integer: \"") + cardinal_graph + pynutil.insert("\" } preserve_order: true") ) alpha_dash_decimal = ( pynutil.insert("units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynutil.delete('-') + pynutil.insert("\"") + pynutil.insert(" decimal { ") + decimal.final_graph_wo_negative + pynutil.insert(" } preserve_order: true") ) final_graph = ( subgraph_decimal | subgraph_cardinal | cardinal_dash_alpha | alpha_dash_cardinal | decimal_dash_alpha | subgraph_fraction | decimal_times | cardinal_times | alpha_dash_decimal ) final_graph += pynutil.insert(" preserve_order: true") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="money", kind="classify") # quantity, integer_part, fractional_part, currency cardinal_graph = cardinal.graph_no_exception graph_decimal_final = decimal.final_graph_wo_negative unit = pynini.string_file(get_abs_path("data/currency.tsv")) unit_singular = pynini.invert(unit) unit_plural = get_singulars(unit_singular) graph_unit_singular = pynutil.insert("currency: \"") + convert_space( unit_singular) + pynutil.insert("\"") graph_unit_plural = pynutil.insert("currency: \"") + convert_space( unit_plural) + pynutil.insert("\"") add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | ( pynutil.insert("0") + NEMO_DIGIT) # twelve dollars (and) fifty cents, zero cents cents_standalone = (pynutil.insert("fractional_part: \"") + pynini.union( pynutil.add_weight( ((NEMO_SIGMA - "one") @ cardinal_graph), -0.7) @ add_leading_zero_to_double_digit + delete_space + pynutil.delete("cents"), pynini.cross("one", "01") + delete_space + pynutil.delete("cent"), ) + pynutil.insert("\"")) optional_cents_standalone = pynini.closure( delete_space + pynini.closure(pynutil.delete("and") + delete_space, 0, 1) + insert_space + cents_standalone, 0, 1, ) # twelve dollars fifty, only after integer optional_cents_suffix = pynini.closure( delete_extra_space + pynutil.insert("fractional_part: \"") + pynutil.add_weight( cardinal_graph @ add_leading_zero_to_double_digit, -0.7) + pynutil.insert("\""), 0, 1, ) graph_integer = (pynutil.insert("integer_part: \"") + ((NEMO_SIGMA - "one") @ cardinal_graph) + pynutil.insert("\"") + delete_extra_space + graph_unit_plural + (optional_cents_standalone | optional_cents_suffix)) graph_integer |= (pynutil.insert("integer_part: \"") + pynini.cross("one", "1") + pynutil.insert("\"") + delete_extra_space + graph_unit_singular + (optional_cents_standalone | optional_cents_suffix)) graph_decimal = graph_decimal_final + delete_extra_space + graph_unit_plural graph_decimal |= pynutil.insert( "currency: \"$\" integer_part: \"0\" ") + cents_standalone final_graph = graph_integer | graph_decimal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None): super().__init__(name="whitelist", kind="classify", deterministic=deterministic) def _get_whitelist_graph(input_case, file, keep_punct_add_end: bool = False): whitelist = load_labels(file) if input_case == "lower_cased": whitelist = [[x.lower(), y] for x, y in whitelist] else: whitelist = [[x, y] for x, y in whitelist] if keep_punct_add_end: whitelist.extend(augment_labels_with_punct_at_end(whitelist)) graph = pynini.string_map(whitelist) return graph graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist/tts.tsv")) graph |= _get_whitelist_graph(input_case, get_abs_path("data/whitelist/symbol.tsv")) if deterministic: names = get_names() graph |= ( pynini.cross(pynini.union("st", "St", "ST"), "Saint") + pynini.closure(pynutil.delete(".")) + pynini.accep(" ") + names ) else: graph |= _get_whitelist_graph( input_case, get_abs_path("data/whitelist/alternatives.tsv"), keep_punct_add_end=True ) for x in [".", ". "]: graph |= ( NEMO_UPPER + pynini.closure(pynutil.delete(x) + NEMO_UPPER, 2) + pynini.closure(pynutil.delete("."), 0, 1) ) if not deterministic: multiple_forms_whitelist_graph = get_formats(get_abs_path("data/whitelist/alternatives_all_format.tsv")) graph |= multiple_forms_whitelist_graph graph_unit = pynini.string_file(get_abs_path("data/measure/unit.tsv")) | pynini.string_file( get_abs_path("data/measure/unit_alternatives.tsv") ) graph_unit_plural = graph_unit @ SINGULAR_TO_PLURAL units_graph = pynini.compose(NEMO_CHAR ** (3, ...), convert_space(graph_unit | graph_unit_plural)) graph |= units_graph # convert to states only if comma is present before the abbreviation to avoid converting all caps words, # e.g. "IN", "OH", "OK" # TODO or only exclude above? states = load_labels(get_abs_path("data/address/state.tsv")) additional_options = [] for x, y in states: if input_case == "lower_cased": x = x.lower() additional_options.append((x, f"{y[0]}.{y[1:]}")) if not deterministic: additional_options.append((x, f"{y[0]}.{y[1:]}.")) states.extend(additional_options) state_graph = pynini.string_map(states) graph |= pynini.closure(NEMO_NOT_SPACE, 1) + pynini.union(", ", ",") + pynini.invert(state_graph).optimize() if input_file: whitelist_provided = _get_whitelist_graph(input_case, input_file) if not deterministic: graph |= whitelist_provided else: graph = whitelist_provided self.graph = (convert_space(graph)).optimize() self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = True): super().__init__(name="money", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph_with_and graph_decimal_final = decimal.final_graph_wo_negative_w_abbr maj_singular_labels = load_labels( get_abs_path("data/money/currency_major.tsv")) maj_unit_plural = convert_space(maj_singular @ SINGULAR_TO_PLURAL) maj_unit_singular = convert_space(maj_singular) graph_maj_singular = pynutil.insert( "currency_maj: \"") + maj_unit_singular + pynutil.insert("\"") graph_maj_plural = pynutil.insert( "currency_maj: \"") + maj_unit_plural + pynutil.insert("\"") optional_delete_fractional_zeros = pynini.closure( pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), 0, 1) graph_integer_one = pynutil.insert("integer_part: \"") + pynini.cross( "1", "one") + pynutil.insert("\"") # only for decimals where third decimal after comma is non-zero or with quantity decimal_delete_last_zeros = ( pynini.closure(NEMO_DIGIT | pynutil.delete(",")) + pynini.accep(".") + pynini.closure(NEMO_DIGIT, 2) + (NEMO_DIGIT - "0") + pynini.closure(pynutil.delete("0"))) decimal_with_quantity = NEMO_SIGMA + NEMO_ALPHA graph_decimal = (graph_maj_plural + insert_space + (decimal_delete_last_zeros | decimal_with_quantity) @ graph_decimal_final) graph_integer = (pynutil.insert("integer_part: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + pynutil.insert("\"")) graph_integer_only = graph_maj_singular + insert_space + graph_integer_one graph_integer_only |= graph_maj_plural + insert_space + graph_integer final_graph = (graph_integer_only + optional_delete_fractional_zeros) | graph_decimal # remove trailing zeros of non zero number in the first 2 digits and fill up to 2 digits # e.g. 2000 -> 20, 0200->02, 01 -> 01, 10 -> 10 # not accepted: 002, 00, 0, two_digits_fractional_part = ( pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(pynutil.delete("0"))) @ ( (pynutil.delete("0") + (NEMO_DIGIT - "0")) | ((NEMO_DIGIT - "0") + pynutil.insert("0")) | ((NEMO_DIGIT - "0") + NEMO_DIGIT)) graph_min_singular = pynutil.insert( " currency_min: \"") + min_singular + pynutil.insert("\"") graph_min_plural = pynutil.insert( " currency_min: \"") + min_plural + pynutil.insert("\"") # format ** dollars ** cent decimal_graph_with_minor = None integer_graph_reordered = None decimal_default_reordered = None for curr_symbol, _ in maj_singular_labels: preserve_order = pynutil.insert(" preserve_order: true") integer_plus_maj = graph_integer + insert_space + pynutil.insert( curr_symbol) @ graph_maj_plural integer_plus_maj |= graph_integer_one + insert_space + pynutil.insert( curr_symbol) @ graph_maj_singular integer_plus_maj_with_comma = pynini.compose( NEMO_DIGIT - "0" + pynini.closure(NEMO_DIGIT | pynutil.delete(",")), integer_plus_maj) integer_plus_maj = pynini.compose( pynini.closure(NEMO_DIGIT) - "0", integer_plus_maj) integer_plus_maj |= integer_plus_maj_with_comma graph_fractional_one = two_digits_fractional_part @ pynini.cross( "1", "one") graph_fractional_one = pynutil.insert( "fractional_part: \"") + graph_fractional_one + pynutil.insert( "\"") graph_fractional = (two_digits_fractional_part @ ( pynini.closure(NEMO_DIGIT, 1, 2) - "1" ) @ cardinal.graph_hundred_component_at_least_one_none_zero_digit) graph_fractional = pynutil.insert( "fractional_part: \"") + graph_fractional + pynutil.insert( "\"") fractional_plus_min = graph_fractional + insert_space + pynutil.insert( curr_symbol) @ graph_min_plural fractional_plus_min |= ( graph_fractional_one + insert_space + pynutil.insert(curr_symbol) @ graph_min_singular) decimal_graph_with_minor_curr = integer_plus_maj + pynini.cross( ".", " ") + fractional_plus_min if not deterministic: decimal_graph_with_minor_curr |= pynutil.add_weight( integer_plus_maj + pynini.cross(".", " ") + pynutil.insert("fractional_part: \"") + two_digits_fractional_part @ cardinal. graph_hundred_component_at_least_one_none_zero_digit + pynutil.insert("\""), weight=0.0001, ) default_fraction_graph = ( decimal_delete_last_zeros | decimal_with_quantity) @ graph_decimal_final decimal_graph_with_minor_curr |= ( pynini.closure(pynutil.delete("0"), 0, 1) + pynutil.delete(".") + fractional_plus_min) decimal_graph_with_minor_curr = (pynutil.delete(curr_symbol) + decimal_graph_with_minor_curr + preserve_order) decimal_graph_with_minor = ( decimal_graph_with_minor_curr if decimal_graph_with_minor is None else pynini.union( decimal_graph_with_minor, decimal_graph_with_minor_curr).optimize()) if not deterministic: integer_graph_reordered_curr = (pynutil.delete(curr_symbol) + integer_plus_maj + preserve_order).optimize() integer_graph_reordered = ( integer_graph_reordered_curr if integer_graph_reordered is None else pynini.union( integer_graph_reordered, integer_graph_reordered_curr).optimize()) decimal_default_reordered_curr = ( pynutil.delete(curr_symbol) + default_fraction_graph + insert_space + pynutil.insert(curr_symbol) @ graph_maj_plural) decimal_default_reordered = ( decimal_default_reordered_curr if decimal_default_reordered is None else pynini.union( decimal_default_reordered, decimal_default_reordered_curr)).optimize() # weight for SH final_graph |= pynutil.add_weight(decimal_graph_with_minor, -0.0001) if not deterministic: final_graph |= integer_graph_reordered | decimal_default_reordered # to handle "$2.00" cases final_graph |= pynini.compose( NEMO_SIGMA + pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), integer_graph_reordered) final_graph = self.add_tokens(final_graph.optimize()) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="time", kind="classify", deterministic=deterministic) suffix_labels = load_labels(get_abs_path("data/time/suffix.tsv")) suffix_labels.extend(augment_labels_with_punct_at_end(suffix_labels)) suffix_graph = pynini.string_map(suffix_labels) time_zone_graph = pynini.string_file( get_abs_path("data/time/zone.tsv")) # only used for < 1000 thousand -> 0 weight cardinal = cardinal.graph labels_hour = [str(x) for x in range(0, 24)] labels_minute_single = [str(x) for x in range(1, 10)] labels_minute_double = [str(x) for x in range(10, 60)] delete_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | ( pynini.closure(pynutil.delete("0"), 0, 1) + NEMO_DIGIT) graph_hour = delete_leading_zero_to_double_digit @ pynini.union( *labels_hour) @ cardinal graph_minute_single = pynini.union(*labels_minute_single) @ cardinal graph_minute_double = pynini.union(*labels_minute_double) @ cardinal final_graph_hour = pynutil.insert( "hours: \"") + graph_hour + pynutil.insert("\"") final_graph_minute = ( pynutil.insert("minutes: \"") + (pynini.cross("0", "o") + insert_space + graph_minute_single | graph_minute_double) + pynutil.insert("\"")) final_graph_second = ( pynutil.insert("seconds: \"") + (pynini.cross("0", "o") + insert_space + graph_minute_single | graph_minute_double) + pynutil.insert("\"")) final_suffix = pynutil.insert("suffix: \"") + convert_space( suffix_graph) + pynutil.insert("\"") final_suffix_optional = pynini.closure( delete_space + insert_space + final_suffix, 0, 1) final_time_zone_optional = pynini.closure( delete_space + insert_space + pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\""), 0, 1, ) # 2:30 pm, 02:30, 2:00 graph_hm = ( final_graph_hour + pynutil.delete(":") + (pynutil.delete("00") | insert_space + final_graph_minute) + final_suffix_optional + final_time_zone_optional) # 10:30:05 pm, graph_hms = (final_graph_hour + pynutil.delete(":") + (pynini.cross("00", " minutes: \"zero\"") | insert_space + final_graph_minute) + pynutil.delete(":") + (pynini.cross("00", " seconds: \"zero\"") | insert_space + final_graph_second) + final_suffix_optional + final_time_zone_optional) # 2.xx pm/am graph_hm2 = ( final_graph_hour + pynutil.delete(".") + (pynutil.delete("00") | insert_space + final_graph_minute) + delete_space + insert_space + final_suffix + final_time_zone_optional) # 2 pm est graph_h = final_graph_hour + delete_space + insert_space + final_suffix + final_time_zone_optional final_graph = (graph_hm | graph_h | graph_hm2 | graph_hms).optimize() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="time", kind="classify") # hours, minutes, seconds, suffix, zone, style, speak_period suffix_graph = pynini.string_file( get_abs_path("data/time/time_suffix.tsv")) time_zone_graph = pynini.invert( pynini.string_file(get_abs_path("data/time/time_zone.tsv"))) time_to_graph = pynini.string_file( get_abs_path("data/time/time_to.tsv")) # only used for < 1000 thousand -> 0 weight cardinal = pynutil.add_weight(CardinalFst().graph_no_exception, weight=-0.7) labels_hour = [num_to_word(x) for x in range(0, 24)] labels_minute_single = [num_to_word(x) for x in range(1, 10)] labels_minute_double = [num_to_word(x) for x in range(10, 60)] graph_hour = pynini.union(*labels_hour) @ cardinal graph_minute_single = pynini.union(*labels_minute_single) @ cardinal graph_minute_double = pynini.union(*labels_minute_double) @ cardinal graph_minute_verbose = pynini.cross("half", "30") | pynini.cross( "quarter", "15") oclock = pynini.cross( pynini.union("o' clock", "o clock", "o'clock", "oclock"), "") final_graph_hour = pynutil.insert( "hours: \"") + graph_hour + pynutil.insert("\"") graph_minute = ( oclock + pynutil.insert("00") | pynutil.delete("o") + delete_space + graph_minute_single | graph_minute_double) final_suffix = pynutil.insert("suffix: \"") + convert_space( suffix_graph) + pynutil.insert("\"") final_suffix_optional = pynini.closure( delete_space + insert_space + final_suffix, 0, 1) final_time_zone_optional = pynini.closure( delete_space + insert_space + pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\""), 0, 1, ) # five o' clock # two o eight, two thiry five (am/pm) # two pm/am graph_hm = (final_graph_hour + delete_extra_space + pynutil.insert("minutes: \"") + graph_minute + pynutil.insert("\"")) # 10 past four, quarter past four, half past four graph_mh = (pynutil.insert("minutes: \"") + pynini.union( graph_minute_single, graph_minute_double, graph_minute_verbose) + pynutil.insert("\"") + delete_space + pynutil.delete("past") + delete_extra_space + final_graph_hour) graph_quarter_time = (pynutil.insert("minutes: \"") + pynini.cross("quarter", "45") + pynutil.insert("\"") + delete_space + pynutil.delete(pynini.union("to", "till")) + delete_extra_space + pynutil.insert("hours: \"") + time_to_graph + pynutil.insert("\"")) graph_h = (final_graph_hour + delete_extra_space + pynutil.insert("minutes: \"") + (pynutil.insert("00") | graph_minute) + pynutil.insert("\"") + delete_space + insert_space + final_suffix + final_time_zone_optional) final_graph = (graph_hm | graph_mh | graph_quarter_time ) + final_suffix_optional + final_time_zone_optional final_graph |= graph_h final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal_tagger: GraphFst, deterministic: bool = True): super().__init__(name="time", kind="verbalize", deterministic=deterministic) # add weight so when using inverse text normalization this conversion is depriotized night_to_early = pynutil.add_weight(pynini.invert( pynini.string_file( get_abs_path("data/time/hour_to_night.tsv"))).optimize(), weight=0.0001) hour_to = pynini.invert( pynini.string_file( get_abs_path("data/time/hour_to.tsv"))).optimize() minute_to = pynini.invert( pynini.string_file( get_abs_path("data/time/minute_to.tsv"))).optimize() time_zone_graph = pynini.invert( convert_space( pynini.union(*[ x[1] for x in load_labels( get_abs_path("data/time/time_zone.tsv")) ]))) graph_zero = pynini.invert( pynini.string_file( get_abs_path("data/numbers/zero.tsv"))).optimize() number_verbalization = graph_zero | cardinal_tagger.two_digit_non_zero hour = pynutil.delete("hours: \"") + pynini.closure( NEMO_DIGIT, 1) + pynutil.delete("\"") hour_verbalized = hour @ number_verbalization @ pynini.cdrewrite( pynini.cross("eins", "ein"), "[BOS]", "[EOS]", NEMO_SIGMA) + pynutil.insert(" uhr") minute = pynutil.delete("minutes: \"") + pynini.closure( NEMO_DIGIT, 1) + pynutil.delete("\"") zone = pynutil.delete("zone: \"") + time_zone_graph + pynutil.delete( "\"") optional_zone = pynini.closure(pynini.accep(" ") + zone, 0, 1) second = pynutil.delete("seconds: \"") + pynini.closure( NEMO_DIGIT, 1) + pynutil.delete("\"") graph_hms = (hour_verbalized + pynini.accep(" ") + minute @ number_verbalization + pynutil.insert(" minuten") + pynini.accep(" ") + second @ number_verbalization + pynutil.insert(" sekunden") + optional_zone) graph_hms @= pynini.cdrewrite( pynini.cross("eins minuten", "eine minute") | pynini.cross("eins sekunden", "eine sekunde"), pynini.union(" ", "[BOS]"), "", NEMO_SIGMA, ) min_30 = [str(x) for x in range(1, 31)] min_30 = pynini.union(*min_30) min_29 = [str(x) for x in range(1, 30)] min_29 = pynini.union(*min_29) graph_h = hour_verbalized graph_hm = hour_verbalized + pynini.accep( " ") + minute @ number_verbalization graph_m_past_h = ( minute @ min_30 @ (number_verbalization | pynini.cross("15", "viertel")) + pynini.accep(" ") + pynutil.insert("nach ") # + hour @ number_verbalization + hour @ pynini.cdrewrite(night_to_early, "[BOS]", "[EOS]", NEMO_SIGMA) @ number_verbalization) graph_m30_h = (minute @ pynini.cross("30", "halb") + pynini.accep(" ") + hour @ pynini.cdrewrite( night_to_early, "[BOS]", "[EOS]", NEMO_SIGMA) @ hour_to @ number_verbalization) graph_m_to_h = ( minute @ minute_to @ min_29 @ (number_verbalization | pynini.cross("15", "viertel")) + pynini.accep(" ") + pynutil.insert("vor ") + hour @ pynini.cdrewrite(night_to_early, "[BOS]", "[EOS]", NEMO_SIGMA) @ hour_to @ number_verbalization) self.graph = (graph_hms | graph_h | graph_hm | pynutil.add_weight(graph_m_past_h, weight=0.0001) | pynutil.add_weight(graph_m30_h, weight=0.0001) | pynutil.add_weight(graph_m_to_h, weight=0.0001)) + optional_zone delete_tokens = self.delete_tokens(self.graph + delete_preserve_order) self.fst = delete_tokens.optimize()