def __init__(self, deterministic: bool = True): super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic) graph_digit = pynini.string_file( get_abs_path("data/ordinals/digit.tsv")).invert() graph_teens = pynini.string_file( get_abs_path("data/ordinals/teen.tsv")).invert() graph = (pynutil.delete("integer:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) convert_rest = pynutil.insert("th", weight=0.01) suffix = pynini.cdrewrite( graph_digit | graph_teens | pynutil.add_weight(pynini.cross("ty", "tieth"), weight=0.001) | convert_rest, "", "[EOS]", NEMO_SIGMA, ).optimize() graph = graph @ suffix self.suffix = suffix delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="time", kind="classify", deterministic=deterministic) suffix_graph = pynini.string_file(get_abs_path("data/time_suffix.tsv")) time_zone_graph = pynini.string_file( get_abs_path("data/time_zone.tsv")) # only used for < 1000 thousand -> 0 weight cardinal = cardinal.graph labels_hour = [str(x) for x in range(0, 24)] labels_minute_single = [str(x) for x in range(1, 10)] labels_minute_double = [str(x) for x in range(10, 60)] delete_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | ( pynini.closure(pynutil.delete("0"), 0, 1) + NEMO_DIGIT) graph_hour = delete_leading_zero_to_double_digit @ pynini.union( *labels_hour) @ cardinal graph_minute_single = pynini.union(*labels_minute_single) @ cardinal graph_minute_double = pynini.union(*labels_minute_double) @ cardinal final_graph_hour = pynutil.insert( "hours: \"") + graph_hour + pynutil.insert("\"") final_graph_minute = ( pynutil.insert("minutes: \"") + (pynini.cross("0", "o") + insert_space + graph_minute_single | graph_minute_double) + pynutil.insert("\"")) final_suffix = pynutil.insert("suffix: \"") + convert_space( suffix_graph) + pynutil.insert("\"") final_suffix_optional = pynini.closure( delete_space + insert_space + final_suffix, 0, 1) final_time_zone_optional = pynini.closure( delete_space + insert_space + pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\""), 0, 1, ) # 2:30 pm, 02:30, 2:00 graph_hm = ( final_graph_hour + pynutil.delete(":") + (pynutil.delete("00") | insert_space + final_graph_minute) + final_suffix_optional + final_time_zone_optional) # 2.xx pm/am graph_hm2 = ( final_graph_hour + pynutil.delete(".") + (pynutil.delete("00") | insert_space + final_graph_minute) + delete_space + insert_space + final_suffix + final_time_zone_optional) # 2 pm est graph_h = final_graph_hour + delete_space + insert_space + final_suffix + final_time_zone_optional final_graph = (graph_hm | graph_h | graph_hm2).optimize() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) graph = pynini.Far( get_abs_path("data/numbers/cardinal_number_name.far")).get_fst() self.graph_hundred_component_at_least_one_none_zero_digit = ( pynini.closure(NEMO_DIGIT, 2, 3) | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure( pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT)) @ graph graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) single_digits_graph = pynutil.add_weight( pynini.invert(graph_digit | graph_zero), 1.2) | pynutil.add_weight( pynini.cross("0", "oh"), 1.1) self.single_digits_graph = single_digits_graph + pynini.closure( pynutil.insert(" ") + single_digits_graph) if not deterministic: single_digits_graph_with_commas = pynini.closure( self.single_digits_graph + pynutil.insert(" "), 1, 3) + pynini.closure( pynutil.delete(",") + single_digits_graph + pynutil.insert(" ") + single_digits_graph + pynutil.insert(" ") + single_digits_graph, 1, ) self.graph |= self.single_digits_graph | get_hundreds_graph( ) | single_digits_graph_with_commas self.range_graph = ( pynini.closure(pynutil.insert("from "), 0, 1) + self.graph + (pynini.cross("-", " to ") | pynini.cross("-", " ")) + self.graph) self.range_graph |= self.graph + (pynini.cross( "x", " by ") | pynini.cross(" x ", " by ")) + self.graph self.range_graph = self.range_graph.optimize() optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) final_graph = self.graph | pynutil.add_weight(self.get_serial_graph(), 1.2) if not deterministic: final_graph |= self.range_graph final_graph = optional_minus_graph + pynutil.insert( "integer: \"") + final_graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def _get_whitelist_non_deterministic_graph( file="data/whitelist_alternatives.tsv"): whitelist = load_labels(get_abs_path(file)) whitelist_lower = [(x.lower(), y.lower()) for x, y in whitelist] whitelist_cased = [(x, y) for x, y in whitelist] graph = pynini.string_map(whitelist_lower + whitelist_cased) return graph
def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="money", kind="classify") cardinal_graph = cardinal.graph graph_decimal_final = decimal.final_graph_wo_negative unit_singular = pynini.string_file(get_abs_path("data/currency.tsv")) unit_plural = convert_space(unit_singular @ SINGULAR_TO_PLURAL) unit_singular = convert_space(unit_singular) graph_unit_singular = pynutil.insert( "currency: \"") + unit_singular + pynutil.insert("\"") graph_unit_plural = pynutil.insert( "currency: \"") + unit_plural + pynutil.insert("\"") graph_integer = (graph_unit_plural + pynutil.insert(" integer_part: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + pynutil.insert("\"")) graph_integer |= (graph_unit_singular + pynutil.insert(" integer_part: \"") + pynini.cross("1", "one") + pynutil.insert("\"")) graph_decimal = graph_unit_plural + insert_space + graph_decimal_final final_graph = graph_integer | graph_decimal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def get_serial_graph(self): """ Finite state transducer for classifying serial. The serial is a combination of digits, letters and dashes, e.g.: c325-b -> tokens { serial { value: "c three two five b" } } """ alpha = NEMO_ALPHA if self.deterministic: num_graph = self.single_digits_graph else: num_graph = self.graph letter_pronunciation = pynini.string_map( load_labels(get_abs_path("data/letter_pronunciation.tsv"))) alpha |= letter_pronunciation delimiter = insert_space | pynini.cross("-", " ") | pynini.cross( "/", " ") letter_num = pynini.closure(alpha + delimiter, 1) + num_graph num_letter = pynini.closure(num_graph + delimiter, 1) + alpha next_alpha_or_num = pynini.closure(delimiter + (alpha | num_graph)) serial_graph = (letter_num | num_letter) + next_alpha_or_num if not self.deterministic: serial_graph += pynini.closure( pynini.accep("s") | pynini.cross("s", "es"), 0, 1) return serial_graph
def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="verbalize", deterministic=deterministic) graph_digit = pynini.invert( pynini.string_file( get_abs_path("data/numbers/digit.tsv"))).optimize() graph_symbols = pynini.string_file( get_abs_path("data/electronic/symbols.tsv")).optimize() user_name = ( pynutil.delete("username:"******"\"") + (pynini.closure( pynutil.add_weight(graph_digit + insert_space, 1.09) | pynutil.add_weight( pynini.closure(graph_symbols + pynutil.insert(" ")), 1.09) | pynutil.add_weight(NEMO_NOT_QUOTE + insert_space, 1.1))) + pynutil.delete("\"")) domain_default = (pynini.closure(NEMO_NOT_QUOTE + insert_space) + pynini.cross(".", "dot ") + NEMO_NOT_QUOTE + pynini.closure(insert_space + NEMO_NOT_QUOTE)) server_default = (pynini.closure( (graph_digit | NEMO_ALPHA) + insert_space, 1) + pynini.closure(graph_symbols + insert_space) + pynini.closure( (graph_digit | NEMO_ALPHA) + insert_space, 1)) server_common = pynini.string_file( get_abs_path("data/electronic/server_name.tsv")) + insert_space domain_common = pynini.cross(".", "dot ") + pynini.string_file( get_abs_path("data/electronic/domain.tsv")) domain = (pynutil.delete("domain:") + delete_space + pynutil.delete("\"") + (pynutil.add_weight(server_common, 1.09) | pynutil.add_weight(server_default, 1.1)) + (pynutil.add_weight(domain_common, 1.09) | pynutil.add_weight(domain_default, 1.1)) + delete_space + pynutil.delete("\"")) graph = user_name + delete_space + pynutil.insert( "at ") + delete_space + domain + delete_space delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def _get_whitelist_graph(input_case, file="data/whitelist.tsv"): whitelist = load_labels(get_abs_path(file)) if input_case == "lower_cased": whitelist = [(x.lower(), y) for x, y in whitelist] else: whitelist = [(x, y) for x, y in whitelist] graph = pynini.string_map(whitelist) return graph
def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="measure", kind="classify") cardinal_graph = cardinal.graph graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv")) graph_unit_plural = convert_space(graph_unit @ SINGULAR_TO_PLURAL) graph_unit = convert_space(graph_unit) optional_graph_negative = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) graph_unit2 = pynini.cross("/", "per") + delete_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit optional_graph_unit2 = pynini.closure( delete_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit2, 0, 1, ) unit_plural = ( pynutil.insert("units: \"") + (graph_unit_plural + optional_graph_unit2 | graph_unit2) + pynutil.insert("\"") ) unit_singular = ( pynutil.insert("units: \"") + (graph_unit + optional_graph_unit2 | graph_unit2) + pynutil.insert("\"") ) subgraph_decimal = ( pynutil.insert("decimal { ") + optional_graph_negative + decimal.final_graph_wo_negative + delete_space + pynutil.insert(" } ") + unit_plural ) subgraph_cardinal = ( pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + delete_space + pynutil.insert("\"") + pynutil.insert(" } ") + unit_plural ) subgraph_cardinal |= ( pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + pynini.cross("1", "one") + delete_space + pynutil.insert("\"") + pynutil.insert(" } ") + unit_singular ) final_graph = subgraph_decimal | subgraph_cardinal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst): super().__init__(name="decimal", kind="classify") cardinal_graph = cardinal.graph cardinal_graph_hundred_component_at_least_one_none_zero_digit = ( cardinal.graph_hundred_component_at_least_one_none_zero_digit) graph_decimal = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_decimal |= pynini.string_file( get_abs_path("data/numbers/zero.tsv")) graph_decimal = ( pynini.cross("zero", "0") | graph_decimal | (graph_decimal | pynini.cross("o", "0")) + pynini.closure( delete_space + (graph_decimal | pynini.cross("o", "0")), 1)) self.graph = pynini.invert(graph_decimal).optimize() point = pynutil.delete(".") optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) graph_fractional = pynutil.insert( "fractional_part: \"") + self.graph + pynutil.insert("\"") graph_integer = pynutil.insert( "integer_part: \"") + cardinal_graph + pynutil.insert("\"") final_graph_wo_sign = ( pynini.closure(graph_integer + pynutil.insert(" "), 0, 1) + point + pynutil.insert(" ") + graph_fractional) self.final_graph_wo_negative = final_graph_wo_sign | get_quantity( final_graph_wo_sign, cardinal_graph_hundred_component_at_least_one_none_zero_digit) final_graph = optional_graph_negative + self.final_graph_wo_negative final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, input_case: str): super().__init__(name="whitelist", kind="classify") whitelist = load_labels(get_abs_path("data/whitelist.tsv")) if input_case == "lower_cased": whitelist = [(x.lower(), y) for x, y in whitelist] else: whitelist = [(x, y) for x, y in whitelist] graph = pynini.string_map(whitelist) graph = pynutil.insert("name: \"") + convert_space(graph) + pynutil.insert("\"") self.fst = graph.optimize()
def __init__(self): super().__init__(name="cardinal", kind="classify") graph = pynini.Far( get_abs_path("data/numbers/cardinal_number_name.far")).get_fst() self.graph_hundred_component_at_least_one_none_zero_digit = ( pynini.closure(NEMO_DIGIT, 2, 3) | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure( pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT)) @ graph optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) final_graph = optional_minus_graph + pynutil.insert( "integer: \"") + self.graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="telephone", kind="classify", deterministic=deterministic) add_separator = pynutil.insert(", ") # between components digit = pynini.invert( pynini.string_file(get_abs_path("data/numbers/digit.tsv")) ).optimize() | pynini.cross("0", "o") country_code = (pynutil.insert("country_code: \"") + pynutil.delete("+") + pynini.closure(digit + insert_space, 0, 2) + digit + pynutil.insert("\"")) optional_country_code = pynini.closure( country_code + pynini.closure(pynutil.delete("-"), 0, 1) + delete_space + insert_space, 0, 1) number_part = (( (pynini.closure(digit + insert_space, 2, 2) + digit + pynutil.delete("-")) | (pynutil.delete("(") + pynini.closure(digit + insert_space, 2, 2) + digit + pynutil.delete(")") + pynini.closure(pynutil.delete("-"), 0, 1) + delete_space)) + add_separator + pynini.closure(digit + insert_space, 2, 2) + digit + pynutil.delete("-") + add_separator + pynini.closure(digit + insert_space, 3, 3) + digit) number_part = pynutil.insert( "number_part: \"") + number_part + pynutil.insert("\"") extension = (pynutil.insert("extension : \"") + pynini.closure(digit + insert_space, 0, 3) + digit + pynutil.insert("\"")) optional_extension = pynini.closure( insert_space + pynutil.delete("-") + extension, 0, 1) graph = optional_country_code + number_part + optional_extension final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize() NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() NEMO_PUNCT = pynini.union( *map(pynini.escape, string.punctuation)).optimize() NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize() NEMO_SIGMA = pynini.closure(NEMO_CHAR) delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) insert_space = pynutil.insert(" ") delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) # _v = pynini.union("a", "e", "i", "o", "u") _c = pynini.union("b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z") _ies = NEMO_SIGMA + _c + pynini.cross("y", "ies") _es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es") _s = NEMO_SIGMA + pynutil.insert("s") graph_plural = plurals._priority_union( suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA).optimize() SINGULAR_TO_PLURAL = graph_plural
def __init__(self): super().__init__(name="cardinal", kind="classify") delete_space = pynutil.delete(" ") graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) delete_extra_spaces = ( pynini.closure(pynutil.delete(" ")) + pynini.closure(pynini.closure(NEMO_ALPHA, 1) + delete_extra_space) + pynini.closure(NEMO_ALPHA, 1) + pynini.closure(pynutil.delete(" "))) graph_hundred = pynutil.delete("hundred") graph_hundred_component = pynini.union( graph_digit + delete_space + graph_hundred + delete_space, pynutil.insert("0")) graph_hundred_component += pynini.union( graph_teen | pynutil.insert("00"), (graph_ties + delete_space | pynutil.insert("0")) + (graph_digit | pynutil.insert("0")), ) # string -> all 3 digit numbers apart from 000 graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ ( pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)) # all 3 digit numbers apart from 0 -> string self.graph_hundred_component_at_least_one_none_zero_digit = ( pynini.invert( graph_hundred_component_at_least_one_none_zero_digit @ (pynutil.delete(pynini.closure("0")) + pynini.difference( NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT))) @ delete_extra_spaces).optimize() insert_comma = pynini.closure(pynutil.insert(","), 0, 1) graph_thousands = (pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("thousand"), pynutil.insert("000", weight=0.1), ) + insert_comma) graph_million = (pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("million"), pynutil.insert("000", weight=0.1), ) + insert_comma) graph_billion = (pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("billion"), pynutil.insert("000", weight=0.1), ) + insert_comma) graph_trillion = (pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("trillion"), pynutil.insert("000", weight=0.1), ) + insert_comma) graph_quadrillion = (pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("quadrillion"), pynutil.insert("000", weight=0.1), ) + insert_comma) graph_quintillion = (pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("quintillion"), pynutil.insert("000", weight=0.1), ) + insert_comma) graph_sextillion = (pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("sextillion"), pynutil.insert("000", weight=0.1), ) + insert_comma) graph = pynini.union( graph_sextillion + delete_space + graph_quintillion + delete_space + graph_quadrillion + delete_space + graph_trillion + delete_space + graph_billion + delete_space + graph_million + delete_space + graph_thousands + delete_space + graph_hundred_component, graph_zero, ) graph = graph @ pynini.union( pynini.closure(pynutil.delete(pynini.union("0", ","))) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(pynini.union(NEMO_DIGIT, ",")), "0", ) self.graph = pynini.invert(graph) @ delete_extra_spaces self.graph = self.graph.optimize() optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) final_graph = optional_minus_graph + pynutil.insert( "integer: \"") + self.graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, deterministic: bool = True): super().__init__(name="measure", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph if not deterministic: cardinal_graph |= cardinal.range_graph graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv")) graph_unit_plural = convert_space(graph_unit @ SINGULAR_TO_PLURAL) graph_unit = convert_space(graph_unit) optional_graph_negative = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) graph_unit2 = pynini.cross("/", "per") + delete_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit optional_graph_unit2 = pynini.closure( delete_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit2, 0, 1, ) unit_plural = ( pynutil.insert("units: \"") + (graph_unit_plural + optional_graph_unit2 | graph_unit2) + pynutil.insert("\"") ) unit_singular = ( pynutil.insert("units: \"") + (graph_unit + optional_graph_unit2 | graph_unit2) + pynutil.insert("\"") ) subgraph_decimal = ( pynutil.insert("decimal { ") + optional_graph_negative + decimal.final_graph_wo_negative + delete_space + pynutil.insert(" } ") + unit_plural ) subgraph_cardinal = ( pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + delete_space + pynutil.insert("\"") + pynutil.insert(" } ") + unit_plural ) subgraph_cardinal |= ( pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + pynini.cross("1", "one") + delete_space + pynutil.insert("\"") + pynutil.insert(" } ") + unit_singular ) cardinal_dash_alpha = ( pynutil.insert("cardinal { integer: \"") + cardinal_graph + pynini.cross('-', '') + pynutil.insert("\" } units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynutil.insert("\"") ) alpha_dash_cardinal = ( pynutil.insert("units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynini.cross('-', '') + pynutil.insert("\"") + pynutil.insert(" cardinal { integer: \"") + cardinal_graph + pynutil.insert("\" } preserve_order: true") ) decimal_dash_alpha = ( pynutil.insert("decimal { ") + decimal.final_graph_wo_negative + pynini.cross('-', '') + pynutil.insert(" } units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynutil.insert("\"") ) alpha_dash_decimal = ( pynutil.insert("units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynini.cross('-', '') + pynutil.insert("\"") + pynutil.insert(" decimal { ") + decimal.final_graph_wo_negative + pynutil.insert(" } preserve_order: true") ) subgraph_fraction = ( pynutil.insert("fraction { ") + fraction.graph + delete_space + pynutil.insert(" } ") + unit_plural ) final_graph = ( subgraph_decimal | subgraph_cardinal | cardinal_dash_alpha | alpha_dash_cardinal | decimal_dash_alpha | alpha_dash_decimal | subgraph_fraction ) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, deterministic: bool): super().__init__(name="date", kind="classify", deterministic=deterministic) month_graph = pynini.string_file( get_abs_path("data/months/names.tsv")).optimize() month_graph |= (TO_LOWER + pynini.closure(NEMO_CHAR)) @ month_graph month_abbr_graph = pynini.string_file( get_abs_path("data/months/abbr.tsv")).optimize() month_abbr_graph = (month_abbr_graph | (TO_LOWER + pynini.closure(NEMO_CHAR)) @ month_abbr_graph) + pynini.closure( pynutil.delete("."), 0, 1) month_graph |= month_abbr_graph month_numbers_graph = pynini.string_file( get_abs_path("data/months/numbers.tsv")).optimize() cardinal_graph = cardinal.graph_hundred_component_at_least_one_none_zero_digit year_graph = _get_year_graph(deterministic) YEAR_WEIGHT = 0.001 year_graph_standalone = (pynutil.insert("year: \"") + pynutil.add_weight(year_graph, YEAR_WEIGHT) + pynutil.insert("\"")) month_graph = pynutil.insert( "month: \"") + month_graph + pynutil.insert("\"") month_numbers_graph = pynutil.insert( "month: \"") + month_numbers_graph + pynutil.insert("\"") day_graph = (pynutil.insert("day: \"") + ((pynini.union("1", "2", "3") + NEMO_DIGIT) | NEMO_DIGIT) @ cardinal_graph + pynutil.insert("\"")) optional_day_graph = pynini.closure(delete_extra_space + day_graph, 0, 1) year_graph = pynutil.insert("year: \"") + year_graph + pynutil.insert( "\"") optional_graph_year = pynini.closure( delete_extra_space + year_graph, 0, 1, ) graph_mdy = (month_graph + optional_day_graph + delete_space + pynini.closure(pynutil.delete(","), 0, 1) + optional_graph_year) delete_sep = pynutil.delete(pynini.union("-", "/", ".")) graph_mdy |= (month_numbers_graph + delete_sep + insert_space + pynini.closure(pynutil.delete("0"), 0, 1) + day_graph + delete_sep + insert_space + year_graph) graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year graph_ymd = (year_graph + delete_sep + insert_space + month_numbers_graph + delete_sep + insert_space + pynini.closure(pynutil.delete("0"), 0, 1) + day_graph) final_graph = (graph_mdy | graph_dmy) + pynutil.insert(" preserve_order: true") final_graph |= graph_ymd | year_graph_standalone final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
NEMO_CHAR, NEMO_DIGIT, NEMO_SIGMA, TO_LOWER, GraphFst, delete_extra_space, delete_space, insert_space, ) try: import pynini from pynini.lib import pynutil graph_teen = pynini.invert( pynini.string_file(get_abs_path("data/numbers/teen.tsv"))).optimize() graph_digit = pynini.invert( pynini.string_file(get_abs_path("data/numbers/digit.tsv"))).optimize() ties_graph = pynini.invert( pynini.string_file(get_abs_path("data/numbers/ties.tsv"))).optimize() PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): # Add placeholders for global variables graph_teen = None graph_digit = None ties_graph = None PYNINI_AVAILABLE = True
from nemo_text_processing.text_normalization.graph_utils import ( NEMO_CHAR, NEMO_DIGIT, NEMO_SIGMA, TO_LOWER, GraphFst, delete_extra_space, delete_space, insert_space, ) try: import pynini from pynini.lib import pynutil graph_teen = pynini.invert(pynini.string_file(get_abs_path("data/numbers/teen.tsv"))).optimize() graph_digit = pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv"))).optimize() ties_graph = pynini.invert(pynini.string_file(get_abs_path("data/numbers/ties.tsv"))).optimize() PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): # Add placeholders for global variables graph_teen = None graph_digit = None ties_graph = None PYNINI_AVAILABLE = True def _get_ties_graph(): """