def __init__(self, deterministic: bool = True): super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic) graph_digit = pynini.string_file( get_abs_path("data/ordinals/digit.tsv")).invert() graph_teens = pynini.string_file( get_abs_path("data/ordinals/teen.tsv")).invert() graph = (pynutil.delete("integer:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) convert_rest = pynutil.insert("th", weight=0.01) suffix = pynini.cdrewrite( graph_digit | graph_teens | pynutil.add_weight(pynini.cross("ty", "tieth"), weight=0.001) | convert_rest, "", "[EOS]", NEMO_SIGMA, ).optimize() self.graph = pynini.compose(graph, suffix) self.suffix = suffix delete_tokens = self.delete_tokens(self.graph) self.fst = delete_tokens.optimize()
def get_address_graph(self, cardinal): """ Finite state transducer for classifying serial. The serial is a combination of digits, letters and dashes, e.g.: 2788 San Tomas Expy, Santa Clara, CA 95051 -> units: "address" cardinal { integer: "two seven eight eight San Tomas Expressway Santa Clara California nine five zero five one" } preserve_order: true """ ordinal_verbalizer = OrdinalVerbalizer().graph ordinal_tagger = OrdinalTagger(cardinal=cardinal).graph ordinal_num = pynini.compose( pynutil.insert("integer: \"") + ordinal_tagger + pynutil.insert("\""), ordinal_verbalizer) address_num = pynini.closure(NEMO_DIGIT, 1) @ cardinal.single_digits_graph direction = (pynini.cross("E", "East") | pynini.cross("S", "South") | pynini.cross("W", "West") | pynini.cross("N", "North")) direction = pynini.closure( pynutil.add_weight(pynini.accep(NEMO_SPACE) + direction, -1), 0, 1) address_words = pynini.string_file( get_abs_path("data/address/address_words.tsv")) address_words = (pynini.accep(NEMO_SPACE) + pynini.closure(ordinal_num, 0, 1) + pynini.closure(NEMO_ALPHA | NEMO_SPACE, 1) + address_words) city = pynini.closure(NEMO_ALPHA | pynini.accep(NEMO_SPACE), 1) city = pynini.closure( pynini.cross(",", "") + pynini.accep(NEMO_SPACE) + city, 0, 1) state = pynini.invert( pynini.string_file(get_abs_path("data/address/states.tsv"))) state = pynini.closure( pynini.cross(",", "") + pynini.accep(NEMO_SPACE) + state, 0, 1) zip_code = pynini.compose(NEMO_DIGIT**5, cardinal.single_digits_graph) zip_code = pynini.closure( pynutil.add_weight( pynini.closure(pynini.cross(",", ""), 0, 1) + pynini.accep(NEMO_SPACE) + zip_code, -100), 0, 1, ) address = (address_num + direction + address_words + pynini.closure(pynini.cross(".", ""), 0, 1) + city + state + zip_code) return address
def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="verbalize", deterministic=deterministic) graph_digit_no_zero = pynini.invert( pynini.string_file( get_abs_path("data/numbers/digit.tsv"))).optimize() graph_zero = pynini.cross("0", "zero") if not deterministic: graph_zero |= pynini.cross("0", "o") | pynini.cross("0", "oh") graph_digit = graph_digit_no_zero | graph_zero graph_symbols = pynini.string_file( get_abs_path("data/electronic/symbols.tsv")).optimize() user_name = ( pynutil.delete("username:"******"\"") + (pynini.closure( pynutil.add_weight(graph_digit + insert_space, 1.09) | pynutil.add_weight( pynini.closure(graph_symbols + pynutil.insert(" ")), 1.09) | pynutil.add_weight(NEMO_NOT_QUOTE + insert_space, 1.1))) + pynutil.delete("\"")) server_common = pynini.string_file( get_abs_path("data/electronic/server_name.tsv")) domain_common = pynini.string_file( get_abs_path("data/electronic/domain.tsv")) convert_defaults = (NEMO_NOT_QUOTE | pynutil.add_weight(domain_common, -0.1) | pynutil.add_weight(server_common, -0.1)) domain = convert_defaults + pynini.closure( pynutil.insert(" ") + convert_defaults) domain = pynini.compose( domain, pynini.closure( pynutil.add_weight(graph_symbols, -0.1) | pynutil.add_weight(graph_digit, -0.1) | NEMO_NOT_QUOTE), ) domain = (pynutil.delete("domain:") + delete_space + pynutil.delete("\"") + domain + delete_space + pynutil.delete("\"")) protocol = pynutil.delete("protocol: \"") + pynini.closure( NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") graph = (pynini.closure(protocol + delete_space, 0, 1) + pynini.closure( user_name + delete_space + pynutil.insert("at ") + delete_space, 0, 1) + domain + delete_space) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def _get_whitelist_non_deterministic_graph( file="data/whitelist_alternatives.tsv"): whitelist = load_labels(get_abs_path(file)) whitelist_lower = [(x.lower(), y.lower()) for x, y in whitelist] whitelist_cased = [(x, y) for x, y in whitelist] graph = pynini.string_map(whitelist_lower + whitelist_cased) return graph
def __init__(self, deterministic: bool = True): super().__init__(name="telephone", kind="classify", deterministic=deterministic) add_separator = pynutil.insert(", ") # between components digit = pynini.invert( pynini.string_file(get_abs_path("data/numbers/digit.tsv")) ).optimize() | pynini.cross("0", "o") country_code = (pynutil.insert("country_code: \"") + pynini.closure(pynutil.delete("+"), 0, 1) + pynini.closure(digit + insert_space, 0, 2) + digit + pynutil.insert("\"")) optional_country_code = pynini.closure( country_code + pynini.closure(pynutil.delete("-"), 0, 1) + delete_space + insert_space, 0, 1) area_part_common = pynutil.add_weight( pynini.cross("800", "eight hundred"), -1.1) area_part_default = pynini.closure(digit + insert_space, 2, 2) + digit area_part = area_part_default | area_part_common area_part = ( (area_part + pynutil.delete("-")) | (pynutil.delete("(") + area_part + (pynutil.delete(") ") | pynutil.delete(")-")))) + add_separator del_separator = pynini.closure(pynini.union("-", " "), 0, 1) number_length = ((NEMO_DIGIT + del_separator) | (NEMO_ALPHA + del_separator))**7 number_words = pynini.closure((NEMO_DIGIT @ digit) + (insert_space | pynini.cross("-", ', ')) | NEMO_ALPHA | (NEMO_ALPHA + pynini.cross("-", ' '))) number_words = pynini.compose(number_length, number_words) number_part = area_part + number_words number_part = pynutil.insert( "number_part: \"") + number_part + pynutil.insert("\"") extension = (pynutil.insert("extension: \"") + pynini.closure(digit + insert_space, 0, 3) + digit + pynutil.insert("\"")) optional_extension = pynini.closure(insert_space + extension, 0, 1) graph = optional_country_code + number_part + optional_extension final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None): super().__init__(name="whitelist", kind="classify", deterministic=deterministic) def _get_whitelist_graph(input_case, file): whitelist = load_labels(file) if input_case == "lower_cased": whitelist = [(x.lower(), y) for x, y in whitelist] else: whitelist = [(x, y) for x, y in whitelist] graph = pynini.string_map(whitelist) return graph def _get_whitelist_non_deterministic_graph( file="data/whitelist_alternatives.tsv"): whitelist = load_labels(get_abs_path(file)) whitelist_lower = [(x.lower(), y.lower()) for x, y in whitelist] whitelist_cased = [(x, y) for x, y in whitelist] graph = pynini.string_map(whitelist_lower + whitelist_cased) return graph graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist.tsv")) if not deterministic: graph |= _get_whitelist_non_deterministic_graph() if input_file: whitelist_provided = _get_whitelist_graph(input_case, input_file) if not deterministic: graph |= whitelist_provided else: graph = whitelist_provided self.graph = (convert_space(graph)).optimize() self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
def _load_roman(file: str): roman = load_labels(get_abs_path(file)) roman_numerals = [(x, y) for x, y in roman] + [(x.upper(), y) for x, y in roman] return pynini.string_map(roman_numerals)
def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="time", kind="classify", deterministic=deterministic) suffix_graph = pynini.string_file(get_abs_path("data/time_suffix.tsv")) time_zone_graph = pynini.string_file( get_abs_path("data/time_zone.tsv")) # only used for < 1000 thousand -> 0 weight cardinal = cardinal.graph labels_hour = [str(x) for x in range(0, 24)] labels_minute_single = [str(x) for x in range(1, 10)] labels_minute_double = [str(x) for x in range(10, 60)] delete_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | ( pynini.closure(pynutil.delete("0"), 0, 1) + NEMO_DIGIT) graph_hour = delete_leading_zero_to_double_digit @ pynini.union( *labels_hour) @ cardinal graph_minute_single = pynini.union(*labels_minute_single) @ cardinal graph_minute_double = pynini.union(*labels_minute_double) @ cardinal final_graph_hour = pynutil.insert( "hours: \"") + graph_hour + pynutil.insert("\"") final_graph_minute = ( pynutil.insert("minutes: \"") + (pynini.cross("0", "o") + insert_space + graph_minute_single | graph_minute_double) + pynutil.insert("\"")) final_graph_second = ( pynutil.insert("seconds: \"") + (pynini.cross("0", "o") + insert_space + graph_minute_single | graph_minute_double) + pynutil.insert("\"")) final_suffix = pynutil.insert("suffix: \"") + convert_space( suffix_graph) + pynutil.insert("\"") final_suffix_optional = pynini.closure( delete_space + insert_space + final_suffix, 0, 1) final_time_zone_optional = pynini.closure( delete_space + insert_space + pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\""), 0, 1, ) # 2:30 pm, 02:30, 2:00 graph_hm = ( final_graph_hour + pynutil.delete(":") + (pynutil.delete("00") | insert_space + final_graph_minute) + final_suffix_optional + final_time_zone_optional) # 10:30:05 pm, graph_hms = (final_graph_hour + pynutil.delete(":") + (pynini.cross("00", " minutes: \"zero\"") | insert_space + final_graph_minute) + pynutil.delete(":") + (pynini.cross("00", " seconds: \"zero\"") | insert_space + final_graph_second) + final_suffix_optional + final_time_zone_optional) # 2.xx pm/am graph_hm2 = ( final_graph_hour + pynutil.delete(".") + (pynutil.delete("00") | insert_space + final_graph_minute) + delete_space + insert_space + final_suffix + final_time_zone_optional) # 2 pm est graph_h = final_graph_hour + delete_space + insert_space + final_suffix + final_time_zone_optional final_graph = (graph_hm | graph_h | graph_hm2 | graph_hms).optimize() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, deterministic: bool): super().__init__(name="date", kind="classify", deterministic=deterministic) month_graph = pynini.string_file(get_abs_path("data/months/names.tsv")).optimize() month_graph |= pynini.compose(TO_LOWER + pynini.closure(NEMO_CHAR), month_graph) | pynini.compose( TO_LOWER ** (2, ...), month_graph ) month_abbr_graph = pynini.string_file(get_abs_path("data/months/abbr.tsv")).optimize() month_abbr_graph = ( month_abbr_graph | pynini.compose(TO_LOWER + pynini.closure(NEMO_LOWER, 1), month_abbr_graph).optimize() | pynini.compose(TO_LOWER ** (2, ...), month_abbr_graph).optimize() ) + pynini.closure(pynutil.delete("."), 0, 1) month_graph |= month_abbr_graph.optimize() month_numbers_graph = pynini.string_file(get_abs_path("data/months/numbers.tsv")).optimize() cardinal_graph = cardinal.graph_hundred_component_at_least_one_none_zero_digit year_graph = _get_year_graph(deterministic) YEAR_WEIGHT = 0.001 year_graph_standalone = ( pynutil.insert("year: \"") + pynutil.add_weight(year_graph, YEAR_WEIGHT) + pynutil.insert("\"") ) month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"") month_numbers_graph = pynutil.insert("month: \"") + month_numbers_graph + pynutil.insert("\"") endings = ["rd", "th", "st", "nd"] endings += [x.upper() for x in endings] endings = pynini.union(*endings) day_graph = ( pynutil.insert("day: \"") + pynini.closure(pynutil.delete("the "), 0, 1) + ( ((pynini.union("1", "2") + NEMO_DIGIT) | NEMO_DIGIT | (pynini.accep("3") + pynini.union("0", "1"))) + pynini.closure(pynutil.delete(endings), 0, 1) ) @ cardinal_graph + pynutil.insert("\"") ) two_digit_year = NEMO_DIGIT ** (2) @ (cardinal.single_digits_graph | cardinal_graph) two_digit_year = pynutil.insert("year: \"") + two_digit_year + pynutil.insert("\"") graph_year = pynutil.insert(" year: \"") + pynutil.delete(" ") + year_graph + pynutil.insert("\"") optional_graph_year = pynini.closure(graph_year, 0, 1) year_graph = pynutil.insert("year: \"") + year_graph + pynutil.insert("\"") graph_mdy = month_graph + ( (delete_extra_space + day_graph) | (pynini.accep(" ") + day_graph) | graph_year | (delete_extra_space + day_graph + graph_year) ) delete_sep = pynutil.delete(pynini.union("-", "/", ".")) graph_mdy |= ( month_numbers_graph + delete_sep + insert_space + pynini.closure(pynutil.delete("0"), 0, 1) + day_graph + delete_sep + insert_space + (year_graph | two_digit_year) ) graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year graph_ymd = ( (year_graph | two_digit_year) + delete_sep + insert_space + month_numbers_graph + delete_sep + insert_space + pynini.closure(pynutil.delete("0"), 0, 1) + day_graph ) final_graph = graph_mdy | graph_dmy if deterministic: final_graph += pynutil.insert(" preserve_order: true") else: final_graph += pynini.closure(pynutil.insert(" preserve_order: true"), 0, 1) m_sep_d = ( month_numbers_graph + delete_sep + insert_space + pynini.closure(pynutil.delete("0"), 0, 1) + day_graph ) final_graph |= m_sep_d final_graph |= graph_ymd | year_graph_standalone if not deterministic: ymd_to_mdy_graph = None ymd_to_dmy_graph = None mdy_to_dmy_graph = None md_to_dm_graph = None for month in [x[0] for x in load_labels(get_abs_path("data/months/names.tsv"))]: for day in [x[0] for x in load_labels(get_abs_path("data/months/days.tsv"))]: ymd_to_mdy_curr = ( pynutil.insert("month: \"" + month + "\" day: \"" + day + "\" ") + pynini.accep('year:') + NEMO_SIGMA + pynutil.delete(" month: \"" + month + "\" day: \"" + day + "\"") ) # YY-MM-DD -> MM-DD-YY ymd_to_mdy_curr = pynini.compose(graph_ymd, ymd_to_mdy_curr) ymd_to_mdy_graph = ( ymd_to_mdy_curr if ymd_to_mdy_graph is None else pynini.union(ymd_to_mdy_curr, ymd_to_mdy_graph) ) ymd_to_dmy_curr = ( pynutil.insert("day: \"" + day + "\" month: \"" + month + "\" ") + pynini.accep('year:') + NEMO_SIGMA + pynutil.delete(" month: \"" + month + "\" day: \"" + day + "\"") ) # YY-MM-DD -> MM-DD-YY ymd_to_dmy_curr = pynini.compose(graph_ymd, ymd_to_dmy_curr).optimize() ymd_to_dmy_graph = ( ymd_to_dmy_curr if ymd_to_dmy_graph is None else pynini.union(ymd_to_dmy_curr, ymd_to_dmy_graph) ) mdy_to_dmy_curr = ( pynutil.insert("day: \"" + day + "\" month: \"" + month + "\" ") + pynutil.delete("month: \"" + month + "\" day: \"" + day + "\" ") + pynini.accep('year:') + NEMO_SIGMA ).optimize() # MM-DD-YY -> verbalize as MM-DD-YY (February fourth 1991) or DD-MM-YY (the fourth of February 1991) mdy_to_dmy_curr = pynini.compose(graph_mdy, mdy_to_dmy_curr).optimize() mdy_to_dmy_graph = ( mdy_to_dmy_curr if mdy_to_dmy_graph is None else pynini.union(mdy_to_dmy_curr, mdy_to_dmy_graph).optimize() ).optimize() md_to_dm_curr = pynutil.insert("day: \"" + day + "\" month: \"" + month + "\"") + pynutil.delete( "month: \"" + month + "\" day: \"" + day + "\"" ) md_to_dm_curr = pynini.compose(m_sep_d, md_to_dm_curr).optimize() md_to_dm_graph = ( md_to_dm_curr if md_to_dm_graph is None else pynini.union(md_to_dm_curr, md_to_dm_graph).optimize() ).optimize() final_graph |= mdy_to_dmy_graph | md_to_dm_graph | ymd_to_mdy_graph | ymd_to_dmy_graph final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
NEMO_CHAR, NEMO_DIGIT, NEMO_LOWER, NEMO_SIGMA, TO_LOWER, GraphFst, delete_extra_space, insert_space, ) from nemo_text_processing.text_normalization.ar.utils import get_abs_path, load_labels try: import pynini from pynini.lib import pynutil graph_teen = pynini.invert(pynini.string_file(get_abs_path("data/numbers/teen.tsv"))).optimize() graph_digit = pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv"))).optimize() ties_graph = pynini.invert(pynini.string_file(get_abs_path("data/numbers/ties.tsv"))).optimize() PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): # Add placeholders for global variables graph_teen = None graph_digit = None ties_graph = None PYNINI_AVAILABLE = True def get_ties_graph(deterministic: bool = True): """
def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, deterministic: bool = True): super().__init__(name="measure", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph if not deterministic: cardinal_graph |= cardinal.range_graph graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv")) graph_unit |= pynini.compose( pynini.closure(TO_LOWER, 1) + pynini.closure(NEMO_ALPHA), graph_unit) graph_unit_plural = convert_space(graph_unit @ SINGULAR_TO_PLURAL) graph_unit = convert_space(graph_unit) optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) graph_unit2 = pynini.cross("/", "per") + delete_space + pynutil.insert( NEMO_NON_BREAKING_SPACE) + graph_unit optional_graph_unit2 = pynini.closure( delete_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit2, 0, 1, ) unit_plural = ( pynutil.insert("units: \"") + (graph_unit_plural + optional_graph_unit2 | graph_unit2) + pynutil.insert("\"")) unit_singular = (pynutil.insert("units: \"") + (graph_unit + optional_graph_unit2 | graph_unit2) + pynutil.insert("\"")) subgraph_decimal = (pynutil.insert("decimal { ") + optional_graph_negative + decimal.final_graph_wo_negative + delete_space + pynutil.insert(" } ") + unit_plural) subgraph_cardinal = (pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + delete_space + pynutil.insert("\"") + pynutil.insert(" } ") + unit_plural) subgraph_cardinal |= (pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + pynini.cross("1", "one") + delete_space + pynutil.insert("\"") + pynutil.insert(" } ") + unit_singular) cardinal_dash_alpha = (pynutil.insert("cardinal { integer: \"") + cardinal_graph + pynini.accep('-') + pynutil.insert("\" } units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynutil.insert("\"")) alpha_dash_cardinal = (pynutil.insert("units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynini.accep('-') + pynutil.insert("\"") + pynutil.insert(" cardinal { integer: \"") + cardinal_graph + pynutil.insert("\" } preserve_order: true")) decimal_dash_alpha = (pynutil.insert("decimal { ") + decimal.final_graph_wo_negative + pynini.cross('-', '') + pynutil.insert(" } units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynutil.insert("\"")) decimal_times = (pynutil.insert("decimal { ") + decimal.final_graph_wo_negative + pynutil.insert(" } units: \"") + pynini.cross(pynini.union('x', "X"), 'x') + pynutil.insert("\"")) alpha_dash_decimal = (pynutil.insert("units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynini.accep('-') + pynutil.insert("\"") + pynutil.insert(" decimal { ") + decimal.final_graph_wo_negative + pynutil.insert(" } preserve_order: true")) subgraph_fraction = (pynutil.insert("fraction { ") + fraction.graph + delete_space + pynutil.insert(" } ") + unit_plural) address = self.get_address_graph(cardinal) address = ( pynutil.insert("units: \"address\" cardinal { integer: \"") + address + pynutil.insert("\" } preserve_order: true")) math_operations = pynini.string_file( get_abs_path("data/math_operations.tsv")) delimiter = pynini.accep(" ") | pynutil.insert(" ") math = (cardinal_graph + delimiter + math_operations + delimiter + cardinal_graph + delimiter + pynini.cross("=", "equals") + delimiter + cardinal_graph) math = (pynutil.insert("units: \"math\" cardinal { integer: \"") + math + pynutil.insert("\" } preserve_order: true")) final_graph = (subgraph_decimal | subgraph_cardinal | cardinal_dash_alpha | alpha_dash_cardinal | decimal_dash_alpha | decimal_times | alpha_dash_decimal | subgraph_fraction | address | math) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) # TODO repalce to have "oh" as a default for "0" graph = pynini.Far( get_abs_path("data/numbers/cardinal_number_name.far")).get_fst() self.graph_hundred_component_at_least_one_none_zero_digit = ( pynini.closure(NEMO_DIGIT, 2, 3) | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure( pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT)) @ graph graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) single_digits_graph = pynini.invert(graph_digit | graph_zero) self.single_digits_graph = single_digits_graph + pynini.closure( insert_space + single_digits_graph) if not deterministic: # for a single token allow only the same normalization # "007" -> {"oh oh seven", "zero zero seven"} not {"oh zero seven"} single_digits_graph_zero = pynini.invert(graph_digit | graph_zero) single_digits_graph_oh = pynini.invert(graph_digit) | pynini.cross( "0", "oh") self.single_digits_graph = single_digits_graph_zero + pynini.closure( insert_space + single_digits_graph_zero) self.single_digits_graph |= single_digits_graph_oh + pynini.closure( insert_space + single_digits_graph_oh) single_digits_graph_with_commas = pynini.closure( self.single_digits_graph + insert_space, 1, 3) + pynini.closure( pynutil.delete(",") + single_digits_graph + insert_space + single_digits_graph + insert_space + single_digits_graph, 1, ) self.range_graph = pynutil.insert( "from ") + self.graph + pynini.cross("-", " to ") + self.graph self.range_graph |= self.graph + (pynini.cross( "x", " by ") | pynini.cross(" x ", " by ")) + self.graph self.range_graph |= (pynutil.insert("from ") + get_hundreds_graph() + pynini.cross("-", " to ") + get_hundreds_graph()) self.range_graph = self.range_graph.optimize() serial_graph = self.get_serial_graph() optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) if deterministic: long_numbers = pynini.compose(NEMO_DIGIT**(5, ...), self.single_digits_graph).optimize() final_graph = self.graph | serial_graph | pynutil.add_weight( long_numbers, -0.001) cardinal_with_leading_zeros = pynini.compose( pynini.accep("0") + pynini.closure(NEMO_DIGIT), self.single_digits_graph) final_graph |= cardinal_with_leading_zeros else: leading_zeros = pynini.compose( pynini.closure(pynini.accep("0"), 1), self.single_digits_graph) cardinal_with_leading_zeros = ( leading_zeros + pynutil.insert(" ") + pynini.compose(pynini.closure(NEMO_DIGIT), self.graph)) final_graph = (self.graph | serial_graph | self.range_graph | self.single_digits_graph | get_hundreds_graph() | pynutil.add_weight( single_digits_graph_with_commas, 0.001) | cardinal_with_leading_zeros) final_graph = optional_minus_graph + pynutil.insert( "integer: \"") + final_graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = True): super().__init__(name="money", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph graph_decimal_final = decimal.final_graph_wo_negative maj_singular_labels = load_labels(get_abs_path("data/currency/currency.tsv")) maj_unit_plural = convert_space(maj_singular @ SINGULAR_TO_PLURAL) maj_unit_singular = convert_space(maj_singular) graph_maj_singular = pynutil.insert("currency_maj: \"") + maj_unit_singular + pynutil.insert("\"") graph_maj_plural = pynutil.insert("currency_maj: \"") + maj_unit_plural + pynutil.insert("\"") optional_delete_fractional_zeros = pynini.closure( pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), 0, 1 ) graph_integer_one = pynutil.insert("integer_part: \"") + pynini.cross("1", "one") + pynutil.insert("\"") # only for decimals where third decimal after comma is non-zero or with quantity decimal_delete_last_zeros = ( pynini.closure(NEMO_DIGIT | pynutil.delete(",")) + pynini.accep(".") + pynini.closure(NEMO_DIGIT, 2) + (NEMO_DIGIT - "0") + pynini.closure(pynutil.delete("0")) ) decimal_with_quantity = NEMO_SIGMA + NEMO_ALPHA graph_decimal = ( graph_maj_plural + insert_space + (decimal_delete_last_zeros | decimal_with_quantity) @ graph_decimal_final ) graph_integer = ( pynutil.insert("integer_part: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + pynutil.insert("\"") ) graph_integer_only = graph_maj_singular + insert_space + graph_integer_one graph_integer_only |= graph_maj_plural + insert_space + graph_integer final_graph = (graph_integer_only + optional_delete_fractional_zeros) | graph_decimal # remove trailing zeros of non zero number in the first 2 digits and fill up to 2 digits # e.g. 2000 -> 20, 0200->02, 01 -> 01, 10 -> 10 # not accepted: 002, 00, 0, two_digits_fractional_part = ( pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(pynutil.delete("0")) ) @ ( (pynutil.delete("0") + (NEMO_DIGIT - "0")) | ((NEMO_DIGIT - "0") + pynutil.insert("0")) | ((NEMO_DIGIT - "0") + NEMO_DIGIT) ) graph_min_singular = pynutil.insert(" currency_min: \"") + min_singular + pynutil.insert("\"") graph_min_plural = pynutil.insert(" currency_min: \"") + min_plural + pynutil.insert("\"") # format ** dollars ** cent decimal_graph_with_minor = None integer_graph_reordered = None decimal_default_reordered = None for curr_symbol, _ in maj_singular_labels: preserve_order = pynutil.insert(" preserve_order: true") integer_plus_maj = graph_integer + insert_space + pynutil.insert(curr_symbol) @ graph_maj_plural integer_plus_maj |= graph_integer_one + insert_space + pynutil.insert(curr_symbol) @ graph_maj_singular integer_plus_maj_with_comma = pynini.compose( NEMO_DIGIT - "0" + pynini.closure(NEMO_DIGIT | pynutil.delete(",")), integer_plus_maj ) integer_plus_maj = pynini.compose(pynini.closure(NEMO_DIGIT) - "0", integer_plus_maj) integer_plus_maj |= integer_plus_maj_with_comma graph_fractional_one = two_digits_fractional_part @ pynini.cross("1", "one") graph_fractional_one = pynutil.insert("fractional_part: \"") + graph_fractional_one + pynutil.insert("\"") graph_fractional = ( two_digits_fractional_part @ (pynini.closure(NEMO_DIGIT, 1, 2) - "1") @ cardinal.graph_hundred_component_at_least_one_none_zero_digit ) graph_fractional = pynutil.insert("fractional_part: \"") + graph_fractional + pynutil.insert("\"") fractional_plus_min = graph_fractional + insert_space + pynutil.insert(curr_symbol) @ graph_min_plural fractional_plus_min |= ( graph_fractional_one + insert_space + pynutil.insert(curr_symbol) @ graph_min_singular ) decimal_graph_with_minor_curr = integer_plus_maj + pynini.cross(".", " ") + fractional_plus_min if not deterministic: decimal_graph_with_minor_curr |= pynutil.add_weight( integer_plus_maj + pynini.cross(".", " ") + pynutil.insert("fractional_part: \"") + two_digits_fractional_part @ cardinal.graph_hundred_component_at_least_one_none_zero_digit + pynutil.insert("\""), weight=0.0001, ) default_fraction_graph = (decimal_delete_last_zeros | decimal_with_quantity) @ graph_decimal_final decimal_graph_with_minor_curr |= ( pynini.closure(pynutil.delete("0"), 0, 1) + pynutil.delete(".") + fractional_plus_min ) decimal_graph_with_minor_curr = ( pynutil.delete(curr_symbol) + decimal_graph_with_minor_curr + preserve_order ) decimal_graph_with_minor = ( decimal_graph_with_minor_curr if decimal_graph_with_minor is None else pynini.union(decimal_graph_with_minor, decimal_graph_with_minor_curr).optimize() ) if not deterministic: integer_graph_reordered_curr = ( pynutil.delete(curr_symbol) + integer_plus_maj + preserve_order ).optimize() integer_graph_reordered = ( integer_graph_reordered_curr if integer_graph_reordered is None else pynini.union(integer_graph_reordered, integer_graph_reordered_curr).optimize() ) decimal_default_reordered_curr = ( pynutil.delete(curr_symbol) + default_fraction_graph + insert_space + pynutil.insert(curr_symbol) @ graph_maj_plural ) decimal_default_reordered = ( decimal_default_reordered_curr if decimal_default_reordered is None else pynini.union(decimal_default_reordered, decimal_default_reordered_curr) ).optimize() # weight for SH final_graph |= pynutil.add_weight(decimal_graph_with_minor, -0.001) if not deterministic: final_graph |= integer_graph_reordered | decimal_default_reordered # to handle "$2.00" cases final_graph |= pynini.compose( NEMO_SIGMA + pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), integer_graph_reordered ) final_graph = self.add_tokens(final_graph.optimize()) self.fst = final_graph.optimize()
from nemo_text_processing.text_normalization.ar.graph_utils import ( NEMO_ALPHA, NEMO_DIGIT, NEMO_SIGMA, SINGULAR_TO_PLURAL, GraphFst, convert_space, insert_space, ) from nemo_text_processing.text_normalization.ar.utils import get_abs_path, load_labels try: import pynini from pynini.lib import pynutil min_singular = pynini.string_file(get_abs_path("data/currency/currency_minor_singular.tsv")) min_plural = pynini.string_file(get_abs_path("data/currency/currency_minor_plural.tsv")) maj_singular = pynini.string_file((get_abs_path("data/currency/currency.tsv"))) PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): PYNINI_AVAILABLE = False class MoneyFst(GraphFst): """ Finite state transducer for classifying money, suppletive aware, e.g. $12.05 -> money { integer_part: "twelve" currency_maj: "dollars" fractional_part: "five" currency_min: "cents" preserve_order: true } $12.0500 -> money { integer_part: "twelve" currency_maj: "dollars" fractional_part: "five" currency_min: "cents" preserve_order: true } $1 -> money { currency_maj: "dollar" integer_part: "one" } $1.00 -> money { currency_maj: "dollar" integer_part: "one" }
NEMO_PUNCT = pynini.union( *map(pynini.escape, string.punctuation)).optimize() NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize() NEMO_SIGMA = pynini.closure(NEMO_CHAR) delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) insert_space = pynutil.insert(" ") delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") delete_preserve_order = pynini.closure( pynutil.delete(" preserve_order: true") | (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\""))) suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) # _v = pynini.union("a", "e", "i", "o", "u") _c = pynini.union("b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z") _ies = NEMO_SIGMA + _c + pynini.cross("y", "ies") _es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es") _s = NEMO_SIGMA + pynutil.insert("s") graph_plural = plurals._priority_union( suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA).optimize() SINGULAR_TO_PLURAL = graph_plural