def __init__(self, deterministic: bool = True): super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic) graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv")).invert() graph_teens = pynini.string_file(get_abs_path("data/ordinals/teen.tsv")).invert() graph = ( pynutil.delete("integer:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) convert_rest = pynutil.insert("th", weight=0.01) suffix = pynini.cdrewrite( graph_digit | graph_teens | pynutil.add_weight(pynini.cross("ty", "tieth"), weight=0.001) | convert_rest, "", "[EOS]", NEMO_SIGMA, ).optimize() graph = graph @ suffix self.suffix = suffix delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) graph = pynini.Far( get_abs_path("data/numbers/cardinal_number_name.far")).get_fst() self.graph_hundred_component_at_least_one_none_zero_digit = ( pynini.closure(NEMO_DIGIT, 2, 3) | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure( pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT)) @ graph graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) single_digits_graph = pynutil.add_weight( pynini.invert(graph_digit | graph_zero), 1.2) | pynutil.add_weight( pynini.cross("0", "oh"), 1.1) self.single_digits_graph = single_digits_graph + pynini.closure( pynutil.insert(" ") + single_digits_graph) if not deterministic: single_digits_graph_with_commas = pynini.closure( self.single_digits_graph + pynutil.insert(" "), 1, 3) + pynini.closure( pynutil.delete(",") + single_digits_graph + pynutil.insert(" ") + single_digits_graph + pynutil.insert(" ") + single_digits_graph, 1, ) self.graph |= self.single_digits_graph | get_hundreds_graph( ) | single_digits_graph_with_commas self.range_graph = ( pynini.closure(pynutil.insert("from "), 0, 1) + self.graph + (pynini.cross("-", " to ") | pynini.cross("-", " ")) + self.graph) self.range_graph |= self.graph + (pynini.cross( "x", " by ") | pynini.cross(" x ", " by ")) + self.graph self.range_graph = self.range_graph.optimize() optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) final_graph = self.graph | pynutil.add_weight(self.get_serial_graph(), 1.2) if not deterministic: final_graph |= self.range_graph final_graph = optional_minus_graph + pynutil.insert( "integer: \"") + final_graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="verbalize", deterministic=deterministic) graph_digit_no_zero = pynini.invert( pynini.string_file( get_abs_path("data/numbers/digit.tsv"))).optimize() graph_zero = pynini.cross("0", "zero") if not deterministic: graph_zero |= pynini.cross("0", "o") | pynini.cross("0", "oh") graph_digit = graph_digit_no_zero | graph_zero graph_symbols = pynini.string_file( get_abs_path("data/electronic/symbols.tsv")).optimize() user_name = ( pynutil.delete("username:"******"\"") + (pynini.closure( pynutil.add_weight(graph_digit + insert_space, 1.09) | pynutil.add_weight( pynini.closure(graph_symbols + pynutil.insert(" ")), 1.09) | pynutil.add_weight(NEMO_NOT_QUOTE + insert_space, 1.1))) + pynutil.delete("\"")) server_common = pynini.string_file( get_abs_path("data/electronic/server_name.tsv")) domain_common = pynini.string_file( get_abs_path("data/electronic/domain.tsv")) convert_defaults = (NEMO_NOT_QUOTE | pynutil.add_weight(domain_common, -0.1) | pynutil.add_weight(server_common, -0.1)) domain = convert_defaults + pynini.closure( pynutil.insert(" ") + convert_defaults) domain = pynini.compose( domain, pynini.closure( pynutil.add_weight(graph_symbols, -0.1) | pynutil.add_weight(graph_digit, -0.1) | NEMO_NOT_QUOTE), ) domain = (pynutil.delete("domain:") + delete_space + pynutil.delete("\"") + domain + delete_space + pynutil.delete("\"")) protocol = pynutil.delete("protocol: \"") + pynini.closure( NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") graph = (pynini.closure(protocol + delete_space, 0, 1) + pynini.closure( user_name + delete_space + pynutil.insert("at ") + delete_space, 0, 1) + domain + delete_space) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def get_names(): """ Returns the graph that matched common male and female names. """ male_labels = load_labels(get_abs_path("data/roman/male.tsv")) female_labels = load_labels(get_abs_path("data/roman/female.tsv")) male_labels.extend([[x[0].upper()] for x in male_labels]) female_labels.extend([[x[0].upper()] for x in female_labels]) names = pynini.string_map(male_labels).optimize() names |= pynini.string_map(female_labels).optimize() return names
def get_address_graph(self, cardinal): """ Finite state transducer for classifying serial. The serial is a combination of digits, letters and dashes, e.g.: 2788 San Tomas Expy, Santa Clara, CA 95051 -> units: "address" cardinal { integer: "two seven eight eight San Tomas Expressway Santa Clara California nine five zero five one" } preserve_order: true """ ordinal_verbalizer = OrdinalVerbalizer().graph ordinal_tagger = OrdinalTagger(cardinal=cardinal).graph ordinal_num = pynini.compose( pynutil.insert("integer: \"") + ordinal_tagger + pynutil.insert("\""), ordinal_verbalizer) address_num = pynini.closure(NEMO_DIGIT, 1) @ cardinal.single_digits_graph direction = (pynini.cross("E", "East") | pynini.cross("S", "South") | pynini.cross("W", "West") | pynini.cross("N", "North")) direction = pynini.closure( pynutil.add_weight(pynini.accep(NEMO_SPACE) + direction, -1), 0, 1) address_words = pynini.string_file( get_abs_path("data/address/address_words.tsv")) address_words = (pynini.accep(NEMO_SPACE) + pynini.closure(ordinal_num, 0, 1) + pynini.closure(NEMO_ALPHA | NEMO_SPACE, 1) + address_words) city = pynini.closure(NEMO_ALPHA | pynini.accep(NEMO_SPACE), 1) city = pynini.closure( pynini.cross(",", "") + pynini.accep(NEMO_SPACE) + city, 0, 1) state = pynini.invert( pynini.string_file(get_abs_path("data/address/states.tsv"))) state = pynini.closure( pynini.cross(",", "") + pynini.accep(NEMO_SPACE) + state, 0, 1) zip_code = pynini.compose(NEMO_DIGIT**5, cardinal.single_digits_graph) zip_code = pynini.closure( pynutil.add_weight( pynini.closure(pynini.cross(",", ""), 0, 1) + pynini.accep(NEMO_SPACE) + zip_code, -100), 0, 1, ) address = (address_num + direction + address_words + pynini.closure(pynini.cross(".", ""), 0, 1) + city + state + zip_code) return address
def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="verbalize", deterministic=deterministic) graph_digit_no_zero = pynini.invert(pynini.string_file(get_abs_path("data/number/digit.tsv"))).optimize() graph_zero = pynini.cross("0", "zero") if not deterministic: graph_zero |= pynini.cross("0", "o") | pynini.cross("0", "oh") graph_digit = graph_digit_no_zero | graph_zero graph_symbols = pynini.string_file(get_abs_path("data/electronic/symbol.tsv")).optimize() default_chars_symbols = pynini.cdrewrite( pynutil.insert(" ") + (graph_symbols | graph_digit) + pynutil.insert(" "), "", "", NEMO_SIGMA ) user_name = ( pynutil.delete("username:"******"\"") + default_chars_symbols + pynutil.delete("\"") ) domain_common = pynini.string_file(get_abs_path("data/electronic/domain.tsv")) domain = ( default_chars_symbols + insert_space + plurals._priority_union( domain_common, pynutil.add_weight(pynini.cross(".", "dot"), weight=0.0001), NEMO_SIGMA ) + pynini.closure( insert_space + (pynini.cdrewrite(TO_UPPER, "", "", NEMO_SIGMA) @ default_chars_symbols), 0, 1 ) ) domain = ( pynutil.delete("domain:") + delete_space + pynutil.delete("\"") + domain + delete_space + pynutil.delete("\"") ).optimize() protocol = pynutil.delete("protocol: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") graph = ( pynini.closure(protocol + delete_space, 0, 1) + pynini.closure(user_name + delete_space + pynutil.insert(" at ") + delete_space, 0, 1) + domain + delete_space ).optimize() @ pynini.cdrewrite(delete_extra_space, "", "", NEMO_SIGMA) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def _get_whitelist_non_deterministic_graph( file="data/whitelist_alternatives.tsv"): whitelist = load_labels(get_abs_path(file)) whitelist_lower = [(x.lower(), y.lower()) for x, y in whitelist] whitelist_cased = [(x, y) for x, y in whitelist] graph = pynini.string_map(whitelist_lower + whitelist_cased) return graph
def __init__(self, deterministic: bool = True): super().__init__(name="punctuation", kind="classify", deterministic=deterministic) s = "!#%&\'()*+,-./:;<=>?@^_`{|}~\"" punct_symbols_to_exclude = ["[", "]"] punct_unicode = [ chr(i) for i in range(sys.maxunicode) if category(chr(i)).startswith("P") and chr(i) not in punct_symbols_to_exclude ] whitelist_symbols = load_labels( get_abs_path("data/whitelist/symbol.tsv")) whitelist_symbols = [x[0] for x in whitelist_symbols] self.punct_marks = [ p for p in punct_unicode + list(s) if p not in whitelist_symbols ] punct = pynini.union(*self.punct_marks) punct = pynini.closure(punct, 1) emphasis = (pynini.accep("<") + ( (pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1) + pynini.closure(pynini.accep("/"), 0, 1)) | (pynini.accep("/") + pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1))) + pynini.accep(">")) punct = plurals._priority_union(emphasis, punct, NEMO_SIGMA) self.graph = punct self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
def get_serial_graph(self): """ Finite state transducer for classifying serial. The serial is a combination of digits, letters and dashes, e.g.: c325-b -> tokens { cardinal { integer: "c three two five b" } } """ alpha = NEMO_ALPHA if self.deterministic: num_graph = self.single_digits_graph else: num_graph = self.graph letter_pronunciation = pynini.string_map( load_labels(get_abs_path("data/letter_pronunciation.tsv"))) alpha |= letter_pronunciation delimiter = insert_space | pynini.cross("-", " ") | pynini.cross( "/", " ") letter_num = pynini.closure(alpha + delimiter, 1) + num_graph num_letter = pynini.closure(num_graph + delimiter, 1) + alpha num_delimiter_num = pynini.closure(num_graph + delimiter, 1) + num_graph next_alpha_or_num = pynini.closure(delimiter + (alpha | num_graph)) serial_graph = (letter_num | num_letter | num_delimiter_num) + next_alpha_or_num if not self.deterministic: serial_graph += pynini.closure( pynini.accep("s") | pynini.cross("s", "es"), 0, 1) serial_graph.optimize() return pynutil.add_weight(serial_graph, 10)
def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="verbalize", deterministic=deterministic) graph_digit = pynini.invert( pynini.string_file( get_abs_path("data/numbers/digit.tsv"))).optimize() graph_symbols = pynini.string_file( get_abs_path("data/electronic/symbols.tsv")).optimize() user_name = ( pynutil.delete("username:"******"\"") + (pynini.closure( pynutil.add_weight(graph_digit + insert_space, 1.09) | pynutil.add_weight( pynini.closure(graph_symbols + pynutil.insert(" ")), 1.09) | pynutil.add_weight(NEMO_NOT_QUOTE + insert_space, 1.1))) + pynutil.delete("\"")) domain_default = (pynini.closure(NEMO_NOT_QUOTE + insert_space) + pynini.cross(".", "dot ") + NEMO_NOT_QUOTE + pynini.closure(insert_space + NEMO_NOT_QUOTE)) server_default = (pynini.closure( (graph_digit | NEMO_ALPHA) + insert_space, 1) + pynini.closure(graph_symbols + insert_space) + pynini.closure( (graph_digit | NEMO_ALPHA) + insert_space, 1)) server_common = pynini.string_file( get_abs_path("data/electronic/server_name.tsv")) + insert_space domain_common = pynini.cross(".", "dot ") + pynini.string_file( get_abs_path("data/electronic/domain.tsv")) domain = (pynutil.delete("domain:") + delete_space + pynutil.delete("\"") + (pynutil.add_weight(server_common, 1.09) | pynutil.add_weight(server_default, 1.1)) + (pynutil.add_weight(domain_common, 1.09) | pynutil.add_weight(domain_default, 1.1)) + delete_space + pynutil.delete("\"")) graph = (pynini.closure( user_name + delete_space + pynutil.insert("at ") + delete_space, 0, 1) + domain + delete_space) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def _get_whitelist_graph(input_case, file="data/whitelist.tsv"): whitelist = load_labels(get_abs_path(file)) if input_case == "lower_cased": whitelist = [(x.lower(), y) for x, y in whitelist] else: whitelist = [(x, y) for x, y in whitelist] graph = pynini.string_map(whitelist) return graph
def __init__(self, deterministic: bool = True): super().__init__(name="telephone", kind="classify", deterministic=deterministic) add_separator = pynutil.insert(", ") # between components digit = pynini.invert( pynini.string_file(get_abs_path("data/numbers/digit.tsv")) ).optimize() | pynini.cross("0", "o") country_code = (pynutil.insert("country_code: \"") + pynini.closure(pynutil.delete("+"), 0, 1) + pynini.closure(digit + insert_space, 0, 2) + digit + pynutil.insert("\"")) optional_country_code = pynini.closure( country_code + pynini.closure(pynutil.delete("-"), 0, 1) + delete_space + insert_space, 0, 1) area_part_common = pynutil.add_weight( pynini.cross("800", "eight hundred"), -1.1) area_part_default = pynini.closure(digit + insert_space, 2, 2) + digit area_part = area_part_default | area_part_common area_part = ( (area_part + pynutil.delete("-")) | (pynutil.delete("(") + area_part + (pynutil.delete(") ") | pynutil.delete(")-")))) + add_separator del_separator = pynini.closure(pynini.union("-", " "), 0, 1) number_length = ((NEMO_DIGIT + del_separator) | (NEMO_ALPHA + del_separator))**7 number_words = pynini.closure((NEMO_DIGIT @ digit) + (insert_space | pynini.cross("-", ', ')) | NEMO_ALPHA | (NEMO_ALPHA + pynini.cross("-", ' '))) number_words = pynini.compose(number_length, number_words) number_part = area_part + number_words number_part = pynutil.insert( "number_part: \"") + number_part + pynutil.insert("\"") extension = (pynutil.insert("extension: \"") + pynini.closure(digit + insert_space, 0, 3) + digit + pynutil.insert("\"")) optional_extension = pynini.closure(insert_space + extension, 0, 1) graph = optional_country_code + number_part + optional_extension # ip digit_to_str_graph = pynini.compose( NEMO_DIGIT**(1, 3), digit + pynini.closure(pynutil.insert(" ") + digit)).optimize() ip_graph = digit_to_str_graph + (pynini.cross(".", " dot ") + digit_to_str_graph)**3 graph |= pynutil.insert( "number_part: \"") + ip_graph.optimize() + pynutil.insert("\"") final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, deterministic: bool): super().__init__(name="decimal", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph cardinal_graph_hundred_component_at_least_one_none_zero_digit = ( cardinal.graph_hundred_component_at_least_one_none_zero_digit) graph_decimal = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_decimal |= pynini.string_file( get_abs_path("data/numbers/zero.tsv")) graph_decimal = ( pynini.cross("zero", "0") | graph_decimal | (graph_decimal | pynini.cross("o", "0")) + pynini.closure( delete_space + (graph_decimal | pynini.cross("o", "0")), 1)) self.graph = pynini.invert(graph_decimal).optimize() if not deterministic: self.graph = self.graph | cardinal_graph point = pynutil.delete(".") optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) graph_fractional = pynutil.insert( "fractional_part: \"") + self.graph + pynutil.insert("\"") graph_integer = pynutil.insert( "integer_part: \"") + cardinal_graph + pynutil.insert("\"") final_graph_wo_sign = ( pynini.closure(graph_integer + pynutil.insert(" "), 0, 1) + point + pynutil.insert(" ") + graph_fractional) self.final_graph_wo_negative = final_graph_wo_sign | get_quantity( final_graph_wo_sign, cardinal_graph_hundred_component_at_least_one_none_zero_digit) final_graph = optional_graph_negative + self.final_graph_wo_negative final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None): super().__init__(name="whitelist", kind="classify", deterministic=deterministic) def _get_whitelist_graph(input_case, file): whitelist = load_labels(file) if input_case == "lower_cased": whitelist = [(x.lower(), y) for x, y in whitelist] else: whitelist = [(x, y) for x, y in whitelist] graph = pynini.string_map(whitelist) return graph def _get_whitelist_non_deterministic_graph( file="data/whitelist_alternatives.tsv"): whitelist = load_labels(get_abs_path(file)) whitelist_lower = [(x.lower(), y.lower()) for x, y in whitelist] whitelist_cased = [(x, y) for x, y in whitelist] graph = pynini.string_map(whitelist_lower + whitelist_cased) return graph graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist.tsv")) if not deterministic: graph |= (_get_whitelist_graph("lower_cased", get_abs_path("data/whitelist.tsv")) | _get_whitelist_non_deterministic_graph()) if input_file: whitelist_provided = _get_whitelist_graph(input_case, input_file) if not deterministic: graph |= whitelist_provided else: graph = whitelist_provided self.graph = (convert_space(graph)).optimize() self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = True): super().__init__(name="money", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph graph_decimal_final = decimal.final_graph_wo_negative unit_singular = pynini.string_file( get_abs_path("data/currency/currency.tsv")) unit_plural = convert_space(unit_singular @ SINGULAR_TO_PLURAL) unit_singular = convert_space(unit_singular) graph_unit_singular = pynutil.insert( "currency: \"") + unit_singular + pynutil.insert("\"") graph_unit_plural = pynutil.insert( "currency: \"") + unit_plural + pynutil.insert("\"") singular_graph = (graph_unit_singular + pynutil.insert(" integer_part: \"") + pynini.cross("1", "one") + pynutil.insert("\"")) graph_decimal = graph_unit_plural + insert_space + graph_decimal_final if deterministic: graph_integer = (graph_unit_plural + pynutil.insert(" integer_part: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + pynutil.insert("\"")) else: graph_integer = ( graph_unit_plural + pynutil.insert(" integer_part: \"") + ((NEMO_SIGMA - "1") @ (get_hundreds_graph(deterministic) | cardinal_graph)) + pynutil.insert("\"")) graph_decimal |= singular_graph + insert_space + graph_decimal_final graph_integer |= singular_graph final_graph = graph_integer | graph_decimal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="telephone", kind="classify", deterministic=deterministic) add_separator = pynutil.insert(", ") # between components digit = pynini.invert( pynini.string_file(get_abs_path("data/numbers/digit.tsv")) ).optimize() | pynini.cross("0", "o") country_code = (pynutil.insert("country_code: \"") + pynutil.delete("+") + pynini.closure(digit + insert_space, 0, 2) + digit + pynutil.insert("\"")) optional_country_code = pynini.closure( country_code + pynini.closure(pynutil.delete("-"), 0, 1) + delete_space + insert_space, 0, 1) number_part = (( (pynini.closure(digit + insert_space, 2, 2) + digit + pynutil.delete("-")) | (pynutil.delete("(") + pynini.closure(digit + insert_space, 2, 2) + digit + pynutil.delete(")") + pynini.closure(pynutil.delete("-"), 0, 1) + delete_space)) + add_separator + pynini.closure(digit + insert_space, 2, 2) + digit + pynutil.delete("-") + add_separator + pynini.closure(digit + insert_space, 3, 3) + digit) number_part = pynutil.insert( "number_part: \"") + number_part + pynutil.insert("\"") extension = (pynutil.insert("extension : \"") + pynini.closure(digit + insert_space, 0, 3) + digit + pynutil.insert("\"")) optional_extension = pynini.closure( insert_space + pynutil.delete("-") + extension, 0, 1) graph = optional_country_code + number_part + optional_extension final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = True): super().__init__(name="money", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph_with_and graph_decimal_final = decimal.final_graph_wo_negative_w_abbr maj_singular_labels = load_labels( get_abs_path("data/money/currency_major.tsv")) maj_unit_plural = convert_space(maj_singular @ SINGULAR_TO_PLURAL) maj_unit_singular = convert_space(maj_singular) graph_maj_singular = pynutil.insert( "currency_maj: \"") + maj_unit_singular + pynutil.insert("\"") graph_maj_plural = pynutil.insert( "currency_maj: \"") + maj_unit_plural + pynutil.insert("\"") optional_delete_fractional_zeros = pynini.closure( pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), 0, 1) graph_integer_one = pynutil.insert("integer_part: \"") + pynini.cross( "1", "one") + pynutil.insert("\"") # only for decimals where third decimal after comma is non-zero or with quantity decimal_delete_last_zeros = ( pynini.closure(NEMO_DIGIT | pynutil.delete(",")) + pynini.accep(".") + pynini.closure(NEMO_DIGIT, 2) + (NEMO_DIGIT - "0") + pynini.closure(pynutil.delete("0"))) decimal_with_quantity = NEMO_SIGMA + NEMO_ALPHA graph_decimal = (graph_maj_plural + insert_space + (decimal_delete_last_zeros | decimal_with_quantity) @ graph_decimal_final) graph_integer = (pynutil.insert("integer_part: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + pynutil.insert("\"")) graph_integer_only = graph_maj_singular + insert_space + graph_integer_one graph_integer_only |= graph_maj_plural + insert_space + graph_integer final_graph = (graph_integer_only + optional_delete_fractional_zeros) | graph_decimal # remove trailing zeros of non zero number in the first 2 digits and fill up to 2 digits # e.g. 2000 -> 20, 0200->02, 01 -> 01, 10 -> 10 # not accepted: 002, 00, 0, two_digits_fractional_part = ( pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(pynutil.delete("0"))) @ ( (pynutil.delete("0") + (NEMO_DIGIT - "0")) | ((NEMO_DIGIT - "0") + pynutil.insert("0")) | ((NEMO_DIGIT - "0") + NEMO_DIGIT)) graph_min_singular = pynutil.insert( " currency_min: \"") + min_singular + pynutil.insert("\"") graph_min_plural = pynutil.insert( " currency_min: \"") + min_plural + pynutil.insert("\"") # format ** dollars ** cent decimal_graph_with_minor = None integer_graph_reordered = None decimal_default_reordered = None for curr_symbol, _ in maj_singular_labels: preserve_order = pynutil.insert(" preserve_order: true") integer_plus_maj = graph_integer + insert_space + pynutil.insert( curr_symbol) @ graph_maj_plural integer_plus_maj |= graph_integer_one + insert_space + pynutil.insert( curr_symbol) @ graph_maj_singular integer_plus_maj_with_comma = pynini.compose( NEMO_DIGIT - "0" + pynini.closure(NEMO_DIGIT | pynutil.delete(",")), integer_plus_maj) integer_plus_maj = pynini.compose( pynini.closure(NEMO_DIGIT) - "0", integer_plus_maj) integer_plus_maj |= integer_plus_maj_with_comma graph_fractional_one = two_digits_fractional_part @ pynini.cross( "1", "one") graph_fractional_one = pynutil.insert( "fractional_part: \"") + graph_fractional_one + pynutil.insert( "\"") graph_fractional = (two_digits_fractional_part @ ( pynini.closure(NEMO_DIGIT, 1, 2) - "1" ) @ cardinal.graph_hundred_component_at_least_one_none_zero_digit) graph_fractional = pynutil.insert( "fractional_part: \"") + graph_fractional + pynutil.insert( "\"") fractional_plus_min = graph_fractional + insert_space + pynutil.insert( curr_symbol) @ graph_min_plural fractional_plus_min |= ( graph_fractional_one + insert_space + pynutil.insert(curr_symbol) @ graph_min_singular) decimal_graph_with_minor_curr = integer_plus_maj + pynini.cross( ".", " ") + fractional_plus_min if not deterministic: decimal_graph_with_minor_curr |= pynutil.add_weight( integer_plus_maj + pynini.cross(".", " ") + pynutil.insert("fractional_part: \"") + two_digits_fractional_part @ cardinal. graph_hundred_component_at_least_one_none_zero_digit + pynutil.insert("\""), weight=0.0001, ) default_fraction_graph = ( decimal_delete_last_zeros | decimal_with_quantity) @ graph_decimal_final decimal_graph_with_minor_curr |= ( pynini.closure(pynutil.delete("0"), 0, 1) + pynutil.delete(".") + fractional_plus_min) decimal_graph_with_minor_curr = (pynutil.delete(curr_symbol) + decimal_graph_with_minor_curr + preserve_order) decimal_graph_with_minor = ( decimal_graph_with_minor_curr if decimal_graph_with_minor is None else pynini.union( decimal_graph_with_minor, decimal_graph_with_minor_curr).optimize()) if not deterministic: integer_graph_reordered_curr = (pynutil.delete(curr_symbol) + integer_plus_maj + preserve_order).optimize() integer_graph_reordered = ( integer_graph_reordered_curr if integer_graph_reordered is None else pynini.union( integer_graph_reordered, integer_graph_reordered_curr).optimize()) decimal_default_reordered_curr = ( pynutil.delete(curr_symbol) + default_fraction_graph + insert_space + pynutil.insert(curr_symbol) @ graph_maj_plural) decimal_default_reordered = ( decimal_default_reordered_curr if decimal_default_reordered is None else pynini.union( decimal_default_reordered, decimal_default_reordered_curr)).optimize() # weight for SH final_graph |= pynutil.add_weight(decimal_graph_with_minor, -0.0001) if not deterministic: final_graph |= integer_graph_reordered | decimal_default_reordered # to handle "$2.00" cases final_graph |= pynini.compose( NEMO_SIGMA + pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), integer_graph_reordered) final_graph = self.add_tokens(final_graph.optimize()) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) # TODO repalce to have "oh" as a default for "0" graph = pynini.Far( get_abs_path("data/numbers/cardinal_number_name.far")).get_fst() self.graph_hundred_component_at_least_one_none_zero_digit = ( pynini.closure(NEMO_DIGIT, 2, 3) | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure( pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT)) @ graph graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) single_digits_graph = pynini.invert(graph_digit | graph_zero) self.single_digits_graph = single_digits_graph + pynini.closure( insert_space + single_digits_graph) if not deterministic: # for a single token allow only the same normalization # "007" -> {"oh oh seven", "zero zero seven"} not {"oh zero seven"} single_digits_graph_zero = pynini.invert(graph_digit | graph_zero) single_digits_graph_oh = pynini.invert(graph_digit) | pynini.cross( "0", "oh") self.single_digits_graph = single_digits_graph_zero + pynini.closure( insert_space + single_digits_graph_zero) self.single_digits_graph |= single_digits_graph_oh + pynini.closure( insert_space + single_digits_graph_oh) single_digits_graph_with_commas = pynini.closure( self.single_digits_graph + insert_space, 1, 3) + pynini.closure( pynutil.delete(",") + single_digits_graph + insert_space + single_digits_graph + insert_space + single_digits_graph, 1, ) self.range_graph = pynutil.insert( "from ") + self.graph + pynini.cross("-", " to ") + self.graph self.range_graph |= self.graph + (pynini.cross( "x", " by ") | pynini.cross(" x ", " by ")) + self.graph self.range_graph |= (pynutil.insert("from ") + get_hundreds_graph() + pynini.cross("-", " to ") + get_hundreds_graph()) self.range_graph = self.range_graph.optimize() serial_graph = self.get_serial_graph() optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) if deterministic: long_numbers = pynini.compose(NEMO_DIGIT**(5, ...), self.single_digits_graph).optimize() final_graph = self.graph | serial_graph | pynutil.add_weight( long_numbers, -0.001) cardinal_with_leading_zeros = pynini.compose( pynini.accep("0") + pynini.closure(NEMO_DIGIT), self.single_digits_graph) final_graph |= cardinal_with_leading_zeros else: leading_zeros = pynini.compose( pynini.closure(pynini.accep("0"), 1), self.single_digits_graph) cardinal_with_leading_zeros = ( leading_zeros + pynutil.insert(" ") + pynini.compose(pynini.closure(NEMO_DIGIT), self.graph)) final_graph = (self.graph | serial_graph | self.range_graph | self.single_digits_graph | get_hundreds_graph() | pynutil.add_weight( single_digits_graph_with_commas, 0.001) | cardinal_with_leading_zeros) final_graph = optional_minus_graph + pynutil.insert( "integer: \"") + final_graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, deterministic: bool = True): super().__init__(name="measure", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph if not deterministic: cardinal_graph |= cardinal.range_graph graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv")) graph_unit |= pynini.compose( pynini.closure(TO_LOWER, 1) + pynini.closure(NEMO_ALPHA), graph_unit) graph_unit_plural = convert_space(graph_unit @ SINGULAR_TO_PLURAL) graph_unit = convert_space(graph_unit) optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) graph_unit2 = pynini.cross("/", "per") + delete_space + pynutil.insert( NEMO_NON_BREAKING_SPACE) + graph_unit optional_graph_unit2 = pynini.closure( delete_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit2, 0, 1, ) unit_plural = ( pynutil.insert("units: \"") + (graph_unit_plural + optional_graph_unit2 | graph_unit2) + pynutil.insert("\"")) unit_singular = (pynutil.insert("units: \"") + (graph_unit + optional_graph_unit2 | graph_unit2) + pynutil.insert("\"")) subgraph_decimal = (pynutil.insert("decimal { ") + optional_graph_negative + decimal.final_graph_wo_negative + delete_space + pynutil.insert(" } ") + unit_plural) subgraph_cardinal = (pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + delete_space + pynutil.insert("\"") + pynutil.insert(" } ") + unit_plural) subgraph_cardinal |= (pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + pynini.cross("1", "one") + delete_space + pynutil.insert("\"") + pynutil.insert(" } ") + unit_singular) cardinal_dash_alpha = (pynutil.insert("cardinal { integer: \"") + cardinal_graph + pynini.accep('-') + pynutil.insert("\" } units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynutil.insert("\"")) alpha_dash_cardinal = (pynutil.insert("units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynini.accep('-') + pynutil.insert("\"") + pynutil.insert(" cardinal { integer: \"") + cardinal_graph + pynutil.insert("\" } preserve_order: true")) decimal_dash_alpha = (pynutil.insert("decimal { ") + decimal.final_graph_wo_negative + pynini.cross('-', '') + pynutil.insert(" } units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynutil.insert("\"")) decimal_times = (pynutil.insert("decimal { ") + decimal.final_graph_wo_negative + pynutil.insert(" } units: \"") + pynini.cross(pynini.union('x', "X"), 'x') + pynutil.insert("\"")) alpha_dash_decimal = (pynutil.insert("units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynini.accep('-') + pynutil.insert("\"") + pynutil.insert(" decimal { ") + decimal.final_graph_wo_negative + pynutil.insert(" } preserve_order: true")) subgraph_fraction = (pynutil.insert("fraction { ") + fraction.graph + delete_space + pynutil.insert(" } ") + unit_plural) address = self.get_address_graph(cardinal) address = ( pynutil.insert("units: \"address\" cardinal { integer: \"") + address + pynutil.insert("\" } preserve_order: true")) math_operations = pynini.string_file( get_abs_path("data/math_operations.tsv")) delimiter = pynini.accep(" ") | pynutil.insert(" ") math = (cardinal_graph + delimiter + math_operations + delimiter + cardinal_graph + delimiter + pynini.cross("=", "equals") + delimiter + cardinal_graph) math = (pynutil.insert("units: \"math\" cardinal { integer: \"") + math + pynutil.insert("\" } preserve_order: true")) final_graph = (subgraph_decimal | subgraph_cardinal | cardinal_dash_alpha | alpha_dash_cardinal | decimal_dash_alpha | decimal_times | alpha_dash_decimal | subgraph_fraction | address | math) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True, lm: bool = False): super().__init__(name="roman", kind="classify", deterministic=deterministic) roman_dict = load_labels( get_abs_path("data/roman/roman_to_spoken.tsv")) default_graph = pynini.string_map(roman_dict).optimize() default_graph = pynutil.insert( "integer: \"") + default_graph + pynutil.insert("\"") graph_teens = pynini.string_map([x[0] for x in roman_dict[:19]]).optimize() # up to five digit roman numerals with a preceding name are converted to ordinal form names = get_names() graph = (pynutil.insert("key_the_ordinal: \"") + names + pynutil.insert("\"") + pynini.accep(" ") + graph_teens @ default_graph).optimize() # single symbol roman numerals with preceding key words are converted to cardinal form key_words = pynini.string_map( load_labels(get_abs_path("data/roman/key_word.tsv"))).optimize() graph |= (pynutil.insert("key_cardinal: \"") + key_words + pynutil.insert("\"") + pynini.accep(" ") + default_graph).optimize() if deterministic: # two digit roman numerals up to 49 roman_to_cardinal = pynini.compose( pynini.closure(NEMO_ALPHA, 2), (pynutil.insert("default_cardinal: \"default\" ") + (pynini.string_map([x[0] for x in roman_dict[:50] ]).optimize()) @ default_graph), ) elif not lm: # two or more digit roman numerals roman_to_cardinal = pynini.compose( pynini.closure(NEMO_ALPHA, 2), (pynutil.insert("default_cardinal: \"default\" ") + (pynini.string_map([x[0] for x in roman_dict[:50] ]).optimize()) @ default_graph), ) # convert three digit roman or up with suffix to ordinal roman_to_ordinal = pynini.compose( pynini.closure(NEMO_ALPHA, 3), (pynutil.insert("default_ordinal: \"default\" ") + graph_teens @ default_graph + pynutil.delete("th")), ) graph |= roman_to_cardinal | roman_to_ordinal # # add a higher weight when roman number consists of a single symbol # graph = pynini.compose(pynini.closure(NEMO_CHAR, 2), graph) | pynutil.add_weight( # pynini.compose(NEMO_CHAR, graph), 101 # ) # graph = graph.optimize() + pynini.closure(pynutil.delete("."), 0, 1) # graph = pynutil.insert("integer: \"") + graph + pynutil.insert("\"") graph = self.add_tokens(graph) self.fst = graph.optimize()
def get_address_graph(self, cardinal): """ Finite state transducer for classifying serial. The serial is a combination of digits, letters and dashes, e.g.: 2788 San Tomas Expy, Santa Clara, CA 95051 -> units: "address" cardinal { integer: "two seven eight eight San Tomas Expressway Santa Clara California nine five zero five one" } preserve_order: true """ ordinal_verbalizer = OrdinalVerbalizer().graph ordinal_tagger = OrdinalTagger(cardinal=cardinal).graph ordinal_num = pynini.compose( pynutil.insert("integer: \"") + ordinal_tagger + pynutil.insert("\""), ordinal_verbalizer) address_num = NEMO_DIGIT**( 1, 2) @ cardinal.graph_hundred_component_at_least_one_none_zero_digit address_num += insert_space + NEMO_DIGIT**2 @ ( pynini.closure(pynini.cross("0", "zero "), 0, 1) + cardinal.graph_hundred_component_at_least_one_none_zero_digit) # to handle the rest of the numbers address_num = pynini.compose(NEMO_DIGIT**(3, 4), address_num) address_num = plurals._priority_union(address_num, cardinal.graph, NEMO_SIGMA) direction = (pynini.cross("E", "East") | pynini.cross("S", "South") | pynini.cross("W", "West") | pynini.cross("N", "North")) + pynini.closure( pynutil.delete("."), 0, 1) direction = pynini.closure(pynini.accep(NEMO_SPACE) + direction, 0, 1) address_words = get_formats( get_abs_path("data/address/address_word.tsv")) address_words = ( pynini.accep(NEMO_SPACE) + (pynini.closure(ordinal_num, 0, 1) | NEMO_UPPER + pynini.closure(NEMO_ALPHA, 1)) + NEMO_SPACE + pynini.closure(NEMO_UPPER + pynini.closure(NEMO_ALPHA) + NEMO_SPACE) + address_words) city = pynini.closure(NEMO_ALPHA | pynini.accep(NEMO_SPACE), 1) city = pynini.closure( pynini.accep(",") + pynini.accep(NEMO_SPACE) + city, 0, 1) states = load_labels(get_abs_path("data/address/state.tsv")) additional_options = [] for x, y in states: additional_options.append((x, f"{y[0]}.{y[1:]}")) states.extend(additional_options) state_graph = pynini.string_map(states) state = pynini.invert(state_graph) state = pynini.closure( pynini.accep(",") + pynini.accep(NEMO_SPACE) + state, 0, 1) zip_code = pynini.compose(NEMO_DIGIT**5, cardinal.single_digits_graph) zip_code = pynini.closure( pynini.closure(pynini.accep(","), 0, 1) + pynini.accep(NEMO_SPACE) + zip_code, 0, 1, ) address = address_num + direction + address_words + pynini.closure( city + state + zip_code, 0, 1) address |= address_num + direction + address_words + pynini.closure( pynini.cross(".", ""), 0, 1) return address
NEMO_DIGIT, NEMO_SIGMA, TO_LOWER, GraphFst, delete_extra_space, delete_space, insert_space, ) from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels try: import pynini from pynini.lib import pynutil graph_teen = pynini.invert( pynini.string_file(get_abs_path("data/numbers/teen.tsv"))).optimize() graph_digit = pynini.invert( pynini.string_file(get_abs_path("data/numbers/digit.tsv"))).optimize() ties_graph = pynini.invert( pynini.string_file(get_abs_path("data/numbers/ties.tsv"))).optimize() PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): # Add placeholders for global variables graph_teen = None graph_digit = None ties_graph = None PYNINI_AVAILABLE = True
def __init__(self, deterministic: bool = True, lm: bool = False): super().__init__(name="roman", kind="classify", deterministic=deterministic) roman_dict = load_labels( get_abs_path("data/roman/roman_to_spoken.tsv")) default_graph = pynini.string_map(roman_dict).optimize() default_graph = pynutil.insert( "integer: \"") + default_graph + pynutil.insert("\"") ordinal_limit = 19 graph_teens = pynini.string_map( [x[0] for x in roman_dict[:ordinal_limit]]).optimize() # roman numerals up to ordinal_limit with a preceding name are converted to ordinal form names = get_names() graph = (pynutil.insert("key_the_ordinal: \"") + names + pynutil.insert("\"") + pynini.accep(" ") + graph_teens @ default_graph).optimize() # single symbol roman numerals with preceding key words (multiple formats) are converted to cardinal form key_words = [] for k_word in load_labels(get_abs_path("data/roman/key_word.tsv")): key_words.append(k_word) key_words.append([k_word[0][0].upper() + k_word[0][1:]]) key_words.append([k_word[0].upper()]) key_words = pynini.string_map(key_words).optimize() graph |= (pynutil.insert("key_cardinal: \"") + key_words + pynutil.insert("\"") + pynini.accep(" ") + default_graph).optimize() if deterministic: # two digit roman numerals up to 49 roman_to_cardinal = pynini.compose( pynini.closure(NEMO_ALPHA, 2), (pynutil.insert("default_cardinal: \"default\" ") + (pynini.string_map([x[0] for x in roman_dict[:50] ]).optimize()) @ default_graph), ) elif not lm: # two or more digit roman numerals roman_to_cardinal = pynini.compose( pynini.difference(NEMO_SIGMA, "I"), (pynutil.insert("default_cardinal: \"default\" integer: \"") + pynini.string_map(roman_dict).optimize() + pynutil.insert("\"")), ).optimize() # convert three digit roman or up with suffix to ordinal roman_to_ordinal = pynini.compose( pynini.closure(NEMO_ALPHA, 3), (pynutil.insert("default_ordinal: \"default\" ") + graph_teens @ default_graph + pynutil.delete("th")), ) graph |= roman_to_cardinal | roman_to_ordinal graph = self.add_tokens(graph) self.fst = graph.optimize()
def __init__(self, cardinal: GraphFst, deterministic: bool): super().__init__(name="date", kind="classify", deterministic=deterministic) month_graph = pynini.string_file( get_abs_path("data/months/names.tsv")).optimize() month_graph |= (TO_LOWER + pynini.closure(NEMO_CHAR)) @ month_graph month_abbr_graph = pynini.string_file( get_abs_path("data/months/abbr.tsv")).optimize() month_abbr_graph = (month_abbr_graph | (TO_LOWER + pynini.closure(NEMO_CHAR)) @ month_abbr_graph) + pynini.closure( pynutil.delete("."), 0, 1) month_graph |= month_abbr_graph # to support all caps names names_all_caps = [[ x[0].upper() ] for x in load_labels(get_abs_path("data/months/names.tsv"))] abbr_all_caps = [ (x.upper(), y) for x, y in load_labels(get_abs_path("data/months/abbr.tsv")) ] month_graph |= pynini.string_map(names_all_caps) | ( pynini.string_map(abbr_all_caps) + pynini.closure(pynutil.delete("."), 0, 1)) month_numbers_graph = pynini.string_file( get_abs_path("data/months/numbers.tsv")).optimize() cardinal_graph = cardinal.graph_hundred_component_at_least_one_none_zero_digit year_graph = _get_year_graph(deterministic) YEAR_WEIGHT = 0.001 year_graph_standalone = (pynutil.insert("year: \"") + pynutil.add_weight(year_graph, YEAR_WEIGHT) + pynutil.insert("\"")) month_graph = pynutil.insert( "month: \"") + month_graph + pynutil.insert("\"") month_numbers_graph = pynutil.insert( "month: \"") + month_numbers_graph + pynutil.insert("\"") day_graph = (pynutil.insert("day: \"") + ((pynini.union("1", "2", "3") + NEMO_DIGIT) | NEMO_DIGIT) @ cardinal_graph + pynutil.insert("\"")) optional_day_graph = pynini.closure(delete_extra_space + day_graph, 0, 1) two_digit_year = NEMO_DIGIT**(2) @ (cardinal.single_digits_graph | cardinal_graph) two_digit_year = pynutil.insert( "year: \"") + two_digit_year + pynutil.insert("\"") year_graph = pynutil.insert("year: \"") + year_graph + pynutil.insert( "\"") optional_graph_year = pynini.closure( delete_extra_space + year_graph, 0, 1, ) graph_mdy = (month_graph + optional_day_graph + delete_space + pynini.closure(pynutil.delete(","), 0, 1) + optional_graph_year) delete_sep = pynutil.delete(pynini.union("-", "/", ".")) graph_mdy |= (month_numbers_graph + delete_sep + insert_space + pynini.closure(pynutil.delete("0"), 0, 1) + day_graph + delete_sep + insert_space + (year_graph | two_digit_year)) graph_dmy = (day_graph + delete_extra_space + month_graph + pynini.closure(pynutil.delete(","), 0, 1) + optional_graph_year) graph_ymd = ((year_graph | two_digit_year) + delete_sep + insert_space + month_numbers_graph + delete_sep + insert_space + pynini.closure(pynutil.delete("0"), 0, 1) + day_graph) final_graph = graph_mdy | graph_dmy if deterministic: final_graph += pynutil.insert(" preserve_order: true") else: final_graph += pynini.closure( pynutil.insert(" preserve_order: true"), 0, 1) final_graph |= graph_ymd | year_graph_standalone if not deterministic: ymd_to_mdy_graph = None mdy_to_dmy_graph = None for month in [ x[0] for x in load_labels(get_abs_path("data/months/names.tsv")) ]: for day in [ x[0] for x in load_labels( get_abs_path("data/months/days.tsv")) ]: ymd_to_mdy_curr = ( pynutil.insert("month: \"" + month + "\" day: \"" + day + "\" ") + pynini.accep('year:') + NEMO_SIGMA + pynutil.delete(" month: \"" + month + "\" day: \"" + day + "\"")) # YY-MM-DD -> MM-DD-YY ymd_to_mdy_curr = pynini.compose(final_graph, ymd_to_mdy_curr) ymd_to_mdy_graph = ( ymd_to_mdy_curr if ymd_to_mdy_graph is None else pynini.union(ymd_to_mdy_curr, ymd_to_mdy_graph)) mdy_to_dmy_curr = ( pynutil.insert("day: \"" + day + "\" month: \"" + month + "\" ") + pynutil.delete("month: \"" + month + "\" day: \"" + day + "\" ") + pynini.accep('year:') + NEMO_SIGMA) # pynini.compose(ymd_to_mdy_curr, mdy_to_dmy_curr) to handle: # YY-MM-DD (input format) -> MM-DD-YY (intermediate ymd_to_mdy_curr representation) -> DD-MM-YY # '2000-01-05' -> 'day: "five" month: "january" year: "two thousand"' # pynini.compose(final_graph, mdy_to_dmy_curr) to handle: # MM-DD-YY (input format) -> DD-MM-YY mdy_to_dmy_curr = pynini.compose( ymd_to_mdy_curr, mdy_to_dmy_curr) | pynini.compose( final_graph, mdy_to_dmy_curr) mdy_to_dmy_graph = ( mdy_to_dmy_curr if mdy_to_dmy_graph is None else pynini.union(mdy_to_dmy_curr, mdy_to_dmy_graph)) final_graph |= ymd_to_mdy_graph | mdy_to_dmy_graph final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
NEMO_ALPHA, NEMO_DIGIT, NEMO_SIGMA, SINGULAR_TO_PLURAL, GraphFst, convert_space, insert_space, ) from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels try: import pynini from pynini.lib import pynutil min_singular = pynini.string_file( get_abs_path("data/money/currency_minor_singular.tsv")) min_plural = pynini.string_file( get_abs_path("data/money/currency_minor_plural.tsv")) maj_singular = pynini.string_file( (get_abs_path("data/money/currency_major.tsv"))) PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): PYNINI_AVAILABLE = False class MoneyFst(GraphFst): """ Finite state transducer for classifying money, suppletive aware, e.g. $12.05 -> money { integer_part: "twelve" currency_maj: "dollars" fractional_part: "five" currency_min: "cents" preserve_order: true } $12.0500 -> money { integer_part: "twelve" currency_maj: "dollars" fractional_part: "five" currency_min: "cents" preserve_order: true }
NEMO_PUNCT = pynini.union( *map(pynini.escape, string.punctuation)).optimize() NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize() NEMO_SIGMA = pynini.closure(NEMO_CHAR) delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) insert_space = pynutil.insert(" ") delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") delete_preserve_order = pynini.closure( pynutil.delete(" preserve_order: true") | (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\""))) suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) # _v = pynini.union("a", "e", "i", "o", "u") _c = pynini.union("b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z") _ies = NEMO_SIGMA + _c + pynini.cross("y", "ies") _es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es") _s = NEMO_SIGMA + pynutil.insert("s") graph_plural = plurals._priority_union( suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA).optimize() SINGULAR_TO_PLURAL = graph_plural
delete_extra_space, delete_space, insert_space, ) from nemo_text_processing.text_normalization.en.utils import ( augment_labels_with_punct_at_end, get_abs_path, load_labels, ) try: import pynini from pynini.lib import pynutil from pynini.examples import plurals graph_teen = pynini.invert(pynini.string_file(get_abs_path("data/number/teen.tsv"))).optimize() graph_digit = pynini.invert(pynini.string_file(get_abs_path("data/number/digit.tsv"))).optimize() ties_graph = pynini.invert(pynini.string_file(get_abs_path("data/number/ty.tsv"))).optimize() year_suffix = load_labels(get_abs_path("data/date/year_suffix.tsv")) year_suffix.extend(augment_labels_with_punct_at_end(year_suffix)) year_suffix = pynini.string_map(year_suffix).optimize() PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): # Add placeholders for global variables graph_teen = None graph_digit = None ties_graph = None PYNINI_AVAILABLE = True
def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None): super().__init__(name="whitelist", kind="classify", deterministic=deterministic) def _get_whitelist_graph(input_case, file, keep_punct_add_end: bool = False): whitelist = load_labels(file) if input_case == "lower_cased": whitelist = [[x.lower(), y] for x, y in whitelist] else: whitelist = [[x, y] for x, y in whitelist] if keep_punct_add_end: whitelist.extend(augment_labels_with_punct_at_end(whitelist)) graph = pynini.string_map(whitelist) return graph graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist/tts.tsv")) graph |= _get_whitelist_graph(input_case, get_abs_path("data/whitelist/symbol.tsv")) if deterministic: names = get_names() graph |= ( pynini.cross(pynini.union("st", "St", "ST"), "Saint") + pynini.closure(pynutil.delete(".")) + pynini.accep(" ") + names ) else: graph |= _get_whitelist_graph( input_case, get_abs_path("data/whitelist/alternatives.tsv"), keep_punct_add_end=True ) for x in [".", ". "]: graph |= ( NEMO_UPPER + pynini.closure(pynutil.delete(x) + NEMO_UPPER, 2) + pynini.closure(pynutil.delete("."), 0, 1) ) if not deterministic: multiple_forms_whitelist_graph = get_formats(get_abs_path("data/whitelist/alternatives_all_format.tsv")) graph |= multiple_forms_whitelist_graph graph_unit = pynini.string_file(get_abs_path("data/measure/unit.tsv")) | pynini.string_file( get_abs_path("data/measure/unit_alternatives.tsv") ) graph_unit_plural = graph_unit @ SINGULAR_TO_PLURAL units_graph = pynini.compose(NEMO_CHAR ** (3, ...), convert_space(graph_unit | graph_unit_plural)) graph |= units_graph # convert to states only if comma is present before the abbreviation to avoid converting all caps words, # e.g. "IN", "OH", "OK" # TODO or only exclude above? states = load_labels(get_abs_path("data/address/state.tsv")) additional_options = [] for x, y in states: if input_case == "lower_cased": x = x.lower() additional_options.append((x, f"{y[0]}.{y[1:]}")) if not deterministic: additional_options.append((x, f"{y[0]}.{y[1:]}.")) states.extend(additional_options) state_graph = pynini.string_map(states) graph |= pynini.closure(NEMO_NOT_SPACE, 1) + pynini.union(", ", ",") + pynini.invert(state_graph).optimize() if input_file: whitelist_provided = _get_whitelist_graph(input_case, input_file) if not deterministic: graph |= whitelist_provided else: graph = whitelist_provided self.graph = (convert_space(graph)).optimize() self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
def __init__(self, cardinal: GraphFst, deterministic: bool, lm: bool = False): super().__init__(name="date", kind="classify", deterministic=deterministic) # january month_graph = pynini.string_file(get_abs_path("data/date/month_name.tsv")).optimize() # January, JANUARY month_graph |= pynini.compose(TO_LOWER + pynini.closure(NEMO_CHAR), month_graph) | pynini.compose( TO_LOWER ** (2, ...), month_graph ) # jan month_abbr_graph = pynini.string_file(get_abs_path("data/date/month_abbr.tsv")).optimize() # jan, Jan, JAN month_abbr_graph = ( month_abbr_graph | pynini.compose(TO_LOWER + pynini.closure(NEMO_LOWER, 1), month_abbr_graph).optimize() | pynini.compose(TO_LOWER ** (2, ...), month_abbr_graph).optimize() ) + pynini.closure(pynutil.delete("."), 0, 1) month_graph |= month_abbr_graph.optimize() month_numbers_labels = pynini.string_file(get_abs_path("data/date/month_number.tsv")).optimize() cardinal_graph = cardinal.graph_hundred_component_at_least_one_none_zero_digit year_graph = _get_year_graph(cardinal_graph=cardinal_graph, deterministic=deterministic) # three_digit_year = (NEMO_DIGIT @ cardinal_graph) + insert_space + (NEMO_DIGIT ** 2) @ cardinal_graph # year_graph |= three_digit_year month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"") month_numbers_graph = pynutil.insert("month: \"") + month_numbers_labels + pynutil.insert("\"") endings = ["rd", "th", "st", "nd"] endings += [x.upper() for x in endings] endings = pynini.union(*endings) day_graph = ( pynutil.insert("day: \"") + pynini.closure(pynutil.delete("the "), 0, 1) + ( ((pynini.union("1", "2") + NEMO_DIGIT) | NEMO_DIGIT | (pynini.accep("3") + pynini.union("0", "1"))) + pynini.closure(pynutil.delete(endings), 0, 1) ) @ cardinal_graph + pynutil.insert("\"") ) two_digit_year = _get_two_digit_year( cardinal_graph=cardinal_graph, single_digits_graph=cardinal.single_digits_graph ) two_digit_year = pynutil.insert("year: \"") + two_digit_year + pynutil.insert("\"") # if lm: # two_digit_year = pynini.compose(pynini.difference(NEMO_DIGIT, "0") + NEMO_DIGIT ** (3), two_digit_year) # year_graph = pynini.compose(pynini.difference(NEMO_DIGIT, "0") + NEMO_DIGIT ** (2), year_graph) # year_graph |= pynini.compose(pynini.difference(NEMO_DIGIT, "0") + NEMO_DIGIT ** (4, ...), year_graph) graph_year = pynutil.insert(" year: \"") + pynutil.delete(" ") + year_graph + pynutil.insert("\"") graph_year |= ( pynutil.insert(" year: \"") + pynini.accep(",") + pynini.closure(pynini.accep(" "), 0, 1) + year_graph + pynutil.insert("\"") ) optional_graph_year = pynini.closure(graph_year, 0, 1) year_graph = pynutil.insert("year: \"") + year_graph + pynutil.insert("\"") graph_mdy = month_graph + ( (delete_extra_space + day_graph) | (pynini.accep(" ") + day_graph) | graph_year | (delete_extra_space + day_graph + graph_year) ) graph_mdy |= ( month_graph + pynini.cross("-", " ") + day_graph + pynini.closure(((pynini.cross("-", " ") + NEMO_SIGMA) @ graph_year), 0, 1) ) for x in ["-", "/", "."]: delete_sep = pynutil.delete(x) graph_mdy |= ( month_numbers_graph + delete_sep + insert_space + pynini.closure(pynutil.delete("0"), 0, 1) + day_graph + delete_sep + insert_space + (year_graph | two_digit_year) ) graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year day_ex_month = (NEMO_DIGIT ** 2 - pynini.project(month_numbers_graph, "input")) @ day_graph for x in ["-", "/", "."]: delete_sep = pynutil.delete(x) graph_dmy |= ( day_ex_month + delete_sep + insert_space + month_numbers_graph + delete_sep + insert_space + (year_graph | two_digit_year) ) graph_ymd = pynini.accep("") for x in ["-", "/", "."]: delete_sep = pynutil.delete(x) graph_ymd |= ( (year_graph | two_digit_year) + delete_sep + insert_space + month_numbers_graph + delete_sep + insert_space + pynini.closure(pynutil.delete("0"), 0, 1) + day_graph ) final_graph = graph_mdy | graph_dmy if not deterministic or lm: final_graph += pynini.closure(pynutil.insert(" preserve_order: true"), 0, 1) m_sep_d = ( month_numbers_graph + pynutil.delete(pynini.union("-", "/")) + insert_space + pynini.closure(pynutil.delete("0"), 0, 1) + day_graph ) final_graph |= m_sep_d else: final_graph += pynutil.insert(" preserve_order: true") final_graph |= graph_ymd | year_graph if not deterministic or lm: ymd_to_mdy_graph = None ymd_to_dmy_graph = None mdy_to_dmy_graph = None md_to_dm_graph = None for month in [x[0] for x in load_labels(get_abs_path("data/date/month_name.tsv"))]: for day in [x[0] for x in load_labels(get_abs_path("data/date/day.tsv"))]: ymd_to_mdy_curr = ( pynutil.insert("month: \"" + month + "\" day: \"" + day + "\" ") + pynini.accep('year:') + NEMO_SIGMA + pynutil.delete(" month: \"" + month + "\" day: \"" + day + "\"") ) # YY-MM-DD -> MM-DD-YY ymd_to_mdy_curr = pynini.compose(graph_ymd, ymd_to_mdy_curr) ymd_to_mdy_graph = ( ymd_to_mdy_curr if ymd_to_mdy_graph is None else pynini.union(ymd_to_mdy_curr, ymd_to_mdy_graph) ) ymd_to_dmy_curr = ( pynutil.insert("day: \"" + day + "\" month: \"" + month + "\" ") + pynini.accep('year:') + NEMO_SIGMA + pynutil.delete(" month: \"" + month + "\" day: \"" + day + "\"") ) # YY-MM-DD -> MM-DD-YY ymd_to_dmy_curr = pynini.compose(graph_ymd, ymd_to_dmy_curr).optimize() ymd_to_dmy_graph = ( ymd_to_dmy_curr if ymd_to_dmy_graph is None else pynini.union(ymd_to_dmy_curr, ymd_to_dmy_graph) ) mdy_to_dmy_curr = ( pynutil.insert("day: \"" + day + "\" month: \"" + month + "\" ") + pynutil.delete("month: \"" + month + "\" day: \"" + day + "\" ") + pynini.accep('year:') + NEMO_SIGMA ).optimize() # MM-DD-YY -> verbalize as MM-DD-YY (February fourth 1991) or DD-MM-YY (the fourth of February 1991) mdy_to_dmy_curr = pynini.compose(graph_mdy, mdy_to_dmy_curr).optimize() mdy_to_dmy_graph = ( mdy_to_dmy_curr if mdy_to_dmy_graph is None else pynini.union(mdy_to_dmy_curr, mdy_to_dmy_graph).optimize() ).optimize() md_to_dm_curr = pynutil.insert("day: \"" + day + "\" month: \"" + month + "\"") + pynutil.delete( "month: \"" + month + "\" day: \"" + day + "\"" ) md_to_dm_curr = pynini.compose(m_sep_d, md_to_dm_curr).optimize() md_to_dm_graph = ( md_to_dm_curr if md_to_dm_graph is None else pynini.union(md_to_dm_curr, md_to_dm_graph).optimize() ).optimize() final_graph |= mdy_to_dmy_graph | md_to_dm_graph | ymd_to_mdy_graph | ymd_to_dmy_graph final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = True, lm: bool = False): super().__init__(name="integer", kind="classify", deterministic=deterministic) """ Finite state transducer for classifying serial (handles only cases without delimiters, values with delimiters are handled by default). The serial is a combination of digits, letters and dashes, e.g.: c325b -> tokens { cardinal { integer: "c three two five b" } } """ num_graph = pynini.compose(NEMO_DIGIT**(6, ...), cardinal.single_digits_graph).optimize() num_graph |= pynini.compose(NEMO_DIGIT**(1, 5), cardinal.graph).optimize() # to handle numbers starting with zero num_graph |= pynini.compose( pynini.accep("0") + pynini.closure(NEMO_DIGIT), cardinal.single_digits_graph).optimize() # TODO: "#" doesn't work from the file symbols_graph = pynini.string_file( get_abs_path("data/whitelist/symbol.tsv")).optimize( ) | pynini.cross("#", "hash") num_graph |= symbols_graph if not self.deterministic and not lm: num_graph |= cardinal.single_digits_graph # also allow double digits to be pronounced as integer in serial number num_graph |= pynutil.add_weight( NEMO_DIGIT**2 @ cardinal. graph_hundred_component_at_least_one_none_zero_digit, weight=0.0001) # add space between letter and digit/symbol symbols = [ x[0] for x in load_labels(get_abs_path("data/whitelist/symbol.tsv")) ] symbols = pynini.union(*symbols) digit_symbol = NEMO_DIGIT | symbols graph_with_space = pynini.compose( pynini.cdrewrite(pynutil.insert(" "), NEMO_ALPHA | symbols, digit_symbol, NEMO_SIGMA), pynini.cdrewrite(pynutil.insert(" "), digit_symbol, NEMO_ALPHA | symbols, NEMO_SIGMA), ) # serial graph with delimiter delimiter = pynini.accep("-") | pynini.accep("/") | pynini.accep(" ") alphas = pynini.closure(NEMO_ALPHA, 1) letter_num = alphas + delimiter + num_graph num_letter = pynini.closure(num_graph + delimiter, 1) + alphas next_alpha_or_num = pynini.closure(delimiter + (alphas | num_graph)) next_alpha_or_num |= pynini.closure( delimiter + num_graph + plurals._priority_union(pynini.accep(" "), pynutil.insert(" "), NEMO_SIGMA).optimize() + alphas) serial_graph = letter_num + next_alpha_or_num serial_graph |= num_letter + next_alpha_or_num # numbers only with 2+ delimiters serial_graph |= (num_graph + delimiter + num_graph + delimiter + num_graph + pynini.closure(delimiter + num_graph)) # 2+ symbols serial_graph |= pynini.compose(NEMO_SIGMA + symbols + NEMO_SIGMA, num_graph + delimiter + num_graph) # exclude ordinal numbers from serial options serial_graph = pynini.compose( pynini.difference(NEMO_SIGMA, pynini.project(ordinal.graph, "input")), serial_graph).optimize() serial_graph = pynutil.add_weight(serial_graph, 0.0001) serial_graph |= (pynini.closure(NEMO_NOT_SPACE, 1) + (pynini.cross("^2", " squared") | pynini.cross("^3", " cubed")).optimize()) # at least one serial graph with alpha numeric value and optional additional serial/num/alpha values serial_graph = ( pynini.closure((serial_graph | num_graph | alphas) + delimiter) + serial_graph + pynini.closure(delimiter + (serial_graph | num_graph | alphas))) serial_graph |= pynini.compose(graph_with_space, serial_graph.optimize()).optimize() serial_graph = pynini.compose(pynini.closure(NEMO_NOT_SPACE, 2), serial_graph).optimize() self.graph = serial_graph.optimize() graph = pynutil.insert("name: \"") + convert_space( self.graph).optimize() + pynutil.insert("\"") self.fst = graph.optimize()