def __init__(self, deterministic: bool = True): super().__init__(name="telephone", kind="classify", deterministic=deterministic) add_separator = pynutil.insert(", ") # between components zero = pynini.cross("0", "zero") if not deterministic: zero |= pynini.cross("0", pynini.union("o", "oh")) digit = pynini.invert(pynini.string_file(get_abs_path("data/number/digit.tsv"))).optimize() | zero telephone_prompts = pynini.string_file(get_abs_path("data/telephone/telephone_prompt.tsv")) country_code = ( pynini.closure(telephone_prompts + delete_extra_space, 0, 1) + pynini.closure(pynini.cross("+", "plus "), 0, 1) + pynini.closure(digit + insert_space, 0, 2) + digit + pynutil.insert(",") ) country_code |= telephone_prompts country_code = pynutil.insert("country_code: \"") + country_code + pynutil.insert("\"") country_code = country_code + pynini.closure(pynutil.delete("-"), 0, 1) + delete_space + insert_space area_part_default = pynini.closure(digit + insert_space, 2, 2) + digit area_part = pynini.cross("800", "eight hundred") | pynini.compose( pynini.difference(NEMO_SIGMA, "800"), area_part_default ) area_part = ( (area_part + (pynutil.delete("-") | pynutil.delete("."))) | ( pynutil.delete("(") + area_part + ((pynutil.delete(")") + pynini.closure(pynutil.delete(" "), 0, 1)) | pynutil.delete(")-")) ) ) + add_separator del_separator = pynini.closure(pynini.union("-", " ", "."), 0, 1) number_length = ((NEMO_DIGIT + del_separator) | (NEMO_ALPHA + del_separator)) ** 7 number_words = pynini.closure( (NEMO_DIGIT @ digit) + (insert_space | (pynini.cross("-", ', '))) | NEMO_ALPHA | (NEMO_ALPHA + pynini.cross("-", ' ')) ) number_words |= pynini.closure( (NEMO_DIGIT @ digit) + (insert_space | (pynini.cross(".", ', '))) | NEMO_ALPHA | (NEMO_ALPHA + pynini.cross(".", ' ')) ) number_words = pynini.compose(number_length, number_words) number_part = area_part + number_words number_part = pynutil.insert("number_part: \"") + number_part + pynutil.insert("\"") extension = ( pynutil.insert("extension: \"") + pynini.closure(digit + insert_space, 0, 3) + digit + pynutil.insert("\"") ) extension = pynini.closure(insert_space + extension, 0, 1) graph = plurals._priority_union(country_code + number_part, number_part, NEMO_SIGMA).optimize() graph = plurals._priority_union(country_code + number_part + extension, graph, NEMO_SIGMA).optimize() graph = plurals._priority_union(number_part + extension, graph, NEMO_SIGMA).optimize() # ip ip_prompts = pynini.string_file(get_abs_path("data/telephone/ip_prompt.tsv")) digit_to_str_graph = digit + pynini.closure(pynutil.insert(" ") + digit, 0, 2) ip_graph = digit_to_str_graph + (pynini.cross(".", " dot ") + digit_to_str_graph) ** 3 graph |= ( pynini.closure( pynutil.insert("country_code: \"") + ip_prompts + pynutil.insert("\"") + delete_extra_space, 0, 1 ) + pynutil.insert("number_part: \"") + ip_graph.optimize() + pynutil.insert("\"") ) # ssn ssn_prompts = pynini.string_file(get_abs_path("data/telephone/ssn_prompt.tsv")) three_digit_part = digit + (pynutil.insert(" ") + digit) ** 2 two_digit_part = digit + pynutil.insert(" ") + digit four_digit_part = digit + (pynutil.insert(" ") + digit) ** 3 ssn_separator = pynini.cross("-", ", ") ssn_graph = three_digit_part + ssn_separator + two_digit_part + ssn_separator + four_digit_part graph |= ( pynini.closure( pynutil.insert("country_code: \"") + ssn_prompts + pynutil.insert("\"") + delete_extra_space, 0, 1 ) + pynutil.insert("number_part: \"") + ssn_graph.optimize() + pynutil.insert("\"") ) final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def get_address_graph(self, cardinal): """ Finite state transducer for classifying serial. The serial is a combination of digits, letters and dashes, e.g.: 2788 San Tomas Expy, Santa Clara, CA 95051 -> units: "address" cardinal { integer: "two seven eight eight San Tomas Expressway Santa Clara California nine five zero five one" } preserve_order: true """ ordinal_verbalizer = OrdinalVerbalizer().graph ordinal_tagger = OrdinalTagger(cardinal=cardinal).graph ordinal_num = pynini.compose( pynutil.insert("integer: \"") + ordinal_tagger + pynutil.insert("\""), ordinal_verbalizer) address_num = NEMO_DIGIT**( 1, 2) @ cardinal.graph_hundred_component_at_least_one_none_zero_digit address_num += insert_space + NEMO_DIGIT**2 @ ( pynini.closure(pynini.cross("0", "zero "), 0, 1) + cardinal.graph_hundred_component_at_least_one_none_zero_digit) # to handle the rest of the numbers address_num = pynini.compose(NEMO_DIGIT**(3, 4), address_num) address_num = plurals._priority_union(address_num, cardinal.graph, NEMO_SIGMA) direction = (pynini.cross("E", "East") | pynini.cross("S", "South") | pynini.cross("W", "West") | pynini.cross("N", "North")) + pynini.closure( pynutil.delete("."), 0, 1) direction = pynini.closure(pynini.accep(NEMO_SPACE) + direction, 0, 1) address_words = get_formats( get_abs_path("data/address/address_word.tsv")) address_words = ( pynini.accep(NEMO_SPACE) + (pynini.closure(ordinal_num, 0, 1) | NEMO_UPPER + pynini.closure(NEMO_ALPHA, 1)) + NEMO_SPACE + pynini.closure(NEMO_UPPER + pynini.closure(NEMO_ALPHA) + NEMO_SPACE) + address_words) city = pynini.closure(NEMO_ALPHA | pynini.accep(NEMO_SPACE), 1) city = pynini.closure( pynini.accep(",") + pynini.accep(NEMO_SPACE) + city, 0, 1) states = load_labels(get_abs_path("data/address/state.tsv")) additional_options = [] for x, y in states: additional_options.append((x, f"{y[0]}.{y[1:]}")) states.extend(additional_options) state_graph = pynini.string_map(states) state = pynini.invert(state_graph) state = pynini.closure( pynini.accep(",") + pynini.accep(NEMO_SPACE) + state, 0, 1) zip_code = pynini.compose(NEMO_DIGIT**5, cardinal.single_digits_graph) zip_code = pynini.closure( pynini.closure(pynini.accep(","), 0, 1) + pynini.accep(NEMO_SPACE) + zip_code, 0, 1, ) address = address_num + direction + address_words + pynini.closure( city + state + zip_code, 0, 1) address |= address_num + direction + address_words + pynini.closure( pynini.cross(".", ""), 0, 1) return address
# Test, for your convinience # If you have completed the above FSTs, the following asserts should not fail # Feel free to comment them out while developing the program assert (sorted_outputs("1" * numbers_to_words) == ["one"]) assert (sorted_outputs("0" * numbers_to_words) == ["zero"]) assert (sorted_outputs("10" * numbers_to_words) == ["ten"]) assert (sorted_outputs("11" * numbers_to_words) == ["eleven"]) assert (sorted_outputs("21" * numbers_to_words) == ["twenty one"]) assert (sorted_outputs("121" * numbers_to_words) == [ "hundred twenty one", "one hundred twenty one" ]) assert (sorted_outputs("12.23" * numbers_to_words) == ["twelve point two three"]) invert_ultimate = pn.invert(ultimate) invert_ultimate = pn.invert(ultimate) * pn.invert(f) # Now, the interactive program while True: try: number = raw_input( "Please enter a number or '-r' for inverted behaviour (Ctrl-C to exit): " ) if number.startswith("-r"): number = raw_input("Please write out a number (Ctrl-C to exit): ") print("Result in numbers") print(sorted_outputs(number * invert_ultimate)) else: print("Result in factorized form")
from nemo_text_processing.text_normalization.graph_utils import ( NEMO_CHAR, NEMO_DIGIT, NEMO_SIGMA, TO_LOWER, GraphFst, delete_extra_space, delete_space, insert_space, ) try: import pynini from pynini.lib import pynutil graph_teen = pynini.invert(pynini.string_file(get_abs_path("data/numbers/teen.tsv"))).optimize() graph_digit = pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv"))).optimize() ties_graph = pynini.invert(pynini.string_file(get_abs_path("data/numbers/ties.tsv"))).optimize() PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): # Add placeholders for global variables graph_teen = None graph_digit = None ties_graph = None PYNINI_AVAILABLE = True def get_ties_graph(deterministic: bool = True): """
def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) graph = pynini.Far( get_abs_path("data/numbers/cardinal_number_name.far")).get_fst() self.graph_hundred_component_at_least_one_none_zero_digit = ( pynini.closure(NEMO_DIGIT, 2, 3) | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure( pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT)) @ graph graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) single_digits_graph = pynini.invert(graph_digit | graph_zero) self.single_digits_graph = single_digits_graph + pynini.closure( pynutil.insert(" ") + single_digits_graph) if not deterministic: single_digits_graph = (pynini.invert(graph_digit | graph_zero) | pynini.cross("0", "oh") | pynini.cross("0", "o")) self.single_digits_graph = single_digits_graph + pynini.closure( pynutil.insert(" ") + single_digits_graph) single_digits_graph_with_commas = pynini.closure( self.single_digits_graph + pynutil.insert(" "), 1, 3) + pynini.closure( pynutil.delete(",") + single_digits_graph + pynutil.insert(" ") + single_digits_graph + pynutil.insert(" ") + single_digits_graph, 1, ) self.graph = (self.graph | self.single_digits_graph | get_hundreds_graph() | pynutil.add_weight(single_digits_graph_with_commas, 0.001)) self.range_graph = ( pynini.closure(pynutil.insert("from "), 0, 1) + self.graph + (pynini.cross("-", " to ") | pynini.cross("-", " ")) + self.graph) self.range_graph |= self.graph + (pynini.cross( "x", " by ") | pynini.cross(" x ", " by ")) + self.graph self.range_graph = self.range_graph.optimize() optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) long_numbers = pynini.compose(NEMO_DIGIT**(5, ...), self.single_digits_graph).optimize() final_graph = self.graph | self.get_serial_graph( ) | pynutil.add_weight(long_numbers, -0.001) if not deterministic: final_graph |= self.range_graph remove_leading_zeros = pynini.closure( pynutil.delete("0"), 1) + pynini.compose( pynini.closure(NEMO_DIGIT, 1), self.graph) final_graph |= remove_leading_zeros final_graph = optional_minus_graph + pynutil.insert( "integer: \"") + final_graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def _get_month_graph(): month_graph = pynini.string_file(get_abs_path("data/months.tsv")) month_graph = pynini.invert(month_graph).optimize() return month_graph
def __init__(self): super().__init__(name="time", kind="classify") # hours, minutes, seconds, suffix, zone, style, speak_period suffix_graph = pynini.string_file(get_abs_path("data/time/time_suffix.tsv")) time_zone_graph = pynini.invert(pynini.string_file(get_abs_path("data/time/time_zone.tsv"))) time_to_graph = pynini.string_file(get_abs_path("data/time/time_to.tsv")) # only used for < 1000 thousand -> 0 weight cardinal = pynutil.add_weight(CardinalFst().graph_no_exception, weight=-0.7) labels_hour = [num_to_word(x) for x in range(0, 24)] labels_minute_single = [num_to_word(x) for x in range(1, 10)] labels_minute_double = [num_to_word(x) for x in range(10, 60)] graph_hour = pynini.union(*labels_hour) @ cardinal graph_minute_single = pynini.union(*labels_minute_single) @ cardinal graph_minute_double = pynini.union(*labels_minute_double) @ cardinal graph_minute_verbose = pynini.cross("half", "30") | pynini.cross("quarter", "15") oclock = pynini.cross(pynini.union("o' clock", "o clock", "o'clock", "oclock"), "") final_graph_hour = pynutil.insert("hours: \"") + graph_hour + pynutil.insert("\"") graph_minute = ( oclock + pynutil.insert("00") | pynutil.delete("o") + delete_space + graph_minute_single | graph_minute_double ) final_suffix = pynutil.insert("suffix: \"") + convert_space(suffix_graph) + pynutil.insert("\"") final_suffix_optional = pynini.closure(delete_space + insert_space + final_suffix, 0, 1) final_time_zone_optional = pynini.closure( delete_space + insert_space + pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\""), 0, 1, ) # five o' clock # two o eight, two thiry five (am/pm) # two pm/am graph_hm = ( final_graph_hour + delete_extra_space + pynutil.insert("minutes: \"") + graph_minute + pynutil.insert("\"") ) # 10 past four, quarter past four, half past four graph_mh = ( pynutil.insert("minutes: \"") + pynini.union(graph_minute_single, graph_minute_double, graph_minute_verbose) + pynutil.insert("\"") + delete_space + pynutil.delete("past") + delete_extra_space + final_graph_hour ) graph_quarter_time = ( pynutil.insert("minutes: \"") + pynini.cross("quarter", "45") + pynutil.insert("\"") + delete_space + pynutil.delete(pynini.union("to", "till")) + delete_extra_space + pynutil.insert("hours: \"") + time_to_graph + pynutil.insert("\"") ) graph_h = ( final_graph_hour + delete_extra_space + pynutil.insert("minutes: \"") + (pynutil.insert("00") | graph_minute) + pynutil.insert("\"") + delete_space + insert_space + final_suffix + final_time_zone_optional ) final_graph = (graph_hm | graph_mh | graph_quarter_time) + final_suffix_optional + final_time_zone_optional final_graph |= graph_h final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
##Vocabulary lm_char = pynini.Fst.read("t9.char.lm") lm_word = pynini.Fst.read("t9.word.lm") t9 = pynini.transducer("0", "[32]") t9_relations = [ "0", "1", "2abc", "3def", "4ghi", "5jkl", "6mno", "7pqrs", "8tuv", "9wxyz" ] ##Reading vocabulary into alphabet. for i in range(10): for k in t9_relations[i]: t9 = pynini.union(pynini.transducer(str(i), str(k)), t9) ##Adding punctuation to vocabulary for i in string.punctuation: t9 = t9 | pynini.transducer("1", "[" + str(ord(i)) + "]") ##Closure and optimization t9.closure().optimize() ##Inverstion for decoding encoder = pynini.invert(t9).optimize() def encode(message): return (message.lower() * encoder).stringify() def decode(message): ###performs encoding on message, projects pathways to intersect with character ngram ###Then returns most likely path lattice = (message * t9).project(True) * lm_char return pynini.shortestpath(lattice).stringify()
# _v = pynini.union("a", "e", "i", "o", "u") _c = pynini.union("b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z") _ies = NEMO_SIGMA + _c + pynini.cross("y", "ies") _es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es") _s = NEMO_SIGMA + pynutil.insert("s") graph_plural = plurals._priority_union( suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA).optimize() SINGULAR_TO_PLURAL = graph_plural PLURAL_TO_SINGULAR = pynini.invert(graph_plural) TO_LOWER = pynini.union(*[ pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase) ]) TO_UPPER = pynini.invert(TO_LOWER) PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): # Create placeholders NEMO_CHAR = None NEMO_DIGIT = None NEMO_LOWER = None NEMO_UPPER = None NEMO_ALPHA = None
def __init__(self, deterministic: bool = True, lm: bool = False): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) self.lm = lm self.deterministic = deterministic # TODO replace to have "oh" as a default for "0" graph = pynini.Far( get_abs_path("data/number/cardinal_number_name.far")).get_fst() self.graph_hundred_component_at_least_one_none_zero_digit = ( pynini.closure(NEMO_DIGIT, 2, 3) | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv")) graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) single_digits_graph = pynini.invert(graph_digit | graph_zero) self.single_digits_graph = single_digits_graph + pynini.closure( insert_space + single_digits_graph) if not deterministic: # for a single token allow only the same normalization # "007" -> {"oh oh seven", "zero zero seven"} not {"oh zero seven"} single_digits_graph_zero = pynini.invert(graph_digit | graph_zero) single_digits_graph_oh = pynini.invert(graph_digit) | pynini.cross( "0", "oh") self.single_digits_graph = single_digits_graph_zero + pynini.closure( insert_space + single_digits_graph_zero) self.single_digits_graph |= single_digits_graph_oh + pynini.closure( insert_space + single_digits_graph_oh) single_digits_graph_with_commas = pynini.closure( self.single_digits_graph + insert_space, 1, 3) + pynini.closure( pynutil.delete(",") + single_digits_graph + insert_space + single_digits_graph + insert_space + single_digits_graph, 1, ) optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) graph = (pynini.closure(NEMO_DIGIT, 1, 3) + (pynini.closure(pynutil.delete(",") + NEMO_DIGIT**3) | pynini.closure(NEMO_DIGIT**3))) @ graph self.graph = graph self.graph_with_and = self.add_optional_and(graph) if deterministic: long_numbers = pynini.compose(NEMO_DIGIT**(5, ...), self.single_digits_graph).optimize() final_graph = plurals._priority_union(long_numbers, self.graph_with_and, NEMO_SIGMA).optimize() cardinal_with_leading_zeros = pynini.compose( pynini.accep("0") + pynini.closure(NEMO_DIGIT), self.single_digits_graph) final_graph |= cardinal_with_leading_zeros else: leading_zeros = pynini.compose( pynini.closure(pynini.accep("0"), 1), self.single_digits_graph) cardinal_with_leading_zeros = ( leading_zeros + pynutil.insert(" ") + pynini.compose( pynini.closure(NEMO_DIGIT), self.graph_with_and)) # add small weight to non-default graphs to make sure the deterministic option is listed first final_graph = ( self.graph_with_and | pynutil.add_weight(self.single_digits_graph, 0.0001) | get_four_digit_year_graph( ) # allows e.g. 4567 be pronouced as forty five sixty seven | pynutil.add_weight(single_digits_graph_with_commas, 0.0001) | cardinal_with_leading_zeros) final_graph = optional_minus_graph + pynutil.insert( "integer: \"") + final_graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) # TODO repalce to have "oh" as a default for "0" graph = pynini.Far( get_abs_path("data/numbers/cardinal_number_name.far")).get_fst() self.graph_hundred_component_at_least_one_none_zero_digit = ( pynini.closure(NEMO_DIGIT, 2, 3) | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure( pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT)) @ graph graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) single_digits_graph = pynini.invert(graph_digit | graph_zero) self.single_digits_graph = single_digits_graph + pynini.closure( insert_space + single_digits_graph) if not deterministic: # for a single token allow only the same normalization # "007" -> {"oh oh seven", "zero zero seven"} not {"oh zero seven"} single_digits_graph_zero = pynini.invert(graph_digit | graph_zero) single_digits_graph_oh = pynini.invert(graph_digit) | pynini.cross( "0", "oh") self.single_digits_graph = single_digits_graph_zero + pynini.closure( insert_space + single_digits_graph_zero) self.single_digits_graph |= single_digits_graph_oh + pynini.closure( insert_space + single_digits_graph_oh) single_digits_graph_with_commas = pynini.closure( self.single_digits_graph + insert_space, 1, 3) + pynini.closure( pynutil.delete(",") + single_digits_graph + insert_space + single_digits_graph + insert_space + single_digits_graph, 1, ) self.range_graph = pynutil.insert( "from ") + self.graph + pynini.cross("-", " to ") + self.graph self.range_graph |= self.graph + (pynini.cross( "x", " by ") | pynini.cross(" x ", " by ")) + self.graph self.range_graph |= (pynutil.insert("from ") + get_hundreds_graph() + pynini.cross("-", " to ") + get_hundreds_graph()) self.range_graph = self.range_graph.optimize() serial_graph = self.get_serial_graph() optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) if deterministic: long_numbers = pynini.compose(NEMO_DIGIT**(5, ...), self.single_digits_graph).optimize() final_graph = self.graph | serial_graph | pynutil.add_weight( long_numbers, -0.001) cardinal_with_leading_zeros = pynini.compose( pynini.accep("0") + pynini.closure(NEMO_DIGIT), self.single_digits_graph) final_graph |= cardinal_with_leading_zeros else: leading_zeros = pynini.compose( pynini.closure(pynini.accep("0"), 1), self.single_digits_graph) cardinal_with_leading_zeros = ( leading_zeros + pynutil.insert(" ") + pynini.compose(pynini.closure(NEMO_DIGIT), self.graph)) final_graph = (self.graph | serial_graph | self.range_graph | self.single_digits_graph | get_hundreds_graph() | pynutil.add_weight( single_digits_graph_with_commas, 0.001) | cardinal_with_leading_zeros) final_graph = optional_minus_graph + pynutil.insert( "integer: \"") + final_graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="time", kind="classify", deterministic=deterministic) delete_time_delimiter = pynutil.delete(pynini.union(".", ":")) one = pynini.string_map([("un", "una"), ("ún", "una")]) change_one = pynini.cdrewrite(one, "", "", NEMO_SIGMA) cardinal_graph = cardinal.graph @ change_one day_suffix = pynutil.insert("suffix: \"") + suffix + pynutil.insert( "\"") day_suffix = delete_space + insert_space + day_suffix delete_hora_suffix = delete_space + insert_space + pynutil.delete("h") delete_minute_suffix = delete_space + insert_space + pynutil.delete( "min") delete_second_suffix = delete_space + insert_space + pynutil.delete( "s") labels_hour_24 = [ str(x) for x in range(0, 25) ] # Can see both systems. Twelve hour requires am/pm for ambiguity resolution labels_hour_12 = [str(x) for x in range(1, 13)] labels_minute_single = [str(x) for x in range(1, 10)] labels_minute_double = [str(x) for x in range(10, 60)] delete_leading_zero_to_double_digit = ( pynini.closure(pynutil.delete("0") | (NEMO_DIGIT - "0"), 0, 1) + NEMO_DIGIT) graph_24 = (pynini.closure(NEMO_DIGIT, 1, 2) @ delete_leading_zero_to_double_digit @ pynini.union(*labels_hour_24)) graph_12 = (pynini.closure(NEMO_DIGIT, 1, 2) @ delete_leading_zero_to_double_digit @ pynini.union(*labels_hour_12)) graph_hour_24 = graph_24 @ cardinal_graph graph_hour_12 = graph_12 @ cardinal_graph graph_minute_single = delete_leading_zero_to_double_digit @ pynini.union( *labels_minute_single) graph_minute_double = pynini.union(*labels_minute_double) graph_minute = pynini.union(graph_minute_single, graph_minute_double) @ cardinal_graph final_graph_hour_only_24 = (pynutil.insert("hours: \"") + graph_hour_24 + pynutil.insert("\"") + delete_hora_suffix) final_graph_hour_only_12 = pynutil.insert( "hours: \"") + graph_hour_12 + pynutil.insert("\"") + day_suffix final_graph_hour_24 = pynutil.insert( "hours: \"") + graph_hour_24 + pynutil.insert("\"") final_graph_hour_12 = pynutil.insert( "hours: \"") + graph_hour_12 + pynutil.insert("\"") final_graph_minute = pynutil.insert( "minutes: \"") + graph_minute + pynutil.insert("\"") final_graph_second = pynutil.insert( "seconds: \"") + graph_minute + pynutil.insert("\"") final_time_zone_optional = pynini.closure( delete_space + insert_space + pynutil.insert("zone: \"") + time_zone_graph + pynutil.insert("\""), 0, 1, ) # 02.30 h graph_hm = ( final_graph_hour_24 + delete_time_delimiter + (pynutil.delete("00") | (insert_space + final_graph_minute)) + pynini.closure( delete_time_delimiter + (pynini.cross("00", " seconds: \"0\"") | (insert_space + final_graph_second)), 0, 1, ) # For seconds 2.30.35 h + pynini.closure(delete_hora_suffix, 0, 1) # 2.30 is valid if unambiguous + final_time_zone_optional) # 2 h 30 min graph_hm |= ( final_graph_hour_24 + delete_hora_suffix + delete_space + (pynutil.delete("00") | (insert_space + final_graph_minute)) + delete_minute_suffix + pynini.closure( delete_space + (pynini.cross("00", " seconds: \"0\"") | (insert_space + final_graph_second)) + delete_second_suffix, 0, 1, ) # For seconds + final_time_zone_optional) # 2.30 a. m. (Only for 12 hour clock) graph_hm |= ( final_graph_hour_12 + delete_time_delimiter + (pynutil.delete("00") | (insert_space + final_graph_minute)) + pynini.closure( delete_time_delimiter + (pynini.cross("00", " seconds: \"0\"") | (insert_space + final_graph_second)), 0, 1, ) # For seconds 2.30.35 a. m. + day_suffix + final_time_zone_optional) graph_h = ( pynini.union(final_graph_hour_only_24, final_graph_hour_only_12) + final_time_zone_optional ) # Should always have a time indicator, else we'll pass to cardinals if not deterministic: # This includes alternate vocalization (hour menos min, min para hour), here we shift the times and indicate a `style` tag hour_shift_24 = pynini.invert( pynini.string_file(get_abs_path("data/time/hour_to_24.tsv"))) hour_shift_12 = pynini.invert( pynini.string_file(get_abs_path("data/time/hour_to_12.tsv"))) minute_shift = pynini.string_file( get_abs_path("data/time/minute_to.tsv")) graph_hour_to_24 = graph_24 @ hour_shift_24 @ cardinal_graph graph_hour_to_12 = graph_12 @ hour_shift_12 @ cardinal_graph graph_minute_to = pynini.union( graph_minute_single, graph_minute_double) @ minute_shift @ cardinal_graph final_graph_hour_to_24 = pynutil.insert( "hours: \"") + graph_hour_to_24 + pynutil.insert("\"") final_graph_hour_to_12 = pynutil.insert( "hours: \"") + graph_hour_to_12 + pynutil.insert("\"") final_graph_minute_to = pynutil.insert( "minutes: \"") + graph_minute_to + pynutil.insert("\"") graph_menos = pynutil.insert(" style: \"1\"") graph_para = pynutil.insert(" style: \"2\"") final_graph_style = graph_menos | graph_para # 02.30 h (omitting seconds since a bit awkward) graph_hm |= ( final_graph_hour_to_24 + delete_time_delimiter + insert_space + final_graph_minute_to + pynini.closure( delete_hora_suffix, 0, 1) # 2.30 is valid if unambiguous + final_time_zone_optional + final_graph_style) # 2 h 30 min graph_hm |= (final_graph_hour_to_24 + delete_hora_suffix + delete_space + insert_space + final_graph_minute_to + delete_minute_suffix + final_time_zone_optional + final_graph_style) # 2.30 a. m. (Only for 12 hour clock) graph_hm |= (final_graph_hour_to_12 + delete_time_delimiter + insert_space + final_graph_minute_to + day_suffix + final_time_zone_optional + final_graph_style) final_graph = graph_hm | graph_h if deterministic: final_graph = final_graph + pynutil.insert(" preserve_order: true") final_graph = final_graph.optimize() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
NEMO_DIGIT, NEMO_SIGMA, NEMO_SPACE, NEMO_WHITE_SPACE, GraphFst, delete_space, insert_space, ) from nemo_text_processing.text_normalization.es.graph_utils import cardinal_separator from nemo_text_processing.text_normalization.es.utils import get_abs_path try: import pynini from pynini.lib import pynutil zero = pynini.invert( pynini.string_file(get_abs_path("data/numbers/zero.tsv"))) digit = pynini.invert( pynini.string_file(get_abs_path("data/numbers/digit.tsv"))) teen = pynini.invert( pynini.string_file(get_abs_path("data/numbers/teen.tsv"))) ties = pynini.invert( pynini.string_file(get_abs_path("data/numbers/ties.tsv"))) twenties = pynini.invert( pynini.string_file(get_abs_path("data/numbers/twenties.tsv"))) hundreds = pynini.invert( pynini.string_file(get_abs_path("data/numbers/hundreds.tsv"))) PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): zero = None
def __init__(self, cardinal_tagger: GraphFst, deterministic: bool = True): super().__init__(name="time", kind="verbalize", deterministic=deterministic) # add weight so when using inverse text normalization this conversion is depriotized night_to_early = pynutil.add_weight(pynini.invert( pynini.string_file( get_abs_path("data/time/hour_to_night.tsv"))).optimize(), weight=0.0001) hour_to = pynini.invert( pynini.string_file( get_abs_path("data/time/hour_to.tsv"))).optimize() minute_to = pynini.invert( pynini.string_file( get_abs_path("data/time/minute_to.tsv"))).optimize() time_zone_graph = pynini.invert( convert_space( pynini.union(*[ x[1] for x in load_labels( get_abs_path("data/time/time_zone.tsv")) ]))) graph_zero = pynini.invert( pynini.string_file( get_abs_path("data/numbers/zero.tsv"))).optimize() number_verbalization = graph_zero | cardinal_tagger.two_digit_non_zero hour = pynutil.delete("hours: \"") + pynini.closure( NEMO_DIGIT, 1) + pynutil.delete("\"") hour_verbalized = hour @ number_verbalization @ pynini.cdrewrite( pynini.cross("eins", "ein"), "[BOS]", "[EOS]", NEMO_SIGMA) + pynutil.insert(" uhr") minute = pynutil.delete("minutes: \"") + pynini.closure( NEMO_DIGIT, 1) + pynutil.delete("\"") zone = pynutil.delete("zone: \"") + time_zone_graph + pynutil.delete( "\"") optional_zone = pynini.closure(pynini.accep(" ") + zone, 0, 1) second = pynutil.delete("seconds: \"") + pynini.closure( NEMO_DIGIT, 1) + pynutil.delete("\"") graph_hms = (hour_verbalized + pynini.accep(" ") + minute @ number_verbalization + pynutil.insert(" minuten") + pynini.accep(" ") + second @ number_verbalization + pynutil.insert(" sekunden") + optional_zone) graph_hms @= pynini.cdrewrite( pynini.cross("eins minuten", "eine minute") | pynini.cross("eins sekunden", "eine sekunde"), pynini.union(" ", "[BOS]"), "", NEMO_SIGMA, ) min_30 = [str(x) for x in range(1, 31)] min_30 = pynini.union(*min_30) min_29 = [str(x) for x in range(1, 30)] min_29 = pynini.union(*min_29) graph_h = hour_verbalized graph_hm = hour_verbalized + pynini.accep( " ") + minute @ number_verbalization graph_m_past_h = ( minute @ min_30 @ (number_verbalization | pynini.cross("15", "viertel")) + pynini.accep(" ") + pynutil.insert("nach ") # + hour @ number_verbalization + hour @ pynini.cdrewrite(night_to_early, "[BOS]", "[EOS]", NEMO_SIGMA) @ number_verbalization) graph_m30_h = (minute @ pynini.cross("30", "halb") + pynini.accep(" ") + hour @ pynini.cdrewrite( night_to_early, "[BOS]", "[EOS]", NEMO_SIGMA) @ hour_to @ number_verbalization) graph_m_to_h = ( minute @ minute_to @ min_29 @ (number_verbalization | pynini.cross("15", "viertel")) + pynini.accep(" ") + pynutil.insert("vor ") + hour @ pynini.cdrewrite(night_to_early, "[BOS]", "[EOS]", NEMO_SIGMA) @ hour_to @ number_verbalization) self.graph = (graph_hms | graph_h | graph_hm | pynutil.add_weight(graph_m_past_h, weight=0.0001) | pynutil.add_weight(graph_m30_h, weight=0.0001) | pynutil.add_weight(graph_m_to_h, weight=0.0001)) + optional_zone delete_tokens = self.delete_tokens(self.graph + delete_preserve_order) self.fst = delete_tokens.optimize()
# compose - * # concat - + # union - | fst = (pn.a("a") | pn.a("e")) + pn.t("a", pn.a("0").closure(0, 5)) | pn.t( pn.a("a").star, "0") + pn.a("xxx") fst = fst.optimize() output_strings = set() for i in range(10000): s = pn.randgen(fst, 1, random.randint(0, 100000)).stringify() output_strings.add(s) print(len(output_strings)) for output_string in output_strings: print(output_string) def top_paths(fst, count=100): return sorted( set(p[1] for p in pn.shortestpath(fst, nshortest=count).paths())) print("INPUTS") print("\t") print(*top_paths(pn.invert(fst), 20), sep="\n\t")
def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="money", kind="classify") # quantity, integer_part, fractional_part, currency, style(depr) cardinal_graph = cardinal.graph_no_exception graph_decimal_final = decimal.final_graph_wo_negative unit = pynini.string_file(get_abs_path("data/currency.tsv")) unit_singular = pynini.invert(unit) unit_plural = get_singulars(unit_singular) graph_unit_singular = pynutil.insert("currency: \"") + convert_space(unit_singular) + pynutil.insert("\"") graph_unit_plural = pynutil.insert("currency: \"") + convert_space(unit_plural) + pynutil.insert("\"") add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert("0") + NEMO_DIGIT) # twelve dollars (and) fifty cents, zero cents cents_standalone = ( pynutil.insert("fractional_part: \"") + pynini.union( pynutil.add_weight(((NEMO_SIGMA - "one") @ cardinal_graph), -0.7) @ add_leading_zero_to_double_digit + delete_space + pynutil.delete("cents"), pynini.cross("one", "01") + delete_space + pynutil.delete("cent"), ) + pynutil.insert("\"") ) optional_cents_standalone = pynini.closure( delete_space + pynini.closure(pynutil.delete("and") + delete_space, 0, 1) + insert_space + cents_standalone, 0, 1, ) # twelve dollars fifty, only after integer optional_cents_suffix = pynini.closure( delete_extra_space + pynutil.insert("fractional_part: \"") + pynutil.add_weight(cardinal_graph @ add_leading_zero_to_double_digit, -0.7) + pynutil.insert("\""), 0, 1, ) graph_integer = ( pynutil.insert("integer_part: \"") + ((NEMO_SIGMA - "one") @ cardinal_graph) + pynutil.insert("\"") + delete_extra_space + graph_unit_plural + (optional_cents_standalone | optional_cents_suffix) ) graph_integer |= ( pynutil.insert("integer_part: \"") + pynini.cross("one", "1") + pynutil.insert("\"") + delete_extra_space + graph_unit_singular + (optional_cents_standalone | optional_cents_suffix) ) graph_decimal = graph_decimal_final + delete_extra_space + graph_unit_plural graph_decimal |= pynutil.insert("currency: \"$\" integer_part: \"0\" ") + cents_standalone final_graph = graph_integer | graph_decimal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()