def __init__(self): super().__init__(name="electronic", kind="classify") delete_extra_space = pynutil.delete(" ") alpha_num = ( NEMO_ALPHA | pynini.string_file(get_abs_path("data/numbers/digit.tsv")) | pynini.string_file(get_abs_path("data/numbers/zero.tsv"))) username = ( pynutil.insert("username: \"") + pynini.closure(alpha_num + delete_extra_space + pynini.closure( pynini.cross("dot", '.') + delete_extra_space, 0, 1)) + alpha_num + pynutil.insert("\"")) single_alphanum = pynini.closure(alpha_num + delete_extra_space) + alpha_num server = single_alphanum | pynini.string_file( get_abs_path("data/electronic/server_name.tsv")) domain = single_alphanum | pynini.string_file( get_abs_path("data/electronic/domain.tsv")) domain_graph = (pynutil.insert("domain: \"") + server + delete_extra_space + pynini.cross("dot", ".") + delete_extra_space + domain + pynutil.insert("\"")) graph = username + delete_extra_space + pynutil.delete( "at") + insert_space + delete_extra_space + domain_graph final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst): super().__init__(name="ordinal", kind="classify") cardinal_graph = cardinal.graph_no_exception graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv")) graph_teens = pynini.string_file(get_abs_path("data/ordinals/teen.tsv")) graph = pynini.closure(NEMO_CHAR) + pynini.union( graph_digit, graph_teens, pynini.cross("tieth", "ty"), pynini.cross("th", "") ) self.graph = graph @ cardinal_graph final_graph = pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="electronic", kind="classify") delete_extra_space = pynutil.delete(" ") alpha_num = ( NEMO_ALPHA | pynini.string_file(get_abs_path("data/numbers/digit.tsv")) | pynini.string_file(get_abs_path("data/numbers/zero.tsv"))) symbols = pynini.string_file( get_abs_path("data/electronic/symbols.tsv")).invert() accepted_username = alpha_num | symbols process_dot = pynini.cross("dot", ".") username = (pynutil.insert("username: \"") + alpha_num + pynini.closure(delete_extra_space + accepted_username) + pynutil.insert("\"")) single_alphanum = pynini.closure(alpha_num + delete_extra_space) + alpha_num server = single_alphanum | pynini.string_file( get_abs_path("data/electronic/server_name.tsv")) domain = single_alphanum | pynini.string_file( get_abs_path("data/electronic/domain.tsv")) domain_graph = (pynutil.insert("domain: \"") + server + delete_extra_space + process_dot + delete_extra_space + domain + pynutil.insert("\"")) graph = username + delete_extra_space + pynutil.delete( "at") + insert_space + delete_extra_space + domain_graph ############# url ### protocol_end = pynini.cross(pynini.union("w w w", "www"), "www") protocol_start = (pynini.cross("h t t p", "http") | pynini.cross( "h t t p s", "https")) + pynini.cross(" colon slash slash ", "://") # .com, ending = (delete_extra_space + symbols + delete_extra_space + (domain | pynini.closure(accepted_username + delete_extra_space, ) + accepted_username)) protocol = (pynini.closure(protocol_start, 0, 1) + protocol_end + delete_extra_space + process_dot + pynini.closure(delete_extra_space + accepted_username, 1) + pynini.closure(ending, 1)) protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert( "\"") graph |= protocol ######## final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="telephone", kind="classify") delete_space = pynutil.delete(' ') # country code, number_part, extension add_separator = pynutil.insert(" ") # between components digit = pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv"))).optimize() | pynini.cross( "0", pynini.union("o", "oh", "zero") ) number_part = ( ( (pynini.closure(digit + insert_space, 2, 2) + digit + pynutil.delete("-")) | ( pynutil.delete("(") + pynini.closure(digit + insert_space, 2, 2) + digit + pynutil.delete(")") + pynini.closure(pynutil.delete("-"), 0, 1) + delete_space ) ) + add_separator + pynini.closure(digit + insert_space, 2, 2) + digit + pynutil.delete("-") + add_separator + pynini.closure(digit + insert_space, 3, 3) + digit ) number_part = pynutil.insert("number_part: \"") + pynini.invert(number_part) + pynutil.insert("\"") graph = number_part final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="whitelist", kind="classify") whitelist = pynini.string_file( get_abs_path("data/whitelist.tsv")).invert() graph = pynutil.insert("name: \"") + convert_space( whitelist) + pynutil.insert("\"") self.fst = graph.optimize()
def __init__(self, cardinal: GraphFst): super().__init__(name="decimal", kind="classify") cardinal_graph = cardinal.graph_no_exception graph_decimal = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_decimal |= pynini.string_file( get_abs_path("data/numbers/zero.tsv")) | pynini.cross("o", "0") graph_decimal = pynini.closure(graph_decimal + delete_space) + graph_decimal self.graph = graph_decimal point = pynutil.delete("point") optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("minus", "\"true\"") + delete_extra_space, 0, 1) graph_fractional = pynutil.insert( "fractional_part: \"") + graph_decimal + pynutil.insert("\"") graph_integer = pynutil.insert( "integer_part: \"") + cardinal_graph + pynutil.insert("\"") final_graph_wo_sign = ( pynini.closure(graph_integer + delete_extra_space, 0, 1) + point + delete_extra_space + graph_fractional) final_graph = optional_graph_negative + final_graph_wo_sign self.final_graph_wo_negative = final_graph_wo_sign | get_quantity( final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit) final_graph |= optional_graph_negative + get_quantity( final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="measure", kind="classify") cardinal_graph = cardinal.graph_no_exception graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv")) graph_unit_singular = pynini.invert(graph_unit) # singular -> abbr graph_unit_plural = get_singulars( graph_unit_singular) # plural -> abbr optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("minus", "\"true\"") + delete_extra_space, 0, 1) unit_singular = convert_space(graph_unit_singular) unit_plural = convert_space(graph_unit_plural) unit_misc = pynutil.insert("/") + pynutil.delete( "per") + delete_space + convert_space(graph_unit_singular) unit_singular = (pynutil.insert("units: \"") + (unit_singular | unit_misc | pynutil.add_weight( unit_singular + delete_space + unit_misc, 0.01)) + pynutil.insert("\"")) unit_plural = (pynutil.insert("units: \"") + (unit_plural | unit_misc | pynutil.add_weight( unit_plural + delete_space + unit_misc, 0.01)) + pynutil.insert("\"")) subgraph_decimal = (pynutil.insert("decimal { ") + optional_graph_negative + decimal.final_graph_wo_negative + pynutil.insert(" }") + delete_extra_space + unit_plural) subgraph_cardinal = (pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + ((NEMO_SIGMA - "one") @ cardinal_graph) + pynutil.insert("\"") + pynutil.insert(" }") + delete_extra_space + unit_plural) subgraph_cardinal |= (pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + pynini.cross("one", "1") + pynutil.insert("\"") + pynutil.insert(" }") + delete_extra_space + unit_singular) final_graph = subgraph_decimal | subgraph_cardinal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="telephone", kind="classify") # country code, number_part, extension digit_to_str = pynini.invert( pynini.string_file(get_abs_path("data/numbers/digit.tsv")) ).optimize() | pynini.cross("0", pynini.union("o", "oh", "zero")) double_digit = pynini.union( *[ pynini.cross( pynini.project(str(i) @ digit_to_str, "output") + pynini.accep(" ") + pynini.project(str(i) @ digit_to_str, "output"), pynutil.insert("double ") + pynini.project(str(i) @ digit_to_str, "output"), ) for i in range(10) ] ) double_digit.invert() number_part = ( pynini.closure(digit_to_str + insert_space, 2, 2) + digit_to_str + pynutil.delete("-") + insert_space + pynini.closure(digit_to_str + insert_space, 2, 2) + digit_to_str + pynutil.delete("-") + insert_space + pynini.closure(digit_to_str + insert_space, 3, 3) + digit_to_str ) number_part = ( pynutil.insert("number_part: \"") + pynini.cdrewrite(double_digit, "", "", NEMO_SIGMA) @ pynini.invert(number_part) + pynutil.insert("\"") ) graph = number_part final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="cardinal", kind="classify") graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) graph_hundred = pynini.cross("hundred", "") graph_hundred_component = pynini.union(graph_digit + delete_space + graph_hundred, pynutil.insert("0")) graph_hundred_component += delete_space graph_hundred_component += pynini.union( graph_teen | pynutil.insert("00"), (graph_ties | pynutil.insert("0")) + delete_space + (graph_digit | pynutil.insert("0")), ) graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ ( pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT) ) self.graph_hundred_component_at_least_one_none_zero_digit = ( graph_hundred_component_at_least_one_none_zero_digit ) graph_thousands = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("thousand"), pynutil.insert("000", weight=0.1), ) graph_million = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("million"), pynutil.insert("000", weight=0.1), ) graph_billion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("billion"), pynutil.insert("000", weight=0.1), ) graph_trillion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("trillion"), pynutil.insert("000", weight=0.1), ) graph_quadrillion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("quadrillion"), pynutil.insert("000", weight=0.1), ) graph_quintillion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("quintillion"), pynutil.insert("000", weight=0.1), ) graph_sextillion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("sextillion"), pynutil.insert("000", weight=0.1), ) graph = pynini.union( graph_sextillion + delete_space + graph_quintillion + delete_space + graph_quadrillion + delete_space + graph_trillion + delete_space + graph_billion + delete_space + graph_million + delete_space + graph_thousands + delete_space + graph_hundred_component, graph_zero, ) graph = graph @ pynini.union( pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0" ) labels_exception = [num_to_word(x) for x in range(0, 13)] graph_exception = pynini.union(*labels_exception) graph = ( pynini.cdrewrite(pynutil.delete("and"), NEMO_SPACE, NEMO_SPACE, NEMO_SIGMA) @ (NEMO_ALPHA + NEMO_SIGMA) @ graph ) self.graph_no_exception = graph self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("minus", "\"-\"") + NEMO_SPACE, 0, 1 ) final_graph = optional_minus_graph + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="time", kind="classify") # hours, minutes, seconds, suffix, zone, style, speak_period suffix_graph = pynini.string_file( get_abs_path("data/time/time_suffix.tsv")) time_zone_graph = pynini.invert( pynini.string_file(get_abs_path("data/time/time_zone.tsv"))) time_to_graph = pynini.string_file( get_abs_path("data/time/time_to.tsv")) # only used for < 1000 thousand -> 0 weight cardinal = pynutil.add_weight(CardinalFst().graph_no_exception, weight=-0.7) labels_hour = [num_to_word(x) for x in range(0, 24)] labels_minute_single = [num_to_word(x) for x in range(1, 10)] labels_minute_double = [num_to_word(x) for x in range(10, 60)] graph_hour = pynini.union(*labels_hour) @ cardinal graph_minute_single = pynini.union(*labels_minute_single) @ cardinal graph_minute_double = pynini.union(*labels_minute_double) @ cardinal graph_minute_verbose = pynini.cross("half", "30") | pynini.cross( "quarter", "15") oclock = pynini.cross( pynini.union("o' clock", "o clock", "o'clock", "oclock"), "") final_graph_hour = pynutil.insert( "hours: \"") + graph_hour + pynutil.insert("\"") graph_minute = ( oclock + pynutil.insert("00") | pynutil.delete("o") + delete_space + graph_minute_single | graph_minute_double) final_suffix = pynutil.insert("suffix: \"") + convert_space( suffix_graph) + pynutil.insert("\"") final_suffix_optional = pynini.closure( delete_space + insert_space + final_suffix, 0, 1) final_time_zone_optional = pynini.closure( delete_space + insert_space + pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\""), 0, 1, ) # five o' clock # two o eight, two thiry five (am/pm) # two pm/am graph_hm = (final_graph_hour + delete_extra_space + pynutil.insert("minutes: \"") + graph_minute + pynutil.insert("\"")) # 10 past four, quarter past four, half past four graph_mh = (pynutil.insert("minutes: \"") + pynini.union( graph_minute_single, graph_minute_double, graph_minute_verbose) + pynutil.insert("\"") + delete_space + pynutil.delete("past") + delete_extra_space + final_graph_hour) graph_quarter_time = (pynutil.insert("minutes: \"") + pynini.cross("quarter", "45") + pynutil.insert("\"") + delete_space + pynutil.delete(pynini.union("to", "till")) + delete_extra_space + pynutil.insert("hours: \"") + time_to_graph + pynutil.insert("\"")) graph_h = (final_graph_hour + delete_extra_space + pynutil.insert("minutes: \"") + (pynutil.insert("00") | graph_minute) + pynutil.insert("\"") + delete_space + insert_space + final_suffix + final_time_zone_optional) final_graph = (graph_hm | graph_mh | graph_quarter_time ) + final_suffix_optional + final_time_zone_optional final_graph |= graph_h final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="money", kind="classify") # quantity, integer_part, fractional_part, currency cardinal_graph = cardinal.graph_no_exception graph_decimal_final = decimal.final_graph_wo_negative unit = pynini.string_file(get_abs_path("data/currency.tsv")) unit_singular = pynini.invert(unit) unit_plural = get_singulars(unit_singular) graph_unit_singular = pynutil.insert("currency: \"") + convert_space( unit_singular) + pynutil.insert("\"") graph_unit_plural = pynutil.insert("currency: \"") + convert_space( unit_plural) + pynutil.insert("\"") add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | ( pynutil.insert("0") + NEMO_DIGIT) # twelve dollars (and) fifty cents, zero cents cents_standalone = (pynutil.insert("fractional_part: \"") + pynini.union( pynutil.add_weight( ((NEMO_SIGMA - "one") @ cardinal_graph), -0.7) @ add_leading_zero_to_double_digit + delete_space + pynutil.delete("cents"), pynini.cross("one", "01") + delete_space + pynutil.delete("cent"), ) + pynutil.insert("\"")) optional_cents_standalone = pynini.closure( delete_space + pynini.closure(pynutil.delete("and") + delete_space, 0, 1) + insert_space + cents_standalone, 0, 1, ) # twelve dollars fifty, only after integer optional_cents_suffix = pynini.closure( delete_extra_space + pynutil.insert("fractional_part: \"") + pynutil.add_weight( cardinal_graph @ add_leading_zero_to_double_digit, -0.7) + pynutil.insert("\""), 0, 1, ) graph_integer = (pynutil.insert("integer_part: \"") + ((NEMO_SIGMA - "one") @ cardinal_graph) + pynutil.insert("\"") + delete_extra_space + graph_unit_plural + (optional_cents_standalone | optional_cents_suffix)) graph_integer |= (pynutil.insert("integer_part: \"") + pynini.cross("one", "1") + pynutil.insert("\"") + delete_extra_space + graph_unit_singular + (optional_cents_standalone | optional_cents_suffix)) graph_decimal = graph_decimal_final + delete_extra_space + graph_unit_plural graph_decimal |= pynutil.insert( "currency: \"$\" integer_part: \"0\" ") + cents_standalone final_graph = graph_integer | graph_decimal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst): super().__init__(name="telephone", kind="classify") # country code, number_part, extension digit_to_str = ( pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv")).optimize()) | pynini.cross("0", pynini.union("o", "oh", "zero")).optimize() ) str_to_digit = pynini.invert(digit_to_str) double_digit = pynini.union( *[ pynini.cross( pynini.project(str(i) @ digit_to_str, "output") + pynini.accep(" ") + pynini.project(str(i) @ digit_to_str, "output"), pynutil.insert("double ") + pynini.project(str(i) @ digit_to_str, "output"), ) for i in range(10) ] ) double_digit.invert() # to handle cases like "one twenty three" two_digit_cardinal = pynini.compose(cardinal.graph_no_exception, NEMO_DIGIT ** 2) double_digit_to_digit = ( pynini.compose(double_digit, str_to_digit + pynutil.delete(" ") + str_to_digit) | two_digit_cardinal ) single_or_double_digit = (pynutil.add_weight(double_digit_to_digit, -0.0001) | str_to_digit).optimize() single_or_double_digit |= ( single_or_double_digit + pynini.closure(pynutil.add_weight(pynutil.delete(" ") + single_or_double_digit, 0.0001)) ).optimize() number_part = pynini.compose( single_or_double_digit, NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 4, ).optimize() number_part = pynutil.insert("number_part: \"") + number_part.optimize() + pynutil.insert("\"") cardinal_option = pynini.compose(single_or_double_digit, NEMO_DIGIT ** (2, 3)) country_code = ( pynutil.insert("country_code: \"") + pynini.closure(pynini.cross("plus ", "+"), 0, 1) + ((pynini.closure(str_to_digit + pynutil.delete(" "), 0, 2) + str_to_digit) | cardinal_option) + pynutil.insert("\"") ) optional_country_code = pynini.closure(country_code + pynutil.delete(" ") + insert_space, 0, 1).optimize() graph = optional_country_code + number_part # credit card number space_four_digits = insert_space + NEMO_DIGIT ** 4 credit_card_graph = pynini.compose(single_or_double_digit, NEMO_DIGIT ** 4 + space_four_digits ** 3).optimize() graph |= pynutil.insert("number_part: \"") + credit_card_graph.optimize() + pynutil.insert("\"") # SSN ssn_graph = pynini.compose( single_or_double_digit, NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 2 + pynutil.insert("-") + NEMO_DIGIT ** 4, ).optimize() graph |= pynutil.insert("number_part: \"") + ssn_graph.optimize() + pynutil.insert("\"") # ip digit_or_double = pynini.closure(str_to_digit + pynutil.delete(" "), 0, 1) + double_digit_to_digit digit_or_double |= double_digit_to_digit + pynini.closure(pynutil.delete(" ") + str_to_digit, 0, 1) digit_or_double |= str_to_digit + (pynutil.delete(" ") + str_to_digit) ** (0, 2) digit_or_double |= cardinal_option digit_or_double = digit_or_double.optimize() ip_graph = digit_or_double + (pynini.cross(" dot ", ".") + digit_or_double) ** 3 graph |= pynutil.insert("number_part: \"") + ip_graph.optimize() + pynutil.insert("\"") graph |= ( pynutil.insert("number_part: \"") + pynutil.add_weight(get_serial_number(cardinal=cardinal), weight=0.0001) + pynutil.insert("\"") ) final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def _get_month_graph(): """ Transducer for month, e.g. march -> march """ month_graph = pynini.string_file(get_abs_path("data/months.tsv")) return month_graph
# limitations under the License. from nemo_text_processing.inverse_text_normalization.en.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_ALPHA, NEMO_DIGIT, GraphFst, delete_extra_space, delete_space, ) try: import pynini from pynini.lib import pynutil graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")).optimize() graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).optimize() ties_graph = pynini.string_file(get_abs_path("data/numbers/ties.tsv")).optimize() PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): graph_teen = None graph_digit = None ties_graph = None PYNINI_AVAILABLE = True def _get_month_graph(): """ Transducer for month, e.g. march -> march