def __init__(self): super().__init__(name="telephone", kind="classify") # country code, number_part, extension separator = pynini.accep(" ") # between components zero = pynini.invert(pynini.string_file(get_abs_path("data/numbers/zero.tsv"))) digit = (pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv"))) | zero).optimize() number_part = ( pynutil.delete("(") + zero + insert_space + pynini.closure(digit + insert_space, 2, 2) + digit + pynutil.delete(")") + separator + pynini.closure(digit + insert_space, 3, 3) + digit + pynutil.delete("-") + insert_space + pynini.closure(digit + insert_space, 3, 3) + digit ) number_part = pynutil.insert("number_part: \"") + pynini.invert(number_part) + pynutil.insert("\"") graph = number_part final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="electronic", kind="classify") delete_extra_space = pynutil.delete(" ") alpha_num = ( NEMO_ALPHA | pynini.string_file(get_abs_path("data/numbers/digit.tsv")) | pynini.string_file(get_abs_path("data/numbers/zero.tsv"))) username = ( pynutil.insert("username: \"") + pynini.closure(alpha_num + delete_extra_space + pynini.closure( pynini.cross("punkt", '.') + delete_extra_space, 0, 1)) + alpha_num + pynutil.insert("\"")) single_alphanum = pynini.closure(alpha_num + delete_extra_space) + alpha_num server = single_alphanum | pynini.string_file( get_abs_path("data/electronic/server_name.tsv")) domain = single_alphanum | pynini.string_file( get_abs_path("data/electronic/domain.tsv")) domain_graph = (pynutil.insert("domain: \"") + server + delete_extra_space + pynini.cross("punkt", ".") + delete_extra_space + domain + pynutil.insert("\"")) graph = username + delete_extra_space + pynutil.delete( "at") + insert_space + delete_extra_space + domain_graph final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic) graph_digit = pynini.string_file( get_abs_path("data/ordinals/digit.tsv")).invert() graph_teens = pynini.string_file( get_abs_path("data/ordinals/teen.tsv")).invert() graph = (pynutil.delete("integer:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) convert_rest = pynutil.insert("th", weight=0.01) suffix = pynini.cdrewrite( graph_digit | graph_teens | pynutil.add_weight(pynini.cross("ty", "tieth"), weight=0.001) | convert_rest, "", "[EOS]", NEMO_SIGMA, ).optimize() self.graph = pynini.compose(graph, suffix) self.suffix = suffix delete_tokens = self.delete_tokens(self.graph) self.fst = delete_tokens.optimize()
def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="decimal", kind="classify", deterministic=deterministic) graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")).invert() graph_digit |= pynini.string_file( get_abs_path("data/numbers/zero.tsv")).invert() graph_digit |= pynini.cross("1", "eins") self.graph = graph_digit + pynini.closure(insert_space + graph_digit).optimize() point = pynutil.delete(",") optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) self.graph_fractional = pynutil.insert( "fractional_part: \"") + self.graph + pynutil.insert("\"") self.graph_integer = pynutil.insert( "integer_part: \"") + cardinal.graph + pynutil.insert("\"") final_graph_wo_sign = self.graph_integer + point + insert_space + self.graph_fractional self.final_graph_wo_negative = final_graph_wo_sign | get_quantity( final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit) final_graph = optional_graph_negative + self.final_graph_wo_negative final_graph += pynutil.insert(" preserve_order: true") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst): super().__init__(name="decimal", kind="classify") cardinal_graph = cardinal.graph_no_exception graph_decimal = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) graph_decimal |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")) | pynini.cross("o", "0") graph_decimal = pynini.closure(graph_decimal + delete_space) + graph_decimal self.graph = graph_decimal point = pynutil.delete("point") optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("minus", "\"true\"") + delete_extra_space, 0, 1 ) graph_fractional = pynutil.insert("fractional_part: \"") + graph_decimal + pynutil.insert("\"") graph_integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") final_graph_wo_sign = ( pynini.closure(graph_integer + delete_extra_space, 0, 1) + point + delete_extra_space + graph_fractional ) final_graph = optional_graph_negative + final_graph_wo_sign self.final_graph_wo_negative = final_graph_wo_sign | get_quantity( final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit ) final_graph |= optional_graph_negative + get_quantity( final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit ) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic) graph_digit = pynini.string_file( get_abs_path("data/ordinals/digit.tsv")).invert() graph_ties = pynini.string_file( get_abs_path("data/ordinals/ties.tsv")).invert() graph_thousands = pynini.string_file( get_abs_path("data/ordinals/thousands.tsv")).invert() graph = pynutil.delete("integer: \"") + pynini.closure( NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") suffixes = pynini.union("ten", "tem", "ter", "tes", "te") convert_rest = pynutil.insert(suffixes, weight=0.01) self.ordinal_stem = graph_digit | graph_ties | graph_thousands suffix = pynini.cdrewrite( pynini.closure(self.ordinal_stem, 0, 1) + convert_rest, "", "[EOS]", NEMO_SIGMA, ).optimize() self.graph = pynini.compose(graph, suffix) self.suffix = suffix delete_tokens = self.delete_tokens(self.graph) self.fst = delete_tokens.optimize()
def _get_digit_or_teen(): """ Transducer for single digit or teens """ return (pynini.string_file(get_abs_path("data/numbers/digit.tsv")) | pynini.string_file( get_abs_path("data/numbers/teen.tsv"))).optimize()
def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="time", kind="classify", deterministic=deterministic) suffix_graph = pynini.string_file(get_abs_path("data/time_suffix.tsv")) time_zone_graph = pynini.string_file( get_abs_path("data/time_zone.tsv")) # only used for < 1000 thousand -> 0 weight cardinal = cardinal.graph labels_hour = [str(x) for x in range(0, 24)] labels_minute_single = [str(x) for x in range(1, 10)] labels_minute_double = [str(x) for x in range(10, 60)] delete_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | ( pynini.closure(pynutil.delete("0"), 0, 1) + NEMO_DIGIT) graph_hour = delete_leading_zero_to_double_digit @ pynini.union( *labels_hour) @ cardinal graph_minute_single = pynini.union(*labels_minute_single) @ cardinal graph_minute_double = pynini.union(*labels_minute_double) @ cardinal final_graph_hour = pynutil.insert( "hours: \"") + graph_hour + pynutil.insert("\"") final_graph_minute = ( pynutil.insert("minutes: \"") + (pynini.cross("0", "o") + insert_space + graph_minute_single | graph_minute_double) + pynutil.insert("\"")) final_suffix = pynutil.insert("suffix: \"") + convert_space( suffix_graph) + pynutil.insert("\"") final_suffix_optional = pynini.closure( delete_space + insert_space + final_suffix, 0, 1) final_time_zone_optional = pynini.closure( delete_space + insert_space + pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\""), 0, 1, ) # 2:30 pm, 02:30, 2:00 graph_hm = ( final_graph_hour + pynutil.delete(":") + (pynutil.delete("00") | insert_space + final_graph_minute) + final_suffix_optional + final_time_zone_optional) # 2.xx pm/am graph_hm2 = ( final_graph_hour + pynutil.delete(".") + (pynutil.delete("00") | insert_space + final_graph_minute) + delete_space + insert_space + final_suffix + final_time_zone_optional) # 2 pm est graph_h = final_graph_hour + delete_space + insert_space + final_suffix + final_time_zone_optional final_graph = (graph_hm | graph_h | graph_hm2).optimize() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="time", kind="classify") # hours, minutes, seconds, suffix, zone, style, speak_period suffix_graph = pynini.string_file(get_abs_path("data/time_suffix.tsv")) time_zone_graph = pynini.invert( pynini.string_file(get_abs_path("data/time_zone.tsv"))) # only used for < 1000 thousand -> 0 weight cardinal = pynutil.add_weight(CardinalFst().graph_no_exception, weight=-0.7) labels_hour = [num_to_word(x) for x in range(0, 24)] labels_minute_single = [num_to_word(x) for x in range(1, 10)] labels_minute_double = [num_to_word(x) for x in range(10, 60)] graph_hour = pynini.union(*labels_hour) @ cardinal graph_minute_single = pynini.union(*labels_minute_single) @ cardinal graph_minute_double = pynini.union(*labels_minute_double) @ cardinal graph_minute_verbose = pynini.cross("half", "30") | pynini.cross( "quarter", "15") oclock = pynini.cross( pynini.union("o' clock", "o clock", "o'clock", "oclock"), "") final_graph_hour = pynutil.insert( "hours: \"") + graph_hour + pynutil.insert("\"") final_graph_minute = ( pynutil.insert("minutes: \"") + (pynutil.insert("00") | oclock + pynutil.insert("00") | pynutil.delete("o") + delete_space + graph_minute_single | graph_minute_double) + pynutil.insert("\"")) final_suffix = pynutil.insert("suffix: \"") + convert_space( suffix_graph) + pynutil.insert("\"") final_suffix_optional = pynini.closure( delete_space + insert_space + final_suffix, 0, 1) final_time_zone_optional = pynini.closure( delete_space + insert_space + pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\""), 0, 1, ) # five o' clock # two o eight, two thiry five (am/pm) # two pm/am graph_hm = final_graph_hour + delete_extra_space + final_graph_minute # 10 past four, quarter past four, half past four graph_mh = (pynutil.insert("minutes: \"") + pynini.union( graph_minute_single, graph_minute_double, graph_minute_verbose) + pynutil.insert("\"") + delete_space + pynutil.delete("past") + delete_extra_space + final_graph_hour) final_graph = ((graph_hm | graph_mh) + final_suffix_optional + final_time_zone_optional).optimize() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) graph = pynini.Far( get_abs_path("data/numbers/cardinal_number_name.far")).get_fst() self.graph_hundred_component_at_least_one_none_zero_digit = ( pynini.closure(NEMO_DIGIT, 2, 3) | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure( pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT)) @ graph graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) single_digits_graph = pynutil.add_weight( pynini.invert(graph_digit | graph_zero), 1.2) | pynutil.add_weight( pynini.cross("0", "oh"), 1.1) self.single_digits_graph = single_digits_graph + pynini.closure( pynutil.insert(" ") + single_digits_graph) if not deterministic: single_digits_graph_with_commas = pynini.closure( self.single_digits_graph + pynutil.insert(" "), 1, 3) + pynini.closure( pynutil.delete(",") + single_digits_graph + pynutil.insert(" ") + single_digits_graph + pynutil.insert(" ") + single_digits_graph, 1, ) self.graph |= self.single_digits_graph | get_hundreds_graph( ) | single_digits_graph_with_commas self.range_graph = ( pynini.closure(pynutil.insert("from "), 0, 1) + self.graph + (pynini.cross("-", " to ") | pynini.cross("-", " ")) + self.graph) self.range_graph |= self.graph + (pynini.cross( "x", " by ") | pynini.cross(" x ", " by ")) + self.graph self.range_graph = self.range_graph.optimize() optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) final_graph = self.graph | pynutil.add_weight(self.get_serial_graph(), 1.2) if not deterministic: final_graph |= self.range_graph final_graph = optional_minus_graph + pynutil.insert( "integer: \"") + final_graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="electronic", kind="classify") delete_extra_space = pynutil.delete(" ") alpha_num = ( NEMO_ALPHA | pynini.string_file(get_abs_path("data/numbers/digit.tsv")) | pynini.string_file(get_abs_path("data/numbers/zero.tsv"))) symbols = pynini.string_file( get_abs_path("data/electronic/symbols.tsv")).invert() accepted_username = alpha_num | symbols process_dot = pynini.cross("punto", ".") username = (pynutil.insert("username: \"") + alpha_num + delete_extra_space + pynini.closure(accepted_username + delete_extra_space) + alpha_num + pynutil.insert("\"")) single_alphanum = pynini.closure(alpha_num + delete_extra_space) + alpha_num server = single_alphanum | pynini.string_file( get_abs_path("data/electronic/server_name.tsv")).invert() domain = single_alphanum | pynini.string_file( get_abs_path("data/electronic/domain.tsv")).invert() domain_graph = (pynutil.insert("domain: \"") + server + delete_extra_space + process_dot + delete_extra_space + domain + pynutil.insert("\"")) graph = (username + delete_extra_space + pynutil.delete("arroba") + insert_space + delete_extra_space + domain_graph) ############# url ### protocol_end = pynini.cross( pynini.union("www", "w w w", "doble ve doble ve doble ve"), "www") protocol_start = pynini.cross( pynini.union("http", "h t t p", "hache te te pe"), "http") protocol_start |= pynini.cross( pynini.union("https", "h t t p s", "hache te te pe ese"), "https") protocol_start += pynini.cross(" dos puntos barra barra ", "://") # e.g. .com, .es ending = (delete_extra_space + symbols + delete_extra_space + (domain | pynini.closure(accepted_username + delete_extra_space, ) + accepted_username)) protocol = (pynini.closure(protocol_start, 0, 1) + protocol_end + delete_extra_space + process_dot + delete_extra_space + (pynini.closure(delete_extra_space + accepted_username, 1) | server) + pynini.closure(ending, 1)) protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert( "\"") graph |= protocol ######## final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="measure", kind="classify") cardinal_graph = cardinal.graph_no_exception graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_four = pynini.cross("tư", "4") graph_one = pynini.cross("mốt", "1") graph_half = pynini.cross("rưỡi", "5") graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv")) graph_unit_singular = pynini.invert(graph_unit) # singular -> abbr optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross(pynini.union("âm", "trừ"), '"true"') + delete_extra_space, 0, 1, ) unit_singular = convert_space(graph_unit_singular) unit_misc = pynutil.insert("/") + pynutil.delete( "trên") + delete_space + convert_space(graph_unit_singular) unit_singular = (pynutil.insert('units: "') + (unit_singular | unit_misc | pynutil.add_weight( unit_singular + delete_space + unit_misc, 0.01)) + pynutil.insert('"')) subgraph_decimal = (pynutil.insert("decimal { ") + optional_graph_negative + decimal.final_graph_wo_negative + pynutil.insert(" }") + delete_extra_space + unit_singular) subgraph_cardinal = (pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert('integer: "') + cardinal_graph + pynutil.insert('"') + pynutil.insert(" }") + delete_extra_space + unit_singular) fraction_graph = (delete_extra_space + pynutil.insert('fractional_part: "') + (graph_digit | graph_half | graph_one | graph_four) + pynutil.insert('"')) subgraph_cardinal |= (pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert('integer: "') + cardinal_graph + pynutil.insert('" }') + delete_extra_space + unit_singular + fraction_graph) final_graph = subgraph_decimal | subgraph_cardinal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="verbalize", deterministic=deterministic) graph_digit_no_zero = pynini.invert( pynini.string_file( get_abs_path("data/numbers/digit.tsv"))).optimize() graph_zero = pynini.cross("0", "zero") if not deterministic: graph_zero |= pynini.cross("0", "o") | pynini.cross("0", "oh") graph_digit = graph_digit_no_zero | graph_zero graph_symbols = pynini.string_file( get_abs_path("data/electronic/symbols.tsv")).optimize() user_name = ( pynutil.delete("username:"******"\"") + (pynini.closure( pynutil.add_weight(graph_digit + insert_space, 1.09) | pynutil.add_weight( pynini.closure(graph_symbols + pynutil.insert(" ")), 1.09) | pynutil.add_weight(NEMO_NOT_QUOTE + insert_space, 1.1))) + pynutil.delete("\"")) server_common = pynini.string_file( get_abs_path("data/electronic/server_name.tsv")) domain_common = pynini.string_file( get_abs_path("data/electronic/domain.tsv")) convert_defaults = (NEMO_NOT_QUOTE | pynutil.add_weight(domain_common, -0.1) | pynutil.add_weight(server_common, -0.1)) domain = convert_defaults + pynini.closure( pynutil.insert(" ") + convert_defaults) domain = pynini.compose( domain, pynini.closure( pynutil.add_weight(graph_symbols, -0.1) | pynutil.add_weight(graph_digit, -0.1) | NEMO_NOT_QUOTE), ) domain = (pynutil.delete("domain:") + delete_space + pynutil.delete("\"") + domain + delete_space + pynutil.delete("\"")) protocol = pynutil.delete("protocol: \"") + pynini.closure( NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") graph = (pynini.closure(protocol + delete_space, 0, 1) + pynini.closure( user_name + delete_space + pynutil.insert("at ") + delete_space, 0, 1) + domain + delete_space) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="verbalize", deterministic=deterministic) graph_digit_no_zero = pynini.invert(pynini.string_file(get_abs_path("data/number/digit.tsv"))).optimize() graph_zero = pynini.cross("0", "zero") if not deterministic: graph_zero |= pynini.cross("0", "o") | pynini.cross("0", "oh") graph_digit = graph_digit_no_zero | graph_zero graph_symbols = pynini.string_file(get_abs_path("data/electronic/symbol.tsv")).optimize() default_chars_symbols = pynini.cdrewrite( pynutil.insert(" ") + (graph_symbols | graph_digit) + pynutil.insert(" "), "", "", NEMO_SIGMA ) user_name = ( pynutil.delete("username:"******"\"") + default_chars_symbols + pynutil.delete("\"") ) domain_common = pynini.string_file(get_abs_path("data/electronic/domain.tsv")) domain = ( default_chars_symbols + insert_space + plurals._priority_union( domain_common, pynutil.add_weight(pynini.cross(".", "dot"), weight=0.0001), NEMO_SIGMA ) + pynini.closure( insert_space + (pynini.cdrewrite(TO_UPPER, "", "", NEMO_SIGMA) @ default_chars_symbols), 0, 1 ) ) domain = ( pynutil.delete("domain:") + delete_space + pynutil.delete("\"") + domain + delete_space + pynutil.delete("\"") ).optimize() protocol = pynutil.delete("protocol: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") graph = ( pynini.closure(protocol + delete_space, 0, 1) + pynini.closure(user_name + delete_space + pynutil.insert(" at ") + delete_space, 0, 1) + domain + delete_space ).optimize() @ pynini.cdrewrite(delete_extra_space, "", "", NEMO_SIGMA) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def get_address_graph(self, cardinal): """ Finite state transducer for classifying serial. The serial is a combination of digits, letters and dashes, e.g.: 2788 San Tomas Expy, Santa Clara, CA 95051 -> units: "address" cardinal { integer: "two seven eight eight San Tomas Expressway Santa Clara California nine five zero five one" } preserve_order: true """ ordinal_verbalizer = OrdinalVerbalizer().graph ordinal_tagger = OrdinalTagger(cardinal=cardinal).graph ordinal_num = pynini.compose( pynutil.insert("integer: \"") + ordinal_tagger + pynutil.insert("\""), ordinal_verbalizer) address_num = pynini.closure(NEMO_DIGIT, 1) @ cardinal.single_digits_graph direction = (pynini.cross("E", "East") | pynini.cross("S", "South") | pynini.cross("W", "West") | pynini.cross("N", "North")) direction = pynini.closure( pynutil.add_weight(pynini.accep(NEMO_SPACE) + direction, -1), 0, 1) address_words = pynini.string_file( get_abs_path("data/address/address_words.tsv")) address_words = (pynini.accep(NEMO_SPACE) + pynini.closure(ordinal_num, 0, 1) + pynini.closure(NEMO_ALPHA | NEMO_SPACE, 1) + address_words) city = pynini.closure(NEMO_ALPHA | pynini.accep(NEMO_SPACE), 1) city = pynini.closure( pynini.cross(",", "") + pynini.accep(NEMO_SPACE) + city, 0, 1) state = pynini.invert( pynini.string_file(get_abs_path("data/address/states.tsv"))) state = pynini.closure( pynini.cross(",", "") + pynini.accep(NEMO_SPACE) + state, 0, 1) zip_code = pynini.compose(NEMO_DIGIT**5, cardinal.single_digits_graph) zip_code = pynini.closure( pynutil.add_weight( pynini.closure(pynini.cross(",", ""), 0, 1) + pynini.accep(NEMO_SPACE) + zip_code, -100), 0, 1, ) address = (address_num + direction + address_words + pynini.closure(pynini.cross(".", ""), 0, 1) + city + state + zip_code) return address
def __init__(self): super().__init__(name="electronic", kind="classify") delete_extra_space = pynutil.delete(" ") alpha_num = ( NEMO_ALPHA | pynini.string_file(get_abs_path("data/numbers/digit.tsv")) | pynini.string_file(get_abs_path("data/numbers/zero.tsv"))) symbols = pynini.string_file( get_abs_path("data/electronic/symbols.tsv")).invert() accepted_username = alpha_num | symbols process_dot = pynini.cross("chấm", ".") username = (pynutil.insert('username: "******"')) single_alphanum = pynini.closure(alpha_num + delete_extra_space) + alpha_num server = single_alphanum | pynini.string_file( get_abs_path("data/electronic/server_name.tsv")) domain = single_alphanum | pynini.string_file( get_abs_path("data/electronic/domain.tsv")) multi_domain = (pynini.closure(process_dot + delete_extra_space + domain + delete_extra_space) + process_dot + delete_extra_space + domain) domain_graph = pynutil.insert( 'domain: "' ) + server + delete_extra_space + multi_domain + pynutil.insert('"') graph = (username + delete_extra_space + pynutil.delete(pynini.union("a còng", "a móc", "a vòng")) + insert_space + delete_extra_space + domain_graph) ############# url ### protocol_end = pynini.cross(pynini.union("w w w", "www"), "www") protocol_start = (pynini.cross("h t t p", "http") | pynini.cross( "h t t p s", "https")) + pynini.cross(" hai chấm sẹc sẹc ", "://") # .com, ending = ( delete_extra_space + symbols + delete_extra_space + (domain | pynini.closure(accepted_username + delete_extra_space) + accepted_username)) protocol = (pynini.closure(protocol_start, 0, 1) + protocol_end + delete_extra_space + process_dot + pynini.closure(delete_extra_space + accepted_username, 1) + pynini.closure(ending, 1, 2)) protocol = pynutil.insert('protocol: "') + protocol + pynutil.insert( '"') graph |= protocol ######## final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="telephone", kind="classify") # create `single_digits` and `double_digits` graphs as these will be # the building blocks of possible telephone numbers graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) graph_ties_unique = pynini.string_file( (get_abs_path("data/numbers/ties_unique.tsv"))) graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) double_digits = pynini.union( graph_teen, graph_ties_unique, (graph_ties + pynutil.insert("0")), (graph_ties + delete_hyphen + graph_digit), ) graph_first_pair = graph_zero + delete_space + graph_digit graph_first_pair |= pynutil.insert( "0") + graph_digit # if zero is omitted graph_first_pair += ( delete_space + insert_space ) # delete_space since closure allows possible gaps to be removed # All digits single_digits = graph_digit | graph_zero graph_pair_all_digits = single_digits + delete_space graph_pair_all_digits += single_digits graph_all_digits = pynini.closure( graph_pair_all_digits + delete_space + insert_space, 3, 3) graph_all_digits = graph_first_pair + graph_all_digits + graph_pair_all_digits # Paired digits graph_pair_digits_and_ties = double_digits | graph_pair_all_digits graph_digits_and_ties = pynini.closure( graph_pair_digits_and_ties + delete_space + insert_space, 3, 3) graph_digits_and_ties = graph_first_pair + graph_digits_and_ties + graph_pair_digits_and_ties number_part = pynini.union(graph_all_digits, graph_digits_and_ties) number_part = pynutil.insert( "number_part: \"") + number_part + pynutil.insert("\"") graph = number_part final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="measure", kind="classify") cardinal_graph = cardinal.graph_no_exception graph_unit_singular = pynini.string_file( get_abs_path("data/measurements_singular.tsv")) graph_unit_singular = pynini.invert( graph_unit_singular) # singular -> abbr graph_unit_plural = pynini.string_file( get_abs_path("data/measurements_plural.tsv")) graph_unit_plural = pynini.invert(graph_unit_plural) # plural -> abbr optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("menos", "\"true\"") + delete_extra_space, 0, 1) unit_singular = convert_space(graph_unit_singular) unit_plural = convert_space(graph_unit_plural) unit_misc = pynutil.insert("/") + pynutil.delete( "por") + delete_space + convert_space(graph_unit_singular) unit_singular = (pynutil.insert("units: \"") + (unit_singular | unit_misc | pynutil.add_weight( unit_singular + delete_space + unit_misc, 0.01)) + pynutil.insert("\"")) unit_plural = (pynutil.insert("units: \"") + (unit_plural | unit_misc | pynutil.add_weight( unit_plural + delete_space + unit_misc, 0.01)) + pynutil.insert("\"")) subgraph_decimal = (pynutil.insert("decimal { ") + optional_graph_negative + decimal.final_graph_wo_negative + pynutil.insert(" }") + delete_extra_space + unit_plural) subgraph_cardinal = ( pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + ((NEMO_SIGMA - "un" - "una" - "uno") @ cardinal_graph) + pynutil.insert("\"") + pynutil.insert(" }") + delete_extra_space + unit_plural) subgraph_cardinal |= ( pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + (pynini.cross("un", "1") | pynini.cross("una", "1") | pynini.cross("uno", "1")) + pynutil.insert("\"") + pynutil.insert(" }") + delete_extra_space + unit_singular) final_graph = subgraph_decimal | subgraph_cardinal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst): super().__init__(name="ordinal", kind="classify") cardinal_graph = cardinal.graph_no_exception graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv")) graph_teens = pynini.string_file(get_abs_path("data/ordinals/teen.tsv")) graph = pynini.closure(NEMO_CHAR) + pynini.union( graph_digit, graph_teens, pynini.cross("tieth", "ty"), pynini.cross("th", "") ) self.graph = graph @ cardinal_graph final_graph = pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="telephone", kind="classify", deterministic=deterministic) graph_zero = pynini.invert( pynini.string_file( get_abs_path("data/numbers/zero.tsv"))).optimize() graph_digit_no_zero = pynini.invert( pynini.string_file(get_abs_path("data/numbers/digit.tsv")) ).optimize() | pynini.cross("1", "eins") graph_digit = graph_digit_no_zero | graph_zero numbers_with_single_digits = pynini.closure(graph_digit + insert_space) + graph_digit two_digit_and_zero = ( NEMO_DIGIT**2 @ cardinal.two_digit_non_zero) | graph_zero # def add_space_after_two_digit(): # return pynini.closure(two_digit_and_zero + insert_space) + ( # two_digit_and_zero # ) country_code = pynini.closure(pynini.cross("+", "plus "), 0, 1) + two_digit_and_zero country_code |= (pynutil.delete("(") + graph_zero + insert_space + numbers_with_single_digits + pynutil.delete(")")) country_code |= graph_zero + insert_space + numbers_with_single_digits country_code = pynutil.insert( "country_code: \"") + country_code + pynutil.insert("\"") del_separator = pynini.cross(pynini.union("-", " "), " ") # numbers_with_two_digits = pynini.closure(graph_digit + insert_space) + add_space_after_two_digit() + pynini.closure(insert_space + graph_digit) # numbers = numbers_with_two_digits + pynini.closure(del_separator + numbers_with_two_digits, 0, 1) numbers = numbers_with_single_digits + pynini.closure( del_separator + numbers_with_single_digits, 0, 1) number_length = pynini.closure( (NEMO_DIGIT | pynini.union("-", " ", ")", "(")), 7) number_part = pynini.compose(number_length, numbers) number = pynutil.insert( "number_part: \"") + number_part + pynutil.insert("\"") graph = country_code + pynini.accep(" ") + number self.graph = graph final_graph = self.add_tokens(self.graph + pynutil.insert(" preserve_order: true")) self.fst = final_graph.optimize()
def _get_month_graph(): """ Transducer for month, e.g. march -> march """ month_graph = pynini.string_file( get_abs_path("data/months.tsv")).optimize() return month_graph
def __init__(self): super().__init__(name="whitelist", kind="classify") whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")) graph = pynutil.insert("name: \"") + convert_space( whitelist) + pynutil.insert("\"") self.fst = graph.optimize()
def __init__(self, number_names: dict, alternative_formats: dict, deterministic=False): super().__init__(name="ordinal", kind="classify", deterministic=deterministic) one_thousand_alternative = alternative_formats['one_thousand_alternative'] separators = alternative_formats['separators'] ordinal = number_names['ordinal_number_names'] ordinal |= ordinal @ one_thousand_alternative ordinal_numbers = separators @ ordinal # to handle cases like 2-ая endings = pynini.string_file(get_abs_path("data/numbers/ordinal_endings.tsv")) not_dash = pynini.closure(pynini.difference(NEMO_SIGMA, "-")) del_ending = pynini.cdrewrite(pynini.cross("-" + not_dash, ""), "", "[EOS]", NEMO_SIGMA) ordinal_numbers_marked = ( ((separators @ ordinal).optimize() + pynini.accep("-") + not_dash).optimize() @ (NEMO_SIGMA + endings).optimize() @ del_ending ).optimize() self.ordinal_numbers = ordinal_numbers # "03" -> remove leading zeros and verbalize leading_zeros = pynini.closure(pynini.cross("0", "")) self.ordinal_numbers_with_leading_zeros = (leading_zeros + ordinal_numbers).optimize() final_graph = (ordinal_numbers | ordinal_numbers_marked).optimize() final_graph = pynutil.insert("integer: \"") + final_graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="money", kind="classify") # quantity, integer_part, fractional_part, currency cardinal_graph = cardinal.graph_no_exception graph_decimal_final = decimal.final_graph_wo_negative graph_half = pynini.cross("rưỡi", "5") unit = pynini.string_file(get_abs_path("data/currency.tsv")) unit_singular = pynini.invert(unit) graph_unit_singular = pynutil.insert("currency: \"") + convert_space( unit_singular) + pynutil.insert("\"") add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | ( pynutil.insert("0") + NEMO_DIGIT) # twelve dollars fifty, only after integer optional_cents_suffix = pynini.closure( delete_extra_space + pynutil.insert("fractional_part: \"") + (pynutil.add_weight( cardinal_graph @ add_leading_zero_to_double_digit, -0.7) | graph_half) + pynutil.insert("\""), 0, 1, ) graph_integer = (pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") + delete_extra_space + graph_unit_singular + optional_cents_suffix) graph_decimal = graph_decimal_final + delete_extra_space + graph_unit_singular + optional_cents_suffix final_graph = graph_integer | graph_decimal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="time", kind="verbalize") hour_to_night = pynini.string_file(get_abs_path("data/time/hour_to_night.tsv")) day_suffixes = pynutil.delete("suffix: \"am\"") night_suffixes = pynutil.delete("suffix: \"pm\"") hour = ( pynutil.delete("hours:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_DIGIT, 1, 2) + pynutil.delete("\"") ) minute = ( pynutil.delete("minutes:") + delete_extra_space + pynutil.delete("\"") + pynini.closure(NEMO_DIGIT, 1, 2) + pynutil.delete("\"") ) graph = hour + delete_extra_space + pynutil.insert("h") + minute.ques + delete_space + day_suffixes.ques graph |= ( hour @ hour_to_night + delete_extra_space + pynutil.insert("h") + minute.ques + delete_space + night_suffixes ) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def __init__(self): super().__init__(name="telephone", kind="classify") delete_space = pynutil.delete(' ') # country code, number_part, extension add_separator = pynutil.insert(" ") # between components digit = pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv"))).optimize() | pynini.cross( "0", pynini.union("o", "oh", "zero") ) number_part = ( ( (pynini.closure(digit + insert_space, 2, 2) + digit + pynutil.delete("-")) | ( pynutil.delete("(") + pynini.closure(digit + insert_space, 2, 2) + digit + pynutil.delete(")") + pynini.closure(pynutil.delete("-"), 0, 1) + delete_space ) ) + add_separator + pynini.closure(digit + insert_space, 2, 2) + digit + pynutil.delete("-") + add_separator + pynini.closure(digit + insert_space, 3, 3) + digit ) number_part = pynutil.insert("number_part: \"") + pynini.invert(number_part) + pynutil.insert("\"") graph = number_part final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def __init__(self, cardinal, deterministic: bool = True): super().__init__(name="fraction", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph integer = pynutil.insert( "integer_part: \"") + cardinal_graph + pynutil.insert("\"") numerator = (pynutil.insert("numerator: \"") + cardinal_graph + (pynini.cross("/", "\" ") | pynini.cross(" / ", "\" "))) endings = ["rd", "th", "st", "nd"] endings += [x.upper() for x in endings] optional_end = pynini.closure(pynini.cross(pynini.union(*endings), ""), 0, 1) denominator = pynutil.insert( "denominator: \"" ) + cardinal_graph + optional_end + pynutil.insert("\"") graph = pynini.closure(integer + pynini.accep(" "), 0, 1) + (numerator + denominator) graph |= pynini.closure( integer + (pynini.accep(" ") | pynutil.insert(" ")), 0, 1) + pynini.compose( pynini.string_file(get_abs_path("data/number/fraction.tsv")), (numerator + denominator)) self.graph = graph final_graph = self.add_tokens(self.graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst): super().__init__(name="fraction", kind="classify") # integer_part # numerator # denominator cardinal_graph = cardinal.graph_no_exception fractional = pynini.string_file(get_abs_path("data/fractions.tsv")) self.fractional = ((pynini.closure(NEMO_CHAR) + fractional) @ cardinal_graph).optimize() integer = pynutil.insert( "integer_part: \"") + cardinal_graph + pynutil.insert("\"") numerator = pynutil.insert( "numerator: \"") + cardinal_graph + pynutil.insert("\"") denominator = pynutil.insert( "denominator: \"") + self.fractional + pynutil.insert("\"") graph = pynini.closure( integer + delete_space, 0, 1) + numerator + delete_space + insert_space + denominator graph = graph.optimize() self.final_graph_wo_negative = graph optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("minus", "\"true\"") + delete_extra_space, 0, 1) graph = optional_graph_negative + graph final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="money", kind="classify") cardinal_graph = cardinal.graph graph_decimal_final = decimal.final_graph_wo_negative unit_singular = pynini.string_file(get_abs_path("data/currency.tsv")) unit_plural = convert_space(unit_singular @ SINGULAR_TO_PLURAL) unit_singular = convert_space(unit_singular) graph_unit_singular = pynutil.insert( "currency: \"") + unit_singular + pynutil.insert("\"") graph_unit_plural = pynutil.insert( "currency: \"") + unit_plural + pynutil.insert("\"") graph_integer = (graph_unit_plural + pynutil.insert(" integer_part: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + pynutil.insert("\"")) graph_integer |= (graph_unit_singular + pynutil.insert(" integer_part: \"") + pynini.cross("1", "one") + pynutil.insert("\"")) graph_decimal = graph_unit_plural + insert_space + graph_decimal_final final_graph = graph_integer | graph_decimal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="verbalize", deterministic=deterministic) graph_digit = pynini.invert( pynini.string_file( get_abs_path("data/numbers/digit.tsv"))).optimize() graph_symbols = pynini.string_file( get_abs_path("data/electronic/symbols.tsv")).optimize() user_name = ( pynutil.delete("username:"******"\"") + (pynini.closure( pynutil.add_weight(graph_digit + insert_space, 1.09) | pynutil.add_weight( pynini.closure(graph_symbols + pynutil.insert(" ")), 1.09) | pynutil.add_weight(NEMO_NOT_QUOTE + insert_space, 1.1))) + pynutil.delete("\"")) domain_default = (pynini.closure(NEMO_NOT_QUOTE + insert_space) + pynini.cross(".", "dot ") + NEMO_NOT_QUOTE + pynini.closure(insert_space + NEMO_NOT_QUOTE)) server_default = (pynini.closure( (graph_digit | NEMO_ALPHA) + insert_space, 1) + pynini.closure(graph_symbols + insert_space) + pynini.closure( (graph_digit | NEMO_ALPHA) + insert_space, 1)) server_common = pynini.string_file( get_abs_path("data/electronic/server_name.tsv")) + insert_space domain_common = pynini.cross(".", "dot ") + pynini.string_file( get_abs_path("data/electronic/domain.tsv")) domain = (pynutil.delete("domain:") + delete_space + pynutil.delete("\"") + (pynutil.add_weight(server_common, 1.09) | pynutil.add_weight(server_default, 1.1)) + (pynutil.add_weight(domain_common, 1.09) | pynutil.add_weight(domain_default, 1.1)) + delete_space + pynutil.delete("\"")) graph = (pynini.closure( user_name + delete_space + pynutil.insert("at ") + delete_space, 0, 1) + domain + delete_space) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()