def __init__(self, whitelist: 'pynini.FstLike', deterministic: bool = True): super().__init__(name="abbreviation", kind="classify", deterministic=deterministic) main_graph = NEMO_UPPER + pynini.closure(insert_space + NEMO_UPPER, 1) misc_graph = pynutil.add_weight( TO_LOWER + pynini.closure(insert_space + pynini.union(TO_LOWER | NEMO_LOWER)), 110) misc_graph |= pynutil.add_weight( pynini.closure(NEMO_UPPER, 2) + pynini.closure(insert_space + NEMO_LOWER, 1), 110) misc_graph |= ( NEMO_UPPER + pynutil.delete(".") + pynini.closure(insert_space + NEMO_UPPER + pynutil.delete("."))) misc_graph |= pynutil.add_weight( TO_LOWER + pynutil.delete(".") + pynini.closure(insert_space + TO_LOWER + pynutil.delete(".")), 110) # set weight of the misc graph to the value higher then word graph = pynutil.add_weight(main_graph.optimize(), 10) | pynutil.add_weight( misc_graph.optimize(), 101) # exclude words that are included in the whitelist graph = pynini.compose( pynini.difference(pynini.project(graph, "input"), pynini.project(whitelist.graph, "input")), graph) graph = pynutil.insert( "value: \"") + graph.optimize() + pynutil.insert("\"") graph = self.add_tokens(graph) self.fst = graph.optimize()
def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) far_file = os.path.join(cache_dir, f"en_tn_{deterministic}_deterministic_verbalizer.far") if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["verbalize"] logging.info(f'VerbalizeFinalFst graph was restored from {far_file}.') else: verbalize = VerbalizeFst(deterministic=deterministic).fst word = WordFst(deterministic=deterministic).fst types = verbalize | word if deterministic: graph = ( pynutil.delete("tokens") + delete_space + pynutil.delete("{") + delete_space + types + delete_space + pynutil.delete("}") ) else: graph = delete_space + types + delete_space graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space self.fst = graph.optimize() if far_file: generator_main(far_file, {"verbalize": self.fst}) logging.info(f"VerbalizeFinalFst grammars are saved to {far_file}.")
def __init__(self, ordinal: GraphFst): super().__init__(name="date", kind="classify") ordinal_graph = ordinal.graph year_graph = _get_year_graph() YEAR_WEIGHT = 0.001 year_graph = pynutil.add_weight(year_graph, YEAR_WEIGHT) month_graph = _get_month_graph() month_graph = pynutil.insert( "month: \"") + month_graph + pynutil.insert("\"") day_graph = pynutil.insert("day: \"") + pynutil.add_weight( ordinal_graph, -0.7) + pynutil.insert("\"") optional_day_graph = pynini.closure(delete_extra_space + day_graph, 0, 1) optional_graph_year = pynini.closure( delete_extra_space + pynutil.insert("year: \"") + pynutil.add_weight(year_graph, -YEAR_WEIGHT) + pynutil.insert("\""), 0, 1, ) graph_mdy = month_graph + optional_day_graph + optional_graph_year graph_dmy = (pynutil.delete("the") + delete_space + day_graph + delete_space + pynutil.delete("of") + delete_extra_space + month_graph + optional_graph_year) graph_year = pynutil.insert("year: \"") + ( year_graph | _get_range_graph()) + pynutil.insert("\"") final_graph = graph_mdy | graph_dmy | graph_year final_graph += pynutil.insert(" preserve_order: true") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="ordinal", kind="verbalize") graph_digit = pynini.string_file( get_abs_path("data/ordinals/digit.tsv")).invert() graph_teens = pynini.string_file( get_abs_path("data/ordinals/teen.tsv")).invert() graph = (pynutil.delete("integer:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) convert_rest = pynutil.insert("th", weight=0.01) suffix = pynini.cdrewrite( graph_digit | graph_teens | pynutil.add_weight(pynini.cross("ty", "tieth"), weight=0.001) | convert_rest, "", "[EOS]", NEMO_SIGMA, ).optimize() graph = graph @ suffix self.suffix = suffix delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="word", kind="verbalize", deterministic=deterministic) chars = pynini.closure(NEMO_CHAR - " ", 1) char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"") graph = char @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize()
def __init__(self): super().__init__(name="ordinal", kind="verbalize") graph = ( pynutil.delete("integer:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) convert_eleven = pynini.cross("11", "11th") convert_twelve = pynini.cross("12", "12th") convert_thirteen = pynini.cross("13", "13th") convert_one = pynini.cross("1", "1st") convert_two = pynini.cross("2", "2nd") convert_three = pynini.cross("3", "3rd") convert_rest = pynutil.insert("th", weight=0.01) suffix = pynini.cdrewrite( convert_eleven | convert_twelve | convert_thirteen | convert_one | convert_two | convert_three | convert_rest, "", "[EOS]", NEMO_SIGMA, ) graph = graph @ suffix delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def __init__(self): super().__init__(name="time", kind="classify") # hours, minutes, seconds, suffix, zone, style, speak_period suffix_graph = pynini.string_file(get_abs_path("data/time_suffix.tsv")) time_zone_graph = pynini.invert( pynini.string_file(get_abs_path("data/time_zone.tsv"))) # only used for < 1000 thousand -> 0 weight cardinal = pynutil.add_weight(CardinalFst().graph_no_exception, weight=-0.7) labels_hour = [num_to_word(x) for x in range(0, 24)] labels_minute_single = [num_to_word(x) for x in range(1, 10)] labels_minute_double = [num_to_word(x) for x in range(10, 60)] graph_hour = pynini.union(*labels_hour) @ cardinal graph_minute_single = pynini.union(*labels_minute_single) @ cardinal graph_minute_double = pynini.union(*labels_minute_double) @ cardinal graph_minute_verbose = pynini.cross("half", "30") | pynini.cross( "quarter", "15") oclock = pynini.cross( pynini.union("o' clock", "o clock", "o'clock", "oclock"), "") final_graph_hour = pynutil.insert( "hours: \"") + graph_hour + pynutil.insert("\"") final_graph_minute = ( pynutil.insert("minutes: \"") + (pynutil.insert("00") | oclock + pynutil.insert("00") | pynutil.delete("o") + delete_space + graph_minute_single | graph_minute_double) + pynutil.insert("\"")) final_suffix = pynutil.insert("suffix: \"") + convert_space( suffix_graph) + pynutil.insert("\"") final_suffix_optional = pynini.closure( delete_space + insert_space + final_suffix, 0, 1) final_time_zone_optional = pynini.closure( delete_space + insert_space + pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\""), 0, 1, ) # five o' clock # two o eight, two thiry five (am/pm) # two pm/am graph_hm = final_graph_hour + delete_extra_space + final_graph_minute # 10 past four, quarter past four, half past four graph_mh = (pynutil.insert("minutes: \"") + pynini.union( graph_minute_single, graph_minute_double, graph_minute_verbose) + pynutil.insert("\"") + delete_space + pynutil.delete("past") + delete_extra_space + final_graph_hour) final_graph = ((graph_hm | graph_mh) + final_suffix_optional + final_time_zone_optional).optimize() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="time", kind="classify", deterministic=deterministic) suffix_graph = pynini.string_file(get_abs_path("data/time_suffix.tsv")) time_zone_graph = pynini.string_file( get_abs_path("data/time_zone.tsv")) # only used for < 1000 thousand -> 0 weight cardinal = cardinal.graph labels_hour = [str(x) for x in range(0, 24)] labels_minute_single = [str(x) for x in range(1, 10)] labels_minute_double = [str(x) for x in range(10, 60)] delete_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | ( pynini.closure(pynutil.delete("0"), 0, 1) + NEMO_DIGIT) graph_hour = delete_leading_zero_to_double_digit @ pynini.union( *labels_hour) @ cardinal graph_minute_single = pynini.union(*labels_minute_single) @ cardinal graph_minute_double = pynini.union(*labels_minute_double) @ cardinal final_graph_hour = pynutil.insert( "hours: \"") + graph_hour + pynutil.insert("\"") final_graph_minute = ( pynutil.insert("minutes: \"") + (pynini.cross("0", "o") + insert_space + graph_minute_single | graph_minute_double) + pynutil.insert("\"")) final_suffix = pynutil.insert("suffix: \"") + convert_space( suffix_graph) + pynutil.insert("\"") final_suffix_optional = pynini.closure( delete_space + insert_space + final_suffix, 0, 1) final_time_zone_optional = pynini.closure( delete_space + insert_space + pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\""), 0, 1, ) # 2:30 pm, 02:30, 2:00 graph_hm = ( final_graph_hour + pynutil.delete(":") + (pynutil.delete("00") | insert_space + final_graph_minute) + final_suffix_optional + final_time_zone_optional) # 2.xx pm/am graph_hm2 = ( final_graph_hour + pynutil.delete(".") + (pynutil.delete("00") | insert_space + final_graph_minute) + delete_space + insert_space + final_suffix + final_time_zone_optional) # 2 pm est graph_h = final_graph_hour + delete_space + insert_space + final_suffix + final_time_zone_optional final_graph = (graph_hm | graph_h | graph_hm2).optimize() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="measure", kind="verbalize") graph = (pynutil.delete(" cardinal { integer: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + delete_space + pynutil.delete("}")) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def __init__(self, decimal: GraphFst): super().__init__(name="money", kind="verbalize") unit = (pynutil.delete("currency:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_CHAR - " ", 1) + pynutil.delete("\"")) graph = unit + delete_space + decimal.numbers delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def __init__(self): super().__init__(name="whitelist", kind="verbalize") graph = (pynutil.delete("name:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_CHAR - " ", 1) + pynutil.delete("\"")) graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize()
def __init__(self): super().__init__(name="money", kind="classify") # quantity, integer_part, fractional_part, currency, style(depr) cardinal_graph = CardinalFst().graph_no_exception graph_decimal_final = DecimalFst().final_graph_wo_negative unit = pynini.string_file(get_abs_path("data/currency.tsv")) unit_singular = pynini.invert(unit) unit_plural = get_singulars(unit_singular) graph_unit_singular = pynutil.insert("currency: \"") + convert_space( unit_singular) + pynutil.insert("\"") graph_unit_plural = pynutil.insert("currency: \"") + convert_space( unit_plural) + pynutil.insert("\"") add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | ( pynutil.insert("0") + NEMO_DIGIT) # twelve dollars (and) fifty cents, zero cents cents_standalone = (pynutil.insert("fractional_part: \"") + pynini.union( pynutil.add_weight( ((NEMO_SIGMA - "one") @ cardinal_graph), -0.7) @ add_leading_zero_to_double_digit + delete_space + pynutil.delete("cents"), pynini.cross("one", "01") + delete_space + pynutil.delete("cent"), ) + pynutil.insert("\"")) optional_cents_standalone = pynini.closure( delete_space + pynini.closure(pynutil.delete("and") + delete_space, 0, 1) + insert_space + cents_standalone, 0, 1, ) # twelve dollars fifty, only after integer optional_cents_suffix = pynini.closure( delete_extra_space + pynutil.insert("fractional_part: \"") + pynutil.add_weight(cardinal_graph, -0.7) + pynutil.insert("\""), 0, 1, ) graph_integer = (pynutil.insert("integer_part: \"") + ((NEMO_SIGMA - "one") @ cardinal_graph) + pynutil.insert("\"") + delete_extra_space + graph_unit_plural + (optional_cents_standalone | optional_cents_suffix)) graph_integer |= (pynutil.insert("integer_part: \"") + pynini.cross("one", "1") + pynutil.insert("\"") + delete_extra_space + graph_unit_singular + (optional_cents_standalone | optional_cents_suffix)) graph_decimal = graph_decimal_final + delete_extra_space + graph_unit_plural graph_decimal |= pynutil.insert( "currency: \"$\" integer_part: \"0\" ") + cents_standalone final_graph = graph_integer | graph_decimal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="money", kind="verbalize", deterministic=deterministic) graph = pynini.closure(RU_ALPHA | " ") delete_tokens = self.delete_tokens( pynutil.delete("integer_part: \"") + graph + pynutil.delete("\"")) self.fst = delete_tokens.optimize()
def __init__(self): super().__init__(name="ordinal", kind="verbalize") graph = (pynutil.delete("integer:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) graph = pynutil.insert("thứ ") + graph delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def _get_thousands_graph(graph_ties, graph_digits): graph_hundred_component = ( (graph_digit | graph_zero) + delete_space + pynutil.delete("trăm")) | pynutil.insert("0") graph = (graph_digit + delete_space + pynutil.delete(pynini.union("nghìn", "ngàn")) + delete_space + graph_hundred_component + delete_space + (graph_teen | graph_ties | graph_digits)) return graph
def __init__(self, deterministic: bool = True): super().__init__(name="roman", kind="verbalize", deterministic=deterministic) suffix = OrdinalFst().suffix integer = pynini.closure(NEMO_NOT_QUOTE) integer |= pynini.closure(pynutil.insert("the "), 0, 1) + integer @ suffix graph = pynutil.delete("integer: \"") + integer + pynutil.delete("\"") delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def _get_thousands_graph(): graph_ties = _get_ties_graph() graph_hundred_component = ( graph_digit + delete_space + pynutil.delete("hundred")) | pynutil.insert("0") graph = (graph_digit + delete_space + pynutil.delete("thousand") + delete_space + graph_hundred_component + delete_space + (graph_teen | graph_ties)) return graph
def _get_two_digit_year_with_s_graph(): # to handle '70s -> seventies graph = ( pynini.closure(pynutil.delete("'"), 0, 1) + pynini.compose( ties_graph + pynutil.delete("0s"), pynini.cdrewrite(pynini.cross("y", "ies"), "", "[EOS]", NEMO_SIGMA) ) ).optimize() return graph
def __init__(self, deterministic: bool = True): super().__init__(name="abbreviation", kind="verbalize", deterministic=deterministic) graph = pynutil.delete("value: \"") + pynini.closure( NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="telephone", kind="verbalize", deterministic=deterministic) graph = pynutil.delete("number_part: \"") + pynini.closure( RU_ALPHA | " ", 1) + pynutil.delete("\"") delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="verbalize", deterministic=deterministic) graph = pynutil.delete("username: \"") + pynini.closure( RU_ALPHA | " ") + pynutil.delete("\"") delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic) value = pynini.closure(NEMO_NOT_QUOTE) graph = pynutil.delete("integer: \"") + value + pynutil.delete("\"") delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) graph = pynini.Far( get_abs_path("data/numbers/cardinal_number_name.far")).get_fst() self.graph_hundred_component_at_least_one_none_zero_digit = ( pynini.closure(NEMO_DIGIT, 2, 3) | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure( pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT)) @ graph graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) single_digits_graph = pynutil.add_weight( pynini.invert(graph_digit | graph_zero), 1.2) | pynutil.add_weight( pynini.cross("0", "oh"), 1.1) self.single_digits_graph = single_digits_graph + pynini.closure( pynutil.insert(" ") + single_digits_graph) if not deterministic: single_digits_graph_with_commas = pynini.closure( self.single_digits_graph + pynutil.insert(" "), 1, 3) + pynini.closure( pynutil.delete(",") + single_digits_graph + pynutil.insert(" ") + single_digits_graph + pynutil.insert(" ") + single_digits_graph, 1, ) self.graph |= self.single_digits_graph | get_hundreds_graph( ) | single_digits_graph_with_commas self.range_graph = ( pynini.closure(pynutil.insert("from "), 0, 1) + self.graph + (pynini.cross("-", " to ") | pynini.cross("-", " ")) + self.graph) self.range_graph |= self.graph + (pynini.cross( "x", " by ") | pynini.cross(" x ", " by ")) + self.graph self.range_graph = self.range_graph.optimize() optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) final_graph = self.graph | pynutil.add_weight(self.get_serial_graph(), 1.2) if not deterministic: final_graph |= self.range_graph final_graph = optional_minus_graph + pynutil.insert( "integer: \"") + final_graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="electronic", kind="classify") delete_extra_space = pynutil.delete(" ") alpha_num = ( NEMO_ALPHA | pynini.string_file(get_abs_path("data/numbers/digit.tsv")) | pynini.string_file(get_abs_path("data/numbers/zero.tsv"))) symbols = pynini.string_file( get_abs_path("data/electronic/symbols.tsv")).invert() accepted_username = alpha_num | symbols process_dot = pynini.cross("punto", ".") username = (pynutil.insert("username: \"") + alpha_num + delete_extra_space + pynini.closure(accepted_username + delete_extra_space) + alpha_num + pynutil.insert("\"")) single_alphanum = pynini.closure(alpha_num + delete_extra_space) + alpha_num server = single_alphanum | pynini.string_file( get_abs_path("data/electronic/server_name.tsv")).invert() domain = single_alphanum | pynini.string_file( get_abs_path("data/electronic/domain.tsv")).invert() domain_graph = (pynutil.insert("domain: \"") + server + delete_extra_space + process_dot + delete_extra_space + domain + pynutil.insert("\"")) graph = (username + delete_extra_space + pynutil.delete("arroba") + insert_space + delete_extra_space + domain_graph) ############# url ### protocol_end = pynini.cross( pynini.union("www", "w w w", "doble ve doble ve doble ve"), "www") protocol_start = pynini.cross( pynini.union("http", "h t t p", "hache te te pe"), "http") protocol_start |= pynini.cross( pynini.union("https", "h t t p s", "hache te te pe ese"), "https") protocol_start += pynini.cross(" dos puntos barra barra ", "://") # e.g. .com, .es ending = (delete_extra_space + symbols + delete_extra_space + (domain | pynini.closure(accepted_username + delete_extra_space, ) + accepted_username)) protocol = (pynini.closure(protocol_start, 0, 1) + protocol_end + delete_extra_space + process_dot + delete_extra_space + (pynini.closure(delete_extra_space + accepted_username, 1) | server) + pynini.closure(ending, 1)) protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert( "\"") graph |= protocol ######## final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def __init__(self, measure: GraphFst, deterministic: bool = False): super().__init__(name="serial", kind="verbalize", deterministic=deterministic) serial = pynutil.delete("units: \"") + pynini.cross( "serial", "") + pynutil.delete("\"") + delete_space graph = measure.graph_cardinal + delete_space + serial delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def __init__(self): super().__init__(name="decimal", kind="verbalize") optionl_sign = pynini.closure(pynini.cross("negative: \"true\"", "-") + delete_space, 0, 1) integer = ( pynutil.delete("integer_part:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) optional_integer = pynini.closure(integer + delete_space, 0, 1) fractional = ( pynutil.insert(".") + pynutil.delete("fractional_part:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) optional_fractional = pynini.closure(fractional + delete_space, 0, 1) quantity = ( pynutil.delete("quantity:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) optional_quantity = pynini.closure(pynutil.insert(" ") + quantity + delete_space, 0, 1) graph = optional_integer + optional_fractional + optional_quantity self.numbers = graph graph = optionl_sign + graph delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def __init__(self, decimal: GraphFst, cardinal: GraphFst, fraction: GraphFst, deterministic: bool = True): super().__init__(name="measure", kind="verbalize", deterministic=deterministic) optional_sign = cardinal.optional_sign unit = pynutil.delete("units: \"") + pynini.closure(NEMO_CHAR - " ", 1) + pynutil.delete("\"") + delete_space graph_decimal = ( pynutil.delete("decimal {") + delete_space + optional_sign + delete_space + decimal.numbers + delete_space + pynutil.delete("}") ) graph_cardinal = ( pynutil.delete("cardinal {") + delete_space + optional_sign + delete_space + cardinal.numbers + delete_space + pynutil.delete("}") ) graph_fraction = ( pynutil.delete("fraction {") + delete_space + fraction.graph + delete_space + pynutil.delete("}") ) graph = (graph_cardinal | graph_decimal | graph_fraction) + delete_space + insert_space + unit # SH adds "preserve_order: true" by default preserve_order = pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space graph |= unit + insert_space + (graph_cardinal | graph_decimal) + delete_space + pynini.closure(preserve_order) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def __init__(self): super().__init__(name="electronic", kind="verbalize") user_name = ( pynutil.delete("username:"******"\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) domain = ( pynutil.delete("domain:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) protocol = ( pynutil.delete("protocol:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) graph = user_name + delete_space + pynutil.insert("@") + domain graph |= protocol delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def __init__(self): super().__init__(name="electronic", kind="classify") delete_extra_space = pynutil.delete(" ") alpha_num = ( NEMO_ALPHA | pynini.string_file(get_abs_path("data/numbers/digit.tsv")) | pynini.string_file(get_abs_path("data/numbers/zero.tsv"))) symbols = pynini.string_file( get_abs_path("data/electronic/symbols.tsv")).invert() accepted_username = alpha_num | symbols process_dot = pynini.cross("chấm", ".") username = (pynutil.insert('username: "******"')) single_alphanum = pynini.closure(alpha_num + delete_extra_space) + alpha_num server = single_alphanum | pynini.string_file( get_abs_path("data/electronic/server_name.tsv")) domain = single_alphanum | pynini.string_file( get_abs_path("data/electronic/domain.tsv")) multi_domain = (pynini.closure(process_dot + delete_extra_space + domain + delete_extra_space) + process_dot + delete_extra_space + domain) domain_graph = pynutil.insert( 'domain: "' ) + server + delete_extra_space + multi_domain + pynutil.insert('"') graph = (username + delete_extra_space + pynutil.delete(pynini.union("a còng", "a móc", "a vòng")) + insert_space + delete_extra_space + domain_graph) ############# url ### protocol_end = pynini.cross(pynini.union("w w w", "www"), "www") protocol_start = (pynini.cross("h t t p", "http") | pynini.cross( "h t t p s", "https")) + pynini.cross(" hai chấm sẹc sẹc ", "://") # .com, ending = ( delete_extra_space + symbols + delete_extra_space + (domain | pynini.closure(accepted_username + delete_extra_space) + accepted_username)) protocol = (pynini.closure(protocol_start, 0, 1) + protocol_end + delete_extra_space + process_dot + pynini.closure(delete_extra_space + accepted_username, 1) + pynini.closure(ending, 1, 2)) protocol = pynutil.insert('protocol: "') + protocol + pynutil.insert( '"') graph |= protocol ######## final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="verbalize_final", kind="verbalize") verbalize = VerbalizeFst().fst word = WordFst().fst types = verbalize | word graph = (pynutil.delete("tokens") + delete_space + pynutil.delete("{") + delete_space + types + delete_space + pynutil.delete("}")) graph = delete_space + pynini.closure( graph + delete_extra_space) + graph + delete_space self.fst = graph