def get_alternative_formats(): """ Utils to get alternative formats for numbers. """ one_alternatives = load_labels( get_abs_path('data/numbers/cardinals_alternatives.tsv')) one_thousand_map = [] for k in one_alternatives: default, alternative = k one_thousand_map.append((alternative.split()[1], alternative)) one_thousand_map = pynini.string_map(one_thousand_map) one_thousand_alternative = pynini.cdrewrite(one_thousand_map, "[BOS]", "", NEMO_SIGMA) # Adapted from # https://github.com/google/TextNormalizationCoveringGrammars/blob/master/src/universal/thousands_punct.grm # Specifies common ways of delimiting thousands in digit strings. t = pynini.Far(get_abs_path('data/utils/universal_thousands_punct.far')) separators = (pynutil.add_weight(t['dot_thousands'], 0.1) | pynutil.add_weight(t['no_delimiter'], -0.1) | pynutil.add_weight(t['space_thousands'], 0.1)) alternative_formats = {} alternative_formats[ 'one_thousand_alternative'] = one_thousand_alternative.optimize() alternative_formats['separators'] = separators.optimize() return alternative_formats
def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None): super().__init__(name="whitelist", kind="classify", deterministic=deterministic) def _get_whitelist_graph(input_case, file): whitelist = load_labels(file) if input_case == "lower_cased": whitelist = [[x[0].lower()] + x[1:] for x in whitelist] else: whitelist = [[x[0].lower()] + x[1:] for x in whitelist] graph = pynini.string_map(whitelist) return graph graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist.tsv")) if input_file: graph = _get_whitelist_graph(input_case, input_file) units_graph = _get_whitelist_graph(input_case, file=get_abs_path("data/measurements.tsv")) # do not replace single letter units, like `м`, `°` and `%` will be replaced units_graph = pynini.compose((NEMO_CHAR ** (2, ...) | pynini.difference(NEMO_CHAR, RU_ALPHA)), units_graph) graph |= units_graph.optimize() graph |= TO_CYRILLIC + pynini.closure(pynutil.insert(" ") + TO_CYRILLIC) self.final_graph = convert_space(graph) self.fst = (pynutil.insert("name: \"") + self.final_graph + pynutil.insert("\"")).optimize()
def __init__(self, cardinal: GraphFst, deterministic: bool = False): super().__init__(name="decimal", kind="classify", deterministic=deterministic) integer_part = cardinal.cardinal_numbers_default cardinal_numbers_with_leading_zeros = cardinal.cardinal_numbers_with_leading_zeros delimiter_map = prepare_labels_for_insertion( get_abs_path("data/numbers/decimal_delimiter.tsv")) delimiter = ( pynini.cross(",", "") + delimiter_map['@@decimal_delimiter@@'] + pynini.closure(pynutil.add_weight(pynutil.insert(" и"), 0.5), 0, 1)).optimize() decimal_endings_map = prepare_labels_for_insertion( get_abs_path("data/numbers/decimal_endings.tsv")) self.integer_part = integer_part + delimiter graph_integer = pynutil.insert( "integer_part: \"") + self.integer_part + pynutil.insert("\"") graph_fractional = NEMO_DIGIT @ cardinal_numbers_with_leading_zeros + decimal_endings_map[ '10'] graph_fractional |= ( NEMO_DIGIT + NEMO_DIGIT ) @ cardinal_numbers_with_leading_zeros + decimal_endings_map['100'] graph_fractional |= ( NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT ) @ cardinal_numbers_with_leading_zeros + decimal_endings_map['1000'] graph_fractional |= ( NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT ) @ cardinal_numbers_with_leading_zeros + decimal_endings_map['10000'] self.optional_quantity = pynini.string_file( get_abs_path("data/numbers/quantity.tsv")).optimize() self.graph_fractional = graph_fractional graph_fractional = pynutil.insert( "fractional_part: \"") + graph_fractional + pynutil.insert("\"") optional_quantity = pynini.closure( (pynutil.add_weight(pynini.accep(NEMO_SPACE), -0.1) | insert_space) + pynutil.insert("quantity: \"") + self.optional_quantity + pynutil.insert("\""), 0, 1, ) self.final_graph = (cardinal.optional_graph_negative + graph_integer + insert_space + graph_fractional + optional_quantity) self.final_graph = self.add_tokens(self.final_graph) self.fst = self.final_graph.optimize()
def __init__(self, number_names: dict, alternative_formats: dict, deterministic=False): super().__init__(name="ordinal", kind="classify", deterministic=deterministic) one_thousand_alternative = alternative_formats['one_thousand_alternative'] separators = alternative_formats['separators'] ordinal = number_names['ordinal_number_names'] ordinal |= ordinal @ one_thousand_alternative ordinal_numbers = separators @ ordinal # to handle cases like 2-ая endings = pynini.string_file(get_abs_path("data/numbers/ordinal_endings.tsv")) not_dash = pynini.closure(pynini.difference(NEMO_SIGMA, "-")) del_ending = pynini.cdrewrite(pynini.cross("-" + not_dash, ""), "", "[EOS]", NEMO_SIGMA) ordinal_numbers_marked = ( ((separators @ ordinal).optimize() + pynini.accep("-") + not_dash).optimize() @ (NEMO_SIGMA + endings).optimize() @ del_ending ).optimize() self.ordinal_numbers = ordinal_numbers # "03" -> remove leading zeros and verbalize leading_zeros = pynini.closure(pynini.cross("0", "")) self.ordinal_numbers_with_leading_zeros = (leading_zeros + ordinal_numbers).optimize() final_graph = (ordinal_numbers | ordinal_numbers_marked).optimize() final_graph = pynutil.insert("integer: \"") + final_graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def _get_whitelist_graph(input_case, file="data/whitelist.tsv"): whitelist = load_labels(get_abs_path(file)) if input_case == "lower_cased": whitelist = [[x[0].lower()] + x[1:] for x in whitelist] else: whitelist = [[x[0].lower()] + x[1:] for x in whitelist] graph = pynini.string_map(whitelist) return graph
def __init__(self, number_names: dict, alternative_formats: dict, deterministic: bool = False): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) self.cardinal_numbers_default = self.get_cardinal_numbers(number_names, alternative_formats, mode="all") self.cardinal_numbers_nominative = self.get_cardinal_numbers( number_names, alternative_formats, mode="nominative" ) self.optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1 ) self.cardinal_numbers_with_optional_negative = ( self.optional_graph_negative + pynutil.insert("integer: \"") + self.cardinal_numbers_default + pynutil.insert("\"") ) # "03" -> remove leading zeros and verbalize leading_zeros = pynini.closure(pynini.cross("0", "")) self.cardinal_numbers_with_leading_zeros = (leading_zeros + self.cardinal_numbers_default).optimize() # "123" -> "один два три" single_digits_graph = pynini.compose(NEMO_DIGIT, self.cardinal_numbers_nominative) self.single_digits_graph = single_digits_graph + pynini.closure(insert_space + single_digits_graph) optional_quantity = pynini.string_file(get_abs_path("data/numbers/quantity.tsv")).optimize() optional_quantity = pynutil.insert("quantity: \"") + optional_quantity + pynutil.insert("\"") optional_quantity = pynini.closure( (pynutil.add_weight(pynini.accep(NEMO_SPACE), -0.1) | insert_space) + optional_quantity, 0, 1 ) serial_graph = self.get_serial_graph() final_graph = ( self.optional_graph_negative + pynutil.insert("integer: \"") + self.cardinal_numbers_with_leading_zeros + pynutil.insert("\"") + optional_quantity ).optimize() final_graph = pynutil.add_weight(final_graph, -0.1) final_graph |= ( pynutil.insert("integer: \"") + pynutil.add_weight(self.single_digits_graph | serial_graph, 10) + pynutil.insert("\"") ) self.final_graph = final_graph # to cover cases "2-х" -> "двух" (this is not covered by ordinal endings) final_graph |= pynini.compose( pynini.compose(NEMO_DIGIT ** (1, ...) + pynini.cross('-х', ''), final_graph), NEMO_SIGMA + pynini.accep("х\"") + NEMO_SIGMA, ) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="whitelist", kind="classify", deterministic=deterministic) whitelist = pynini.string_file( get_abs_path("data/whitelist.tsv")).invert() graph = pynutil.insert("name: \"") + convert_space( whitelist) + pynutil.insert("\"") self.fst = graph.optimize()
def get_alternative_formats(): """ Utils to get alternative formats for numbers. """ one_alternatives = load_labels( get_abs_path('data/numbers/cardinals_alternatives.tsv')) one_thousand_map = [] for k in one_alternatives: default, alternative = k one_thousand_map.append((alternative.split()[1], alternative)) one_thousand_map = pynini.string_map(one_thousand_map) one_thousand_alternative = pynini.cdrewrite(one_thousand_map, "[BOS]", "", NEMO_SIGMA) t = pynini.Far(get_abs_path('data/utils/universal_thousands_punct.far')) separators = (pynutil.add_weight(t['dot_thousands'], 0.1) | pynutil.add_weight(t['no_delimiter'], -0.1) | pynutil.add_weight(t['space_thousands'], 0.1)) alternative_formats = {} alternative_formats['one_thousand_alternative'] = one_thousand_alternative alternative_formats['separators'] = separators return alternative_formats
def get_number_names(): """ Creates numbers names. Based on: 1) Gorman, K., and Sproat, R. 2016. Minimally supervised number normalization. Transactions of the Association for Computational Linguistics 4: 507-519. and 2) Ng, A. H., Gorman, K., and Sproat, R. 2017. Minimally supervised written-to-spoken text normalization. In ASRU, pages 665-670. """ a = pynini.Far(get_abs_path('data/utils/util_arithmetic.far'), mode='r') d = a['DELTA_STAR'] f = a['IARITHMETIC_RESTRICTED'] g = pynini.Fst.read(get_abs_path('data/utils/g.fst')) fg = (d @ (f @ (f @ (f @ g).optimize()).optimize()).optimize()).optimize() assert rewrite.top_rewrite("230", fg) == "(+ 200 30 +)" # Compiles lexicon transducers (L). cardinal_name_nominative = pynini.string_file( get_abs_path("data/numbers/1_cardinals_nominative_именительный.tsv") ).optimize() cardinal_name_genitive = pynini.string_file( get_abs_path( "data/numbers/2_cardinals_genitive_родительный.tsv")).optimize() cardinal_name_dative = pynini.string_file( get_abs_path( "data/numbers/3_cardinals_dative_датильный.tsv")).optimize() cardinal_name_accusative = pynini.string_file( get_abs_path( "data/numbers/4_cardinals_accusative_винительный.tsv")).optimize() cardinal_name_instrumental = pynini.string_file( get_abs_path("data/numbers/5_cardinals_instrumental_творительный.tsv") ).optimize() cardinal_name_prepositional = pynini.string_file( get_abs_path("data/numbers/6_cardinals_prepositional_предложный.tsv") ).optimize() cardinal_l = ( pynini.closure(cardinal_name_nominative + pynini.accep(" ")) + cardinal_name_nominative).optimize() for case in [ cardinal_name_genitive, cardinal_name_dative, cardinal_name_accusative, cardinal_name_instrumental, cardinal_name_prepositional, ]: cardinal_l |= (pynini.closure(case + pynini.accep(" ")) + case).optimize() # Numbers up to 1000 in nominative case (to use, for example, with telephone) nominative_up_to_thousand_name = pynini.string_file( get_abs_path("data/numbers/cardinals_nominative_case.tsv")) nominative_up_to_thousand_name_l = ( pynini.closure(nominative_up_to_thousand_name + pynini.accep(" ")) + nominative_up_to_thousand_name).optimize() # Convert e.g. "(* 5 1000 *)" back to "5000" so complex ordinals will be formed correctly, # e.g. "пятитысячный" will eventually be formed. (If we didn't do this, the incorrect phrase # "пять тысячный" would be formed). # We do this for all thousands from "(*2 1000 *)" —> "2000" to "(*20 1000 *)" —> "20000". # We do not go higher, in order to prevent the WFST graph becoming even larger. complex_numbers = pynini.cross("(* 2 1000 *)", "2000") for number in range(3, 21): complex_numbers |= pynini.cross(f"(* {number} 1000 *)", f"{number}000") complex_numbers = (NEMO_SIGMA + pynutil.add_weight(complex_numbers, -1) + pynini.closure(pynini.union(" ", ")", "(", "+", "*"))) fg_ordinal = pynutil.add_weight(pynini.compose(fg, complex_numbers), -1) | fg ordinal_name = pynini.string_file( get_abs_path("data/numbers/ordinals.tsv")) ordinal_l = (pynini.closure(cardinal_name_nominative + pynini.accep(" ")) + ordinal_name).optimize() # Composes L with the leaf transducer (P), then composes that with FG. p = a['LEAVES'] number_names = {} number_names['ordinal_number_names'] = ( fg_ordinal @ (p @ ordinal_l)).optimize() number_names['cardinal_number_names'] = (fg @ (p @ cardinal_l)).optimize() number_names['nominative_up_to_thousand_names'] = ( fg @ (p @ nominative_up_to_thousand_name_l)).optimize() return number_names
("Ё́", "Е'"), ("И́", "И'"), ("О́", "О'"), ("У́", "У'"), ("Ы́", "Ы'"), ("Э́", "Э'"), ("Ю́", "Ю'"), ("Я́", "Я'"), ("а́", "а'"), ("е́", "е'"), ("ё́", "е'"), ("и́", "и'"), ("о́", "о'"), ("у́", "у'"), ("ы́", "ы'"), ("э́", "э'"), ("ю́", "ю'"), ("я́", "я'"), ("ё", "е"), ("Ё", "Е"), ] REWRITE_STRESSED = pynini.closure(pynini.string_map(RU_STRESSED_MAP).optimize() | RU_ALPHA).optimize() TO_LATIN = pynini.string_file(get_abs_path("data/cyrillic_to_latin.tsv")) RU_ALPHA_OR_SPACE = pynini.union(RU_ALPHA, NEMO_SPACE, NEMO_NON_BREAKING_SPACE).optimize() except (ModuleNotFoundError, ImportError): # Create placeholders RU_ALPHA = None LO_LATIN = None
def __init__(self, number_names: dict, deterministic: bool): super().__init__(name="date", kind="classify", deterministic=deterministic) # Ru format: DD-MM-YYYY or DD-MM-YY month_abbr_to_names = pynini.string_file( get_abs_path("data/months/abbr_to_name.tsv")).optimize() delete_sep = pynutil.add_weight(pynini.cross( ".", " "), 1.09) | pynutil.add_weight( pynini.cross(pynini.union("/", "-"), " "), 1.1) numbers = number_names['ordinal_number_names'] zero = (pynutil.add_weight(pynini.cross("0", ""), -0.1)) | (pynutil.add_weight( pynini.cross("0", "ноль "), 0.1)) zero_digit = zero + pynini.compose(NEMO_DIGIT, numbers) digit_day = (pynini.union("1", "2", "3") + NEMO_DIGIT) | NEMO_DIGIT digit_day = pynini.compose(digit_day, numbers) day = (pynutil.insert("day: \"") + (zero_digit | digit_day) + pynutil.insert("\"")).optimize() digit_month = zero_digit | pynini.compose( pynini.accep("1") + NEMO_DIGIT, numbers) month_number_to_abbr = pynini.string_file( get_abs_path("data/months/numbers.tsv")).optimize() month_number_to_abbr = ((( (pynutil.add_weight(pynini.cross("0", ""), -0.1) | pynini.accep("1")) + NEMO_DIGIT) | NEMO_DIGIT).optimize() @ month_number_to_abbr).optimize() month_name = ((month_number_to_abbr @ month_abbr_to_names) | pynutil.add_weight(month_abbr_to_names, 0.1)).optimize() month = (pynutil.insert("month: \"") + (month_name | digit_month) + pynutil.insert("\"")).optimize() year = pynini.compose(((NEMO_DIGIT**4) | (NEMO_DIGIT**2)), numbers).optimize() year |= zero_digit year_word_singular = ["год", "года", "году", "годом", "годе"] year_word_plural = [ "годы", "годов", "годам", "годами", "годам", "годах" ] year_word = pynini.cross("г.", pynini.union(*year_word_singular)) year_word |= pynini.cross("гг.", pynini.union(*year_word_plural)) year_word = (pynutil.add_weight(insert_space, -0.1) | pynutil.add_weight(pynini.accep(" "), 0.1)) + year_word year_optional = pynutil.insert("year: \"") + year + pynini.closure( year_word, 0, 1) + pynutil.insert("\"") year_optional = pynini.closure(delete_sep + year_optional, 0, 1).optimize() year_only = pynutil.insert( "year: \"") + year + year_word + pynutil.insert("\"") tagger_graph = (day + delete_sep + month + year_optional) | year_only # Verbalizer day = (pynutil.delete("day:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) month = (pynutil.delete("month:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) year = (pynutil.delete("year:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + delete_space + pynutil.delete("\"")) year_optional = pynini.closure(delete_extra_space + year, 0, 1) graph_dmy = day + delete_extra_space + month + year_optional verbalizer_graph = (graph_dmy | year) + delete_space self.final_graph = pynini.compose(tagger_graph, verbalizer_graph).optimize() self.fst = pynutil.insert( "day: \"") + self.final_graph + pynutil.insert("\"") self.fst = self.add_tokens(self.fst).optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="classify", deterministic=deterministic) # tagger accepted_symbols = [] with open(get_abs_path("data/electronic/symbols.tsv"), 'r') as f: for line in f: symbol, _ = line.split('\t') accepted_symbols.append(pynini.accep(symbol)) username = (pynutil.insert("username: \"") + NEMO_ALPHA + pynini.closure(NEMO_ALPHA | NEMO_DIGIT | pynini.union(*accepted_symbols)) + pynutil.insert("\"") + pynini.cross('@', ' ')) domain_graph = ( NEMO_ALPHA + (pynini.closure(NEMO_ALPHA | NEMO_DIGIT | pynini.accep('-') | pynini.accep('.'))) + (NEMO_ALPHA | NEMO_DIGIT)) domain_graph = pynutil.insert( "domain: \"") + domain_graph + pynutil.insert("\"") tagger_graph = (username + domain_graph).optimize() # verbalizer graph_digit = pynini.string_file( get_abs_path( "data/numbers/digits_nominative_case.tsv")).optimize() graph_symbols = pynini.string_file( get_abs_path("data/electronic/symbols.tsv")).optimize() user_name = ( pynutil.delete("username:"******"\"") + (pynini.closure( pynutil.add_weight(graph_digit + insert_space, 1.09) | pynutil.add_weight( pynini.closure(graph_symbols + pynutil.insert(" ")), 1.09) | pynutil.add_weight(NEMO_NOT_QUOTE + insert_space, 1.1))) + pynutil.delete("\"")) domain_default = (pynini.closure(NEMO_NOT_QUOTE + insert_space) + pynini.cross(".", "точка ") + NEMO_NOT_QUOTE + pynini.closure(insert_space + NEMO_NOT_QUOTE)) server_default = (pynini.closure( (graph_digit | NEMO_ALPHA) + insert_space, 1) + pynini.closure(graph_symbols + insert_space) + pynini.closure( (graph_digit | NEMO_ALPHA) + insert_space, 1)) server_common = pynini.string_file( get_abs_path("data/electronic/server_name.tsv")) + insert_space domain_common = pynini.cross(".", "точка ") + pynini.string_file( get_abs_path("data/electronic/domain.tsv")) domain = (pynutil.delete("domain:") + delete_space + pynutil.delete("\"") + (pynutil.add_weight(server_common, 1.09) | pynutil.add_weight(server_default, 1.1)) + (pynutil.add_weight(domain_common, 1.09) | pynutil.add_weight(domain_default, 1.1)) + delete_space + pynutil.delete("\"")) graph = user_name + delete_space + pynutil.insert( "собака ") + delete_space + domain + delete_space # replace all latin letters with their Ru verbalization verbalizer_graph = (graph.optimize() @ (pynini.closure( TO_CYRILLIC | RU_ALPHA | pynini.accep(" ")))).optimize() verbalizer_graph = verbalizer_graph.optimize() self.final_graph = (tagger_graph @ verbalizer_graph).optimize() self.fst = self.add_tokens( pynutil.insert("username: \"") + self.final_graph + pynutil.insert("\"")).optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = True): super().__init__(name="money", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.cardinal_numbers_default decimal_graph = decimal.final_graph unit_singular = pynini.string_file( get_abs_path("data/currency/currency_singular.tsv")) unit_plural = pynini.string_file( get_abs_path("data/currency/currency_plural.tsv")) # adding weight to make sure the space is preserved for ITN optional_delimiter = pynini.closure( pynutil.add_weight(pynini.cross(NEMO_SPACE, ""), -100), 0, 1) graph_unit_singular = (optional_delimiter + pynutil.insert(" currency: \"") + unit_singular + pynutil.insert("\"")) graph_unit_plural = optional_delimiter + pynutil.insert( " currency: \"") + unit_plural + pynutil.insert("\"") one = pynini.compose(pynini.accep("1"), cardinal_graph).optimize() singular_graph = pynutil.insert( "integer_part: \"") + one + pynutil.insert( "\"") + graph_unit_singular graph_decimal = decimal_graph + graph_unit_plural graph_integer = (pynutil.insert("integer_part: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + pynutil.insert("\"") + (graph_unit_plural)) graph_integer |= singular_graph tagger_graph = (graph_integer.optimize() | graph_decimal.optimize()).optimize() # verbalizer integer = pynutil.delete("\"") + pynini.closure( NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") integer_part = pynutil.delete("integer_part: ") + integer unit = (pynutil.delete("currency: ") + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) unit = pynini.accep(NEMO_SPACE) + unit verbalizer_graph_cardinal = (integer_part + unit).optimize() fractional_part = pynutil.delete("fractional_part: ") + integer optional_quantity = pynini.closure( pynini.accep(NEMO_SPACE) + pynutil.delete("quantity: ") + integer, 0, 1) verbalizer_graph_decimal = (pynutil.delete('decimal { ') + integer_part + pynini.accep(" ") + fractional_part + optional_quantity + pynutil.delete(" }") + unit) verbalizer_graph = (verbalizer_graph_cardinal | verbalizer_graph_decimal).optimize() self.final_graph = (tagger_graph @ verbalizer_graph).optimize() self.fst = self.add_tokens( pynutil.insert("integer_part: \"") + self.final_graph + pynutil.insert("\"")).optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = True): super().__init__(name="measure", kind="classify", deterministic=deterministic) # adding weight to make sure the space is preserved for ITN delete_space = pynini.closure( pynutil.add_weight( pynutil.delete( pynini.union(NEMO_SPACE, NEMO_NON_BREAKING_SPACE)), -1), 0, 1) cardinal_graph = cardinal.cardinal_numbers_default cardinal_graph_nominative = cardinal.cardinal_numbers_nominative graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv")) optional_graph_negative = cardinal.optional_graph_negative space_for_units = ( pynutil.add_weight(pynutil.insert(NEMO_NON_BREAKING_SPACE), -0.1) | pynutil.add_weight(pynutil.insert(NEMO_SPACE), 0.1)).optimize() slash_unit = (pynini.cross("/", "в") | pynini.cross("/", "за")) + space_for_units + graph_unit unit_slash_unit = pynutil.add_weight( graph_unit + space_for_units + slash_unit, -0.1) default_units = pynutil.insert("units: \"") + ( graph_unit | unit_slash_unit) + pynutil.insert("\"") slash_units = pynutil.insert( "units: \"") + slash_unit + pynutil.insert("\"") subgraph_decimal = decimal.final_graph + ( (delete_space + default_units) | slash_units) cardinal_space = ( pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + cardinal_graph + ((delete_space + pynutil.insert("\"") + pynutil.insert(" } ") + default_units) | (pynutil.insert("\"") + pynutil.insert(" } ") + slash_units))) cardinal_optional_dash_alpha = ( pynutil.insert("cardinal { integer: \"") + cardinal_graph + pynini.closure(pynini.cross('-', ''), 0, 1) + pynutil.insert("\" } units: \"") + pynini.closure(RU_ALPHA, 1) + pynutil.insert("\"")) alpha_optional_dash_cardinal = ( pynutil.insert("units: \"") + pynini.closure(RU_ALPHA, 1) + pynini.closure(pynini.cross('-', ''), 0, 1) + pynutil.insert("\"") + pynutil.insert(" cardinal { integer: \"") + cardinal_graph_nominative + pynutil.insert("\" } preserve_order: true")) decimal_dash_alpha = (decimal.final_graph + pynini.cross('-', '') + pynutil.insert(" units: \"") + pynini.closure(RU_ALPHA, 1) + pynutil.insert("\"")) alpha_dash_decimal = (pynutil.insert("units: \"") + pynini.closure(RU_ALPHA, 1) + pynini.cross('-', '') + pynutil.insert("\" ") + decimal.final_graph + pynutil.insert(" preserve_order: true")) self.tagger_graph_default = (subgraph_decimal | cardinal_space).optimize() tagger_graph = (self.tagger_graph_default | cardinal_optional_dash_alpha | alpha_optional_dash_cardinal | decimal_dash_alpha | alpha_dash_decimal).optimize() # verbalizer unit = pynutil.delete("units: \"") + pynini.closure( NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + delete_space optional_sign = pynini.closure( pynini.cross("negative: \"true\" ", "минус "), 0, 1) integer = pynutil.delete(" \"") + pynini.closure( NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") integer_part = pynutil.delete("integer_part:") + integer fractional_part = pynutil.delete("fractional_part:") + integer optional_quantity_part = pynini.closure( pynini.accep(" ") + pynutil.delete("quantity: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\""), 0, 1, ) graph_decimal = optional_sign + integer_part + pynini.accep( " ") + fractional_part + optional_quantity_part graph_decimal = pynutil.delete( "decimal {" ) + delete_space + graph_decimal + delete_space + pynutil.delete("}") graph_cardinal = (pynutil.delete("cardinal {") + delete_space + optional_sign + pynutil.delete("integer: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + delete_space + pynutil.delete("}")) verbalizer_graph = (graph_cardinal | graph_decimal) + delete_space + insert_space + unit # SH adds "preserve_order: true" by default preserve_order = pynutil.delete( "preserve_order:") + delete_space + pynutil.delete( "true") + delete_space verbalizer_graph |= (unit + insert_space + (graph_cardinal | graph_decimal) + delete_space + pynini.closure(preserve_order, 0, 1)) self.verbalizer_graph = verbalizer_graph.optimize() final_graph = (tagger_graph @ verbalizer_graph).optimize() self.fst = self.add_tokens( pynutil.insert("cardinal { integer: \"") + final_graph + pynutil.insert("\" }")).optimize()
("Ы́", "Ы'"), ("Э́", "Э'"), ("Ю́", "Ю'"), ("Я́", "Я'"), ("а́", "а'"), ("е́", "е'"), ("ё́", "е'"), ("и́", "и'"), ("о́", "о'"), ("у́", "у'"), ("ы́", "ы'"), ("э́", "э'"), ("ю́", "ю'"), ("я́", "я'"), ("ё", "е"), ("Ё", "Е"), ] REWRITE_STRESSED = pynini.closure( pynini.string_map(RU_STRESSED_MAP).optimize() | RU_ALPHA).optimize() TO_CYRILLIC = pynini.string_file( get_abs_path("data/latin_to_cyrillic.tsv")).optimize() TO_LATIN = pynini.invert(TO_CYRILLIC).optimize() RU_ALPHA_OR_SPACE = pynini.union(RU_ALPHA, NEMO_SPACE, NEMO_NON_BREAKING_SPACE).optimize() except (ModuleNotFoundError, ImportError): # Create placeholders RU_ALPHA = None LO_LATIN = None
def __init__(self, number_names: dict, deterministic: bool = True): super().__init__(name="time", kind="classify", deterministic=deterministic) increment_hour_ordinal = pynini.string_file( get_abs_path("data/time/increment_hour_ordinal.tsv")) increment_hour_cardinal = pynini.string_file( get_abs_path("data/time/increment_hour_cardinal.tsv")) convert_hour = pynini.string_file( get_abs_path("data/time/time_convert.tsv")) number = pynini.closure(pynini.cross("0", ""), 0, 1) + number_names['cardinal_names_nominative'] hour_options = pynini.project(increment_hour_ordinal, "input") hour_options = hour_options | pynini.project(convert_hour, "output") hour_exeption_ends_with_one = pynini.union(*["01", "21"]) hour_exeption_ends_rest = pynini.union(*["02", "03", "04", "22", "23"]) hour_other = (pynini.difference( hour_options, pynini.union(hour_exeption_ends_with_one, hour_exeption_ends_rest))).optimize() hour = hour_exeption_ends_with_one @ number + pynutil.insert(" час") hour |= hour_exeption_ends_rest @ number + pynutil.insert(" часа") hour |= hour_other @ number + pynutil.insert(" часов") optional_and = pynini.closure(pynutil.insert("и "), 0, 1) digits = pynini.union(*[str(x) for x in range(10)]) mins_start = pynini.union(*"012345") mins_options = mins_start + digits mins_exception_ends_with_one = mins_start + pynini.accep("1") mins_exception_ends_rest = pynini.difference( mins_start + pynini.union(*"234"), pynini.union(*["12", "13", "14"])) mins_other = pynini.difference( mins_options, pynini.union(mins_exception_ends_with_one, mins_exception_ends_rest)) minutes = mins_exception_ends_with_one @ number + pynutil.insert( " минута") minutes |= mins_exception_ends_rest @ number + pynutil.insert( " минуты") minutes |= mins_other @ number + pynutil.insert(" минут") self.minutes = minutes.optimize() # 17:15 -> "семнадцать часов и пятнадцать минут" hm = (pynutil.insert("hours: \"") + hour.optimize() + pynutil.insert("\"") + (pynini.cross(":", " ") + pynutil.insert("minutes: \"") + optional_and + minutes.optimize()) + pynutil.insert("\"") + pynutil.insert(" preserve_order: true")) h = pynutil.insert("hours: \"") + hour + pynutil.insert( "\"") + pynutil.delete(":00") self.graph_preserve_order = (hm | h).optimize() # 17:15 -> "пятнадцать минут шестого" # Requires permutations for the correct verbalization self.increment_hour_ordinal = pynini.compose( hour_options, increment_hour_ordinal).optimize() m_next_h = (pynutil.insert("hours: \"") + self.increment_hour_ordinal + pynutil.insert("\"") + pynini.cross(":", " ") + pynutil.insert("minutes: \"") + minutes + pynutil.insert("\"")) # 17:45 -> "без пятнадцати минут шесть" # Requires permutations for the correct verbalization self.mins_to_h = pynini.string_file( get_abs_path("data/time/minutes_to_hour.tsv")).optimize() self.increment_hour_cardinal = pynini.compose( hour_options, increment_hour_cardinal).optimize() m_to_h = (pynutil.insert("hours: \"") + self.increment_hour_cardinal + pynutil.insert("\"") + pynini.cross(":", " ") + pynutil.insert("minutes: \"без ") + self.mins_to_h + pynutil.insert("\"")) self.final_graph = m_next_h | self.graph_preserve_order | m_to_h self.fst = self.add_tokens(self.final_graph) self.fst = self.fst.optimize()