def get_cardinal_numbers(self, number_names: dict, alternative_formats: dict, mode: str = "all"): """Returns cardinal numbers names graph. Args: number_names: number_names for cardinal and ordinal numbers alternative_formats: alternative number formats mode: "all" - to return graph that includes all Ru cases, "nominative" to return only the nominative form """ if mode == "all": cardinal_names = number_names['cardinal_number_names'] elif mode == "nominative": cardinal_names = number_names['cardinal_names_nominative'] else: raise ValueError(f'{mode} is not supported.') one_thousand_alternative = alternative_formats[ 'one_thousand_alternative'] separators = alternative_formats['separators'] cardinal_numbers = cardinal_names | pynini.compose( cardinal_names, one_thousand_alternative) cardinal_numbers = pynini.compose(separators, cardinal_numbers) return cardinal_numbers
def rewrite_lattice( string: pynini.FstLike, rule: pynini.Fst, token_type: Optional[pynini.TokenType] = None) -> pynini.Fst: """Constructs a weighted lattice of output strings. Constructs a weighted, epsilon-free lattice of output strings given an input FST (or string) and a rule FST. Args: string: Input string or FST. rule: Input rule WFST. token_type: Optional input token type, or symbol table. Returns: An epsilon-free WFSA. Raises: Error: Composition failure. """ # TODO(kbg): Consider adding support for PDT and MPDT composition. # TODO(kbg): Consider using `contextlib.nullcontext` here instead. if token_type is None: lattice = pynini.compose(string, rule, compose_filter="alt_sequence") else: with pynini.default_token_type(token_type): lattice = pynini.compose(string, rule, compose_filter="alt_sequence") if lattice.start() == pynini.NO_STATE_ID: raise Error("Composition failure") return lattice.project("output").rmepsilon()
def get_token_sem_graph(classify_and_verbalize): token_plus_punct = ( pynini.closure(punct + pynutil.insert(" ")) + classify_and_verbalize + pynini.closure(pynutil.insert(" ") + punct)) graph = token_plus_punct + pynini.closure( (pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) | (pynutil.insert(" ") + punct + pynutil.insert(" "))) + token_plus_punct) graph |= punct_only + pynini.closure(punct) graph = delete_space + graph + delete_space remove_extra_spaces = pynini.closure( NEMO_NOT_SPACE, 1) + pynini.closure(delete_extra_space + pynini.closure(NEMO_NOT_SPACE, 1)) remove_extra_spaces |= ( pynini.closure(pynutil.delete(" "), 1) + pynini.closure(NEMO_NOT_SPACE, 1) + pynini.closure(delete_extra_space + pynini.closure(NEMO_NOT_SPACE, 1))) graph = pynini.compose(graph.optimize(), remove_extra_spaces).optimize() return graph
def add_optional_and(self, graph): if not self.deterministic: graph = pynini.compose( graph, NEMO_SIGMA + pynini.closure(pynini.cross("hundred ", " "), 0, 1) + NEMO_SIGMA) not_quote = pynini.closure(NEMO_NOT_QUOTE) no_thousand_million = pynini.difference( not_quote, not_quote + pynini.union("thousand", "million") + not_quote).optimize() integer = (not_quote + pynutil.add_weight( pynini.cross("hundred ", "hundred and ") + no_thousand_million, -0.0001)).optimize() no_hundred = pynini.difference( NEMO_SIGMA, not_quote + pynini.accep("hundred") + not_quote).optimize() integer |= (not_quote + pynutil.add_weight( pynini.cross("thousand ", "thousand and ") + no_hundred, -0.0001)).optimize() graph_with_and = pynini.compose( graph, integer).optimize() | pynutil.add_weight(graph, 0.00001) return graph_with_and
def __construct_r1(self): ''' Umlaut Apfel$ ==> Äpfel ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( [ "<CB>", "<FB>", "<UL>", "<DEL-S>", "<SS>", "<WB>", "<^UC>", "<^Ax>", "<e>", "<^pl>", "<^Gen>", "<^Del>", "<NoHy>", "<NoDef>", "<UL>", "<FB>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()) # r1a tau = pynini.push(pynini.string_map( [("a", "ä"), ("o", "ö"), ("u", "ü"), ("A", "Ä"), ("O", "Ö"), ("U", "Ü")], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet), push_labels=True) lc = pynini.union( self.__syms.consonants, pynini.string_map( ["<CB>", "<WB>", "<NoHy>", "<NoDef>", "<^UC>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()).optimize() r1a = pynini.cdrewrite( tau, lc, pynini.concat( alphabet.closure(), pynini.acceptor("<UL>", token_type=self.__syms.alphabet)), alphabet.closure()) # r1c tau = pynini.transducer("a", "", input_token_type=self.__syms.alphabet) r1c = pynini.cdrewrite( tau, pynini.string_map( ["ä", "Ä"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), pynini.concat( self.__syms.consonants_lower, alphabet.closure(), pynini.acceptor("<UL>", token_type=self.__syms.alphabet)), alphabet.closure()).optimize() # r1d r1d = pynini.cdrewrite( pynini.transducer("<UL>", "<FB>", input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet), "", "", alphabet.closure()) return pynini.compose(r1a, pynini.compose(r1c, r1d)).optimize()
def __init__( self, input_case: str, cache_dir: str = None, overwrite_cache: bool = False, deterministic: bool = True, whitelist: str = None, ): super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far" ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] logging.info(f'ClassifyFst.fst was restored from {far_file}.') else: logging.info(f"Creating ClassifyFst grammars.") word_graph = WordFst(deterministic=deterministic).fst whitelist_graph = WhiteListFst(input_case=input_case, deterministic=deterministic).fst punct_graph = PunctuationFst(deterministic=deterministic).fst classify = pynutil.add_weight(whitelist_graph, 1) | pynutil.add_weight(word_graph, 100) punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }") punct = pynini.closure( pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) | (pynutil.insert(" ") + punct), 1, ) token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") token_plus_punct = ( pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) ) graph = ( token_plus_punct + pynini.closure( ( pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) | (pynutil.insert(" ") + punct + pynutil.insert(" ")) ) + token_plus_punct ).optimize() ) graph = delete_space + graph + delete_space graph |= punct self.fst = graph.optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) logging.info(f"ClassifyFst grammars are saved to {far_file}.")
def __init__(self, cardinal: GraphFst, deterministic: bool): super().__init__(name="decimal", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph cardinal_graph_hundred_component_at_least_one_none_zero_digit = ( cardinal.graph_hundred_component_at_least_one_none_zero_digit) self.graph = cardinal.single_digits_graph.optimize() if not deterministic: self.graph = self.graph | cardinal_graph point = pynutil.delete(".") optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) self.graph_fractional = pynutil.insert( "fractional_part: \"") + self.graph + pynutil.insert("\"") self.graph_integer = pynutil.insert( "integer_part: \"") + cardinal_graph + pynutil.insert("\"") final_graph_wo_sign = ( pynini.closure(self.graph_integer + pynutil.insert(" "), 0, 1) + point + pynutil.insert(" ") + self.graph_fractional) self.final_graph_wo_negative = final_graph_wo_sign | get_quantity( final_graph_wo_sign, cardinal_graph_hundred_component_at_least_one_none_zero_digit) # reduce options for non_deterministic and allow either "oh" or "zero", but not combination if not deterministic: no_oh_zero = pynini.difference( NEMO_SIGMA, (NEMO_SIGMA + "oh" + NEMO_SIGMA + "zero" + NEMO_SIGMA) | (NEMO_SIGMA + "zero" + NEMO_SIGMA + "oh" + NEMO_SIGMA), ).optimize() no_zero_oh = pynini.difference( NEMO_SIGMA, NEMO_SIGMA + pynini.accep("zero") + NEMO_SIGMA + pynini.accep("oh") + NEMO_SIGMA).optimize() self.final_graph_wo_negative |= pynini.compose( self.final_graph_wo_negative, pynini.cdrewrite( pynini.cross("integer_part: \"zero\"", "integer_part: \"oh\""), NEMO_SIGMA, NEMO_SIGMA, NEMO_SIGMA), ) self.final_graph_wo_negative = pynini.compose( self.final_graph_wo_negative, no_oh_zero).optimize() self.final_graph_wo_negative = pynini.compose( self.final_graph_wo_negative, no_zero_oh).optimize() final_graph = optional_graph_negative + self.final_graph_wo_negative final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def get_serial_graph(self): """ Finite state transducer for classifying serial (handles only cases without delimiters, values with delimiters are handled by default). The serial is a combination of digits, letters and dashes, e.g.: c325b -> tokens { cardinal { integer: "c three two five b" } } """ num_graph = self.single_digits_graph if not self.deterministic: num_graph |= self.graph # add space between letter and digit graph_with_space = pynini.compose( pynini.cdrewrite(pynutil.insert(" "), NEMO_ALPHA, NEMO_DIGIT, NEMO_SIGMA), pynini.cdrewrite(pynutil.insert(" "), NEMO_DIGIT, NEMO_ALPHA, NEMO_SIGMA), ) # make sure at least one digit and letter is present not_space = pynini.closure(NEMO_NOT_SPACE) graph_with_space = pynini.compose( (not_space + NEMO_ALPHA + not_space + NEMO_DIGIT + not_space) | (not_space + NEMO_DIGIT + not_space + NEMO_ALPHA + not_space), graph_with_space, ) keep_space = pynini.accep(" ") serial_graph = pynini.compose( graph_with_space, pynini.closure(pynini.closure(NEMO_ALPHA, 1) + keep_space, 1) + num_graph + pynini.closure(keep_space + pynini.closure(NEMO_ALPHA) + pynini.closure(keep_space + num_graph, 0, 1)), ) serial_graph |= pynini.compose( graph_with_space, num_graph + keep_space + pynini.closure(NEMO_ALPHA, 1) + pynini.closure(keep_space + num_graph + pynini.closure( keep_space + pynini.closure(NEMO_ALPHA), 0, 1)), ) # serial graph with delimiter delimiter = pynini.accep("-") | pynini.accep("/") alphas = pynini.closure(NEMO_ALPHA, 1) letter_num = alphas + delimiter + num_graph num_letter = pynini.closure(num_graph + delimiter, 1) + alphas next_alpha_or_num = pynini.closure(delimiter + (alphas | num_graph)) next_alpha_or_num |= pynini.closure(delimiter + num_graph + pynutil.insert(" ") + alphas) serial_graph |= letter_num + next_alpha_or_num serial_graph |= num_letter + next_alpha_or_num # numbers only with 2+ delimiters serial_graph |= (num_graph + delimiter + num_graph + delimiter + num_graph + pynini.closure(delimiter + num_graph)) return pynutil.add_weight(serial_graph, 2)
def __init__(self, number_names: dict, alternative_formats: dict, deterministic: bool = False): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) self.cardinal_numbers_default = self.get_cardinal_numbers(number_names, alternative_formats, mode="all") self.cardinal_numbers_nominative = self.get_cardinal_numbers( number_names, alternative_formats, mode="nominative" ) self.optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1 ) self.cardinal_numbers_with_optional_negative = ( self.optional_graph_negative + pynutil.insert("integer: \"") + self.cardinal_numbers_default + pynutil.insert("\"") ) # "03" -> remove leading zeros and verbalize leading_zeros = pynini.closure(pynini.cross("0", "")) self.cardinal_numbers_with_leading_zeros = (leading_zeros + self.cardinal_numbers_default).optimize() # "123" -> "один два три" single_digits_graph = pynini.compose(NEMO_DIGIT, self.cardinal_numbers_nominative) self.single_digits_graph = single_digits_graph + pynini.closure(insert_space + single_digits_graph) optional_quantity = pynini.string_file(get_abs_path("data/numbers/quantity.tsv")).optimize() optional_quantity = pynutil.insert("quantity: \"") + optional_quantity + pynutil.insert("\"") optional_quantity = pynini.closure( (pynutil.add_weight(pynini.accep(NEMO_SPACE), -0.1) | insert_space) + optional_quantity, 0, 1 ) serial_graph = self.get_serial_graph() final_graph = ( self.optional_graph_negative + pynutil.insert("integer: \"") + self.cardinal_numbers_with_leading_zeros + pynutil.insert("\"") + optional_quantity ).optimize() final_graph = pynutil.add_weight(final_graph, -0.1) final_graph |= ( pynutil.insert("integer: \"") + pynutil.add_weight(self.single_digits_graph | serial_graph, 10) + pynutil.insert("\"") ) self.final_graph = final_graph # to cover cases "2-х" -> "двух" (this is not covered by ordinal endings) final_graph |= pynini.compose( pynini.compose(NEMO_DIGIT ** (1, ...) + pynini.cross('-х', ''), final_graph), NEMO_SIGMA + pynini.accep("х\"") + NEMO_SIGMA, ) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="telephone", kind="classify", deterministic=deterministic) add_separator = pynutil.insert(", ") # between components digit = pynini.invert( pynini.string_file(get_abs_path("data/numbers/digit.tsv")) ).optimize() | pynini.cross("0", "o") country_code = (pynutil.insert("country_code: \"") + pynini.closure(pynutil.delete("+"), 0, 1) + pynini.closure(digit + insert_space, 0, 2) + digit + pynutil.insert("\"")) optional_country_code = pynini.closure( country_code + pynini.closure(pynutil.delete("-"), 0, 1) + delete_space + insert_space, 0, 1) area_part_common = pynutil.add_weight( pynini.cross("800", "eight hundred"), -1.1) area_part_default = pynini.closure(digit + insert_space, 2, 2) + digit area_part = area_part_default | area_part_common area_part = ( (area_part + pynutil.delete("-")) | (pynutil.delete("(") + area_part + (pynutil.delete(") ") | pynutil.delete(")-")))) + add_separator del_separator = pynini.closure(pynini.union("-", " "), 0, 1) number_length = ((NEMO_DIGIT + del_separator) | (NEMO_ALPHA + del_separator))**7 number_words = pynini.closure((NEMO_DIGIT @ digit) + (insert_space | pynini.cross("-", ', ')) | NEMO_ALPHA | (NEMO_ALPHA + pynini.cross("-", ' '))) number_words = pynini.compose(number_length, number_words) number_part = area_part + number_words number_part = pynutil.insert( "number_part: \"") + number_part + pynutil.insert("\"") extension = (pynutil.insert("extension: \"") + pynini.closure(digit + insert_space, 0, 3) + digit + pynutil.insert("\"")) optional_extension = pynini.closure(insert_space + extension, 0, 1) graph = optional_country_code + number_part + optional_extension # ip digit_to_str_graph = pynini.compose( NEMO_DIGIT**(1, 3), digit + pynini.closure(pynutil.insert(" ") + digit)).optimize() ip_graph = digit_to_str_graph + (pynini.cross(".", " dot ") + digit_to_str_graph)**3 graph |= pynutil.insert( "number_part: \"") + ip_graph.optimize() + pynutil.insert("\"") final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def combine_formulas(formulas: list): combined_formula = formulas[0] for formula in formulas[1:]: combined_formula = pynini.compose(combined_formula, formula) remove_non_phonemes = pynini.cdrewrite( pynini.cross(pynini.union(*"#0").optimize(), ""), "", "", self.sigma_star) combined_formula = pynini.compose(combined_formula, remove_non_phonemes) return combined_formula
def get_address_graph(self, cardinal): """ Finite state transducer for classifying serial. The serial is a combination of digits, letters and dashes, e.g.: 2788 San Tomas Expy, Santa Clara, CA 95051 -> units: "address" cardinal { integer: "two seven eight eight San Tomas Expressway Santa Clara California nine five zero five one" } preserve_order: true """ ordinal_verbalizer = OrdinalVerbalizer().graph ordinal_tagger = OrdinalTagger(cardinal=cardinal).graph ordinal_num = pynini.compose( pynutil.insert("integer: \"") + ordinal_tagger + pynutil.insert("\""), ordinal_verbalizer) address_num = pynini.closure(NEMO_DIGIT, 1) @ cardinal.single_digits_graph direction = (pynini.cross("E", "East") | pynini.cross("S", "South") | pynini.cross("W", "West") | pynini.cross("N", "North")) direction = pynini.closure( pynutil.add_weight(pynini.accep(NEMO_SPACE) + direction, -1), 0, 1) address_words = pynini.string_file( get_abs_path("data/address/address_words.tsv")) address_words = (pynini.accep(NEMO_SPACE) + pynini.closure(ordinal_num, 0, 1) + pynini.closure(NEMO_ALPHA | NEMO_SPACE, 1) + address_words) city = pynini.closure(NEMO_ALPHA | pynini.accep(NEMO_SPACE), 1) city = pynini.closure( pynini.cross(",", "") + pynini.accep(NEMO_SPACE) + city, 0, 1) state = pynini.invert( pynini.string_file(get_abs_path("data/address/states.tsv"))) state = pynini.closure( pynini.cross(",", "") + pynini.accep(NEMO_SPACE) + state, 0, 1) zip_code = pynini.compose(NEMO_DIGIT**5, cardinal.single_digits_graph) zip_code = pynini.closure( pynutil.add_weight( pynini.closure(pynini.cross(",", ""), 0, 1) + pynini.accep(NEMO_SPACE) + zip_code, -100), 0, 1, ) address = (address_num + direction + address_words + pynini.closure(pynini.cross(".", ""), 0, 1) + city + state + zip_code) return address
def _create_levenshtein_automaton_lattice(self, query): """Constructs a lattice for a query string. Args: query: input string or acceptor. Returns: A lattice FST. """ l_i = compose(query, self._e_i) lattice = compose(l_i, self._l_o) EditTransducer.check_wellformed_lattice(lattice) return lattice
def _create_lattice(self, iset, oset): """Creates edit lattice for a pair of input/output strings or acceptors. Args: iset: input string or acceptor oset: output string or acceptor. Returns: A lattice FST. """ l_i = compose(iset, self._e_i) l_o = compose(self._e_o, oset) lattice = compose(l_i, l_o) EditTransducer.check_wellformed_lattice(lattice) return lattice
def __construct_compound_stems_nn(self, tmp): ''' Default noun compounding stems ''' with pynini.default_token_type(self.__syms.alphabet): kompos_stems = pynini.compose( pynini.concat( self.__syms.characters.closure(1), pynini.union( pynini.cross( "", pynini.concat( pynini.accep("<+NN>"), pynini.concat(self.__syms.gender, pynini.accep("<Nom> <Sg>")))), pynini.cross( "", pynini.concat( pynini.accep("<+NN>"), pynini.concat(self.__syms.gender, pynini.accep("<Nom> <Pl>")))))), tmp) return (pynini.cross("", "<Kompos_Stems>") + kompos_stems + pynini.accep("<NN>") + pynini.cross("", "<kompos> <nativ>")).optimize()
def __init__(self, whitelist: 'pynini.FstLike', deterministic: bool = True): super().__init__(name="abbreviation", kind="classify", deterministic=deterministic) dot = pynini.accep(".") # A.B.C. -> A. B. C. graph = NEMO_UPPER + dot + pynini.closure( insert_space + NEMO_UPPER + dot, 1) # A.B.C. -> A.B.C. graph |= NEMO_UPPER + dot + pynini.closure(NEMO_UPPER + dot, 1) # ABC -> ABC graph |= NEMO_UPPER + pynini.closure(NEMO_UPPER, 1) # ABC -> A B C graph |= NEMO_UPPER + pynini.closure(insert_space + NEMO_UPPER, 1) # exclude words that are included in the whitelist graph = pynini.compose( pynini.difference(pynini.project(graph, "input"), pynini.project(whitelist.graph, "input")), graph) graph = pynutil.insert( "value: \"") + graph.optimize() + pynutil.insert("\"") graph = self.add_tokens(graph) self.fst = graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="roman", kind="verbalize", deterministic=deterministic) suffix = OrdinalFst().suffix cardinal = pynini.closure(NEMO_NOT_QUOTE) ordinal = pynini.compose(cardinal, suffix) graph = (pynutil.delete("key_cardinal: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + pynini.accep(" ") + pynutil.delete("integer: \"") + cardinal + pynutil.delete("\"")).optimize() graph |= (pynutil.delete("default_cardinal: \"default\" integer: \"") + cardinal + pynutil.delete("\"")).optimize() graph |= (pynutil.delete("default_ordinal: \"default\" integer: \"") + ordinal + pynutil.delete("\"")).optimize() graph |= (pynutil.delete("key_the_ordinal: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + pynini.accep(" ") + pynutil.delete("integer: \"") + pynini.closure(pynutil.insert("the "), 0, 1) + ordinal + pynutil.delete("\"")).optimize() delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def __init__(self, tn_time: GraphFst, deterministic: bool = True): super().__init__(name="time", kind="classify", deterministic=deterministic) tn_time_tagger = tn_time.graph_preserve_order tn_time_verbalizer = TNTimeVerbalizer().graph tn_time_graph_preserve_order = pynini.compose( tn_time_tagger, tn_time_verbalizer).optimize() graph_preserve_order = pynini.invert( tn_time_graph_preserve_order).optimize() graph_preserve_order = pynutil.insert( "hours: \"") + graph_preserve_order + pynutil.insert("\"") # "пятнадцать минут шестого" -> 17:15 # Requires permutations for the correct verbalization m_next_h = (pynutil.insert("minutes: \"") + pynini.invert(tn_time.minutes).optimize() + pynutil.insert("\"") + pynini.accep(NEMO_SPACE) + pynutil.insert("hours: \"") + pynini.invert(tn_time.increment_hour_ordinal).optimize() + pynutil.insert("\"")).optimize() # "без пятнадцати минут шесть" -> 17:45 # Requires permutation for the correct verbalization m_to_h = (pynini.cross("без ", "minutes: \"") + pynini.invert(tn_time.mins_to_h) + pynutil.insert("\"") + pynini.accep(NEMO_SPACE) + pynutil.insert("hours: \"") + pynini.invert(tn_time.increment_hour_cardinal).optimize() + pynutil.insert("\"")) graph_reserve_order = m_next_h | m_to_h graph = graph_preserve_order | graph_reserve_order graph = self.add_tokens(graph) self.fst = graph.optimize()
def __init__(self, cardinal, deterministic: bool = True): super().__init__(name="fraction", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph integer = pynutil.insert( "integer_part: \"") + cardinal_graph + pynutil.insert("\"") numerator = (pynutil.insert("numerator: \"") + cardinal_graph + (pynini.cross("/", "\" ") | pynini.cross(" / ", "\" "))) endings = ["rd", "th", "st", "nd"] endings += [x.upper() for x in endings] optional_end = pynini.closure(pynini.cross(pynini.union(*endings), ""), 0, 1) denominator = pynutil.insert( "denominator: \"" ) + cardinal_graph + optional_end + pynutil.insert("\"") graph = pynini.closure(integer + pynini.accep(" "), 0, 1) + (numerator + denominator) graph |= pynini.closure( integer + (pynini.accep(" ") | pynutil.insert(" ")), 0, 1) + pynini.compose( pynini.string_file(get_abs_path("data/number/fraction.tsv")), (numerator + denominator)) self.graph = graph final_graph = self.add_tokens(self.graph) self.fst = final_graph.optimize()
def __init__(self, input_case: str, deterministic: bool = True): super().__init__(name="whitelist", kind="classify", deterministic=deterministic) def _get_whitelist_graph(input_case, file="data/whitelist.tsv"): whitelist = load_labels(get_abs_path(file)) if input_case == "lower_cased": whitelist = [[x[0].lower()] + x[1:] for x in whitelist] else: whitelist = [[x[0].lower()] + x[1:] for x in whitelist] graph = pynini.string_map(whitelist) return graph graph = _get_whitelist_graph(input_case) units_graph = _get_whitelist_graph(input_case, file="data/measurements.tsv") # do not replace single letter units, like `м` or `°` units_graph = pynini.compose( pynini.difference(pynini.project(units_graph, "input"), NEMO_ALPHA), units_graph) graph |= units_graph.optimize() graph |= TO_LATIN + pynini.closure(pynutil.insert(" ") + TO_LATIN) self.final_graph = convert_space(graph) self.fst = (pynutil.insert("name: \"") + self.final_graph + pynutil.insert("\"")).optimize()
def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None): super().__init__(name="whitelist", kind="classify", deterministic=deterministic) def _get_whitelist_graph(input_case, file): whitelist = load_labels(file) if input_case == "lower_cased": whitelist = [[x[0].lower()] + x[1:] for x in whitelist] else: whitelist = [[x[0].lower()] + x[1:] for x in whitelist] graph = pynini.string_map(whitelist) return graph graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist.tsv")) if input_file: graph = _get_whitelist_graph(input_case, input_file) units_graph = _get_whitelist_graph(input_case, file=get_abs_path("data/measurements.tsv")) # do not replace single letter units, like `м`, `°` and `%` will be replaced units_graph = pynini.compose((NEMO_CHAR ** (2, ...) | pynini.difference(NEMO_CHAR, RU_ALPHA)), units_graph) graph |= units_graph.optimize() graph |= TO_CYRILLIC + pynini.closure(pynutil.insert(" ") + TO_CYRILLIC) self.final_graph = convert_space(graph) self.fst = (pynutil.insert("name: \"") + self.final_graph + pynutil.insert("\"")).optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic) graph_digit = pynini.string_file( get_abs_path("data/ordinals/digit.tsv")).invert() graph_ties = pynini.string_file( get_abs_path("data/ordinals/ties.tsv")).invert() graph_thousands = pynini.string_file( get_abs_path("data/ordinals/thousands.tsv")).invert() graph = pynutil.delete("integer: \"") + pynini.closure( NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") suffixes = pynini.union("ten", "tem", "ter", "tes", "te") convert_rest = pynutil.insert(suffixes, weight=0.01) self.ordinal_stem = graph_digit | graph_ties | graph_thousands suffix = pynini.cdrewrite( pynini.closure(self.ordinal_stem, 0, 1) + convert_rest, "", "[EOS]", NEMO_SIGMA, ).optimize() self.graph = pynini.compose(graph, suffix) self.suffix = suffix delete_tokens = self.delete_tokens(self.graph) self.fst = delete_tokens.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic) graph_digit = pynini.string_file( get_abs_path("data/ordinals/digit.tsv")).invert() graph_teens = pynini.string_file( get_abs_path("data/ordinals/teen.tsv")).invert() graph = (pynutil.delete("integer:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) convert_rest = pynutil.insert("th", weight=0.01) suffix = pynini.cdrewrite( graph_digit | graph_teens | pynutil.add_weight(pynini.cross("ty", "tieth"), weight=0.001) | convert_rest, "", "[EOS]", NEMO_SIGMA, ).optimize() self.graph = pynini.compose(graph, suffix) self.suffix = suffix delete_tokens = self.delete_tokens(self.graph) self.fst = delete_tokens.optimize()
def parallelInversion(transducersAndOutputs, alphabet=None): try: a = [ sandwich.compose(y, sandwich.invert(t)).project(True) for y, t in transducersAndOutputs ] a = reduce(sandwich.intersect, a) if alphabet != None: lm = sandwich.union(*alphabet).closure() a = a * lm a.topsort() for s in a.states(): iterator = a.mutable_arcs(s) while not iterator.done(): value = iterator.value() #print value.olabel,value.ilabel,value.weight assert value.olabel == value.ilabel if value.olabel != 0: value.weight = 1 iterator.set_value(value) iterator.next() return sandwich.shortestpath(a).stringify() except: # print "Got an exception in parallel inversion..." # for y,t in transducersAndOutputs: # print "inverting:" # t = invert(t) # print t # print "composing:" # t = compose(y,t) # print t # print "projecting:" # t = project(True) # print t return None
def __init__(self, whitelist: 'pynini.FstLike', deterministic: bool = True): super().__init__(name="abbreviation", kind="classify", deterministic=deterministic) main_graph = NEMO_UPPER + pynini.closure(insert_space + NEMO_UPPER, 1) misc_graph = pynutil.add_weight( TO_LOWER + pynini.closure(insert_space + pynini.union(TO_LOWER | NEMO_LOWER)), 110) misc_graph |= pynutil.add_weight( pynini.closure(NEMO_UPPER, 2) + pynini.closure(insert_space + NEMO_LOWER, 1), 110) misc_graph |= ( NEMO_UPPER + pynutil.delete(".") + pynini.closure(insert_space + NEMO_UPPER + pynutil.delete("."))) misc_graph |= pynutil.add_weight( TO_LOWER + pynutil.delete(".") + pynini.closure(insert_space + TO_LOWER + pynutil.delete(".")), 110) # set weight of the misc graph to the value higher then word graph = pynutil.add_weight(main_graph.optimize(), 10) | pynutil.add_weight( misc_graph.optimize(), 101) # exclude words that are included in the whitelist graph = pynini.compose( pynini.difference(pynini.project(graph, "input"), pynini.project(whitelist.graph, "input")), graph) graph = pynutil.insert( "value: \"") + graph.optimize() + pynutil.insert("\"") graph = self.add_tokens(graph) self.fst = graph.optimize()
def process_window(input_str, window_fst, model, pruning_weight=5, rejection_weight=1.5): ''' Compose a window input automaton with the model. ''' t1 = time.time() window_fst.relabel_tables(new_isymbols=model[0].output_symbols(), new_osymbols=model[0].output_symbols()) for fst in model: window_fst = pynini.compose(window_fst, fst) window_fst.project(project_output=True) window_fst.prune(weight=pruning_weight) window_fst.optimize() t3 = time.time() logging.debug('- composition: {}s'.format(t3 - t1)) # allow also identity for windows of length 1 # (with weight `rejection_weight`) if ' ' not in input_str: # The formula: # rejection_weight*(len(input_str)+2) # means that rejection_weight*2 is the initial cost of having an OOV # word (which is than more expensive with increasing length). # While discovered by accident, this turned out to work well as # a very naive OOV word model. window_fst.union( pynini.acceptor(escape_for_pynini(input_str), weight=rejection_weight * (len(input_str) + 2))) t2 = time.time() logging.debug('Total processing time: {}s'.format(t2 - t1)) return window_fst
def decode_lattice(lattice: pynini.Fst, lm: pynini.Fst, sym: pynini.SymbolTable) -> str: """Decodes the lattice.""" lattice = pynini.compose(lattice, lm) assert lattice.start() != pynini.NO_STATE_ID, "composition failure" # Pynini can join the string for us. return pynini.shortestpath(lattice).rmepsilon().string(sym)
def get_serial_graph(self): """ Finite state transducer for classifying serial. The serial is a combination of digits, letters and dashes, e.g.: c325-b -> tokens { cardinal { integer: "си три два пять би" } } """ num_graph = self.single_digits_graph alpha = TO_CYRILLIC | RU_ALPHA delimiter = insert_space | pynini.cross("-", " ") | pynini.cross( "/", " ") letter_num = pynini.closure(alpha + delimiter, 1) + num_graph num_letter = pynini.closure(num_graph + delimiter, 1) + alpha num_delimiter_num = pynini.closure(num_graph + delimiter, 1) + num_graph next_alpha_or_num = pynini.closure(delimiter + (alpha | num_graph)) serial_graph = (letter_num | num_letter | num_delimiter_num) + next_alpha_or_num # at least 1 alpha and 1 digit is present at_least_one_alpha_num = ( NEMO_SIGMA + (RU_ALPHA | pynini.project(TO_CYRILLIC, "input")) + NEMO_SIGMA + NEMO_DIGIT + NEMO_SIGMA) | ( NEMO_SIGMA + NEMO_DIGIT + NEMO_SIGMA + (RU_ALPHA | pynini.project(TO_CYRILLIC, "input")) + NEMO_SIGMA) serial_graph = pynini.compose(at_least_one_alpha_num, serial_graph.optimize()).optimize() # numbers only with 2+ delimiters serial_graph |= (num_graph + delimiter + num_graph + delimiter + num_graph + pynini.closure(delimiter + num_graph)).optimize() return serial_graph.optimize()
def __construct_compound_stems_nn(self, tmp): ''' Default noun compounding stems ''' return pynini.concat( pynini.transducer("", "<Kompos_Stems>", output_token_type=self.__syms.alphabet), pynini.compose( pynini.concat( self.__syms.characters.closure(1), pynini.union( pynini.transducer( "", pynini.concat( pynini.acceptor( "<+NN>", token_type=self.__syms.alphabet), self.__syms.gender, pynini.acceptor( "<Nom> <Sg>", token_type=self.__syms.alphabet))), pynini.transducer( "", pynini.concat( pynini.acceptor( "<+NN>", token_type=self.__syms.alphabet), self.__syms.gender, pynini.acceptor( "<Nom> <Pl>", token_type=self.__syms.alphabet))))), tmp), pynini.acceptor("<NN>", token_type=self.__syms.alphabet), pynini.transducer( "", "<kompos> <nativ>", output_token_type=self.__syms.alphabet)).optimize()
def parse(self): fsa = fsa_from_list_of_symbols(self.input, self.fst.mutable_input_symbols()) intersection = compose(fsa, self.fst) self._best = shortestpath(intersection) self._best.topsort() self._reverse_polish_rules = retrieve_rules(self._best)
def get_paths(decode_graph, isymbs, osymbs, phoneme_list): phoneme_fst = pynini.acceptor(" ".join(phoneme_list), token_type = isymbs) return [path for path in pynini.compose(phoneme_fst, decode_graph).paths(input_token_type=isymbs, output_token_type=osymbs)]