Exemple #1
0
    def get_cardinal_numbers(self,
                             number_names: dict,
                             alternative_formats: dict,
                             mode: str = "all"):
        """Returns cardinal numbers names graph.

        Args:
            number_names: number_names for cardinal and ordinal numbers
            alternative_formats: alternative number formats
            mode: "all" - to return graph that includes all Ru cases, "nominative" to return only the nominative form
        """
        if mode == "all":
            cardinal_names = number_names['cardinal_number_names']
        elif mode == "nominative":
            cardinal_names = number_names['cardinal_names_nominative']
        else:
            raise ValueError(f'{mode} is not supported.')
        one_thousand_alternative = alternative_formats[
            'one_thousand_alternative']
        separators = alternative_formats['separators']

        cardinal_numbers = cardinal_names | pynini.compose(
            cardinal_names, one_thousand_alternative)
        cardinal_numbers = pynini.compose(separators, cardinal_numbers)
        return cardinal_numbers
Exemple #2
0
def rewrite_lattice(
    string: pynini.FstLike,
    rule: pynini.Fst,
    token_type: Optional[pynini.TokenType] = None) -> pynini.Fst:
  """Constructs a weighted lattice of output strings.

  Constructs a weighted, epsilon-free lattice of output strings given an
  input FST (or string) and a rule FST.

  Args:
    string: Input string or FST.
    rule: Input rule WFST.
    token_type: Optional input token type, or symbol table.

  Returns:
    An epsilon-free WFSA.

  Raises:
    Error: Composition failure.
  """
  # TODO(kbg): Consider adding support for PDT and MPDT composition.
  # TODO(kbg): Consider using `contextlib.nullcontext` here instead.
  if token_type is None:
    lattice = pynini.compose(string, rule, compose_filter="alt_sequence")
  else:
    with pynini.default_token_type(token_type):
      lattice = pynini.compose(string, rule, compose_filter="alt_sequence")
  if lattice.start() == pynini.NO_STATE_ID:
    raise Error("Composition failure")
  return lattice.project("output").rmepsilon()
Exemple #3
0
            def get_token_sem_graph(classify_and_verbalize):
                token_plus_punct = (
                    pynini.closure(punct + pynutil.insert(" ")) +
                    classify_and_verbalize +
                    pynini.closure(pynutil.insert(" ") + punct))

                graph = token_plus_punct + pynini.closure(
                    (pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1),
                                    delete_extra_space)
                     | (pynutil.insert(" ") + punct + pynutil.insert(" "))) +
                    token_plus_punct)

                graph |= punct_only + pynini.closure(punct)
                graph = delete_space + graph + delete_space

                remove_extra_spaces = pynini.closure(
                    NEMO_NOT_SPACE,
                    1) + pynini.closure(delete_extra_space +
                                        pynini.closure(NEMO_NOT_SPACE, 1))
                remove_extra_spaces |= (
                    pynini.closure(pynutil.delete(" "), 1) +
                    pynini.closure(NEMO_NOT_SPACE, 1) +
                    pynini.closure(delete_extra_space +
                                   pynini.closure(NEMO_NOT_SPACE, 1)))

                graph = pynini.compose(graph.optimize(),
                                       remove_extra_spaces).optimize()
                return graph
Exemple #4
0
    def add_optional_and(self, graph):
        if not self.deterministic:
            graph = pynini.compose(
                graph, NEMO_SIGMA +
                pynini.closure(pynini.cross("hundred ", " "), 0, 1) +
                NEMO_SIGMA)

        not_quote = pynini.closure(NEMO_NOT_QUOTE)
        no_thousand_million = pynini.difference(
            not_quote, not_quote + pynini.union("thousand", "million") +
            not_quote).optimize()
        integer = (not_quote + pynutil.add_weight(
            pynini.cross("hundred ", "hundred and ") + no_thousand_million,
            -0.0001)).optimize()

        no_hundred = pynini.difference(
            NEMO_SIGMA,
            not_quote + pynini.accep("hundred") + not_quote).optimize()
        integer |= (not_quote + pynutil.add_weight(
            pynini.cross("thousand ", "thousand and ") + no_hundred,
            -0.0001)).optimize()

        graph_with_and = pynini.compose(
            graph, integer).optimize() | pynutil.add_weight(graph, 0.00001)

        return graph_with_and
Exemple #5
0
    def __construct_r1(self):
        '''
    Umlaut

    Apfel$ ==> Äpfel
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<CB>", "<FB>", "<UL>", "<DEL-S>", "<SS>", "<WB>", "<^UC>",
                    "<^Ax>", "<e>", "<^pl>", "<^Gen>", "<^Del>", "<NoHy>",
                    "<NoDef>", "<UL>", "<FB>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project())

        # r1a
        tau = pynini.push(pynini.string_map(
            [("a", "ä"), ("o", "ö"), ("u", "ü"), ("A", "Ä"), ("O", "Ö"),
             ("U", "Ü")],
            input_token_type=self.__syms.alphabet,
            output_token_type=self.__syms.alphabet),
                          push_labels=True)
        lc = pynini.union(
            self.__syms.consonants,
            pynini.string_map(
                ["<CB>", "<WB>", "<NoHy>", "<NoDef>", "<^UC>"],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project()).optimize()
        r1a = pynini.cdrewrite(
            tau, lc,
            pynini.concat(
                alphabet.closure(),
                pynini.acceptor("<UL>", token_type=self.__syms.alphabet)),
            alphabet.closure())

        # r1c
        tau = pynini.transducer("a", "", input_token_type=self.__syms.alphabet)
        r1c = pynini.cdrewrite(
            tau,
            pynini.string_map(
                ["ä", "Ä"],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project(),
            pynini.concat(
                self.__syms.consonants_lower, alphabet.closure(),
                pynini.acceptor("<UL>", token_type=self.__syms.alphabet)),
            alphabet.closure()).optimize()

        # r1d
        r1d = pynini.cdrewrite(
            pynini.transducer("<UL>",
                              "<FB>",
                              input_token_type=self.__syms.alphabet,
                              output_token_type=self.__syms.alphabet), "", "",
            alphabet.closure())

        return pynini.compose(r1a, pynini.compose(r1c, r1d)).optimize()
Exemple #6
0
    def __init__(
        self,
        input_case: str,
        cache_dir: str = None,
        overwrite_cache: bool = False,
        deterministic: bool = True,
        whitelist: str = None,
    ):
        super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic)

        far_file = None
        if cache_dir is not None and cache_dir != "None":
            os.makedirs(cache_dir, exist_ok=True)
            whitelist_file = os.path.basename(whitelist) if whitelist else ""
            far_file = os.path.join(
                cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far"
            )
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
            logging.info(f'ClassifyFst.fst was restored from {far_file}.')
        else:
            logging.info(f"Creating ClassifyFst grammars.")

            word_graph = WordFst(deterministic=deterministic).fst
            whitelist_graph = WhiteListFst(input_case=input_case, deterministic=deterministic).fst
            punct_graph = PunctuationFst(deterministic=deterministic).fst

            classify = pynutil.add_weight(whitelist_graph, 1) | pynutil.add_weight(word_graph, 100)

            punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }")
            punct = pynini.closure(
                pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
                | (pynutil.insert(" ") + punct),
                1,
            )
            token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
            token_plus_punct = (
                pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)
            )

            graph = (
                token_plus_punct
                + pynini.closure(
                    (
                        pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
                        | (pynutil.insert(" ") + punct + pynutil.insert(" "))
                    )
                    + token_plus_punct
                ).optimize()
            )

            graph = delete_space + graph + delete_space
            graph |= punct

            self.fst = graph.optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f"ClassifyFst grammars are saved to {far_file}.")
Exemple #7
0
    def __init__(self, cardinal: GraphFst, deterministic: bool):
        super().__init__(name="decimal",
                         kind="classify",
                         deterministic=deterministic)

        cardinal_graph = cardinal.graph
        cardinal_graph_hundred_component_at_least_one_none_zero_digit = (
            cardinal.graph_hundred_component_at_least_one_none_zero_digit)

        self.graph = cardinal.single_digits_graph.optimize()

        if not deterministic:
            self.graph = self.graph | cardinal_graph

        point = pynutil.delete(".")
        optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)

        self.graph_fractional = pynutil.insert(
            "fractional_part: \"") + self.graph + pynutil.insert("\"")
        self.graph_integer = pynutil.insert(
            "integer_part: \"") + cardinal_graph + pynutil.insert("\"")
        final_graph_wo_sign = (
            pynini.closure(self.graph_integer + pynutil.insert(" "), 0, 1) +
            point + pynutil.insert(" ") + self.graph_fractional)

        self.final_graph_wo_negative = final_graph_wo_sign | get_quantity(
            final_graph_wo_sign,
            cardinal_graph_hundred_component_at_least_one_none_zero_digit)

        # reduce options for non_deterministic and allow either "oh" or "zero", but not combination
        if not deterministic:
            no_oh_zero = pynini.difference(
                NEMO_SIGMA,
                (NEMO_SIGMA + "oh" + NEMO_SIGMA + "zero" + NEMO_SIGMA)
                | (NEMO_SIGMA + "zero" + NEMO_SIGMA + "oh" + NEMO_SIGMA),
            ).optimize()
            no_zero_oh = pynini.difference(
                NEMO_SIGMA, NEMO_SIGMA + pynini.accep("zero") + NEMO_SIGMA +
                pynini.accep("oh") + NEMO_SIGMA).optimize()

            self.final_graph_wo_negative |= pynini.compose(
                self.final_graph_wo_negative,
                pynini.cdrewrite(
                    pynini.cross("integer_part: \"zero\"",
                                 "integer_part: \"oh\""), NEMO_SIGMA,
                    NEMO_SIGMA, NEMO_SIGMA),
            )
            self.final_graph_wo_negative = pynini.compose(
                self.final_graph_wo_negative, no_oh_zero).optimize()
            self.final_graph_wo_negative = pynini.compose(
                self.final_graph_wo_negative, no_zero_oh).optimize()

        final_graph = optional_graph_negative + self.final_graph_wo_negative

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemple #8
0
    def get_serial_graph(self):
        """
        Finite state transducer for classifying serial (handles only cases without delimiters,
        values with delimiters are handled by default).
            The serial is a combination of digits, letters and dashes, e.g.:
            c325b -> tokens { cardinal { integer: "c three two five b" } }
        """
        num_graph = self.single_digits_graph

        if not self.deterministic:
            num_graph |= self.graph

        # add space between letter and digit
        graph_with_space = pynini.compose(
            pynini.cdrewrite(pynutil.insert(" "), NEMO_ALPHA, NEMO_DIGIT,
                             NEMO_SIGMA),
            pynini.cdrewrite(pynutil.insert(" "), NEMO_DIGIT, NEMO_ALPHA,
                             NEMO_SIGMA),
        )

        # make sure at least one digit and letter is present
        not_space = pynini.closure(NEMO_NOT_SPACE)
        graph_with_space = pynini.compose(
            (not_space + NEMO_ALPHA + not_space + NEMO_DIGIT + not_space)
            | (not_space + NEMO_DIGIT + not_space + NEMO_ALPHA + not_space),
            graph_with_space,
        )

        keep_space = pynini.accep(" ")
        serial_graph = pynini.compose(
            graph_with_space,
            pynini.closure(pynini.closure(NEMO_ALPHA, 1) + keep_space, 1) +
            num_graph +
            pynini.closure(keep_space + pynini.closure(NEMO_ALPHA) +
                           pynini.closure(keep_space + num_graph, 0, 1)),
        )
        serial_graph |= pynini.compose(
            graph_with_space,
            num_graph + keep_space + pynini.closure(NEMO_ALPHA, 1) +
            pynini.closure(keep_space + num_graph + pynini.closure(
                keep_space + pynini.closure(NEMO_ALPHA), 0, 1)),
        )

        # serial graph with delimiter
        delimiter = pynini.accep("-") | pynini.accep("/")
        alphas = pynini.closure(NEMO_ALPHA, 1)
        letter_num = alphas + delimiter + num_graph
        num_letter = pynini.closure(num_graph + delimiter, 1) + alphas
        next_alpha_or_num = pynini.closure(delimiter + (alphas | num_graph))
        next_alpha_or_num |= pynini.closure(delimiter + num_graph +
                                            pynutil.insert(" ") + alphas)

        serial_graph |= letter_num + next_alpha_or_num
        serial_graph |= num_letter + next_alpha_or_num
        # numbers only with 2+ delimiters
        serial_graph |= (num_graph + delimiter + num_graph + delimiter +
                         num_graph + pynini.closure(delimiter + num_graph))
        return pynutil.add_weight(serial_graph, 2)
Exemple #9
0
    def __init__(self, number_names: dict, alternative_formats: dict, deterministic: bool = False):
        super().__init__(name="cardinal", kind="classify", deterministic=deterministic)

        self.cardinal_numbers_default = self.get_cardinal_numbers(number_names, alternative_formats, mode="all")
        self.cardinal_numbers_nominative = self.get_cardinal_numbers(
            number_names, alternative_formats, mode="nominative"
        )
        self.optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1
        )

        self.cardinal_numbers_with_optional_negative = (
            self.optional_graph_negative
            + pynutil.insert("integer: \"")
            + self.cardinal_numbers_default
            + pynutil.insert("\"")
        )

        # "03" -> remove leading zeros and verbalize
        leading_zeros = pynini.closure(pynini.cross("0", ""))
        self.cardinal_numbers_with_leading_zeros = (leading_zeros + self.cardinal_numbers_default).optimize()

        # "123" -> "один два три"
        single_digits_graph = pynini.compose(NEMO_DIGIT, self.cardinal_numbers_nominative)
        self.single_digits_graph = single_digits_graph + pynini.closure(insert_space + single_digits_graph)

        optional_quantity = pynini.string_file(get_abs_path("data/numbers/quantity.tsv")).optimize()
        optional_quantity = pynutil.insert("quantity: \"") + optional_quantity + pynutil.insert("\"")
        optional_quantity = pynini.closure(
            (pynutil.add_weight(pynini.accep(NEMO_SPACE), -0.1) | insert_space) + optional_quantity, 0, 1
        )

        serial_graph = self.get_serial_graph()

        final_graph = (
            self.optional_graph_negative
            + pynutil.insert("integer: \"")
            + self.cardinal_numbers_with_leading_zeros
            + pynutil.insert("\"")
            + optional_quantity
        ).optimize()

        final_graph = pynutil.add_weight(final_graph, -0.1)
        final_graph |= (
            pynutil.insert("integer: \"")
            + pynutil.add_weight(self.single_digits_graph | serial_graph, 10)
            + pynutil.insert("\"")
        )
        self.final_graph = final_graph

        # to cover cases "2-х" -> "двух" (this is not covered by ordinal endings)
        final_graph |= pynini.compose(
            pynini.compose(NEMO_DIGIT ** (1, ...) + pynini.cross('-х', ''), final_graph),
            NEMO_SIGMA + pynini.accep("х\"") + NEMO_SIGMA,
        )
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemple #10
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="telephone",
                         kind="classify",
                         deterministic=deterministic)

        add_separator = pynutil.insert(", ")  # between components
        digit = pynini.invert(
            pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
        ).optimize() | pynini.cross("0", "o")

        country_code = (pynutil.insert("country_code: \"") +
                        pynini.closure(pynutil.delete("+"), 0, 1) +
                        pynini.closure(digit + insert_space, 0, 2) + digit +
                        pynutil.insert("\""))
        optional_country_code = pynini.closure(
            country_code + pynini.closure(pynutil.delete("-"), 0, 1) +
            delete_space + insert_space, 0, 1)

        area_part_common = pynutil.add_weight(
            pynini.cross("800", "eight hundred"), -1.1)
        area_part_default = pynini.closure(digit + insert_space, 2, 2) + digit
        area_part = area_part_default | area_part_common

        area_part = (
            (area_part + pynutil.delete("-"))
            | (pynutil.delete("(") + area_part +
               (pynutil.delete(") ") | pynutil.delete(")-")))) + add_separator

        del_separator = pynini.closure(pynini.union("-", " "), 0, 1)
        number_length = ((NEMO_DIGIT + del_separator) |
                         (NEMO_ALPHA + del_separator))**7
        number_words = pynini.closure((NEMO_DIGIT @ digit) +
                                      (insert_space | pynini.cross("-", ', '))
                                      | NEMO_ALPHA
                                      | (NEMO_ALPHA + pynini.cross("-", ' ')))
        number_words = pynini.compose(number_length, number_words)
        number_part = area_part + number_words
        number_part = pynutil.insert(
            "number_part: \"") + number_part + pynutil.insert("\"")
        extension = (pynutil.insert("extension: \"") +
                     pynini.closure(digit + insert_space, 0, 3) + digit +
                     pynutil.insert("\""))
        optional_extension = pynini.closure(insert_space + extension, 0, 1)

        graph = optional_country_code + number_part + optional_extension

        # ip
        digit_to_str_graph = pynini.compose(
            NEMO_DIGIT**(1, 3),
            digit + pynini.closure(pynutil.insert(" ") + digit)).optimize()
        ip_graph = digit_to_str_graph + (pynini.cross(".", " dot ") +
                                         digit_to_str_graph)**3
        graph |= pynutil.insert(
            "number_part: \"") + ip_graph.optimize() + pynutil.insert("\"")

        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Exemple #11
0
 def combine_formulas(formulas: list):
     combined_formula = formulas[0]
     for formula in formulas[1:]:
         combined_formula = pynini.compose(combined_formula, formula)
     remove_non_phonemes = pynini.cdrewrite(
         pynini.cross(pynini.union(*"#0").optimize(), ""), "", "",
         self.sigma_star)
     combined_formula = pynini.compose(combined_formula,
                                       remove_non_phonemes)
     return combined_formula
Exemple #12
0
    def get_address_graph(self, cardinal):
        """
        Finite state transducer for classifying serial.
            The serial is a combination of digits, letters and dashes, e.g.:
            2788 San Tomas Expy, Santa Clara, CA 95051 ->
                units: "address" cardinal
                { integer: "two seven eight eight San Tomas Expressway Santa Clara California nine five zero five one" }
                 preserve_order: true
        """
        ordinal_verbalizer = OrdinalVerbalizer().graph
        ordinal_tagger = OrdinalTagger(cardinal=cardinal).graph
        ordinal_num = pynini.compose(
            pynutil.insert("integer: \"") + ordinal_tagger +
            pynutil.insert("\""), ordinal_verbalizer)

        address_num = pynini.closure(NEMO_DIGIT,
                                     1) @ cardinal.single_digits_graph

        direction = (pynini.cross("E", "East")
                     | pynini.cross("S", "South")
                     | pynini.cross("W", "West")
                     | pynini.cross("N", "North"))
        direction = pynini.closure(
            pynutil.add_weight(pynini.accep(NEMO_SPACE) + direction, -1), 0, 1)

        address_words = pynini.string_file(
            get_abs_path("data/address/address_words.tsv"))
        address_words = (pynini.accep(NEMO_SPACE) +
                         pynini.closure(ordinal_num, 0, 1) +
                         pynini.closure(NEMO_ALPHA | NEMO_SPACE, 1) +
                         address_words)

        city = pynini.closure(NEMO_ALPHA | pynini.accep(NEMO_SPACE), 1)
        city = pynini.closure(
            pynini.cross(",", "") + pynini.accep(NEMO_SPACE) + city, 0, 1)

        state = pynini.invert(
            pynini.string_file(get_abs_path("data/address/states.tsv")))
        state = pynini.closure(
            pynini.cross(",", "") + pynini.accep(NEMO_SPACE) + state, 0, 1)

        zip_code = pynini.compose(NEMO_DIGIT**5, cardinal.single_digits_graph)
        zip_code = pynini.closure(
            pynutil.add_weight(
                pynini.closure(pynini.cross(",", ""), 0, 1) +
                pynini.accep(NEMO_SPACE) + zip_code, -100),
            0,
            1,
        )

        address = (address_num + direction + address_words +
                   pynini.closure(pynini.cross(".", ""), 0, 1) + city + state +
                   zip_code)
        return address
  def _create_levenshtein_automaton_lattice(self, query):
    """Constructs a lattice for a query string.

    Args:
      query: input string or acceptor.

    Returns:
      A lattice FST.
    """
    l_i = compose(query, self._e_i)
    lattice = compose(l_i, self._l_o)
    EditTransducer.check_wellformed_lattice(lattice)
    return lattice
  def _create_lattice(self, iset, oset):
    """Creates edit lattice for a pair of input/output strings or acceptors.

    Args:
      iset: input string or acceptor
      oset: output string or acceptor.

    Returns:
      A lattice FST.
    """
    l_i = compose(iset, self._e_i)
    l_o = compose(self._e_o, oset)
    lattice = compose(l_i, l_o)
    EditTransducer.check_wellformed_lattice(lattice)
    return lattice
Exemple #15
0
 def __construct_compound_stems_nn(self, tmp):
     '''
 Default noun compounding stems
 '''
     with pynini.default_token_type(self.__syms.alphabet):
         kompos_stems = pynini.compose(
             pynini.concat(
                 self.__syms.characters.closure(1),
                 pynini.union(
                     pynini.cross(
                         "",
                         pynini.concat(
                             pynini.accep("<+NN>"),
                             pynini.concat(self.__syms.gender,
                                           pynini.accep("<Nom> <Sg>")))),
                     pynini.cross(
                         "",
                         pynini.concat(
                             pynini.accep("<+NN>"),
                             pynini.concat(self.__syms.gender,
                                           pynini.accep("<Nom> <Pl>")))))),
             tmp)
         return (pynini.cross("", "<Kompos_Stems>") + kompos_stems +
                 pynini.accep("<NN>") +
                 pynini.cross("", "<kompos> <nativ>")).optimize()
Exemple #16
0
    def __init__(self,
                 whitelist: 'pynini.FstLike',
                 deterministic: bool = True):
        super().__init__(name="abbreviation",
                         kind="classify",
                         deterministic=deterministic)

        dot = pynini.accep(".")
        # A.B.C. -> A. B. C.
        graph = NEMO_UPPER + dot + pynini.closure(
            insert_space + NEMO_UPPER + dot, 1)
        # A.B.C. -> A.B.C.
        graph |= NEMO_UPPER + dot + pynini.closure(NEMO_UPPER + dot, 1)
        # ABC -> ABC
        graph |= NEMO_UPPER + pynini.closure(NEMO_UPPER, 1)
        # ABC -> A B C
        graph |= NEMO_UPPER + pynini.closure(insert_space + NEMO_UPPER, 1)

        # exclude words that are included in the whitelist
        graph = pynini.compose(
            pynini.difference(pynini.project(graph, "input"),
                              pynini.project(whitelist.graph, "input")), graph)

        graph = pynutil.insert(
            "value: \"") + graph.optimize() + pynutil.insert("\"")
        graph = self.add_tokens(graph)
        self.fst = graph.optimize()
Exemple #17
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="roman",
                         kind="verbalize",
                         deterministic=deterministic)
        suffix = OrdinalFst().suffix

        cardinal = pynini.closure(NEMO_NOT_QUOTE)
        ordinal = pynini.compose(cardinal, suffix)

        graph = (pynutil.delete("key_cardinal: \"") +
                 pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") +
                 pynini.accep(" ") + pynutil.delete("integer: \"") + cardinal +
                 pynutil.delete("\"")).optimize()

        graph |= (pynutil.delete("default_cardinal: \"default\" integer: \"") +
                  cardinal + pynutil.delete("\"")).optimize()

        graph |= (pynutil.delete("default_ordinal: \"default\" integer: \"") +
                  ordinal + pynutil.delete("\"")).optimize()

        graph |= (pynutil.delete("key_the_ordinal: \"") +
                  pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") +
                  pynini.accep(" ") + pynutil.delete("integer: \"") +
                  pynini.closure(pynutil.insert("the "), 0, 1) + ordinal +
                  pynutil.delete("\"")).optimize()

        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()
Exemple #18
0
    def __init__(self, tn_time: GraphFst, deterministic: bool = True):
        super().__init__(name="time",
                         kind="classify",
                         deterministic=deterministic)

        tn_time_tagger = tn_time.graph_preserve_order
        tn_time_verbalizer = TNTimeVerbalizer().graph
        tn_time_graph_preserve_order = pynini.compose(
            tn_time_tagger, tn_time_verbalizer).optimize()
        graph_preserve_order = pynini.invert(
            tn_time_graph_preserve_order).optimize()
        graph_preserve_order = pynutil.insert(
            "hours: \"") + graph_preserve_order + pynutil.insert("\"")

        # "пятнадцать минут шестого" -> 17:15
        # Requires permutations for the correct verbalization
        m_next_h = (pynutil.insert("minutes: \"") +
                    pynini.invert(tn_time.minutes).optimize() +
                    pynutil.insert("\"") + pynini.accep(NEMO_SPACE) +
                    pynutil.insert("hours: \"") +
                    pynini.invert(tn_time.increment_hour_ordinal).optimize() +
                    pynutil.insert("\"")).optimize()

        # "без пятнадцати минут шесть" -> 17:45
        # Requires permutation for the correct verbalization
        m_to_h = (pynini.cross("без ", "minutes: \"") +
                  pynini.invert(tn_time.mins_to_h) + pynutil.insert("\"") +
                  pynini.accep(NEMO_SPACE) + pynutil.insert("hours: \"") +
                  pynini.invert(tn_time.increment_hour_cardinal).optimize() +
                  pynutil.insert("\""))

        graph_reserve_order = m_next_h | m_to_h
        graph = graph_preserve_order | graph_reserve_order
        graph = self.add_tokens(graph)
        self.fst = graph.optimize()
Exemple #19
0
    def __init__(self, cardinal, deterministic: bool = True):
        super().__init__(name="fraction",
                         kind="classify",
                         deterministic=deterministic)
        cardinal_graph = cardinal.graph

        integer = pynutil.insert(
            "integer_part: \"") + cardinal_graph + pynutil.insert("\"")
        numerator = (pynutil.insert("numerator: \"") + cardinal_graph +
                     (pynini.cross("/", "\" ") | pynini.cross(" / ", "\" ")))

        endings = ["rd", "th", "st", "nd"]
        endings += [x.upper() for x in endings]
        optional_end = pynini.closure(pynini.cross(pynini.union(*endings), ""),
                                      0, 1)

        denominator = pynutil.insert(
            "denominator: \""
        ) + cardinal_graph + optional_end + pynutil.insert("\"")

        graph = pynini.closure(integer + pynini.accep(" "), 0,
                               1) + (numerator + denominator)
        graph |= pynini.closure(
            integer +
            (pynini.accep(" ") | pynutil.insert(" ")), 0, 1) + pynini.compose(
                pynini.string_file(get_abs_path("data/number/fraction.tsv")),
                (numerator + denominator))

        self.graph = graph
        final_graph = self.add_tokens(self.graph)
        self.fst = final_graph.optimize()
Exemple #20
0
    def __init__(self, input_case: str, deterministic: bool = True):
        super().__init__(name="whitelist",
                         kind="classify",
                         deterministic=deterministic)

        def _get_whitelist_graph(input_case, file="data/whitelist.tsv"):
            whitelist = load_labels(get_abs_path(file))
            if input_case == "lower_cased":
                whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
            else:
                whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
            graph = pynini.string_map(whitelist)
            return graph

        graph = _get_whitelist_graph(input_case)

        units_graph = _get_whitelist_graph(input_case,
                                           file="data/measurements.tsv")
        # do not replace single letter units, like `м` or `°`
        units_graph = pynini.compose(
            pynini.difference(pynini.project(units_graph, "input"),
                              NEMO_ALPHA), units_graph)
        graph |= units_graph.optimize()
        graph |= TO_LATIN + pynini.closure(pynutil.insert(" ") + TO_LATIN)

        self.final_graph = convert_space(graph)
        self.fst = (pynutil.insert("name: \"") + self.final_graph +
                    pynutil.insert("\"")).optimize()
Exemple #21
0
    def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None):
        super().__init__(name="whitelist", kind="classify", deterministic=deterministic)

        def _get_whitelist_graph(input_case, file):
            whitelist = load_labels(file)
            if input_case == "lower_cased":
                whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
            else:
                whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
            graph = pynini.string_map(whitelist)
            return graph

        graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist.tsv"))

        if input_file:
            graph = _get_whitelist_graph(input_case, input_file)

        units_graph = _get_whitelist_graph(input_case, file=get_abs_path("data/measurements.tsv"))
        # do not replace single letter units, like `м`, `°` and `%` will be replaced
        units_graph = pynini.compose((NEMO_CHAR ** (2, ...) | pynini.difference(NEMO_CHAR, RU_ALPHA)), units_graph)
        graph |= units_graph.optimize()
        graph |= TO_CYRILLIC + pynini.closure(pynutil.insert(" ") + TO_CYRILLIC)

        self.final_graph = convert_space(graph)
        self.fst = (pynutil.insert("name: \"") + self.final_graph + pynutil.insert("\"")).optimize()
Exemple #22
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="ordinal",
                         kind="verbalize",
                         deterministic=deterministic)
        graph_digit = pynini.string_file(
            get_abs_path("data/ordinals/digit.tsv")).invert()
        graph_ties = pynini.string_file(
            get_abs_path("data/ordinals/ties.tsv")).invert()
        graph_thousands = pynini.string_file(
            get_abs_path("data/ordinals/thousands.tsv")).invert()

        graph = pynutil.delete("integer: \"") + pynini.closure(
            NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")

        suffixes = pynini.union("ten", "tem", "ter", "tes", "te")
        convert_rest = pynutil.insert(suffixes, weight=0.01)
        self.ordinal_stem = graph_digit | graph_ties | graph_thousands

        suffix = pynini.cdrewrite(
            pynini.closure(self.ordinal_stem, 0, 1) + convert_rest,
            "",
            "[EOS]",
            NEMO_SIGMA,
        ).optimize()
        self.graph = pynini.compose(graph, suffix)
        self.suffix = suffix
        delete_tokens = self.delete_tokens(self.graph)
        self.fst = delete_tokens.optimize()
    def __init__(self, deterministic: bool = True):
        super().__init__(name="ordinal",
                         kind="verbalize",
                         deterministic=deterministic)

        graph_digit = pynini.string_file(
            get_abs_path("data/ordinals/digit.tsv")).invert()
        graph_teens = pynini.string_file(
            get_abs_path("data/ordinals/teen.tsv")).invert()

        graph = (pynutil.delete("integer:") + delete_space +
                 pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) +
                 pynutil.delete("\""))
        convert_rest = pynutil.insert("th", weight=0.01)

        suffix = pynini.cdrewrite(
            graph_digit | graph_teens
            | pynutil.add_weight(pynini.cross("ty", "tieth"), weight=0.001)
            | convert_rest,
            "",
            "[EOS]",
            NEMO_SIGMA,
        ).optimize()
        self.graph = pynini.compose(graph, suffix)
        self.suffix = suffix
        delete_tokens = self.delete_tokens(self.graph)
        self.fst = delete_tokens.optimize()
Exemple #24
0
def parallelInversion(transducersAndOutputs, alphabet=None):
    try:
        a = [
            sandwich.compose(y, sandwich.invert(t)).project(True)
            for y, t in transducersAndOutputs
        ]
        a = reduce(sandwich.intersect, a)
        if alphabet != None:
            lm = sandwich.union(*alphabet).closure()
            a = a * lm
        a.topsort()
        for s in a.states():
            iterator = a.mutable_arcs(s)
            while not iterator.done():
                value = iterator.value()
                #print value.olabel,value.ilabel,value.weight
                assert value.olabel == value.ilabel
                if value.olabel != 0:
                    value.weight = 1
                    iterator.set_value(value)
                iterator.next()
        return sandwich.shortestpath(a).stringify()
    except:
        # print "Got an exception in parallel inversion..."
        # for y,t in transducersAndOutputs:
        #     print "inverting:"
        #     t = invert(t)
        #     print t
        #     print "composing:"
        #     t = compose(y,t)
        #     print t
        #     print "projecting:"
        #     t = project(True)
        #     print t
        return None
Exemple #25
0
    def __init__(self,
                 whitelist: 'pynini.FstLike',
                 deterministic: bool = True):
        super().__init__(name="abbreviation",
                         kind="classify",
                         deterministic=deterministic)

        main_graph = NEMO_UPPER + pynini.closure(insert_space + NEMO_UPPER, 1)
        misc_graph = pynutil.add_weight(
            TO_LOWER +
            pynini.closure(insert_space + pynini.union(TO_LOWER | NEMO_LOWER)),
            110)
        misc_graph |= pynutil.add_weight(
            pynini.closure(NEMO_UPPER, 2) +
            pynini.closure(insert_space + NEMO_LOWER, 1), 110)
        misc_graph |= (
            NEMO_UPPER + pynutil.delete(".") +
            pynini.closure(insert_space + NEMO_UPPER + pynutil.delete(".")))
        misc_graph |= pynutil.add_weight(
            TO_LOWER + pynutil.delete(".") +
            pynini.closure(insert_space + TO_LOWER + pynutil.delete(".")), 110)

        # set weight of the misc graph to the value higher then word
        graph = pynutil.add_weight(main_graph.optimize(),
                                   10) | pynutil.add_weight(
                                       misc_graph.optimize(), 101)

        # exclude words that are included in the whitelist
        graph = pynini.compose(
            pynini.difference(pynini.project(graph, "input"),
                              pynini.project(whitelist.graph, "input")), graph)
        graph = pynutil.insert(
            "value: \"") + graph.optimize() + pynutil.insert("\"")
        graph = self.add_tokens(graph)
        self.fst = graph.optimize()
Exemple #26
0
def process_window(input_str,
                   window_fst,
                   model,
                   pruning_weight=5,
                   rejection_weight=1.5):
    '''
    Compose a window input automaton with the model.
    '''
    t1 = time.time()
    window_fst.relabel_tables(new_isymbols=model[0].output_symbols(),
                              new_osymbols=model[0].output_symbols())
    for fst in model:
        window_fst = pynini.compose(window_fst, fst)
        window_fst.project(project_output=True)
        window_fst.prune(weight=pruning_weight)
        window_fst.optimize()
    t3 = time.time()
    logging.debug('- composition: {}s'.format(t3 - t1))
    # allow also identity for windows of length 1
    # (with weight `rejection_weight`)
    if ' ' not in input_str:
        # The formula:
        #    rejection_weight*(len(input_str)+2)
        # means that rejection_weight*2 is the initial cost of having an OOV
        # word (which is than more expensive with increasing length).
        # While discovered by accident, this turned out to work well as
        # a very naive OOV word model.
        window_fst.union(
            pynini.acceptor(escape_for_pynini(input_str),
                            weight=rejection_weight * (len(input_str) + 2)))
    t2 = time.time()
    logging.debug('Total processing time: {}s'.format(t2 - t1))
    return window_fst
Exemple #27
0
def decode_lattice(lattice: pynini.Fst, lm: pynini.Fst,
                   sym: pynini.SymbolTable) -> str:
    """Decodes the lattice."""
    lattice = pynini.compose(lattice, lm)
    assert lattice.start() != pynini.NO_STATE_ID, "composition failure"
    # Pynini can join the string for us.
    return pynini.shortestpath(lattice).rmepsilon().string(sym)
Exemple #28
0
    def get_serial_graph(self):
        """
        Finite state transducer for classifying serial.
            The serial is a combination of digits, letters and dashes, e.g.:
            c325-b -> tokens { cardinal { integer: "си три два пять би" } }
        """
        num_graph = self.single_digits_graph

        alpha = TO_CYRILLIC | RU_ALPHA

        delimiter = insert_space | pynini.cross("-", " ") | pynini.cross(
            "/", " ")
        letter_num = pynini.closure(alpha + delimiter, 1) + num_graph
        num_letter = pynini.closure(num_graph + delimiter, 1) + alpha
        num_delimiter_num = pynini.closure(num_graph + delimiter,
                                           1) + num_graph
        next_alpha_or_num = pynini.closure(delimiter + (alpha | num_graph))
        serial_graph = (letter_num | num_letter
                        | num_delimiter_num) + next_alpha_or_num

        # at least 1 alpha and 1 digit is present
        at_least_one_alpha_num = (
            NEMO_SIGMA + (RU_ALPHA | pynini.project(TO_CYRILLIC, "input")) +
            NEMO_SIGMA + NEMO_DIGIT + NEMO_SIGMA) | (
                NEMO_SIGMA + NEMO_DIGIT + NEMO_SIGMA +
                (RU_ALPHA | pynini.project(TO_CYRILLIC, "input")) + NEMO_SIGMA)
        serial_graph = pynini.compose(at_least_one_alpha_num,
                                      serial_graph.optimize()).optimize()
        # numbers only with 2+ delimiters
        serial_graph |= (num_graph + delimiter + num_graph + delimiter +
                         num_graph +
                         pynini.closure(delimiter + num_graph)).optimize()
        return serial_graph.optimize()
Exemple #29
0
 def __construct_compound_stems_nn(self, tmp):
     '''
 Default noun compounding stems
 '''
     return pynini.concat(
         pynini.transducer("",
                           "<Kompos_Stems>",
                           output_token_type=self.__syms.alphabet),
         pynini.compose(
             pynini.concat(
                 self.__syms.characters.closure(1),
                 pynini.union(
                     pynini.transducer(
                         "",
                         pynini.concat(
                             pynini.acceptor(
                                 "<+NN>", token_type=self.__syms.alphabet),
                             self.__syms.gender,
                             pynini.acceptor(
                                 "<Nom> <Sg>",
                                 token_type=self.__syms.alphabet))),
                     pynini.transducer(
                         "",
                         pynini.concat(
                             pynini.acceptor(
                                 "<+NN>", token_type=self.__syms.alphabet),
                             self.__syms.gender,
                             pynini.acceptor(
                                 "<Nom> <Pl>",
                                 token_type=self.__syms.alphabet))))), tmp),
         pynini.acceptor("<NN>", token_type=self.__syms.alphabet),
         pynini.transducer(
             "", "<kompos> <nativ>",
             output_token_type=self.__syms.alphabet)).optimize()
    def parse(self):
        fsa = fsa_from_list_of_symbols(self.input, self.fst.mutable_input_symbols())
        intersection = compose(fsa, self.fst)
        self._best = shortestpath(intersection)

        self._best.topsort()

        self._reverse_polish_rules = retrieve_rules(self._best)
def get_paths(decode_graph, isymbs, osymbs, phoneme_list):
    phoneme_fst = pynini.acceptor(" ".join(phoneme_list), token_type = isymbs)
    return [path for path in pynini.compose(phoneme_fst, decode_graph).paths(input_token_type=isymbs, output_token_type=osymbs)]