Beispiel #1
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic)

        graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv")).invert()
        graph_teens = pynini.string_file(get_abs_path("data/ordinals/teen.tsv")).invert()

        graph = (
            pynutil.delete("integer:")
            + delete_space
            + pynutil.delete("\"")
            + pynini.closure(NEMO_NOT_QUOTE, 1)
            + pynutil.delete("\"")
        )
        convert_rest = pynutil.insert("th", weight=0.01)

        suffix = pynini.cdrewrite(
            graph_digit | graph_teens | pynutil.add_weight(pynini.cross("ty", "tieth"), weight=0.001) | convert_rest,
            "",
            "[EOS]",
            NEMO_SIGMA,
        ).optimize()
        graph = graph @ suffix
        self.suffix = suffix
        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()
Beispiel #2
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="cardinal",
                         kind="classify",
                         deterministic=deterministic)

        graph = pynini.Far(
            get_abs_path("data/numbers/cardinal_number_name.far")).get_fst()
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            pynini.closure(NEMO_DIGIT, 2, 3)
            | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph
        self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure(
            pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT +
            NEMO_DIGIT + NEMO_DIGIT)) @ graph

        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
        single_digits_graph = pynutil.add_weight(
            pynini.invert(graph_digit | graph_zero), 1.2) | pynutil.add_weight(
                pynini.cross("0", "oh"), 1.1)
        self.single_digits_graph = single_digits_graph + pynini.closure(
            pynutil.insert(" ") + single_digits_graph)

        if not deterministic:
            single_digits_graph_with_commas = pynini.closure(
                self.single_digits_graph + pynutil.insert(" "), 1,
                3) + pynini.closure(
                    pynutil.delete(",") + single_digits_graph +
                    pynutil.insert(" ") + single_digits_graph +
                    pynutil.insert(" ") + single_digits_graph,
                    1,
                )
            self.graph |= self.single_digits_graph | get_hundreds_graph(
            ) | single_digits_graph_with_commas
            self.range_graph = (
                pynini.closure(pynutil.insert("from "), 0, 1) + self.graph +
                (pynini.cross("-", " to ") | pynini.cross("-", " ")) +
                self.graph)

            self.range_graph |= self.graph + (pynini.cross(
                "x", " by ") | pynini.cross(" x ", " by ")) + self.graph
            self.range_graph = self.range_graph.optimize()

        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)
        final_graph = self.graph | pynutil.add_weight(self.get_serial_graph(),
                                                      1.2)

        if not deterministic:
            final_graph |= self.range_graph

        final_graph = optional_minus_graph + pynutil.insert(
            "integer: \"") + final_graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Beispiel #3
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="electronic",
                         kind="verbalize",
                         deterministic=deterministic)
        graph_digit_no_zero = pynini.invert(
            pynini.string_file(
                get_abs_path("data/numbers/digit.tsv"))).optimize()
        graph_zero = pynini.cross("0", "zero")

        if not deterministic:
            graph_zero |= pynini.cross("0", "o") | pynini.cross("0", "oh")

        graph_digit = graph_digit_no_zero | graph_zero
        graph_symbols = pynini.string_file(
            get_abs_path("data/electronic/symbols.tsv")).optimize()
        user_name = (
            pynutil.delete("username:"******"\"") +
            (pynini.closure(
                pynutil.add_weight(graph_digit + insert_space, 1.09)
                | pynutil.add_weight(
                    pynini.closure(graph_symbols + pynutil.insert(" ")), 1.09)
                | pynutil.add_weight(NEMO_NOT_QUOTE + insert_space, 1.1))) +
            pynutil.delete("\""))

        server_common = pynini.string_file(
            get_abs_path("data/electronic/server_name.tsv"))
        domain_common = pynini.string_file(
            get_abs_path("data/electronic/domain.tsv"))

        convert_defaults = (NEMO_NOT_QUOTE
                            | pynutil.add_weight(domain_common, -0.1)
                            | pynutil.add_weight(server_common, -0.1))
        domain = convert_defaults + pynini.closure(
            pynutil.insert(" ") + convert_defaults)
        domain = pynini.compose(
            domain,
            pynini.closure(
                pynutil.add_weight(graph_symbols, -0.1)
                | pynutil.add_weight(graph_digit, -0.1) | NEMO_NOT_QUOTE),
        )

        domain = (pynutil.delete("domain:") + delete_space +
                  pynutil.delete("\"") + domain + delete_space +
                  pynutil.delete("\""))

        protocol = pynutil.delete("protocol: \"") + pynini.closure(
            NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
        graph = (pynini.closure(protocol + delete_space, 0, 1) +
                 pynini.closure(
                     user_name + delete_space + pynutil.insert("at ") +
                     delete_space, 0, 1) + domain + delete_space)

        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()
Beispiel #4
0
def get_names():
    """
    Returns the graph that matched common male and female names.
    """
    male_labels = load_labels(get_abs_path("data/roman/male.tsv"))
    female_labels = load_labels(get_abs_path("data/roman/female.tsv"))
    male_labels.extend([[x[0].upper()] for x in male_labels])
    female_labels.extend([[x[0].upper()] for x in female_labels])
    names = pynini.string_map(male_labels).optimize()
    names |= pynini.string_map(female_labels).optimize()
    return names
Beispiel #5
0
    def get_address_graph(self, cardinal):
        """
        Finite state transducer for classifying serial.
            The serial is a combination of digits, letters and dashes, e.g.:
            2788 San Tomas Expy, Santa Clara, CA 95051 ->
                units: "address" cardinal
                { integer: "two seven eight eight San Tomas Expressway Santa Clara California nine five zero five one" }
                 preserve_order: true
        """
        ordinal_verbalizer = OrdinalVerbalizer().graph
        ordinal_tagger = OrdinalTagger(cardinal=cardinal).graph
        ordinal_num = pynini.compose(
            pynutil.insert("integer: \"") + ordinal_tagger +
            pynutil.insert("\""), ordinal_verbalizer)

        address_num = pynini.closure(NEMO_DIGIT,
                                     1) @ cardinal.single_digits_graph

        direction = (pynini.cross("E", "East")
                     | pynini.cross("S", "South")
                     | pynini.cross("W", "West")
                     | pynini.cross("N", "North"))
        direction = pynini.closure(
            pynutil.add_weight(pynini.accep(NEMO_SPACE) + direction, -1), 0, 1)

        address_words = pynini.string_file(
            get_abs_path("data/address/address_words.tsv"))
        address_words = (pynini.accep(NEMO_SPACE) +
                         pynini.closure(ordinal_num, 0, 1) +
                         pynini.closure(NEMO_ALPHA | NEMO_SPACE, 1) +
                         address_words)

        city = pynini.closure(NEMO_ALPHA | pynini.accep(NEMO_SPACE), 1)
        city = pynini.closure(
            pynini.cross(",", "") + pynini.accep(NEMO_SPACE) + city, 0, 1)

        state = pynini.invert(
            pynini.string_file(get_abs_path("data/address/states.tsv")))
        state = pynini.closure(
            pynini.cross(",", "") + pynini.accep(NEMO_SPACE) + state, 0, 1)

        zip_code = pynini.compose(NEMO_DIGIT**5, cardinal.single_digits_graph)
        zip_code = pynini.closure(
            pynutil.add_weight(
                pynini.closure(pynini.cross(",", ""), 0, 1) +
                pynini.accep(NEMO_SPACE) + zip_code, -100),
            0,
            1,
        )

        address = (address_num + direction + address_words +
                   pynini.closure(pynini.cross(".", ""), 0, 1) + city + state +
                   zip_code)
        return address
Beispiel #6
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="electronic", kind="verbalize", deterministic=deterministic)
        graph_digit_no_zero = pynini.invert(pynini.string_file(get_abs_path("data/number/digit.tsv"))).optimize()
        graph_zero = pynini.cross("0", "zero")

        if not deterministic:
            graph_zero |= pynini.cross("0", "o") | pynini.cross("0", "oh")

        graph_digit = graph_digit_no_zero | graph_zero
        graph_symbols = pynini.string_file(get_abs_path("data/electronic/symbol.tsv")).optimize()

        default_chars_symbols = pynini.cdrewrite(
            pynutil.insert(" ") + (graph_symbols | graph_digit) + pynutil.insert(" "), "", "", NEMO_SIGMA
        )

        user_name = (
            pynutil.delete("username:"******"\"")
            + default_chars_symbols
            + pynutil.delete("\"")
        )

        domain_common = pynini.string_file(get_abs_path("data/electronic/domain.tsv"))

        domain = (
            default_chars_symbols
            + insert_space
            + plurals._priority_union(
                domain_common, pynutil.add_weight(pynini.cross(".", "dot"), weight=0.0001), NEMO_SIGMA
            )
            + pynini.closure(
                insert_space + (pynini.cdrewrite(TO_UPPER, "", "", NEMO_SIGMA) @ default_chars_symbols), 0, 1
            )
        )
        domain = (
            pynutil.delete("domain:")
            + delete_space
            + pynutil.delete("\"")
            + domain
            + delete_space
            + pynutil.delete("\"")
        ).optimize()

        protocol = pynutil.delete("protocol: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
        graph = (
            pynini.closure(protocol + delete_space, 0, 1)
            + pynini.closure(user_name + delete_space + pynutil.insert(" at ") + delete_space, 0, 1)
            + domain
            + delete_space
        ).optimize() @ pynini.cdrewrite(delete_extra_space, "", "", NEMO_SIGMA)

        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()
Beispiel #7
0
 def _get_whitelist_non_deterministic_graph(
         file="data/whitelist_alternatives.tsv"):
     whitelist = load_labels(get_abs_path(file))
     whitelist_lower = [(x.lower(), y.lower()) for x, y in whitelist]
     whitelist_cased = [(x, y) for x, y in whitelist]
     graph = pynini.string_map(whitelist_lower + whitelist_cased)
     return graph
Beispiel #8
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="punctuation",
                         kind="classify",
                         deterministic=deterministic)
        s = "!#%&\'()*+,-./:;<=>?@^_`{|}~\""

        punct_symbols_to_exclude = ["[", "]"]
        punct_unicode = [
            chr(i) for i in range(sys.maxunicode)
            if category(chr(i)).startswith("P")
            and chr(i) not in punct_symbols_to_exclude
        ]

        whitelist_symbols = load_labels(
            get_abs_path("data/whitelist/symbol.tsv"))
        whitelist_symbols = [x[0] for x in whitelist_symbols]
        self.punct_marks = [
            p for p in punct_unicode + list(s) if p not in whitelist_symbols
        ]

        punct = pynini.union(*self.punct_marks)
        punct = pynini.closure(punct, 1)

        emphasis = (pynini.accep("<") + (
            (pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1) +
             pynini.closure(pynini.accep("/"), 0, 1))
            | (pynini.accep("/") +
               pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1))) +
                    pynini.accep(">"))
        punct = plurals._priority_union(emphasis, punct, NEMO_SIGMA)

        self.graph = punct
        self.fst = (pynutil.insert("name: \"") + self.graph +
                    pynutil.insert("\"")).optimize()
Beispiel #9
0
    def get_serial_graph(self):
        """
        Finite state transducer for classifying serial.
            The serial is a combination of digits, letters and dashes, e.g.:
            c325-b -> tokens { cardinal { integer: "c three two five b" } }
        """
        alpha = NEMO_ALPHA

        if self.deterministic:
            num_graph = self.single_digits_graph
        else:
            num_graph = self.graph
            letter_pronunciation = pynini.string_map(
                load_labels(get_abs_path("data/letter_pronunciation.tsv")))
            alpha |= letter_pronunciation

        delimiter = insert_space | pynini.cross("-", " ") | pynini.cross(
            "/", " ")
        letter_num = pynini.closure(alpha + delimiter, 1) + num_graph
        num_letter = pynini.closure(num_graph + delimiter, 1) + alpha
        num_delimiter_num = pynini.closure(num_graph + delimiter,
                                           1) + num_graph
        next_alpha_or_num = pynini.closure(delimiter + (alpha | num_graph))
        serial_graph = (letter_num | num_letter
                        | num_delimiter_num) + next_alpha_or_num
        if not self.deterministic:
            serial_graph += pynini.closure(
                pynini.accep("s") | pynini.cross("s", "es"), 0, 1)

        serial_graph.optimize()
        return pynutil.add_weight(serial_graph, 10)
Beispiel #10
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="electronic",
                         kind="verbalize",
                         deterministic=deterministic)
        graph_digit = pynini.invert(
            pynini.string_file(
                get_abs_path("data/numbers/digit.tsv"))).optimize()
        graph_symbols = pynini.string_file(
            get_abs_path("data/electronic/symbols.tsv")).optimize()
        user_name = (
            pynutil.delete("username:"******"\"") +
            (pynini.closure(
                pynutil.add_weight(graph_digit + insert_space, 1.09)
                | pynutil.add_weight(
                    pynini.closure(graph_symbols + pynutil.insert(" ")), 1.09)
                | pynutil.add_weight(NEMO_NOT_QUOTE + insert_space, 1.1))) +
            pynutil.delete("\""))

        domain_default = (pynini.closure(NEMO_NOT_QUOTE + insert_space) +
                          pynini.cross(".", "dot ") + NEMO_NOT_QUOTE +
                          pynini.closure(insert_space + NEMO_NOT_QUOTE))

        server_default = (pynini.closure(
            (graph_digit | NEMO_ALPHA) + insert_space, 1) +
                          pynini.closure(graph_symbols + insert_space) +
                          pynini.closure(
                              (graph_digit | NEMO_ALPHA) + insert_space, 1))
        server_common = pynini.string_file(
            get_abs_path("data/electronic/server_name.tsv")) + insert_space

        domain_common = pynini.cross(".", "dot ") + pynini.string_file(
            get_abs_path("data/electronic/domain.tsv"))

        domain = (pynutil.delete("domain:") + delete_space +
                  pynutil.delete("\"") +
                  (pynutil.add_weight(server_common, 1.09)
                   | pynutil.add_weight(server_default, 1.1)) +
                  (pynutil.add_weight(domain_common, 1.09)
                   | pynutil.add_weight(domain_default, 1.1)) + delete_space +
                  pynutil.delete("\""))

        graph = (pynini.closure(
            user_name + delete_space + pynutil.insert("at ") + delete_space, 0,
            1) + domain + delete_space)

        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()
Beispiel #11
0
 def _get_whitelist_graph(input_case, file="data/whitelist.tsv"):
     whitelist = load_labels(get_abs_path(file))
     if input_case == "lower_cased":
         whitelist = [(x.lower(), y) for x, y in whitelist]
     else:
         whitelist = [(x, y) for x, y in whitelist]
     graph = pynini.string_map(whitelist)
     return graph
Beispiel #12
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="telephone",
                         kind="classify",
                         deterministic=deterministic)

        add_separator = pynutil.insert(", ")  # between components
        digit = pynini.invert(
            pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
        ).optimize() | pynini.cross("0", "o")

        country_code = (pynutil.insert("country_code: \"") +
                        pynini.closure(pynutil.delete("+"), 0, 1) +
                        pynini.closure(digit + insert_space, 0, 2) + digit +
                        pynutil.insert("\""))
        optional_country_code = pynini.closure(
            country_code + pynini.closure(pynutil.delete("-"), 0, 1) +
            delete_space + insert_space, 0, 1)

        area_part_common = pynutil.add_weight(
            pynini.cross("800", "eight hundred"), -1.1)
        area_part_default = pynini.closure(digit + insert_space, 2, 2) + digit
        area_part = area_part_default | area_part_common

        area_part = (
            (area_part + pynutil.delete("-"))
            | (pynutil.delete("(") + area_part +
               (pynutil.delete(") ") | pynutil.delete(")-")))) + add_separator

        del_separator = pynini.closure(pynini.union("-", " "), 0, 1)
        number_length = ((NEMO_DIGIT + del_separator) |
                         (NEMO_ALPHA + del_separator))**7
        number_words = pynini.closure((NEMO_DIGIT @ digit) +
                                      (insert_space | pynini.cross("-", ', '))
                                      | NEMO_ALPHA
                                      | (NEMO_ALPHA + pynini.cross("-", ' ')))
        number_words = pynini.compose(number_length, number_words)
        number_part = area_part + number_words
        number_part = pynutil.insert(
            "number_part: \"") + number_part + pynutil.insert("\"")
        extension = (pynutil.insert("extension: \"") +
                     pynini.closure(digit + insert_space, 0, 3) + digit +
                     pynutil.insert("\""))
        optional_extension = pynini.closure(insert_space + extension, 0, 1)

        graph = optional_country_code + number_part + optional_extension

        # ip
        digit_to_str_graph = pynini.compose(
            NEMO_DIGIT**(1, 3),
            digit + pynini.closure(pynutil.insert(" ") + digit)).optimize()
        ip_graph = digit_to_str_graph + (pynini.cross(".", " dot ") +
                                         digit_to_str_graph)**3
        graph |= pynutil.insert(
            "number_part: \"") + ip_graph.optimize() + pynutil.insert("\"")

        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Beispiel #13
0
    def __init__(self, cardinal: GraphFst, deterministic: bool):
        super().__init__(name="decimal",
                         kind="classify",
                         deterministic=deterministic)

        cardinal_graph = cardinal.graph
        cardinal_graph_hundred_component_at_least_one_none_zero_digit = (
            cardinal.graph_hundred_component_at_least_one_none_zero_digit)

        graph_decimal = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_decimal |= pynini.string_file(
            get_abs_path("data/numbers/zero.tsv"))

        graph_decimal = (
            pynini.cross("zero", "0")
            | graph_decimal
            | (graph_decimal | pynini.cross("o", "0")) + pynini.closure(
                delete_space + (graph_decimal | pynini.cross("o", "0")), 1))
        self.graph = pynini.invert(graph_decimal).optimize()
        if not deterministic:
            self.graph = self.graph | cardinal_graph

        point = pynutil.delete(".")
        optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)

        graph_fractional = pynutil.insert(
            "fractional_part: \"") + self.graph + pynutil.insert("\"")
        graph_integer = pynutil.insert(
            "integer_part: \"") + cardinal_graph + pynutil.insert("\"")
        final_graph_wo_sign = (
            pynini.closure(graph_integer + pynutil.insert(" "), 0, 1) + point +
            pynutil.insert(" ") + graph_fractional)

        self.final_graph_wo_negative = final_graph_wo_sign | get_quantity(
            final_graph_wo_sign,
            cardinal_graph_hundred_component_at_least_one_none_zero_digit)

        final_graph = optional_graph_negative + self.final_graph_wo_negative

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Beispiel #14
0
    def __init__(self,
                 input_case: str,
                 deterministic: bool = True,
                 input_file: str = None):
        super().__init__(name="whitelist",
                         kind="classify",
                         deterministic=deterministic)

        def _get_whitelist_graph(input_case, file):
            whitelist = load_labels(file)
            if input_case == "lower_cased":
                whitelist = [(x.lower(), y) for x, y in whitelist]
            else:
                whitelist = [(x, y) for x, y in whitelist]
            graph = pynini.string_map(whitelist)
            return graph

        def _get_whitelist_non_deterministic_graph(
                file="data/whitelist_alternatives.tsv"):
            whitelist = load_labels(get_abs_path(file))
            whitelist_lower = [(x.lower(), y.lower()) for x, y in whitelist]
            whitelist_cased = [(x, y) for x, y in whitelist]
            graph = pynini.string_map(whitelist_lower + whitelist_cased)
            return graph

        graph = _get_whitelist_graph(input_case,
                                     get_abs_path("data/whitelist.tsv"))
        if not deterministic:
            graph |= (_get_whitelist_graph("lower_cased",
                                           get_abs_path("data/whitelist.tsv"))
                      | _get_whitelist_non_deterministic_graph())

        if input_file:
            whitelist_provided = _get_whitelist_graph(input_case, input_file)
            if not deterministic:
                graph |= whitelist_provided
            else:
                graph = whitelist_provided

        self.graph = (convert_space(graph)).optimize()
        self.fst = (pynutil.insert("name: \"") + self.graph +
                    pynutil.insert("\"")).optimize()
Beispiel #15
0
    def __init__(self,
                 cardinal: GraphFst,
                 decimal: GraphFst,
                 deterministic: bool = True):
        super().__init__(name="money",
                         kind="classify",
                         deterministic=deterministic)
        cardinal_graph = cardinal.graph
        graph_decimal_final = decimal.final_graph_wo_negative

        unit_singular = pynini.string_file(
            get_abs_path("data/currency/currency.tsv"))
        unit_plural = convert_space(unit_singular @ SINGULAR_TO_PLURAL)
        unit_singular = convert_space(unit_singular)

        graph_unit_singular = pynutil.insert(
            "currency: \"") + unit_singular + pynutil.insert("\"")
        graph_unit_plural = pynutil.insert(
            "currency: \"") + unit_plural + pynutil.insert("\"")

        singular_graph = (graph_unit_singular +
                          pynutil.insert(" integer_part: \"") +
                          pynini.cross("1", "one") + pynutil.insert("\""))

        graph_decimal = graph_unit_plural + insert_space + graph_decimal_final

        if deterministic:
            graph_integer = (graph_unit_plural +
                             pynutil.insert(" integer_part: \"") +
                             ((NEMO_SIGMA - "1") @ cardinal_graph) +
                             pynutil.insert("\""))
        else:
            graph_integer = (
                graph_unit_plural + pynutil.insert(" integer_part: \"") +
                ((NEMO_SIGMA - "1")
                 @ (get_hundreds_graph(deterministic) | cardinal_graph)) +
                pynutil.insert("\""))
            graph_decimal |= singular_graph + insert_space + graph_decimal_final

        graph_integer |= singular_graph

        final_graph = graph_integer | graph_decimal
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Beispiel #16
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="telephone",
                         kind="classify",
                         deterministic=deterministic)

        add_separator = pynutil.insert(", ")  # between components
        digit = pynini.invert(
            pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
        ).optimize() | pynini.cross("0", "o")

        country_code = (pynutil.insert("country_code: \"") +
                        pynutil.delete("+") +
                        pynini.closure(digit + insert_space, 0, 2) + digit +
                        pynutil.insert("\""))
        optional_country_code = pynini.closure(
            country_code + pynini.closure(pynutil.delete("-"), 0, 1) +
            delete_space + insert_space, 0, 1)
        number_part = ((
            (pynini.closure(digit + insert_space, 2, 2) + digit +
             pynutil.delete("-"))
            |
            (pynutil.delete("(") + pynini.closure(digit + insert_space, 2, 2) +
             digit + pynutil.delete(")") +
             pynini.closure(pynutil.delete("-"), 0, 1) + delete_space)) +
                       add_separator +
                       pynini.closure(digit + insert_space, 2, 2) + digit +
                       pynutil.delete("-") + add_separator +
                       pynini.closure(digit + insert_space, 3, 3) + digit)
        number_part = pynutil.insert(
            "number_part: \"") + number_part + pynutil.insert("\"")
        extension = (pynutil.insert("extension : \"") +
                     pynini.closure(digit + insert_space, 0, 3) + digit +
                     pynutil.insert("\""))
        optional_extension = pynini.closure(
            insert_space + pynutil.delete("-") + extension, 0, 1)

        graph = optional_country_code + number_part + optional_extension
        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Beispiel #17
0
    def __init__(self,
                 cardinal: GraphFst,
                 decimal: GraphFst,
                 deterministic: bool = True):
        super().__init__(name="money",
                         kind="classify",
                         deterministic=deterministic)
        cardinal_graph = cardinal.graph_with_and
        graph_decimal_final = decimal.final_graph_wo_negative_w_abbr

        maj_singular_labels = load_labels(
            get_abs_path("data/money/currency_major.tsv"))
        maj_unit_plural = convert_space(maj_singular @ SINGULAR_TO_PLURAL)
        maj_unit_singular = convert_space(maj_singular)

        graph_maj_singular = pynutil.insert(
            "currency_maj: \"") + maj_unit_singular + pynutil.insert("\"")
        graph_maj_plural = pynutil.insert(
            "currency_maj: \"") + maj_unit_plural + pynutil.insert("\"")

        optional_delete_fractional_zeros = pynini.closure(
            pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), 0, 1)

        graph_integer_one = pynutil.insert("integer_part: \"") + pynini.cross(
            "1", "one") + pynutil.insert("\"")
        # only for decimals where third decimal after comma is non-zero or with quantity
        decimal_delete_last_zeros = (
            pynini.closure(NEMO_DIGIT | pynutil.delete(",")) +
            pynini.accep(".") + pynini.closure(NEMO_DIGIT, 2) +
            (NEMO_DIGIT - "0") + pynini.closure(pynutil.delete("0")))
        decimal_with_quantity = NEMO_SIGMA + NEMO_ALPHA

        graph_decimal = (graph_maj_plural + insert_space +
                         (decimal_delete_last_zeros | decimal_with_quantity)
                         @ graph_decimal_final)

        graph_integer = (pynutil.insert("integer_part: \"") +
                         ((NEMO_SIGMA - "1") @ cardinal_graph) +
                         pynutil.insert("\""))

        graph_integer_only = graph_maj_singular + insert_space + graph_integer_one
        graph_integer_only |= graph_maj_plural + insert_space + graph_integer

        final_graph = (graph_integer_only +
                       optional_delete_fractional_zeros) | graph_decimal

        # remove trailing zeros of non zero number in the first 2 digits and fill up to 2 digits
        # e.g. 2000 -> 20, 0200->02, 01 -> 01, 10 -> 10
        # not accepted: 002, 00, 0,
        two_digits_fractional_part = (
            pynini.closure(NEMO_DIGIT) +
            (NEMO_DIGIT - "0") + pynini.closure(pynutil.delete("0"))) @ (
                (pynutil.delete("0") + (NEMO_DIGIT - "0"))
                | ((NEMO_DIGIT - "0") + pynutil.insert("0"))
                | ((NEMO_DIGIT - "0") + NEMO_DIGIT))

        graph_min_singular = pynutil.insert(
            " currency_min: \"") + min_singular + pynutil.insert("\"")
        graph_min_plural = pynutil.insert(
            " currency_min: \"") + min_plural + pynutil.insert("\"")
        # format ** dollars ** cent
        decimal_graph_with_minor = None
        integer_graph_reordered = None
        decimal_default_reordered = None
        for curr_symbol, _ in maj_singular_labels:
            preserve_order = pynutil.insert(" preserve_order: true")
            integer_plus_maj = graph_integer + insert_space + pynutil.insert(
                curr_symbol) @ graph_maj_plural
            integer_plus_maj |= graph_integer_one + insert_space + pynutil.insert(
                curr_symbol) @ graph_maj_singular

            integer_plus_maj_with_comma = pynini.compose(
                NEMO_DIGIT - "0" +
                pynini.closure(NEMO_DIGIT | pynutil.delete(",")),
                integer_plus_maj)
            integer_plus_maj = pynini.compose(
                pynini.closure(NEMO_DIGIT) - "0", integer_plus_maj)
            integer_plus_maj |= integer_plus_maj_with_comma

            graph_fractional_one = two_digits_fractional_part @ pynini.cross(
                "1", "one")
            graph_fractional_one = pynutil.insert(
                "fractional_part: \"") + graph_fractional_one + pynutil.insert(
                    "\"")
            graph_fractional = (two_digits_fractional_part @ (
                pynini.closure(NEMO_DIGIT, 1, 2) - "1"
            ) @ cardinal.graph_hundred_component_at_least_one_none_zero_digit)
            graph_fractional = pynutil.insert(
                "fractional_part: \"") + graph_fractional + pynutil.insert(
                    "\"")

            fractional_plus_min = graph_fractional + insert_space + pynutil.insert(
                curr_symbol) @ graph_min_plural
            fractional_plus_min |= (
                graph_fractional_one + insert_space +
                pynutil.insert(curr_symbol) @ graph_min_singular)

            decimal_graph_with_minor_curr = integer_plus_maj + pynini.cross(
                ".", " ") + fractional_plus_min

            if not deterministic:
                decimal_graph_with_minor_curr |= pynutil.add_weight(
                    integer_plus_maj + pynini.cross(".", " ") +
                    pynutil.insert("fractional_part: \"") +
                    two_digits_fractional_part @ cardinal.
                    graph_hundred_component_at_least_one_none_zero_digit +
                    pynutil.insert("\""),
                    weight=0.0001,
                )
                default_fraction_graph = (
                    decimal_delete_last_zeros
                    | decimal_with_quantity) @ graph_decimal_final
            decimal_graph_with_minor_curr |= (
                pynini.closure(pynutil.delete("0"), 0, 1) +
                pynutil.delete(".") + fractional_plus_min)
            decimal_graph_with_minor_curr = (pynutil.delete(curr_symbol) +
                                             decimal_graph_with_minor_curr +
                                             preserve_order)

            decimal_graph_with_minor = (
                decimal_graph_with_minor_curr
                if decimal_graph_with_minor is None else pynini.union(
                    decimal_graph_with_minor,
                    decimal_graph_with_minor_curr).optimize())

            if not deterministic:
                integer_graph_reordered_curr = (pynutil.delete(curr_symbol) +
                                                integer_plus_maj +
                                                preserve_order).optimize()

                integer_graph_reordered = (
                    integer_graph_reordered_curr
                    if integer_graph_reordered is None else pynini.union(
                        integer_graph_reordered,
                        integer_graph_reordered_curr).optimize())
                decimal_default_reordered_curr = (
                    pynutil.delete(curr_symbol) + default_fraction_graph +
                    insert_space +
                    pynutil.insert(curr_symbol) @ graph_maj_plural)

                decimal_default_reordered = (
                    decimal_default_reordered_curr
                    if decimal_default_reordered is None else pynini.union(
                        decimal_default_reordered,
                        decimal_default_reordered_curr)).optimize()

        # weight for SH
        final_graph |= pynutil.add_weight(decimal_graph_with_minor, -0.0001)

        if not deterministic:
            final_graph |= integer_graph_reordered | decimal_default_reordered
            # to handle "$2.00" cases
            final_graph |= pynini.compose(
                NEMO_SIGMA + pynutil.delete(".") +
                pynini.closure(pynutil.delete("0"), 1),
                integer_graph_reordered)
        final_graph = self.add_tokens(final_graph.optimize())
        self.fst = final_graph.optimize()
Beispiel #18
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="cardinal",
                         kind="classify",
                         deterministic=deterministic)
        # TODO repalce to have "oh" as a default for "0"
        graph = pynini.Far(
            get_abs_path("data/numbers/cardinal_number_name.far")).get_fst()
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            pynini.closure(NEMO_DIGIT, 2, 3)
            | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph
        self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure(
            pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT +
            NEMO_DIGIT + NEMO_DIGIT)) @ graph

        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))

        single_digits_graph = pynini.invert(graph_digit | graph_zero)
        self.single_digits_graph = single_digits_graph + pynini.closure(
            insert_space + single_digits_graph)

        if not deterministic:
            # for a single token allow only the same normalization
            # "007" -> {"oh oh seven", "zero zero seven"} not {"oh zero seven"}
            single_digits_graph_zero = pynini.invert(graph_digit | graph_zero)
            single_digits_graph_oh = pynini.invert(graph_digit) | pynini.cross(
                "0", "oh")

            self.single_digits_graph = single_digits_graph_zero + pynini.closure(
                insert_space + single_digits_graph_zero)
            self.single_digits_graph |= single_digits_graph_oh + pynini.closure(
                insert_space + single_digits_graph_oh)

            single_digits_graph_with_commas = pynini.closure(
                self.single_digits_graph + insert_space, 1,
                3) + pynini.closure(
                    pynutil.delete(",") + single_digits_graph + insert_space +
                    single_digits_graph + insert_space + single_digits_graph,
                    1,
                )

            self.range_graph = pynutil.insert(
                "from ") + self.graph + pynini.cross("-", " to ") + self.graph
            self.range_graph |= self.graph + (pynini.cross(
                "x", " by ") | pynini.cross(" x ", " by ")) + self.graph
            self.range_graph |= (pynutil.insert("from ") +
                                 get_hundreds_graph() +
                                 pynini.cross("-", " to ") +
                                 get_hundreds_graph())
            self.range_graph = self.range_graph.optimize()

        serial_graph = self.get_serial_graph()
        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)

        if deterministic:
            long_numbers = pynini.compose(NEMO_DIGIT**(5, ...),
                                          self.single_digits_graph).optimize()
            final_graph = self.graph | serial_graph | pynutil.add_weight(
                long_numbers, -0.001)
            cardinal_with_leading_zeros = pynini.compose(
                pynini.accep("0") + pynini.closure(NEMO_DIGIT),
                self.single_digits_graph)
            final_graph |= cardinal_with_leading_zeros
        else:

            leading_zeros = pynini.compose(
                pynini.closure(pynini.accep("0"), 1), self.single_digits_graph)
            cardinal_with_leading_zeros = (
                leading_zeros + pynutil.insert(" ") +
                pynini.compose(pynini.closure(NEMO_DIGIT), self.graph))

            final_graph = (self.graph
                           | serial_graph
                           | self.range_graph
                           | self.single_digits_graph
                           | get_hundreds_graph()
                           | pynutil.add_weight(
                               single_digits_graph_with_commas, 0.001)
                           | cardinal_with_leading_zeros)

        final_graph = optional_minus_graph + pynutil.insert(
            "integer: \"") + final_graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)

        self.fst = final_graph.optimize()
Beispiel #19
0
    def __init__(self,
                 cardinal: GraphFst,
                 decimal: GraphFst,
                 fraction: GraphFst,
                 deterministic: bool = True):
        super().__init__(name="measure",
                         kind="classify",
                         deterministic=deterministic)
        cardinal_graph = cardinal.graph

        if not deterministic:
            cardinal_graph |= cardinal.range_graph

        graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv"))
        graph_unit |= pynini.compose(
            pynini.closure(TO_LOWER, 1) + pynini.closure(NEMO_ALPHA),
            graph_unit)

        graph_unit_plural = convert_space(graph_unit @ SINGULAR_TO_PLURAL)
        graph_unit = convert_space(graph_unit)
        optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)

        graph_unit2 = pynini.cross("/", "per") + delete_space + pynutil.insert(
            NEMO_NON_BREAKING_SPACE) + graph_unit

        optional_graph_unit2 = pynini.closure(
            delete_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) +
            graph_unit2,
            0,
            1,
        )

        unit_plural = (
            pynutil.insert("units: \"") +
            (graph_unit_plural + optional_graph_unit2 | graph_unit2) +
            pynutil.insert("\""))

        unit_singular = (pynutil.insert("units: \"") +
                         (graph_unit + optional_graph_unit2 | graph_unit2) +
                         pynutil.insert("\""))

        subgraph_decimal = (pynutil.insert("decimal { ") +
                            optional_graph_negative +
                            decimal.final_graph_wo_negative + delete_space +
                            pynutil.insert(" } ") + unit_plural)

        subgraph_cardinal = (pynutil.insert("cardinal { ") +
                             optional_graph_negative +
                             pynutil.insert("integer: \"") +
                             ((NEMO_SIGMA - "1") @ cardinal_graph) +
                             delete_space + pynutil.insert("\"") +
                             pynutil.insert(" } ") + unit_plural)

        subgraph_cardinal |= (pynutil.insert("cardinal { ") +
                              optional_graph_negative +
                              pynutil.insert("integer: \"") +
                              pynini.cross("1", "one") + delete_space +
                              pynutil.insert("\"") + pynutil.insert(" } ") +
                              unit_singular)

        cardinal_dash_alpha = (pynutil.insert("cardinal { integer: \"") +
                               cardinal_graph + pynini.accep('-') +
                               pynutil.insert("\" } units: \"") +
                               pynini.closure(NEMO_ALPHA, 1) +
                               pynutil.insert("\""))

        alpha_dash_cardinal = (pynutil.insert("units: \"") +
                               pynini.closure(NEMO_ALPHA, 1) +
                               pynini.accep('-') + pynutil.insert("\"") +
                               pynutil.insert(" cardinal { integer: \"") +
                               cardinal_graph +
                               pynutil.insert("\" } preserve_order: true"))

        decimal_dash_alpha = (pynutil.insert("decimal { ") +
                              decimal.final_graph_wo_negative +
                              pynini.cross('-', '') +
                              pynutil.insert(" } units: \"") +
                              pynini.closure(NEMO_ALPHA, 1) +
                              pynutil.insert("\""))

        decimal_times = (pynutil.insert("decimal { ") +
                         decimal.final_graph_wo_negative +
                         pynutil.insert(" } units: \"") +
                         pynini.cross(pynini.union('x', "X"), 'x') +
                         pynutil.insert("\""))

        alpha_dash_decimal = (pynutil.insert("units: \"") +
                              pynini.closure(NEMO_ALPHA, 1) +
                              pynini.accep('-') + pynutil.insert("\"") +
                              pynutil.insert(" decimal { ") +
                              decimal.final_graph_wo_negative +
                              pynutil.insert(" } preserve_order: true"))

        subgraph_fraction = (pynutil.insert("fraction { ") + fraction.graph +
                             delete_space + pynutil.insert(" } ") +
                             unit_plural)

        address = self.get_address_graph(cardinal)
        address = (
            pynutil.insert("units: \"address\" cardinal { integer: \"") +
            address + pynutil.insert("\" } preserve_order: true"))

        math_operations = pynini.string_file(
            get_abs_path("data/math_operations.tsv"))
        delimiter = pynini.accep(" ") | pynutil.insert(" ")

        math = (cardinal_graph + delimiter + math_operations + delimiter +
                cardinal_graph + delimiter + pynini.cross("=", "equals") +
                delimiter + cardinal_graph)
        math = (pynutil.insert("units: \"math\" cardinal { integer: \"") +
                math + pynutil.insert("\" } preserve_order: true"))
        final_graph = (subgraph_decimal
                       | subgraph_cardinal
                       | cardinal_dash_alpha
                       | alpha_dash_cardinal
                       | decimal_dash_alpha
                       | decimal_times
                       | alpha_dash_decimal
                       | subgraph_fraction
                       | address
                       | math)
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Beispiel #20
0
    def __init__(self, deterministic: bool = True, lm: bool = False):
        super().__init__(name="roman",
                         kind="classify",
                         deterministic=deterministic)

        roman_dict = load_labels(
            get_abs_path("data/roman/roman_to_spoken.tsv"))
        default_graph = pynini.string_map(roman_dict).optimize()
        default_graph = pynutil.insert(
            "integer: \"") + default_graph + pynutil.insert("\"")
        graph_teens = pynini.string_map([x[0]
                                         for x in roman_dict[:19]]).optimize()

        # up to five digit roman numerals with a preceding name are converted to ordinal form
        names = get_names()
        graph = (pynutil.insert("key_the_ordinal: \"") + names +
                 pynutil.insert("\"") + pynini.accep(" ") +
                 graph_teens @ default_graph).optimize()

        # single symbol roman numerals with preceding key words are converted to cardinal form
        key_words = pynini.string_map(
            load_labels(get_abs_path("data/roman/key_word.tsv"))).optimize()
        graph |= (pynutil.insert("key_cardinal: \"") + key_words +
                  pynutil.insert("\"") + pynini.accep(" ") +
                  default_graph).optimize()

        if deterministic:
            # two digit roman numerals up to 49
            roman_to_cardinal = pynini.compose(
                pynini.closure(NEMO_ALPHA, 2),
                (pynutil.insert("default_cardinal: \"default\" ") +
                 (pynini.string_map([x[0] for x in roman_dict[:50]
                                     ]).optimize()) @ default_graph),
            )
        elif not lm:
            # two or more digit roman numerals
            roman_to_cardinal = pynini.compose(
                pynini.closure(NEMO_ALPHA, 2),
                (pynutil.insert("default_cardinal: \"default\" ") +
                 (pynini.string_map([x[0] for x in roman_dict[:50]
                                     ]).optimize()) @ default_graph),
            )

        # convert three digit roman or up with suffix to ordinal
        roman_to_ordinal = pynini.compose(
            pynini.closure(NEMO_ALPHA, 3),
            (pynutil.insert("default_ordinal: \"default\" ") +
             graph_teens @ default_graph + pynutil.delete("th")),
        )

        graph |= roman_to_cardinal | roman_to_ordinal

        # # add a higher weight when roman number consists of a single symbol
        # graph = pynini.compose(pynini.closure(NEMO_CHAR, 2), graph) | pynutil.add_weight(
        #     pynini.compose(NEMO_CHAR, graph), 101
        # )
        # graph = graph.optimize() + pynini.closure(pynutil.delete("."), 0, 1)

        # graph = pynutil.insert("integer: \"") + graph + pynutil.insert("\"")
        graph = self.add_tokens(graph)

        self.fst = graph.optimize()
Beispiel #21
0
    def get_address_graph(self, cardinal):
        """
        Finite state transducer for classifying serial.
            The serial is a combination of digits, letters and dashes, e.g.:
            2788 San Tomas Expy, Santa Clara, CA 95051 ->
                units: "address" cardinal
                { integer: "two seven eight eight San Tomas Expressway Santa Clara California nine five zero five one" }
                 preserve_order: true
        """
        ordinal_verbalizer = OrdinalVerbalizer().graph
        ordinal_tagger = OrdinalTagger(cardinal=cardinal).graph
        ordinal_num = pynini.compose(
            pynutil.insert("integer: \"") + ordinal_tagger +
            pynutil.insert("\""), ordinal_verbalizer)

        address_num = NEMO_DIGIT**(
            1,
            2) @ cardinal.graph_hundred_component_at_least_one_none_zero_digit
        address_num += insert_space + NEMO_DIGIT**2 @ (
            pynini.closure(pynini.cross("0", "zero "), 0, 1) +
            cardinal.graph_hundred_component_at_least_one_none_zero_digit)
        # to handle the rest of the numbers
        address_num = pynini.compose(NEMO_DIGIT**(3, 4), address_num)
        address_num = plurals._priority_union(address_num, cardinal.graph,
                                              NEMO_SIGMA)

        direction = (pynini.cross("E", "East")
                     | pynini.cross("S", "South")
                     | pynini.cross("W", "West")
                     | pynini.cross("N", "North")) + pynini.closure(
                         pynutil.delete("."), 0, 1)

        direction = pynini.closure(pynini.accep(NEMO_SPACE) + direction, 0, 1)
        address_words = get_formats(
            get_abs_path("data/address/address_word.tsv"))
        address_words = (
            pynini.accep(NEMO_SPACE) +
            (pynini.closure(ordinal_num, 0, 1)
             | NEMO_UPPER + pynini.closure(NEMO_ALPHA, 1)) + NEMO_SPACE +
            pynini.closure(NEMO_UPPER + pynini.closure(NEMO_ALPHA) +
                           NEMO_SPACE) + address_words)

        city = pynini.closure(NEMO_ALPHA | pynini.accep(NEMO_SPACE), 1)
        city = pynini.closure(
            pynini.accep(",") + pynini.accep(NEMO_SPACE) + city, 0, 1)

        states = load_labels(get_abs_path("data/address/state.tsv"))

        additional_options = []
        for x, y in states:
            additional_options.append((x, f"{y[0]}.{y[1:]}"))
        states.extend(additional_options)
        state_graph = pynini.string_map(states)
        state = pynini.invert(state_graph)
        state = pynini.closure(
            pynini.accep(",") + pynini.accep(NEMO_SPACE) + state, 0, 1)

        zip_code = pynini.compose(NEMO_DIGIT**5, cardinal.single_digits_graph)
        zip_code = pynini.closure(
            pynini.closure(pynini.accep(","), 0, 1) +
            pynini.accep(NEMO_SPACE) + zip_code,
            0,
            1,
        )

        address = address_num + direction + address_words + pynini.closure(
            city + state + zip_code, 0, 1)

        address |= address_num + direction + address_words + pynini.closure(
            pynini.cross(".", ""), 0, 1)

        return address
Beispiel #22
0
    NEMO_DIGIT,
    NEMO_SIGMA,
    TO_LOWER,
    GraphFst,
    delete_extra_space,
    delete_space,
    insert_space,
)
from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels

try:
    import pynini
    from pynini.lib import pynutil

    graph_teen = pynini.invert(
        pynini.string_file(get_abs_path("data/numbers/teen.tsv"))).optimize()
    graph_digit = pynini.invert(
        pynini.string_file(get_abs_path("data/numbers/digit.tsv"))).optimize()
    ties_graph = pynini.invert(
        pynini.string_file(get_abs_path("data/numbers/ties.tsv"))).optimize()

    PYNINI_AVAILABLE = True
except (ModuleNotFoundError, ImportError):
    # Add placeholders for global variables
    graph_teen = None
    graph_digit = None
    ties_graph = None

    PYNINI_AVAILABLE = True

Beispiel #23
0
    def __init__(self, deterministic: bool = True, lm: bool = False):
        super().__init__(name="roman",
                         kind="classify",
                         deterministic=deterministic)

        roman_dict = load_labels(
            get_abs_path("data/roman/roman_to_spoken.tsv"))
        default_graph = pynini.string_map(roman_dict).optimize()
        default_graph = pynutil.insert(
            "integer: \"") + default_graph + pynutil.insert("\"")
        ordinal_limit = 19

        graph_teens = pynini.string_map(
            [x[0] for x in roman_dict[:ordinal_limit]]).optimize()

        # roman numerals up to ordinal_limit with a preceding name are converted to ordinal form
        names = get_names()
        graph = (pynutil.insert("key_the_ordinal: \"") + names +
                 pynutil.insert("\"") + pynini.accep(" ") +
                 graph_teens @ default_graph).optimize()

        # single symbol roman numerals with preceding key words (multiple formats) are converted to cardinal form
        key_words = []
        for k_word in load_labels(get_abs_path("data/roman/key_word.tsv")):
            key_words.append(k_word)
            key_words.append([k_word[0][0].upper() + k_word[0][1:]])
            key_words.append([k_word[0].upper()])

        key_words = pynini.string_map(key_words).optimize()
        graph |= (pynutil.insert("key_cardinal: \"") + key_words +
                  pynutil.insert("\"") + pynini.accep(" ") +
                  default_graph).optimize()

        if deterministic:
            # two digit roman numerals up to 49
            roman_to_cardinal = pynini.compose(
                pynini.closure(NEMO_ALPHA, 2),
                (pynutil.insert("default_cardinal: \"default\" ") +
                 (pynini.string_map([x[0] for x in roman_dict[:50]
                                     ]).optimize()) @ default_graph),
            )
        elif not lm:
            # two or more digit roman numerals
            roman_to_cardinal = pynini.compose(
                pynini.difference(NEMO_SIGMA, "I"),
                (pynutil.insert("default_cardinal: \"default\" integer: \"") +
                 pynini.string_map(roman_dict).optimize() +
                 pynutil.insert("\"")),
            ).optimize()

        # convert three digit roman or up with suffix to ordinal
        roman_to_ordinal = pynini.compose(
            pynini.closure(NEMO_ALPHA, 3),
            (pynutil.insert("default_ordinal: \"default\" ") +
             graph_teens @ default_graph + pynutil.delete("th")),
        )

        graph |= roman_to_cardinal | roman_to_ordinal
        graph = self.add_tokens(graph)

        self.fst = graph.optimize()
Beispiel #24
0
    def __init__(self, cardinal: GraphFst, deterministic: bool):
        super().__init__(name="date",
                         kind="classify",
                         deterministic=deterministic)

        month_graph = pynini.string_file(
            get_abs_path("data/months/names.tsv")).optimize()
        month_graph |= (TO_LOWER + pynini.closure(NEMO_CHAR)) @ month_graph
        month_abbr_graph = pynini.string_file(
            get_abs_path("data/months/abbr.tsv")).optimize()
        month_abbr_graph = (month_abbr_graph |
                            (TO_LOWER + pynini.closure(NEMO_CHAR))
                            @ month_abbr_graph) + pynini.closure(
                                pynutil.delete("."), 0, 1)
        month_graph |= month_abbr_graph

        # to support all caps names
        names_all_caps = [[
            x[0].upper()
        ] for x in load_labels(get_abs_path("data/months/names.tsv"))]
        abbr_all_caps = [
            (x.upper(), y)
            for x, y in load_labels(get_abs_path("data/months/abbr.tsv"))
        ]
        month_graph |= pynini.string_map(names_all_caps) | (
            pynini.string_map(abbr_all_caps) +
            pynini.closure(pynutil.delete("."), 0, 1))

        month_numbers_graph = pynini.string_file(
            get_abs_path("data/months/numbers.tsv")).optimize()
        cardinal_graph = cardinal.graph_hundred_component_at_least_one_none_zero_digit

        year_graph = _get_year_graph(deterministic)

        YEAR_WEIGHT = 0.001
        year_graph_standalone = (pynutil.insert("year: \"") +
                                 pynutil.add_weight(year_graph, YEAR_WEIGHT) +
                                 pynutil.insert("\""))

        month_graph = pynutil.insert(
            "month: \"") + month_graph + pynutil.insert("\"")
        month_numbers_graph = pynutil.insert(
            "month: \"") + month_numbers_graph + pynutil.insert("\"")

        day_graph = (pynutil.insert("day: \"") +
                     ((pynini.union("1", "2", "3") + NEMO_DIGIT) | NEMO_DIGIT)
                     @ cardinal_graph + pynutil.insert("\""))
        optional_day_graph = pynini.closure(delete_extra_space + day_graph, 0,
                                            1)

        two_digit_year = NEMO_DIGIT**(2) @ (cardinal.single_digits_graph
                                            | cardinal_graph)
        two_digit_year = pynutil.insert(
            "year: \"") + two_digit_year + pynutil.insert("\"")

        year_graph = pynutil.insert("year: \"") + year_graph + pynutil.insert(
            "\"")
        optional_graph_year = pynini.closure(
            delete_extra_space + year_graph,
            0,
            1,
        )
        graph_mdy = (month_graph + optional_day_graph + delete_space +
                     pynini.closure(pynutil.delete(","), 0, 1) +
                     optional_graph_year)

        delete_sep = pynutil.delete(pynini.union("-", "/", "."))
        graph_mdy |= (month_numbers_graph + delete_sep + insert_space +
                      pynini.closure(pynutil.delete("0"), 0, 1) + day_graph +
                      delete_sep + insert_space +
                      (year_graph | two_digit_year))

        graph_dmy = (day_graph + delete_extra_space + month_graph +
                     pynini.closure(pynutil.delete(","), 0, 1) +
                     optional_graph_year)
        graph_ymd = ((year_graph | two_digit_year) + delete_sep +
                     insert_space + month_numbers_graph + delete_sep +
                     insert_space + pynini.closure(pynutil.delete("0"), 0, 1) +
                     day_graph)

        final_graph = graph_mdy | graph_dmy
        if deterministic:
            final_graph += pynutil.insert(" preserve_order: true")
        else:
            final_graph += pynini.closure(
                pynutil.insert(" preserve_order: true"), 0, 1)
        final_graph |= graph_ymd | year_graph_standalone

        if not deterministic:
            ymd_to_mdy_graph = None
            mdy_to_dmy_graph = None

            for month in [
                    x[0]
                    for x in load_labels(get_abs_path("data/months/names.tsv"))
            ]:
                for day in [
                        x[0] for x in load_labels(
                            get_abs_path("data/months/days.tsv"))
                ]:
                    ymd_to_mdy_curr = (
                        pynutil.insert("month: \"" + month + "\" day: \"" +
                                       day + "\" ") + pynini.accep('year:') +
                        NEMO_SIGMA + pynutil.delete(" month: \"" + month +
                                                    "\" day: \"" + day + "\""))

                    # YY-MM-DD -> MM-DD-YY
                    ymd_to_mdy_curr = pynini.compose(final_graph,
                                                     ymd_to_mdy_curr)
                    ymd_to_mdy_graph = (
                        ymd_to_mdy_curr if ymd_to_mdy_graph is None else
                        pynini.union(ymd_to_mdy_curr, ymd_to_mdy_graph))

                    mdy_to_dmy_curr = (
                        pynutil.insert("day: \"" + day + "\" month: \"" +
                                       month + "\" ") +
                        pynutil.delete("month: \"" + month + "\" day: \"" +
                                       day + "\" ") + pynini.accep('year:') +
                        NEMO_SIGMA)

                    # pynini.compose(ymd_to_mdy_curr, mdy_to_dmy_curr) to handle:
                    # YY-MM-DD (input format) -> MM-DD-YY (intermediate ymd_to_mdy_curr representation) -> DD-MM-YY
                    # '2000-01-05' -> 'day: "five" month: "january" year: "two thousand"'
                    # pynini.compose(final_graph, mdy_to_dmy_curr) to handle:
                    # MM-DD-YY (input format) -> DD-MM-YY
                    mdy_to_dmy_curr = pynini.compose(
                        ymd_to_mdy_curr, mdy_to_dmy_curr) | pynini.compose(
                            final_graph, mdy_to_dmy_curr)
                    mdy_to_dmy_graph = (
                        mdy_to_dmy_curr if mdy_to_dmy_graph is None else
                        pynini.union(mdy_to_dmy_curr, mdy_to_dmy_graph))

            final_graph |= ymd_to_mdy_graph | mdy_to_dmy_graph

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Beispiel #25
0
    NEMO_ALPHA,
    NEMO_DIGIT,
    NEMO_SIGMA,
    SINGULAR_TO_PLURAL,
    GraphFst,
    convert_space,
    insert_space,
)
from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels

try:
    import pynini
    from pynini.lib import pynutil

    min_singular = pynini.string_file(
        get_abs_path("data/money/currency_minor_singular.tsv"))
    min_plural = pynini.string_file(
        get_abs_path("data/money/currency_minor_plural.tsv"))
    maj_singular = pynini.string_file(
        (get_abs_path("data/money/currency_major.tsv")))

    PYNINI_AVAILABLE = True
except (ModuleNotFoundError, ImportError):
    PYNINI_AVAILABLE = False


class MoneyFst(GraphFst):
    """
    Finite state transducer for classifying money, suppletive aware, e.g. 
        $12.05 -> money { integer_part: "twelve" currency_maj: "dollars" fractional_part: "five" currency_min: "cents" preserve_order: true }
        $12.0500 -> money { integer_part: "twelve" currency_maj: "dollars" fractional_part: "five" currency_min: "cents" preserve_order: true }
Beispiel #26
0
    NEMO_PUNCT = pynini.union(
        *map(pynini.escape, string.punctuation)).optimize()
    NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize()

    NEMO_SIGMA = pynini.closure(NEMO_CHAR)

    delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE))
    insert_space = pynutil.insert(" ")
    delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ")
    delete_preserve_order = pynini.closure(
        pynutil.delete(" preserve_order: true")
        | (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE +
           pynutil.delete("\"")))

    suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv"))
    # _v = pynini.union("a", "e", "i", "o", "u")
    _c = pynini.union("b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n",
                      "p", "q", "r", "s", "t", "v", "w", "x", "y", "z")
    _ies = NEMO_SIGMA + _c + pynini.cross("y", "ies")
    _es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x",
                                    "z") + pynutil.insert("es")
    _s = NEMO_SIGMA + pynutil.insert("s")

    graph_plural = plurals._priority_union(
        suppletive,
        plurals._priority_union(_ies,
                                plurals._priority_union(_es, _s, NEMO_SIGMA),
                                NEMO_SIGMA), NEMO_SIGMA).optimize()

    SINGULAR_TO_PLURAL = graph_plural
Beispiel #27
0
    delete_extra_space,
    delete_space,
    insert_space,
)
from nemo_text_processing.text_normalization.en.utils import (
    augment_labels_with_punct_at_end,
    get_abs_path,
    load_labels,
)

try:
    import pynini
    from pynini.lib import pynutil
    from pynini.examples import plurals

    graph_teen = pynini.invert(pynini.string_file(get_abs_path("data/number/teen.tsv"))).optimize()
    graph_digit = pynini.invert(pynini.string_file(get_abs_path("data/number/digit.tsv"))).optimize()
    ties_graph = pynini.invert(pynini.string_file(get_abs_path("data/number/ty.tsv"))).optimize()
    year_suffix = load_labels(get_abs_path("data/date/year_suffix.tsv"))
    year_suffix.extend(augment_labels_with_punct_at_end(year_suffix))
    year_suffix = pynini.string_map(year_suffix).optimize()

    PYNINI_AVAILABLE = True
except (ModuleNotFoundError, ImportError):
    # Add placeholders for global variables
    graph_teen = None
    graph_digit = None
    ties_graph = None

    PYNINI_AVAILABLE = True
Beispiel #28
0
    def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None):
        super().__init__(name="whitelist", kind="classify", deterministic=deterministic)

        def _get_whitelist_graph(input_case, file, keep_punct_add_end: bool = False):
            whitelist = load_labels(file)
            if input_case == "lower_cased":
                whitelist = [[x.lower(), y] for x, y in whitelist]
            else:
                whitelist = [[x, y] for x, y in whitelist]

            if keep_punct_add_end:
                whitelist.extend(augment_labels_with_punct_at_end(whitelist))

            graph = pynini.string_map(whitelist)
            return graph

        graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist/tts.tsv"))
        graph |= _get_whitelist_graph(input_case, get_abs_path("data/whitelist/symbol.tsv"))

        if deterministic:
            names = get_names()
            graph |= (
                pynini.cross(pynini.union("st", "St", "ST"), "Saint")
                + pynini.closure(pynutil.delete("."))
                + pynini.accep(" ")
                + names
            )
        else:
            graph |= _get_whitelist_graph(
                input_case, get_abs_path("data/whitelist/alternatives.tsv"), keep_punct_add_end=True
            )

        for x in [".", ". "]:
            graph |= (
                NEMO_UPPER
                + pynini.closure(pynutil.delete(x) + NEMO_UPPER, 2)
                + pynini.closure(pynutil.delete("."), 0, 1)
            )

        if not deterministic:
            multiple_forms_whitelist_graph = get_formats(get_abs_path("data/whitelist/alternatives_all_format.tsv"))
            graph |= multiple_forms_whitelist_graph

            graph_unit = pynini.string_file(get_abs_path("data/measure/unit.tsv")) | pynini.string_file(
                get_abs_path("data/measure/unit_alternatives.tsv")
            )
            graph_unit_plural = graph_unit @ SINGULAR_TO_PLURAL
            units_graph = pynini.compose(NEMO_CHAR ** (3, ...), convert_space(graph_unit | graph_unit_plural))
            graph |= units_graph

        # convert to states only if comma is present before the abbreviation to avoid converting all caps words,
        # e.g. "IN", "OH", "OK"
        # TODO or only exclude above?
        states = load_labels(get_abs_path("data/address/state.tsv"))
        additional_options = []
        for x, y in states:
            if input_case == "lower_cased":
                x = x.lower()
            additional_options.append((x, f"{y[0]}.{y[1:]}"))
            if not deterministic:
                additional_options.append((x, f"{y[0]}.{y[1:]}."))

        states.extend(additional_options)
        state_graph = pynini.string_map(states)
        graph |= pynini.closure(NEMO_NOT_SPACE, 1) + pynini.union(", ", ",") + pynini.invert(state_graph).optimize()

        if input_file:
            whitelist_provided = _get_whitelist_graph(input_case, input_file)
            if not deterministic:
                graph |= whitelist_provided
            else:
                graph = whitelist_provided

        self.graph = (convert_space(graph)).optimize()

        self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
Beispiel #29
0
    def __init__(self, cardinal: GraphFst, deterministic: bool, lm: bool = False):
        super().__init__(name="date", kind="classify", deterministic=deterministic)

        # january
        month_graph = pynini.string_file(get_abs_path("data/date/month_name.tsv")).optimize()
        # January, JANUARY
        month_graph |= pynini.compose(TO_LOWER + pynini.closure(NEMO_CHAR), month_graph) | pynini.compose(
            TO_LOWER ** (2, ...), month_graph
        )

        # jan
        month_abbr_graph = pynini.string_file(get_abs_path("data/date/month_abbr.tsv")).optimize()
        # jan, Jan, JAN
        month_abbr_graph = (
            month_abbr_graph
            | pynini.compose(TO_LOWER + pynini.closure(NEMO_LOWER, 1), month_abbr_graph).optimize()
            | pynini.compose(TO_LOWER ** (2, ...), month_abbr_graph).optimize()
        ) + pynini.closure(pynutil.delete("."), 0, 1)
        month_graph |= month_abbr_graph.optimize()

        month_numbers_labels = pynini.string_file(get_abs_path("data/date/month_number.tsv")).optimize()
        cardinal_graph = cardinal.graph_hundred_component_at_least_one_none_zero_digit

        year_graph = _get_year_graph(cardinal_graph=cardinal_graph, deterministic=deterministic)

        # three_digit_year = (NEMO_DIGIT @ cardinal_graph) + insert_space + (NEMO_DIGIT ** 2) @ cardinal_graph
        # year_graph |= three_digit_year

        month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"")
        month_numbers_graph = pynutil.insert("month: \"") + month_numbers_labels + pynutil.insert("\"")

        endings = ["rd", "th", "st", "nd"]
        endings += [x.upper() for x in endings]
        endings = pynini.union(*endings)

        day_graph = (
            pynutil.insert("day: \"")
            + pynini.closure(pynutil.delete("the "), 0, 1)
            + (
                ((pynini.union("1", "2") + NEMO_DIGIT) | NEMO_DIGIT | (pynini.accep("3") + pynini.union("0", "1")))
                + pynini.closure(pynutil.delete(endings), 0, 1)
            )
            @ cardinal_graph
            + pynutil.insert("\"")
        )

        two_digit_year = _get_two_digit_year(
            cardinal_graph=cardinal_graph, single_digits_graph=cardinal.single_digits_graph
        )
        two_digit_year = pynutil.insert("year: \"") + two_digit_year + pynutil.insert("\"")

        # if lm:
        #     two_digit_year = pynini.compose(pynini.difference(NEMO_DIGIT, "0") + NEMO_DIGIT ** (3), two_digit_year)
        #     year_graph = pynini.compose(pynini.difference(NEMO_DIGIT, "0") + NEMO_DIGIT ** (2), year_graph)
        #     year_graph |= pynini.compose(pynini.difference(NEMO_DIGIT, "0") + NEMO_DIGIT ** (4, ...), year_graph)

        graph_year = pynutil.insert(" year: \"") + pynutil.delete(" ") + year_graph + pynutil.insert("\"")
        graph_year |= (
            pynutil.insert(" year: \"")
            + pynini.accep(",")
            + pynini.closure(pynini.accep(" "), 0, 1)
            + year_graph
            + pynutil.insert("\"")
        )
        optional_graph_year = pynini.closure(graph_year, 0, 1)

        year_graph = pynutil.insert("year: \"") + year_graph + pynutil.insert("\"")

        graph_mdy = month_graph + (
            (delete_extra_space + day_graph)
            | (pynini.accep(" ") + day_graph)
            | graph_year
            | (delete_extra_space + day_graph + graph_year)
        )

        graph_mdy |= (
            month_graph
            + pynini.cross("-", " ")
            + day_graph
            + pynini.closure(((pynini.cross("-", " ") + NEMO_SIGMA) @ graph_year), 0, 1)
        )

        for x in ["-", "/", "."]:
            delete_sep = pynutil.delete(x)
            graph_mdy |= (
                month_numbers_graph
                + delete_sep
                + insert_space
                + pynini.closure(pynutil.delete("0"), 0, 1)
                + day_graph
                + delete_sep
                + insert_space
                + (year_graph | two_digit_year)
            )

        graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year
        day_ex_month = (NEMO_DIGIT ** 2 - pynini.project(month_numbers_graph, "input")) @ day_graph
        for x in ["-", "/", "."]:
            delete_sep = pynutil.delete(x)
            graph_dmy |= (
                day_ex_month
                + delete_sep
                + insert_space
                + month_numbers_graph
                + delete_sep
                + insert_space
                + (year_graph | two_digit_year)
            )

        graph_ymd = pynini.accep("")
        for x in ["-", "/", "."]:
            delete_sep = pynutil.delete(x)
            graph_ymd |= (
                (year_graph | two_digit_year)
                + delete_sep
                + insert_space
                + month_numbers_graph
                + delete_sep
                + insert_space
                + pynini.closure(pynutil.delete("0"), 0, 1)
                + day_graph
            )

        final_graph = graph_mdy | graph_dmy

        if not deterministic or lm:
            final_graph += pynini.closure(pynutil.insert(" preserve_order: true"), 0, 1)
            m_sep_d = (
                month_numbers_graph
                + pynutil.delete(pynini.union("-", "/"))
                + insert_space
                + pynini.closure(pynutil.delete("0"), 0, 1)
                + day_graph
            )
            final_graph |= m_sep_d
        else:
            final_graph += pynutil.insert(" preserve_order: true")

        final_graph |= graph_ymd | year_graph

        if not deterministic or lm:
            ymd_to_mdy_graph = None
            ymd_to_dmy_graph = None
            mdy_to_dmy_graph = None
            md_to_dm_graph = None

            for month in [x[0] for x in load_labels(get_abs_path("data/date/month_name.tsv"))]:
                for day in [x[0] for x in load_labels(get_abs_path("data/date/day.tsv"))]:
                    ymd_to_mdy_curr = (
                        pynutil.insert("month: \"" + month + "\" day: \"" + day + "\" ")
                        + pynini.accep('year:')
                        + NEMO_SIGMA
                        + pynutil.delete(" month: \"" + month + "\" day: \"" + day + "\"")
                    )

                    # YY-MM-DD -> MM-DD-YY
                    ymd_to_mdy_curr = pynini.compose(graph_ymd, ymd_to_mdy_curr)
                    ymd_to_mdy_graph = (
                        ymd_to_mdy_curr
                        if ymd_to_mdy_graph is None
                        else pynini.union(ymd_to_mdy_curr, ymd_to_mdy_graph)
                    )

                    ymd_to_dmy_curr = (
                        pynutil.insert("day: \"" + day + "\" month: \"" + month + "\" ")
                        + pynini.accep('year:')
                        + NEMO_SIGMA
                        + pynutil.delete(" month: \"" + month + "\" day: \"" + day + "\"")
                    )

                    # YY-MM-DD -> MM-DD-YY
                    ymd_to_dmy_curr = pynini.compose(graph_ymd, ymd_to_dmy_curr).optimize()
                    ymd_to_dmy_graph = (
                        ymd_to_dmy_curr
                        if ymd_to_dmy_graph is None
                        else pynini.union(ymd_to_dmy_curr, ymd_to_dmy_graph)
                    )

                    mdy_to_dmy_curr = (
                        pynutil.insert("day: \"" + day + "\" month: \"" + month + "\" ")
                        + pynutil.delete("month: \"" + month + "\" day: \"" + day + "\" ")
                        + pynini.accep('year:')
                        + NEMO_SIGMA
                    ).optimize()
                    # MM-DD-YY -> verbalize as MM-DD-YY (February fourth 1991) or DD-MM-YY (the fourth of February 1991)
                    mdy_to_dmy_curr = pynini.compose(graph_mdy, mdy_to_dmy_curr).optimize()
                    mdy_to_dmy_graph = (
                        mdy_to_dmy_curr
                        if mdy_to_dmy_graph is None
                        else pynini.union(mdy_to_dmy_curr, mdy_to_dmy_graph).optimize()
                    ).optimize()

                    md_to_dm_curr = pynutil.insert("day: \"" + day + "\" month: \"" + month + "\"") + pynutil.delete(
                        "month: \"" + month + "\" day: \"" + day + "\""
                    )
                    md_to_dm_curr = pynini.compose(m_sep_d, md_to_dm_curr).optimize()

                    md_to_dm_graph = (
                        md_to_dm_curr
                        if md_to_dm_graph is None
                        else pynini.union(md_to_dm_curr, md_to_dm_graph).optimize()
                    ).optimize()

            final_graph |= mdy_to_dmy_graph | md_to_dm_graph | ymd_to_mdy_graph | ymd_to_dmy_graph

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Beispiel #30
0
    def __init__(self,
                 cardinal: GraphFst,
                 ordinal: GraphFst,
                 deterministic: bool = True,
                 lm: bool = False):
        super().__init__(name="integer",
                         kind="classify",
                         deterministic=deterministic)
        """
        Finite state transducer for classifying serial (handles only cases without delimiters,
        values with delimiters are handled by default).
            The serial is a combination of digits, letters and dashes, e.g.:
            c325b -> tokens { cardinal { integer: "c three two five b" } }
        """
        num_graph = pynini.compose(NEMO_DIGIT**(6, ...),
                                   cardinal.single_digits_graph).optimize()
        num_graph |= pynini.compose(NEMO_DIGIT**(1, 5),
                                    cardinal.graph).optimize()
        # to handle numbers starting with zero
        num_graph |= pynini.compose(
            pynini.accep("0") + pynini.closure(NEMO_DIGIT),
            cardinal.single_digits_graph).optimize()
        # TODO: "#" doesn't work from the file
        symbols_graph = pynini.string_file(
            get_abs_path("data/whitelist/symbol.tsv")).optimize(
            ) | pynini.cross("#", "hash")
        num_graph |= symbols_graph

        if not self.deterministic and not lm:
            num_graph |= cardinal.single_digits_graph
            # also allow double digits to be pronounced as integer in serial number
            num_graph |= pynutil.add_weight(
                NEMO_DIGIT**2 @ cardinal.
                graph_hundred_component_at_least_one_none_zero_digit,
                weight=0.0001)

        # add space between letter and digit/symbol
        symbols = [
            x[0]
            for x in load_labels(get_abs_path("data/whitelist/symbol.tsv"))
        ]
        symbols = pynini.union(*symbols)
        digit_symbol = NEMO_DIGIT | symbols

        graph_with_space = pynini.compose(
            pynini.cdrewrite(pynutil.insert(" "), NEMO_ALPHA | symbols,
                             digit_symbol, NEMO_SIGMA),
            pynini.cdrewrite(pynutil.insert(" "), digit_symbol,
                             NEMO_ALPHA | symbols, NEMO_SIGMA),
        )

        # serial graph with delimiter
        delimiter = pynini.accep("-") | pynini.accep("/") | pynini.accep(" ")
        alphas = pynini.closure(NEMO_ALPHA, 1)
        letter_num = alphas + delimiter + num_graph
        num_letter = pynini.closure(num_graph + delimiter, 1) + alphas
        next_alpha_or_num = pynini.closure(delimiter + (alphas | num_graph))
        next_alpha_or_num |= pynini.closure(
            delimiter + num_graph +
            plurals._priority_union(pynini.accep(" "), pynutil.insert(" "),
                                    NEMO_SIGMA).optimize() + alphas)

        serial_graph = letter_num + next_alpha_or_num
        serial_graph |= num_letter + next_alpha_or_num
        # numbers only with 2+ delimiters
        serial_graph |= (num_graph + delimiter + num_graph + delimiter +
                         num_graph + pynini.closure(delimiter + num_graph))
        # 2+ symbols
        serial_graph |= pynini.compose(NEMO_SIGMA + symbols + NEMO_SIGMA,
                                       num_graph + delimiter + num_graph)

        # exclude ordinal numbers from serial options
        serial_graph = pynini.compose(
            pynini.difference(NEMO_SIGMA,
                              pynini.project(ordinal.graph, "input")),
            serial_graph).optimize()

        serial_graph = pynutil.add_weight(serial_graph, 0.0001)
        serial_graph |= (pynini.closure(NEMO_NOT_SPACE, 1) +
                         (pynini.cross("^2", " squared")
                          | pynini.cross("^3", " cubed")).optimize())

        # at least one serial graph with alpha numeric value and optional additional serial/num/alpha values
        serial_graph = (
            pynini.closure((serial_graph | num_graph | alphas) + delimiter) +
            serial_graph + pynini.closure(delimiter +
                                          (serial_graph | num_graph | alphas)))

        serial_graph |= pynini.compose(graph_with_space,
                                       serial_graph.optimize()).optimize()
        serial_graph = pynini.compose(pynini.closure(NEMO_NOT_SPACE, 2),
                                      serial_graph).optimize()

        self.graph = serial_graph.optimize()
        graph = pynutil.insert("name: \"") + convert_space(
            self.graph).optimize() + pynutil.insert("\"")
        self.fst = graph.optimize()