Esempio n. 1
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="ordinal",
                         kind="verbalize",
                         deterministic=deterministic)

        graph_digit = pynini.string_file(
            get_abs_path("data/ordinals/digit.tsv")).invert()
        graph_teens = pynini.string_file(
            get_abs_path("data/ordinals/teen.tsv")).invert()

        graph = (pynutil.delete("integer:") + delete_space +
                 pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) +
                 pynutil.delete("\""))
        convert_rest = pynutil.insert("th", weight=0.01)

        suffix = pynini.cdrewrite(
            graph_digit | graph_teens
            | pynutil.add_weight(pynini.cross("ty", "tieth"), weight=0.001)
            | convert_rest,
            "",
            "[EOS]",
            NEMO_SIGMA,
        ).optimize()
        graph = graph @ suffix
        self.suffix = suffix
        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()
Esempio n. 2
0
    def __init__(self, cardinal: GraphFst, deterministic: bool = True):
        super().__init__(name="time",
                         kind="classify",
                         deterministic=deterministic)
        suffix_graph = pynini.string_file(get_abs_path("data/time_suffix.tsv"))
        time_zone_graph = pynini.string_file(
            get_abs_path("data/time_zone.tsv"))

        # only used for < 1000 thousand -> 0 weight
        cardinal = cardinal.graph

        labels_hour = [str(x) for x in range(0, 24)]
        labels_minute_single = [str(x) for x in range(1, 10)]
        labels_minute_double = [str(x) for x in range(10, 60)]

        delete_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (
            pynini.closure(pynutil.delete("0"), 0, 1) + NEMO_DIGIT)

        graph_hour = delete_leading_zero_to_double_digit @ pynini.union(
            *labels_hour) @ cardinal

        graph_minute_single = pynini.union(*labels_minute_single) @ cardinal
        graph_minute_double = pynini.union(*labels_minute_double) @ cardinal

        final_graph_hour = pynutil.insert(
            "hours: \"") + graph_hour + pynutil.insert("\"")
        final_graph_minute = (
            pynutil.insert("minutes: \"") +
            (pynini.cross("0", "o") + insert_space + graph_minute_single
             | graph_minute_double) + pynutil.insert("\""))
        final_suffix = pynutil.insert("suffix: \"") + convert_space(
            suffix_graph) + pynutil.insert("\"")
        final_suffix_optional = pynini.closure(
            delete_space + insert_space + final_suffix, 0, 1)
        final_time_zone_optional = pynini.closure(
            delete_space + insert_space + pynutil.insert("zone: \"") +
            convert_space(time_zone_graph) + pynutil.insert("\""),
            0,
            1,
        )

        # 2:30 pm, 02:30, 2:00
        graph_hm = (
            final_graph_hour + pynutil.delete(":") +
            (pynutil.delete("00") | insert_space + final_graph_minute) +
            final_suffix_optional + final_time_zone_optional)

        # 2.xx pm/am
        graph_hm2 = (
            final_graph_hour + pynutil.delete(".") +
            (pynutil.delete("00") | insert_space + final_graph_minute) +
            delete_space + insert_space + final_suffix +
            final_time_zone_optional)
        # 2 pm est
        graph_h = final_graph_hour + delete_space + insert_space + final_suffix + final_time_zone_optional
        final_graph = (graph_hm | graph_h | graph_hm2).optimize()

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Esempio n. 3
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="cardinal",
                         kind="classify",
                         deterministic=deterministic)

        graph = pynini.Far(
            get_abs_path("data/numbers/cardinal_number_name.far")).get_fst()
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            pynini.closure(NEMO_DIGIT, 2, 3)
            | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph
        self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure(
            pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT +
            NEMO_DIGIT + NEMO_DIGIT)) @ graph

        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
        single_digits_graph = pynutil.add_weight(
            pynini.invert(graph_digit | graph_zero), 1.2) | pynutil.add_weight(
                pynini.cross("0", "oh"), 1.1)
        self.single_digits_graph = single_digits_graph + pynini.closure(
            pynutil.insert(" ") + single_digits_graph)

        if not deterministic:
            single_digits_graph_with_commas = pynini.closure(
                self.single_digits_graph + pynutil.insert(" "), 1,
                3) + pynini.closure(
                    pynutil.delete(",") + single_digits_graph +
                    pynutil.insert(" ") + single_digits_graph +
                    pynutil.insert(" ") + single_digits_graph,
                    1,
                )
            self.graph |= self.single_digits_graph | get_hundreds_graph(
            ) | single_digits_graph_with_commas
            self.range_graph = (
                pynini.closure(pynutil.insert("from "), 0, 1) + self.graph +
                (pynini.cross("-", " to ") | pynini.cross("-", " ")) +
                self.graph)

            self.range_graph |= self.graph + (pynini.cross(
                "x", " by ") | pynini.cross(" x ", " by ")) + self.graph
            self.range_graph = self.range_graph.optimize()

        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)
        final_graph = self.graph | pynutil.add_weight(self.get_serial_graph(),
                                                      1.2)

        if not deterministic:
            final_graph |= self.range_graph

        final_graph = optional_minus_graph + pynutil.insert(
            "integer: \"") + final_graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Esempio n. 4
0
 def _get_whitelist_non_deterministic_graph(
         file="data/whitelist_alternatives.tsv"):
     whitelist = load_labels(get_abs_path(file))
     whitelist_lower = [(x.lower(), y.lower()) for x, y in whitelist]
     whitelist_cased = [(x, y) for x, y in whitelist]
     graph = pynini.string_map(whitelist_lower + whitelist_cased)
     return graph
Esempio n. 5
0
    def __init__(self, cardinal: GraphFst, decimal: GraphFst):
        super().__init__(name="money", kind="classify")
        cardinal_graph = cardinal.graph
        graph_decimal_final = decimal.final_graph_wo_negative

        unit_singular = pynini.string_file(get_abs_path("data/currency.tsv"))
        unit_plural = convert_space(unit_singular @ SINGULAR_TO_PLURAL)
        unit_singular = convert_space(unit_singular)

        graph_unit_singular = pynutil.insert(
            "currency: \"") + unit_singular + pynutil.insert("\"")
        graph_unit_plural = pynutil.insert(
            "currency: \"") + unit_plural + pynutil.insert("\"")

        graph_integer = (graph_unit_plural +
                         pynutil.insert(" integer_part: \"") +
                         ((NEMO_SIGMA - "1") @ cardinal_graph) +
                         pynutil.insert("\""))
        graph_integer |= (graph_unit_singular +
                          pynutil.insert(" integer_part: \"") +
                          pynini.cross("1", "one") + pynutil.insert("\""))
        graph_decimal = graph_unit_plural + insert_space + graph_decimal_final
        final_graph = graph_integer | graph_decimal
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Esempio n. 6
0
    def get_serial_graph(self):
        """
        Finite state transducer for classifying serial.
            The serial is a combination of digits, letters and dashes, e.g.:
            c325-b -> tokens { serial { value: "c three two five b" } }
        """
        alpha = NEMO_ALPHA

        if self.deterministic:
            num_graph = self.single_digits_graph
        else:
            num_graph = self.graph
            letter_pronunciation = pynini.string_map(
                load_labels(get_abs_path("data/letter_pronunciation.tsv")))
            alpha |= letter_pronunciation

        delimiter = insert_space | pynini.cross("-", " ") | pynini.cross(
            "/", " ")
        letter_num = pynini.closure(alpha + delimiter, 1) + num_graph
        num_letter = pynini.closure(num_graph + delimiter, 1) + alpha
        next_alpha_or_num = pynini.closure(delimiter + (alpha | num_graph))
        serial_graph = (letter_num | num_letter) + next_alpha_or_num

        if not self.deterministic:
            serial_graph += pynini.closure(
                pynini.accep("s") | pynini.cross("s", "es"), 0, 1)
        return serial_graph
Esempio n. 7
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="electronic",
                         kind="verbalize",
                         deterministic=deterministic)
        graph_digit = pynini.invert(
            pynini.string_file(
                get_abs_path("data/numbers/digit.tsv"))).optimize()
        graph_symbols = pynini.string_file(
            get_abs_path("data/electronic/symbols.tsv")).optimize()
        user_name = (
            pynutil.delete("username:"******"\"") +
            (pynini.closure(
                pynutil.add_weight(graph_digit + insert_space, 1.09)
                | pynutil.add_weight(
                    pynini.closure(graph_symbols + pynutil.insert(" ")), 1.09)
                | pynutil.add_weight(NEMO_NOT_QUOTE + insert_space, 1.1))) +
            pynutil.delete("\""))

        domain_default = (pynini.closure(NEMO_NOT_QUOTE + insert_space) +
                          pynini.cross(".", "dot ") + NEMO_NOT_QUOTE +
                          pynini.closure(insert_space + NEMO_NOT_QUOTE))

        server_default = (pynini.closure(
            (graph_digit | NEMO_ALPHA) + insert_space, 1) +
                          pynini.closure(graph_symbols + insert_space) +
                          pynini.closure(
                              (graph_digit | NEMO_ALPHA) + insert_space, 1))
        server_common = pynini.string_file(
            get_abs_path("data/electronic/server_name.tsv")) + insert_space

        domain_common = pynini.cross(".", "dot ") + pynini.string_file(
            get_abs_path("data/electronic/domain.tsv"))

        domain = (pynutil.delete("domain:") + delete_space +
                  pynutil.delete("\"") +
                  (pynutil.add_weight(server_common, 1.09)
                   | pynutil.add_weight(server_default, 1.1)) +
                  (pynutil.add_weight(domain_common, 1.09)
                   | pynutil.add_weight(domain_default, 1.1)) + delete_space +
                  pynutil.delete("\""))

        graph = user_name + delete_space + pynutil.insert(
            "at ") + delete_space + domain + delete_space

        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()
Esempio n. 8
0
 def _get_whitelist_graph(input_case, file="data/whitelist.tsv"):
     whitelist = load_labels(get_abs_path(file))
     if input_case == "lower_cased":
         whitelist = [(x.lower(), y) for x, y in whitelist]
     else:
         whitelist = [(x, y) for x, y in whitelist]
     graph = pynini.string_map(whitelist)
     return graph
Esempio n. 9
0
    def __init__(self, cardinal: GraphFst, decimal: GraphFst):
        super().__init__(name="measure", kind="classify")
        cardinal_graph = cardinal.graph

        graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv"))
        graph_unit_plural = convert_space(graph_unit @ SINGULAR_TO_PLURAL)
        graph_unit = convert_space(graph_unit)
        optional_graph_negative = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1)

        graph_unit2 = pynini.cross("/", "per") + delete_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit

        optional_graph_unit2 = pynini.closure(
            delete_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit2, 0, 1,
        )

        unit_plural = (
            pynutil.insert("units: \"")
            + (graph_unit_plural + optional_graph_unit2 | graph_unit2)
            + pynutil.insert("\"")
        )

        unit_singular = (
            pynutil.insert("units: \"") + (graph_unit + optional_graph_unit2 | graph_unit2) + pynutil.insert("\"")
        )

        subgraph_decimal = (
            pynutil.insert("decimal { ")
            + optional_graph_negative
            + decimal.final_graph_wo_negative
            + delete_space
            + pynutil.insert(" } ")
            + unit_plural
        )

        subgraph_cardinal = (
            pynutil.insert("cardinal { ")
            + optional_graph_negative
            + pynutil.insert("integer: \"")
            + ((NEMO_SIGMA - "1") @ cardinal_graph)
            + delete_space
            + pynutil.insert("\"")
            + pynutil.insert(" } ")
            + unit_plural
        )

        subgraph_cardinal |= (
            pynutil.insert("cardinal { ")
            + optional_graph_negative
            + pynutil.insert("integer: \"")
            + pynini.cross("1", "one")
            + delete_space
            + pynutil.insert("\"")
            + pynutil.insert(" } ")
            + unit_singular
        )
        final_graph = subgraph_decimal | subgraph_cardinal
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Esempio n. 10
0
    def __init__(self, cardinal: GraphFst):
        super().__init__(name="decimal", kind="classify")

        cardinal_graph = cardinal.graph
        cardinal_graph_hundred_component_at_least_one_none_zero_digit = (
            cardinal.graph_hundred_component_at_least_one_none_zero_digit)

        graph_decimal = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_decimal |= pynini.string_file(
            get_abs_path("data/numbers/zero.tsv"))

        graph_decimal = (
            pynini.cross("zero", "0")
            | graph_decimal
            | (graph_decimal | pynini.cross("o", "0")) + pynini.closure(
                delete_space + (graph_decimal | pynini.cross("o", "0")), 1))
        self.graph = pynini.invert(graph_decimal).optimize()

        point = pynutil.delete(".")

        optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)

        graph_fractional = pynutil.insert(
            "fractional_part: \"") + self.graph + pynutil.insert("\"")
        graph_integer = pynutil.insert(
            "integer_part: \"") + cardinal_graph + pynutil.insert("\"")
        final_graph_wo_sign = (
            pynini.closure(graph_integer + pynutil.insert(" "), 0, 1) + point +
            pynutil.insert(" ") + graph_fractional)

        self.final_graph_wo_negative = final_graph_wo_sign | get_quantity(
            final_graph_wo_sign,
            cardinal_graph_hundred_component_at_least_one_none_zero_digit)

        final_graph = optional_graph_negative + self.final_graph_wo_negative

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Esempio n. 11
0
    def __init__(self, input_case: str):
        super().__init__(name="whitelist", kind="classify")

        whitelist = load_labels(get_abs_path("data/whitelist.tsv"))
        if input_case == "lower_cased":
            whitelist = [(x.lower(), y) for x, y in whitelist]
        else:
            whitelist = [(x, y) for x, y in whitelist]

        graph = pynini.string_map(whitelist)

        graph = pynutil.insert("name: \"") + convert_space(graph) + pynutil.insert("\"")
        self.fst = graph.optimize()
Esempio n. 12
0
    def __init__(self):
        super().__init__(name="cardinal", kind="classify")

        graph = pynini.Far(
            get_abs_path("data/numbers/cardinal_number_name.far")).get_fst()
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            pynini.closure(NEMO_DIGIT, 2, 3)
            | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph
        self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure(
            pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT +
            NEMO_DIGIT + NEMO_DIGIT)) @ graph

        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)

        final_graph = optional_minus_graph + pynutil.insert(
            "integer: \"") + self.graph + pynutil.insert("\"")

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Esempio n. 13
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="telephone",
                         kind="classify",
                         deterministic=deterministic)

        add_separator = pynutil.insert(", ")  # between components
        digit = pynini.invert(
            pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
        ).optimize() | pynini.cross("0", "o")

        country_code = (pynutil.insert("country_code: \"") +
                        pynutil.delete("+") +
                        pynini.closure(digit + insert_space, 0, 2) + digit +
                        pynutil.insert("\""))
        optional_country_code = pynini.closure(
            country_code + pynini.closure(pynutil.delete("-"), 0, 1) +
            delete_space + insert_space, 0, 1)
        number_part = ((
            (pynini.closure(digit + insert_space, 2, 2) + digit +
             pynutil.delete("-"))
            |
            (pynutil.delete("(") + pynini.closure(digit + insert_space, 2, 2) +
             digit + pynutil.delete(")") +
             pynini.closure(pynutil.delete("-"), 0, 1) + delete_space)) +
                       add_separator +
                       pynini.closure(digit + insert_space, 2, 2) + digit +
                       pynutil.delete("-") + add_separator +
                       pynini.closure(digit + insert_space, 3, 3) + digit)
        number_part = pynutil.insert(
            "number_part: \"") + number_part + pynutil.insert("\"")
        extension = (pynutil.insert("extension : \"") +
                     pynini.closure(digit + insert_space, 0, 3) + digit +
                     pynutil.insert("\""))
        optional_extension = pynini.closure(
            insert_space + pynutil.delete("-") + extension, 0, 1)

        graph = optional_country_code + number_part + optional_extension
        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Esempio n. 14
0
    NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r",
                                    u"\u00A0").optimize()
    NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize()
    NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize()

    NEMO_PUNCT = pynini.union(
        *map(pynini.escape, string.punctuation)).optimize()
    NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize()

    NEMO_SIGMA = pynini.closure(NEMO_CHAR)

    delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE))
    insert_space = pynutil.insert(" ")
    delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ")

    suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv"))
    # _v = pynini.union("a", "e", "i", "o", "u")
    _c = pynini.union("b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n",
                      "p", "q", "r", "s", "t", "v", "w", "x", "y", "z")
    _ies = NEMO_SIGMA + _c + pynini.cross("y", "ies")
    _es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x",
                                    "z") + pynutil.insert("es")
    _s = NEMO_SIGMA + pynutil.insert("s")

    graph_plural = plurals._priority_union(
        suppletive,
        plurals._priority_union(_ies,
                                plurals._priority_union(_es, _s, NEMO_SIGMA),
                                NEMO_SIGMA), NEMO_SIGMA).optimize()

    SINGULAR_TO_PLURAL = graph_plural
Esempio n. 15
0
    def __init__(self):
        super().__init__(name="cardinal", kind="classify")

        delete_space = pynutil.delete(" ")
        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
        graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))

        delete_extra_spaces = (
            pynini.closure(pynutil.delete(" ")) +
            pynini.closure(pynini.closure(NEMO_ALPHA, 1) + delete_extra_space)
            + pynini.closure(NEMO_ALPHA, 1) +
            pynini.closure(pynutil.delete(" ")))

        graph_hundred = pynutil.delete("hundred")

        graph_hundred_component = pynini.union(
            graph_digit + delete_space + graph_hundred + delete_space,
            pynutil.insert("0"))
        graph_hundred_component += pynini.union(
            graph_teen | pynutil.insert("00"),
            (graph_ties + delete_space | pynutil.insert("0")) +
            (graph_digit | pynutil.insert("0")),
        )

        #  string -> all 3 digit numbers apart from 000
        graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ (
            pynini.closure(NEMO_DIGIT) +
            (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT))

        # all 3 digit numbers apart from 0 -> string
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            pynini.invert(
                graph_hundred_component_at_least_one_none_zero_digit
                @ (pynutil.delete(pynini.closure("0")) + pynini.difference(
                    NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT)))
            @ delete_extra_spaces).optimize()

        insert_comma = pynini.closure(pynutil.insert(","), 0, 1)

        graph_thousands = (pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space + pynutil.delete("thousand"),
            pynutil.insert("000", weight=0.1),
        ) + insert_comma)

        graph_million = (pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space + pynutil.delete("million"),
            pynutil.insert("000", weight=0.1),
        ) + insert_comma)
        graph_billion = (pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space + pynutil.delete("billion"),
            pynutil.insert("000", weight=0.1),
        ) + insert_comma)
        graph_trillion = (pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space + pynutil.delete("trillion"),
            pynutil.insert("000", weight=0.1),
        ) + insert_comma)
        graph_quadrillion = (pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space + pynutil.delete("quadrillion"),
            pynutil.insert("000", weight=0.1),
        ) + insert_comma)
        graph_quintillion = (pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space + pynutil.delete("quintillion"),
            pynutil.insert("000", weight=0.1),
        ) + insert_comma)
        graph_sextillion = (pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space + pynutil.delete("sextillion"),
            pynutil.insert("000", weight=0.1),
        ) + insert_comma)

        graph = pynini.union(
            graph_sextillion + delete_space + graph_quintillion +
            delete_space + graph_quadrillion + delete_space + graph_trillion +
            delete_space + graph_billion + delete_space + graph_million +
            delete_space + graph_thousands + delete_space +
            graph_hundred_component,
            graph_zero,
        )

        graph = graph @ pynini.union(
            pynini.closure(pynutil.delete(pynini.union("0", ","))) +
            pynini.difference(NEMO_DIGIT, "0") +
            pynini.closure(pynini.union(NEMO_DIGIT, ",")),
            "0",
        )

        self.graph = pynini.invert(graph) @ delete_extra_spaces
        self.graph = self.graph.optimize()

        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)

        final_graph = optional_minus_graph + pynutil.insert(
            "integer: \"") + self.graph + pynutil.insert("\"")

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Esempio n. 16
0
    def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, deterministic: bool = True):
        super().__init__(name="measure", kind="classify", deterministic=deterministic)
        cardinal_graph = cardinal.graph

        if not deterministic:
            cardinal_graph |= cardinal.range_graph

        graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv"))
        graph_unit_plural = convert_space(graph_unit @ SINGULAR_TO_PLURAL)
        graph_unit = convert_space(graph_unit)
        optional_graph_negative = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1)

        graph_unit2 = pynini.cross("/", "per") + delete_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit

        optional_graph_unit2 = pynini.closure(
            delete_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit2, 0, 1,
        )

        unit_plural = (
            pynutil.insert("units: \"")
            + (graph_unit_plural + optional_graph_unit2 | graph_unit2)
            + pynutil.insert("\"")
        )

        unit_singular = (
            pynutil.insert("units: \"") + (graph_unit + optional_graph_unit2 | graph_unit2) + pynutil.insert("\"")
        )

        subgraph_decimal = (
            pynutil.insert("decimal { ")
            + optional_graph_negative
            + decimal.final_graph_wo_negative
            + delete_space
            + pynutil.insert(" } ")
            + unit_plural
        )

        subgraph_cardinal = (
            pynutil.insert("cardinal { ")
            + optional_graph_negative
            + pynutil.insert("integer: \"")
            + ((NEMO_SIGMA - "1") @ cardinal_graph)
            + delete_space
            + pynutil.insert("\"")
            + pynutil.insert(" } ")
            + unit_plural
        )

        subgraph_cardinal |= (
            pynutil.insert("cardinal { ")
            + optional_graph_negative
            + pynutil.insert("integer: \"")
            + pynini.cross("1", "one")
            + delete_space
            + pynutil.insert("\"")
            + pynutil.insert(" } ")
            + unit_singular
        )

        cardinal_dash_alpha = (
            pynutil.insert("cardinal { integer: \"")
            + cardinal_graph
            + pynini.cross('-', '')
            + pynutil.insert("\" } units: \"")
            + pynini.closure(NEMO_ALPHA, 1)
            + pynutil.insert("\"")
        )

        alpha_dash_cardinal = (
            pynutil.insert("units: \"")
            + pynini.closure(NEMO_ALPHA, 1)
            + pynini.cross('-', '')
            + pynutil.insert("\"")
            + pynutil.insert(" cardinal { integer: \"")
            + cardinal_graph
            + pynutil.insert("\" } preserve_order: true")
        )

        decimal_dash_alpha = (
            pynutil.insert("decimal { ")
            + decimal.final_graph_wo_negative
            + pynini.cross('-', '')
            + pynutil.insert(" } units: \"")
            + pynini.closure(NEMO_ALPHA, 1)
            + pynutil.insert("\"")
        )

        alpha_dash_decimal = (
            pynutil.insert("units: \"")
            + pynini.closure(NEMO_ALPHA, 1)
            + pynini.cross('-', '')
            + pynutil.insert("\"")
            + pynutil.insert(" decimal { ")
            + decimal.final_graph_wo_negative
            + pynutil.insert(" } preserve_order: true")
        )

        subgraph_fraction = (
            pynutil.insert("fraction { ") + fraction.graph + delete_space + pynutil.insert(" } ") + unit_plural
        )

        final_graph = (
            subgraph_decimal
            | subgraph_cardinal
            | cardinal_dash_alpha
            | alpha_dash_cardinal
            | decimal_dash_alpha
            | alpha_dash_decimal
            | subgraph_fraction
        )
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Esempio n. 17
0
    def __init__(self, cardinal: GraphFst, deterministic: bool):
        super().__init__(name="date",
                         kind="classify",
                         deterministic=deterministic)

        month_graph = pynini.string_file(
            get_abs_path("data/months/names.tsv")).optimize()
        month_graph |= (TO_LOWER + pynini.closure(NEMO_CHAR)) @ month_graph
        month_abbr_graph = pynini.string_file(
            get_abs_path("data/months/abbr.tsv")).optimize()
        month_abbr_graph = (month_abbr_graph |
                            (TO_LOWER + pynini.closure(NEMO_CHAR))
                            @ month_abbr_graph) + pynini.closure(
                                pynutil.delete("."), 0, 1)
        month_graph |= month_abbr_graph

        month_numbers_graph = pynini.string_file(
            get_abs_path("data/months/numbers.tsv")).optimize()

        cardinal_graph = cardinal.graph_hundred_component_at_least_one_none_zero_digit

        year_graph = _get_year_graph(deterministic)

        YEAR_WEIGHT = 0.001
        year_graph_standalone = (pynutil.insert("year: \"") +
                                 pynutil.add_weight(year_graph, YEAR_WEIGHT) +
                                 pynutil.insert("\""))

        month_graph = pynutil.insert(
            "month: \"") + month_graph + pynutil.insert("\"")
        month_numbers_graph = pynutil.insert(
            "month: \"") + month_numbers_graph + pynutil.insert("\"")

        day_graph = (pynutil.insert("day: \"") +
                     ((pynini.union("1", "2", "3") + NEMO_DIGIT) | NEMO_DIGIT)
                     @ cardinal_graph + pynutil.insert("\""))
        optional_day_graph = pynini.closure(delete_extra_space + day_graph, 0,
                                            1)

        year_graph = pynutil.insert("year: \"") + year_graph + pynutil.insert(
            "\"")
        optional_graph_year = pynini.closure(
            delete_extra_space + year_graph,
            0,
            1,
        )
        graph_mdy = (month_graph + optional_day_graph + delete_space +
                     pynini.closure(pynutil.delete(","), 0, 1) +
                     optional_graph_year)

        delete_sep = pynutil.delete(pynini.union("-", "/", "."))
        graph_mdy |= (month_numbers_graph + delete_sep + insert_space +
                      pynini.closure(pynutil.delete("0"), 0, 1) + day_graph +
                      delete_sep + insert_space + year_graph)

        graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year
        graph_ymd = (year_graph + delete_sep + insert_space +
                     month_numbers_graph + delete_sep + insert_space +
                     pynini.closure(pynutil.delete("0"), 0, 1) + day_graph)

        final_graph = (graph_mdy
                       | graph_dmy) + pynutil.insert(" preserve_order: true")
        final_graph |= graph_ymd | year_graph_standalone
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Esempio n. 18
0
    NEMO_CHAR,
    NEMO_DIGIT,
    NEMO_SIGMA,
    TO_LOWER,
    GraphFst,
    delete_extra_space,
    delete_space,
    insert_space,
)

try:
    import pynini
    from pynini.lib import pynutil

    graph_teen = pynini.invert(
        pynini.string_file(get_abs_path("data/numbers/teen.tsv"))).optimize()
    graph_digit = pynini.invert(
        pynini.string_file(get_abs_path("data/numbers/digit.tsv"))).optimize()
    ties_graph = pynini.invert(
        pynini.string_file(get_abs_path("data/numbers/ties.tsv"))).optimize()

    PYNINI_AVAILABLE = True
except (ModuleNotFoundError, ImportError):
    # Add placeholders for global variables
    graph_teen = None
    graph_digit = None
    ties_graph = None

    PYNINI_AVAILABLE = True

Esempio n. 19
0
from nemo_text_processing.text_normalization.graph_utils import (
    NEMO_CHAR,
    NEMO_DIGIT,
    NEMO_SIGMA,
    TO_LOWER,
    GraphFst,
    delete_extra_space,
    delete_space,
    insert_space,
)

try:
    import pynini
    from pynini.lib import pynutil

    graph_teen = pynini.invert(pynini.string_file(get_abs_path("data/numbers/teen.tsv"))).optimize()
    graph_digit = pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv"))).optimize()
    ties_graph = pynini.invert(pynini.string_file(get_abs_path("data/numbers/ties.tsv"))).optimize()

    PYNINI_AVAILABLE = True
except (ModuleNotFoundError, ImportError):
    # Add placeholders for global variables
    graph_teen = None
    graph_digit = None
    ties_graph = None

    PYNINI_AVAILABLE = True


def _get_ties_graph():
    """