Beispiel #1
0
    def __init__(self):
        super().__init__(name="electronic", kind="classify")

        delete_extra_space = pynutil.delete(" ")
        alpha_num = (
            NEMO_ALPHA
            | pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
            | pynini.string_file(get_abs_path("data/numbers/zero.tsv")))

        symbols = pynini.string_file(
            get_abs_path("data/electronic/symbols.tsv")).invert()

        accepted_username = alpha_num | symbols
        process_dot = pynini.cross("punto", ".")
        username = (pynutil.insert("username: \"") + alpha_num +
                    delete_extra_space +
                    pynini.closure(accepted_username + delete_extra_space) +
                    alpha_num + pynutil.insert("\""))
        single_alphanum = pynini.closure(alpha_num +
                                         delete_extra_space) + alpha_num
        server = single_alphanum | pynini.string_file(
            get_abs_path("data/electronic/server_name.tsv")).invert()
        domain = single_alphanum | pynini.string_file(
            get_abs_path("data/electronic/domain.tsv")).invert()
        domain_graph = (pynutil.insert("domain: \"") + server +
                        delete_extra_space + process_dot + delete_extra_space +
                        domain + pynutil.insert("\""))
        graph = (username + delete_extra_space + pynutil.delete("arroba") +
                 insert_space + delete_extra_space + domain_graph)

        ############# url ###
        protocol_end = pynini.cross(
            pynini.union("www", "w w w", "doble ve doble ve doble ve"), "www")
        protocol_start = pynini.cross(
            pynini.union("http", "h t t p", "hache te te pe"), "http")
        protocol_start |= pynini.cross(
            pynini.union("https", "h t t p s", "hache te te pe ese"), "https")
        protocol_start += pynini.cross(" dos puntos barra barra ", "://")

        # e.g. .com, .es
        ending = (delete_extra_space + symbols + delete_extra_space +
                  (domain
                   | pynini.closure(accepted_username + delete_extra_space, ) +
                   accepted_username))

        protocol = (pynini.closure(protocol_start, 0, 1) + protocol_end +
                    delete_extra_space + process_dot + delete_extra_space +
                    (pynini.closure(delete_extra_space + accepted_username, 1)
                     | server) + pynini.closure(ending, 1))
        protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert(
            "\"")
        graph |= protocol
        ########

        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Beispiel #2
0
    def __init__(self, cardinal: GraphFst, decimal: GraphFst):
        super().__init__(name="measure", kind="classify")

        cardinal_graph = cardinal.graph_no_exception

        graph_unit_singular = pynini.string_file(
            get_abs_path("data/measurements_singular.tsv"))
        graph_unit_singular = pynini.invert(
            graph_unit_singular)  # singular -> abbr
        graph_unit_plural = pynini.string_file(
            get_abs_path("data/measurements_plural.tsv"))
        graph_unit_plural = pynini.invert(graph_unit_plural)  # plural -> abbr

        optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("menos", "\"true\"") +
            delete_extra_space, 0, 1)

        unit_singular = convert_space(graph_unit_singular)
        unit_plural = convert_space(graph_unit_plural)
        unit_misc = pynutil.insert("/") + pynutil.delete(
            "por") + delete_space + convert_space(graph_unit_singular)

        unit_singular = (pynutil.insert("units: \"") +
                         (unit_singular | unit_misc | pynutil.add_weight(
                             unit_singular + delete_space + unit_misc, 0.01)) +
                         pynutil.insert("\""))
        unit_plural = (pynutil.insert("units: \"") +
                       (unit_plural | unit_misc | pynutil.add_weight(
                           unit_plural + delete_space + unit_misc, 0.01)) +
                       pynutil.insert("\""))

        subgraph_decimal = (pynutil.insert("decimal { ") +
                            optional_graph_negative +
                            decimal.final_graph_wo_negative +
                            pynutil.insert(" }") + delete_extra_space +
                            unit_plural)
        subgraph_cardinal = (
            pynutil.insert("cardinal { ") + optional_graph_negative +
            pynutil.insert("integer: \"") +
            ((NEMO_SIGMA - "un" - "una" - "uno") @ cardinal_graph) +
            pynutil.insert("\"") + pynutil.insert(" }") + delete_extra_space +
            unit_plural)
        subgraph_cardinal |= (
            pynutil.insert("cardinal { ") + optional_graph_negative +
            pynutil.insert("integer: \"") +
            (pynini.cross("un", "1") | pynini.cross("una", "1")
             | pynini.cross("uno", "1")) + pynutil.insert("\"") +
            pynutil.insert(" }") + delete_extra_space + unit_singular)
        final_graph = subgraph_decimal | subgraph_cardinal
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Beispiel #3
0
    def __init__(self):
        super().__init__(name="whitelist", kind="classify")

        whitelist = pynini.string_file(
            get_abs_path("data/whitelist.tsv")).invert()
        graph = pynutil.insert("name: \"") + convert_space(
            whitelist) + pynutil.insert("\"")
        self.fst = graph.optimize()
Beispiel #4
0
    def __init__(self):
        super().__init__(name="date", kind="classify")

        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
        graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
        graph_twenties = pynini.string_file(
            get_abs_path("data/numbers/twenties.tsv"))

        graph_1_to_100 = pynini.union(
            graph_digit,
            graph_twenties,
            graph_teen,
            (graph_ties + pynutil.insert("0")),
            (graph_ties + pynutil.delete(" y ") + graph_digit),
        )

        digits_1_to_31 = [str(digits) for digits in range(1, 32)]
        graph_1_to_31 = graph_1_to_100 @ pynini.union(*digits_1_to_31)
        # can use "primero" for 1st day of the month
        graph_1_to_31 = pynini.union(graph_1_to_31,
                                     pynini.cross("primero", "1"))

        day_graph = pynutil.insert("day: \"") + graph_1_to_31 + pynutil.insert(
            "\"")

        month_graph = pynini.string_file(get_abs_path("data/months.tsv"))
        month_graph = pynutil.insert(
            "month: \"") + month_graph + pynutil.insert("\"")

        graph_dm = day_graph + delete_space + pynutil.delete(
            "de") + delete_extra_space + month_graph

        final_graph = graph_dm
        final_graph += pynutil.insert(" preserve_order: true")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Beispiel #5
0
    def __init__(self, cardinal: GraphFst):
        super().__init__(name="decimal", kind="classify")

        # number after decimal point can be any series of cardinals <1000, including 'zero'
        graph_decimal = cardinal.numbers_up_to_thousand
        graph_decimal = pynini.closure(graph_decimal +
                                       delete_space) + graph_decimal
        self.graph = graph_decimal

        # decimal point can be denoted by 'coma' or 'punto'
        decimal_point = pynini.cross("coma", "morphosyntactic_features: \",\"")
        decimal_point |= pynini.cross("punto",
                                      "morphosyntactic_features: \".\"")

        optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("menos", "\"true\"") +
            delete_extra_space, 0, 1)

        graph_fractional = pynutil.insert(
            "fractional_part: \"") + graph_decimal + pynutil.insert("\"")

        cardinal_graph = cardinal.graph_no_exception | pynini.string_file(
            get_abs_path("data/numbers/es/zero.tsv"))
        graph_integer = pynutil.insert(
            "integer_part: \"") + cardinal_graph + pynutil.insert("\"")
        final_graph_wo_sign = (
            pynini.closure(graph_integer + delete_extra_space, 0, 1) +
            decimal_point + delete_extra_space + graph_fractional)
        final_graph = optional_graph_negative + final_graph_wo_sign

        self.final_graph_wo_negative = final_graph_wo_sign | get_quantity(
            final_graph_wo_sign, cardinal.numbers_up_to_million)
        final_graph |= optional_graph_negative + get_quantity(
            final_graph_wo_sign, cardinal.numbers_up_to_million)
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Beispiel #6
0
    def __init__(self, cardinal: GraphFst, decimal: GraphFst):
        super().__init__(name="money", kind="classify")
        # quantity, integer_part, fractional_part, currency

        cardinal_graph = cardinal.graph_no_exception
        graph_decimal_final = decimal.final_graph_wo_negative

        unit_singular = pynini.string_file(
            get_abs_path("data/currency_singular.tsv"))
        unit_singular = pynini.invert(unit_singular)
        unit_plural = pynini.string_file(
            get_abs_path("data/currency_plural.tsv"))
        unit_plural = pynini.invert(unit_plural)

        graph_unit_singular = pynutil.insert("currency: \"") + convert_space(
            unit_singular) + pynutil.insert("\"")
        graph_unit_plural = pynutil.insert("currency: \"") + convert_space(
            unit_plural) + pynutil.insert("\"")

        add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (
            pynutil.insert("0") + NEMO_DIGIT)
        # twelve dollars (and) fifty cents, zero cents
        cents_standalone = (
            pynutil.insert("morphosyntactic_features: \",\""
                           )  # always use a comma in the decimal
            + insert_space + pynutil.insert("fractional_part: \"") +
            pynini.union(
                pynutil.add_weight(
                    ((NEMO_SIGMA - "un") @ cardinal_graph),
                    -0.7) @ add_leading_zero_to_double_digit + delete_space +
                pynutil.delete(pynini.union("centavos", "céntimos")),
                pynini.cross("un", "01") + delete_space +
                pynutil.delete(pynini.union("centavo", "céntimo")),
            ) + pynutil.insert("\""))

        optional_cents_standalone = pynini.closure(
            delete_space + pynini.closure(
                (pynutil.delete("con") | pynutil.delete('y')) + delete_space,
                0, 1) + insert_space + cents_standalone,
            0,
            1,
        )
        # twelve dollars fifty, only after integer
        # setenta y cinco dólares con sesenta y tres~$75,63
        optional_cents_suffix = pynini.closure(
            delete_extra_space +
            pynutil.insert("morphosyntactic_features: \",\""
                           )  # always use a comma in the decimal
            + insert_space + pynutil.insert("fractional_part: \"") +
            pynini.closure(
                (pynutil.delete("con") | pynutil.delete('y')) + delete_space,
                0, 1) + pynutil.add_weight(
                    cardinal_graph @ add_leading_zero_to_double_digit, -0.7) +
            pynutil.insert("\""),
            0,
            1,
        )

        graph_integer = (pynutil.insert("integer_part: \"") +
                         ((NEMO_SIGMA - "un" - "una") @ cardinal_graph) +
                         pynutil.insert("\"") + delete_extra_space +
                         graph_unit_plural +
                         (optional_cents_standalone | optional_cents_suffix))
        graph_integer |= (
            pynutil.insert("integer_part: \"") +
            (pynini.cross("un", "1") | pynini.cross("una", "1")) +
            pynutil.insert("\"") + delete_extra_space + graph_unit_singular +
            (optional_cents_standalone | optional_cents_suffix))
        graph_decimal = graph_decimal_final + delete_extra_space + graph_unit_plural
        graph_decimal |= pynutil.insert(
            "currency: \"$\" integer_part: \"0\" ") + cents_standalone
        final_graph = graph_integer | graph_decimal
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Beispiel #7
0
    def __init__(self):
        super().__init__(name="cardinal", kind="classify")
        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
        graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
        graph_twenties = pynini.string_file(
            get_abs_path("data/numbers/twenties.tsv"))
        graph_hundreds = pynini.string_file(
            get_abs_path("data/numbers/hundreds.tsv"))

        graph_hundred_component = graph_hundreds | pynutil.insert("0")
        graph_hundred_component += delete_space
        graph_hundred_component += pynini.union(
            graph_twenties | graph_teen | pynutil.insert("00"),
            (graph_ties | pynutil.insert("0")) + delete_space +
            (graph_digit | pynutil.insert("0")),
        )

        graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ (
            pynini.closure(NEMO_DIGIT) +
            (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT))
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            graph_hundred_component_at_least_one_none_zero_digit)

        graph_thousands = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space + pynutil.delete("mil"),
            pynutil.insert("001") +
            pynutil.delete("mil"),  # because we say 'mil', not 'un mil'
            pynutil.insert("000", weight=0.1),
        )

        graph_millones = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space +
            (pynutil.delete("millones") | pynutil.delete("millón")),
            pynutil.insert("000") +
            pynutil.delete("millones"),  # to allow for 'mil millones'
        )

        graph_mil_millones = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space + pynutil.delete("mil"),
            pynutil.insert("001") +
            pynutil.delete("mil"),  # because we say 'mil', not 'un mil'
        )
        graph_mil_millones += delete_space + (
            graph_millones | pynutil.insert("000") + pynutil.delete("millones")
        )  # allow for 'mil millones'
        graph_mil_millones |= pynutil.insert("000000", weight=0.1)

        # also allow 'millardo' instead of 'mil millones'
        graph_millardo = (
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space +
            (pynutil.delete("millardo") | pynutil.delete("millardos")))

        graph_billones = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space +
            (pynutil.delete("billones") | pynutil.delete("billón")), )

        graph_mil_billones = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space + pynutil.delete("mil"),
            pynutil.insert("001") +
            pynutil.delete("mil"),  # because we say 'mil', not 'un mil'
        )
        graph_mil_billones += delete_space + (
            graph_billones | pynutil.insert("000") + pynutil.delete("billones")
        )  # allow for 'mil billones'
        graph_mil_billones |= pynutil.insert("000000", weight=0.1)

        graph_trillones = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space +
            (pynutil.delete("trillones") | pynutil.delete("trillón")), )

        graph_mil_trillones = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space + pynutil.delete("mil"),
            pynutil.insert("001") +
            pynutil.delete("mil"),  # because we say 'mil', not 'un mil'
        )
        graph_mil_trillones += delete_space + (
            graph_trillones | pynutil.insert("000") +
            pynutil.delete("trillones"))  # allow for 'mil trillones'
        graph_mil_trillones |= pynutil.insert("000000", weight=0.1)

        graph = pynini.union(
            (graph_mil_trillones
             | pynutil.insert("000", weight=0.1) + graph_trillones) +
            delete_space +
            (graph_mil_billones
             | pynutil.insert("000", weight=0.1) + graph_billones) +
            delete_space + pynini.union(
                graph_mil_millones,
                pynutil.insert("000", weight=0.1) + graph_millones,
                graph_millardo + graph_millones,
                graph_millardo + pynutil.insert("000", weight=0.1),
            ) + delete_space + graph_thousands + delete_space +
            graph_hundred_component,
            graph_zero,
        )

        graph = graph @ pynini.union(
            pynutil.delete(pynini.closure("0")) + pynini.difference(
                NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0")

        # ignore "y" inside cardinal numbers
        graph = (
            pynini.cdrewrite(pynutil.delete("y"), NEMO_SPACE, NEMO_SPACE,
                             NEMO_SIGMA) @ (NEMO_ALPHA + NEMO_SIGMA) @ graph)

        self.graph_no_exception = graph

        # save self.numbers_up_to_thousand for use in DecimalFst
        digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3)
        numbers_up_to_thousand = pynini.compose(
            graph, digits_up_to_thousand).optimize()
        self.numbers_up_to_thousand = numbers_up_to_thousand

        # save self.numbers_up_to_million for use in DecimalFst
        digits_up_to_million = (NEMO_DIGIT
                                | (NEMO_DIGIT**2)
                                | (NEMO_DIGIT**3)
                                | (NEMO_DIGIT**4)
                                | (NEMO_DIGIT**5)
                                | (NEMO_DIGIT**6))
        numbers_up_to_million = pynini.compose(
            graph, digits_up_to_million).optimize()
        self.numbers_up_to_million = numbers_up_to_million

        # don't convert cardinals from zero to nine inclusive
        graph_exception = pynini.project(pynini.union(graph_digit, graph_zero),
                                         'input')

        self.graph = (pynini.project(graph, "input") -
                      graph_exception.arcsort()) @ graph

        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("menos", "\"-\"") +
            NEMO_SPACE, 0, 1)

        final_graph = optional_minus_graph + pynutil.insert(
            "integer: \"") + self.graph + pynutil.insert("\"")

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Beispiel #8
0
    def __init__(self):
        super().__init__(name="telephone", kind="classify")

        # create `single_digits` and `double_digits` graphs as these will be
        # the building blocks of possible telephone numbers
        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
        graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
        graph_twenties = pynini.string_file(
            get_abs_path("data/numbers/twenties.tsv"))

        single_digits = pynini.invert(graph_digit).optimize() | pynini.cross(
            "0", "cero")

        double_digits = pynini.union(
            graph_twenties,
            graph_teen,
            (graph_ties + pynutil.insert("0")),
            (graph_ties + delete_space + pynutil.delete("y") + delete_space +
             graph_digit),
        ).invert()

        # define `ten_digit_graph`, `nine_digit_graph`, `eight_digit_graph`
        # which accept telephone numbers spoken (1) only with single digits,
        # or (2) spoken with double digits (and sometimes single digits)

        # 10-digit option (1): all single digits
        ten_digit_graph = (pynini.closure(single_digits + insert_space, 3, 3) +
                           pynutil.delete("-") +
                           pynini.closure(single_digits + insert_space, 3, 3) +
                           pynutil.delete("-") +
                           pynini.closure(single_digits + insert_space, 3, 3) +
                           single_digits)

        # 10-digit option (2): (1+2) + (1+2) + (2+2) digits
        ten_digit_graph |= (single_digits + insert_space + double_digits +
                            insert_space + pynutil.delete("-") +
                            single_digits + insert_space + double_digits +
                            insert_space + pynutil.delete("-") +
                            double_digits + insert_space + double_digits)

        # 9-digit option (1): all single digits
        nine_digit_graph = (
            pynini.closure(single_digits + insert_space, 3, 3) +
            pynutil.delete("-") +
            pynini.closure(single_digits + insert_space, 3, 3) +
            pynutil.delete("-") +
            pynini.closure(single_digits + insert_space, 2, 2) + single_digits)

        # 9-digit option (2): (1+2) + (1+2) + (1+2) digits
        nine_digit_graph |= (single_digits + insert_space + double_digits +
                             insert_space + pynutil.delete("-") +
                             single_digits + insert_space + double_digits +
                             insert_space + pynutil.delete("-") +
                             single_digits + insert_space + double_digits)

        # 8-digit option (1): all single digits
        eight_digit_graph = (
            pynini.closure(single_digits + insert_space, 4, 4) +
            pynutil.delete("-") +
            pynini.closure(single_digits + insert_space, 3, 3) + single_digits)

        # 8-digit option (2): (2+2) + (2+2) digits
        eight_digit_graph |= (double_digits + insert_space + double_digits +
                              insert_space + pynutil.delete("-") +
                              double_digits + insert_space + double_digits)

        number_part = pynini.union(
            ten_digit_graph,
            nine_digit_graph,
            eight_digit_graph,
        )

        number_part = pynutil.insert("number_part: \"") + pynini.invert(
            number_part) + pynutil.insert("\"")

        graph = number_part
        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Beispiel #9
0
    def __init__(self):
        super().__init__(name="time", kind="classify")

        suffix_graph = pynini.string_file(
            get_abs_path("data/time/time_suffix.tsv"))
        time_to_graph = pynini.string_file(
            get_abs_path("data/time/time_to.tsv"))

        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
        graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
        graph_twenties = pynini.string_file(
            get_abs_path("data/numbers/twenties.tsv"))

        graph_1_to_100 = pynini.union(
            graph_digit,
            graph_twenties,
            graph_teen,
            (graph_ties + pynutil.insert("0")),
            (graph_ties + pynutil.delete(" y ") + graph_digit),
        )

        # note that graph_hour will start from 2 hours
        # "1 o'clock" will be treated differently because it
        # is singular
        digits_2_to_23 = [str(digits) for digits in range(2, 24)]
        digits_1_to_59 = [str(digits) for digits in range(1, 60)]

        graph_1oclock = pynini.cross("la una", "la 1")
        graph_hour = pynini.cross(
            "las ", "las ") + graph_1_to_100 @ pynini.union(*digits_2_to_23)
        graph_minute = graph_1_to_100 @ pynini.union(*digits_1_to_59)
        graph_minute_verbose = pynini.cross("media", "30") | pynini.cross(
            "cuarto", "15")

        final_graph_hour = pynutil.insert("hours: \"") + (
            graph_1oclock | graph_hour) + pynutil.insert("\"")

        final_graph_minute = (pynutil.insert("minutes: \"") + pynini.closure(
            (pynutil.delete("y") | pynutil.delete("con")) + delete_space, 0,
            1) + (graph_minute | graph_minute_verbose) + pynutil.insert("\""))

        final_suffix = pynutil.insert("suffix: \"") + convert_space(
            suffix_graph) + pynutil.insert("\"")
        final_suffix_optional = pynini.closure(
            delete_space + insert_space + final_suffix, 0, 1)

        # las nueve a eme (only convert on-the-hour times if they are followed by a suffix)
        graph_hsuffix = (final_graph_hour + delete_extra_space +
                         pynutil.insert("minutes: \"00\"") + insert_space +
                         final_suffix)

        # las nueve y veinticinco
        graph_hm = final_graph_hour + delete_extra_space + final_graph_minute

        # un cuarto para las cinco
        graph_mh = (pynutil.insert("minutes: \"") + pynini.union(
            pynini.cross("un cuarto para", "45"),
            pynini.cross("cuarto para", "45"),
        ) + pynutil.insert("\"") + delete_extra_space +
                    pynutil.insert("hours: \"") + time_to_graph +
                    pynutil.insert("\""))

        # las diez menos diez
        graph_time_to = (pynutil.insert("hours: \"") + time_to_graph +
                         pynutil.insert("\"") + delete_extra_space +
                         pynutil.insert("minutes: \"") + delete_space +
                         pynutil.delete("menos") + delete_space + pynini.union(
                             pynini.cross("cinco", "55"),
                             pynini.cross("diez", "50"),
                             pynini.cross("cuarto", "45"),
                             pynini.cross("veinte", "40"),
                             pynini.cross("veinticinco", "30"),
                         ) + pynutil.insert("\""))
        final_graph = pynini.union(
            (graph_hm | graph_mh | graph_time_to) + final_suffix_optional,
            graph_hsuffix).optimize()

        final_graph = self.add_tokens(final_graph)

        self.fst = final_graph.optimize()
Beispiel #10
0
    def __init__(self, cardinal: GraphFst):
        super().__init__(name="ordinal", kind="classify")

        cardinal_graph = cardinal.graph_no_exception
        graph_digit = pynini.string_file(
            get_abs_path("data/ordinals/digit.tsv"))
        graph_teens = pynini.string_file(
            get_abs_path("data/ordinals/teen.tsv"))
        graph_twenties = pynini.string_file(
            get_abs_path("data/ordinals/twenties.tsv"))
        graph_ties = pynini.string_file(get_abs_path("data/ordinals/ties.tsv"))
        graph_hundreds = pynini.string_file(
            get_abs_path("data/ordinals/hundreds.tsv"))

        ordinal_graph_union = pynini.union(
            graph_digit,
            graph_teens,
            graph_twenties,
            graph_ties,
            graph_hundreds,
        )

        accept_o_endings = NEMO_SIGMA + pynini.accep("o")
        accept_a_endings = NEMO_SIGMA + pynini.accep("a")
        accept_er_endings = NEMO_SIGMA.closure() + pynini.accep("er")

        ordinal_graph_o = accept_o_endings @ ordinal_graph_union
        ordinal_graph_a = accept_a_endings @ ordinal_graph_union
        ordinal_graph_er = accept_er_endings @ ordinal_graph_union

        # 'optional_numbers_in_front' have negative weight so we always
        # include them if they're there
        optional_numbers_in_front = (
            pynutil.add_weight(ordinal_graph_union, -0.1) +
            delete_space.closure()).closure()
        graph_o_suffix = (optional_numbers_in_front +
                          ordinal_graph_o) @ cardinal_graph
        graph_a_suffix = (optional_numbers_in_front +
                          ordinal_graph_a) @ cardinal_graph
        graph_er_suffix = (optional_numbers_in_front +
                           ordinal_graph_er) @ cardinal_graph

        # don't convert ordinals from one to nine inclusive
        graph_exception = pynini.project(pynini.union(graph_digit), 'input')
        graph_o_suffix = (pynini.project(graph_o_suffix, "input") -
                          graph_exception.arcsort()) @ graph_o_suffix
        graph_a_suffix = (pynini.project(graph_a_suffix, "input") -
                          graph_exception.arcsort()) @ graph_a_suffix
        graph_er_suffix = (pynini.project(graph_er_suffix, "input") -
                           graph_exception.arcsort()) @ graph_er_suffix

        graph = (pynutil.insert("integer: \"") + graph_o_suffix +
                 pynutil.insert("\"") +
                 pynutil.insert(" morphosyntactic_features: \"o\""))
        graph |= (pynutil.insert("integer: \"") + graph_a_suffix +
                  pynutil.insert("\"") +
                  pynutil.insert(" morphosyntactic_features: \"a\""))
        graph |= (pynutil.insert("integer: \"") + graph_er_suffix +
                  pynutil.insert("\"") +
                  pynutil.insert(" morphosyntactic_features: \"er\""))

        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()