Exemple #1
0
    def __init__(self):
        super().__init__(name="telephone", kind="classify")
        # country code, number_part, extension
        separator = pynini.accep(" ")  # between components
        zero = pynini.invert(pynini.string_file(get_abs_path("data/numbers/zero.tsv")))
        digit = (pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv"))) | zero).optimize()

        number_part = (
            pynutil.delete("(")
            + zero
            + insert_space
            + pynini.closure(digit + insert_space, 2, 2)
            + digit
            + pynutil.delete(")")
            + separator
            + pynini.closure(digit + insert_space, 3, 3)
            + digit
            + pynutil.delete("-")
            + insert_space
            + pynini.closure(digit + insert_space, 3, 3)
            + digit
        )
        number_part = pynutil.insert("number_part: \"") + pynini.invert(number_part) + pynutil.insert("\"")

        graph = number_part
        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Exemple #2
0
    def __init__(self):
        super().__init__(name="electronic", kind="classify")

        delete_extra_space = pynutil.delete(" ")
        alpha_num = (
            NEMO_ALPHA
            | pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
            | pynini.string_file(get_abs_path("data/numbers/zero.tsv")))
        username = (
            pynutil.insert("username: \"") +
            pynini.closure(alpha_num + delete_extra_space + pynini.closure(
                pynini.cross("punkt", '.') + delete_extra_space, 0, 1)) +
            alpha_num + pynutil.insert("\""))
        single_alphanum = pynini.closure(alpha_num +
                                         delete_extra_space) + alpha_num
        server = single_alphanum | pynini.string_file(
            get_abs_path("data/electronic/server_name.tsv"))
        domain = single_alphanum | pynini.string_file(
            get_abs_path("data/electronic/domain.tsv"))
        domain_graph = (pynutil.insert("domain: \"") + server +
                        delete_extra_space + pynini.cross("punkt", ".") +
                        delete_extra_space + domain + pynutil.insert("\""))
        graph = username + delete_extra_space + pynutil.delete(
            "at") + insert_space + delete_extra_space + domain_graph

        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
    def __init__(self, deterministic: bool = True):
        super().__init__(name="ordinal",
                         kind="verbalize",
                         deterministic=deterministic)

        graph_digit = pynini.string_file(
            get_abs_path("data/ordinals/digit.tsv")).invert()
        graph_teens = pynini.string_file(
            get_abs_path("data/ordinals/teen.tsv")).invert()

        graph = (pynutil.delete("integer:") + delete_space +
                 pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) +
                 pynutil.delete("\""))
        convert_rest = pynutil.insert("th", weight=0.01)

        suffix = pynini.cdrewrite(
            graph_digit | graph_teens
            | pynutil.add_weight(pynini.cross("ty", "tieth"), weight=0.001)
            | convert_rest,
            "",
            "[EOS]",
            NEMO_SIGMA,
        ).optimize()
        self.graph = pynini.compose(graph, suffix)
        self.suffix = suffix
        delete_tokens = self.delete_tokens(self.graph)
        self.fst = delete_tokens.optimize()
Exemple #4
0
    def __init__(self, cardinal: GraphFst, deterministic: bool = True):
        super().__init__(name="decimal",
                         kind="classify",
                         deterministic=deterministic)

        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv")).invert()
        graph_digit |= pynini.string_file(
            get_abs_path("data/numbers/zero.tsv")).invert()
        graph_digit |= pynini.cross("1", "eins")
        self.graph = graph_digit + pynini.closure(insert_space +
                                                  graph_digit).optimize()

        point = pynutil.delete(",")
        optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)

        self.graph_fractional = pynutil.insert(
            "fractional_part: \"") + self.graph + pynutil.insert("\"")
        self.graph_integer = pynutil.insert(
            "integer_part: \"") + cardinal.graph + pynutil.insert("\"")
        final_graph_wo_sign = self.graph_integer + point + insert_space + self.graph_fractional

        self.final_graph_wo_negative = final_graph_wo_sign | get_quantity(
            final_graph_wo_sign,
            cardinal.graph_hundred_component_at_least_one_none_zero_digit)
        final_graph = optional_graph_negative + self.final_graph_wo_negative
        final_graph += pynutil.insert(" preserve_order: true")

        final_graph = self.add_tokens(final_graph)

        self.fst = final_graph.optimize()
    def __init__(self, cardinal: GraphFst):
        super().__init__(name="decimal", kind="classify")

        cardinal_graph = cardinal.graph_no_exception

        graph_decimal = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
        graph_decimal |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")) | pynini.cross("o", "0")

        graph_decimal = pynini.closure(graph_decimal + delete_space) + graph_decimal
        self.graph = graph_decimal

        point = pynutil.delete("point")

        optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("minus", "\"true\"") + delete_extra_space, 0, 1
        )

        graph_fractional = pynutil.insert("fractional_part: \"") + graph_decimal + pynutil.insert("\"")
        graph_integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"")
        final_graph_wo_sign = (
            pynini.closure(graph_integer + delete_extra_space, 0, 1) + point + delete_extra_space + graph_fractional
        )
        final_graph = optional_graph_negative + final_graph_wo_sign

        self.final_graph_wo_negative = final_graph_wo_sign | get_quantity(
            final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit
        )
        final_graph |= optional_graph_negative + get_quantity(
            final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit
        )
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemple #6
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="ordinal",
                         kind="verbalize",
                         deterministic=deterministic)
        graph_digit = pynini.string_file(
            get_abs_path("data/ordinals/digit.tsv")).invert()
        graph_ties = pynini.string_file(
            get_abs_path("data/ordinals/ties.tsv")).invert()
        graph_thousands = pynini.string_file(
            get_abs_path("data/ordinals/thousands.tsv")).invert()

        graph = pynutil.delete("integer: \"") + pynini.closure(
            NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")

        suffixes = pynini.union("ten", "tem", "ter", "tes", "te")
        convert_rest = pynutil.insert(suffixes, weight=0.01)
        self.ordinal_stem = graph_digit | graph_ties | graph_thousands

        suffix = pynini.cdrewrite(
            pynini.closure(self.ordinal_stem, 0, 1) + convert_rest,
            "",
            "[EOS]",
            NEMO_SIGMA,
        ).optimize()
        self.graph = pynini.compose(graph, suffix)
        self.suffix = suffix
        delete_tokens = self.delete_tokens(self.graph)
        self.fst = delete_tokens.optimize()
Exemple #7
0
def _get_digit_or_teen():
    """
    Transducer for single digit or teens
    """
    return (pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
            | pynini.string_file(
                get_abs_path("data/numbers/teen.tsv"))).optimize()
Exemple #8
0
    def __init__(self, cardinal: GraphFst, deterministic: bool = True):
        super().__init__(name="time",
                         kind="classify",
                         deterministic=deterministic)
        suffix_graph = pynini.string_file(get_abs_path("data/time_suffix.tsv"))
        time_zone_graph = pynini.string_file(
            get_abs_path("data/time_zone.tsv"))

        # only used for < 1000 thousand -> 0 weight
        cardinal = cardinal.graph

        labels_hour = [str(x) for x in range(0, 24)]
        labels_minute_single = [str(x) for x in range(1, 10)]
        labels_minute_double = [str(x) for x in range(10, 60)]

        delete_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (
            pynini.closure(pynutil.delete("0"), 0, 1) + NEMO_DIGIT)

        graph_hour = delete_leading_zero_to_double_digit @ pynini.union(
            *labels_hour) @ cardinal

        graph_minute_single = pynini.union(*labels_minute_single) @ cardinal
        graph_minute_double = pynini.union(*labels_minute_double) @ cardinal

        final_graph_hour = pynutil.insert(
            "hours: \"") + graph_hour + pynutil.insert("\"")
        final_graph_minute = (
            pynutil.insert("minutes: \"") +
            (pynini.cross("0", "o") + insert_space + graph_minute_single
             | graph_minute_double) + pynutil.insert("\""))
        final_suffix = pynutil.insert("suffix: \"") + convert_space(
            suffix_graph) + pynutil.insert("\"")
        final_suffix_optional = pynini.closure(
            delete_space + insert_space + final_suffix, 0, 1)
        final_time_zone_optional = pynini.closure(
            delete_space + insert_space + pynutil.insert("zone: \"") +
            convert_space(time_zone_graph) + pynutil.insert("\""),
            0,
            1,
        )

        # 2:30 pm, 02:30, 2:00
        graph_hm = (
            final_graph_hour + pynutil.delete(":") +
            (pynutil.delete("00") | insert_space + final_graph_minute) +
            final_suffix_optional + final_time_zone_optional)

        # 2.xx pm/am
        graph_hm2 = (
            final_graph_hour + pynutil.delete(".") +
            (pynutil.delete("00") | insert_space + final_graph_minute) +
            delete_space + insert_space + final_suffix +
            final_time_zone_optional)
        # 2 pm est
        graph_h = final_graph_hour + delete_space + insert_space + final_suffix + final_time_zone_optional
        final_graph = (graph_hm | graph_h | graph_hm2).optimize()

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemple #9
0
    def __init__(self):
        super().__init__(name="time", kind="classify")
        # hours, minutes, seconds, suffix, zone, style, speak_period

        suffix_graph = pynini.string_file(get_abs_path("data/time_suffix.tsv"))
        time_zone_graph = pynini.invert(
            pynini.string_file(get_abs_path("data/time_zone.tsv")))

        # only used for < 1000 thousand -> 0 weight
        cardinal = pynutil.add_weight(CardinalFst().graph_no_exception,
                                      weight=-0.7)

        labels_hour = [num_to_word(x) for x in range(0, 24)]
        labels_minute_single = [num_to_word(x) for x in range(1, 10)]
        labels_minute_double = [num_to_word(x) for x in range(10, 60)]

        graph_hour = pynini.union(*labels_hour) @ cardinal

        graph_minute_single = pynini.union(*labels_minute_single) @ cardinal
        graph_minute_double = pynini.union(*labels_minute_double) @ cardinal
        graph_minute_verbose = pynini.cross("half", "30") | pynini.cross(
            "quarter", "15")
        oclock = pynini.cross(
            pynini.union("o' clock", "o clock", "o'clock", "oclock"), "")

        final_graph_hour = pynutil.insert(
            "hours: \"") + graph_hour + pynutil.insert("\"")
        final_graph_minute = (
            pynutil.insert("minutes: \"") +
            (pynutil.insert("00")
             | oclock + pynutil.insert("00")
             | pynutil.delete("o") + delete_space + graph_minute_single
             | graph_minute_double) + pynutil.insert("\""))
        final_suffix = pynutil.insert("suffix: \"") + convert_space(
            suffix_graph) + pynutil.insert("\"")
        final_suffix_optional = pynini.closure(
            delete_space + insert_space + final_suffix, 0, 1)
        final_time_zone_optional = pynini.closure(
            delete_space + insert_space + pynutil.insert("zone: \"") +
            convert_space(time_zone_graph) + pynutil.insert("\""),
            0,
            1,
        )

        # five o' clock
        # two o eight, two thiry five (am/pm)
        # two pm/am
        graph_hm = final_graph_hour + delete_extra_space + final_graph_minute
        # 10 past four, quarter past four, half past four
        graph_mh = (pynutil.insert("minutes: \"") + pynini.union(
            graph_minute_single, graph_minute_double, graph_minute_verbose) +
                    pynutil.insert("\"") + delete_space +
                    pynutil.delete("past") + delete_extra_space +
                    final_graph_hour)
        final_graph = ((graph_hm | graph_mh) + final_suffix_optional +
                       final_time_zone_optional).optimize()

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemple #10
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="cardinal",
                         kind="classify",
                         deterministic=deterministic)

        graph = pynini.Far(
            get_abs_path("data/numbers/cardinal_number_name.far")).get_fst()
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            pynini.closure(NEMO_DIGIT, 2, 3)
            | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph
        self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure(
            pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT +
            NEMO_DIGIT + NEMO_DIGIT)) @ graph

        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
        single_digits_graph = pynutil.add_weight(
            pynini.invert(graph_digit | graph_zero), 1.2) | pynutil.add_weight(
                pynini.cross("0", "oh"), 1.1)
        self.single_digits_graph = single_digits_graph + pynini.closure(
            pynutil.insert(" ") + single_digits_graph)

        if not deterministic:
            single_digits_graph_with_commas = pynini.closure(
                self.single_digits_graph + pynutil.insert(" "), 1,
                3) + pynini.closure(
                    pynutil.delete(",") + single_digits_graph +
                    pynutil.insert(" ") + single_digits_graph +
                    pynutil.insert(" ") + single_digits_graph,
                    1,
                )
            self.graph |= self.single_digits_graph | get_hundreds_graph(
            ) | single_digits_graph_with_commas
            self.range_graph = (
                pynini.closure(pynutil.insert("from "), 0, 1) + self.graph +
                (pynini.cross("-", " to ") | pynini.cross("-", " ")) +
                self.graph)

            self.range_graph |= self.graph + (pynini.cross(
                "x", " by ") | pynini.cross(" x ", " by ")) + self.graph
            self.range_graph = self.range_graph.optimize()

        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)
        final_graph = self.graph | pynutil.add_weight(self.get_serial_graph(),
                                                      1.2)

        if not deterministic:
            final_graph |= self.range_graph

        final_graph = optional_minus_graph + pynutil.insert(
            "integer: \"") + final_graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemple #11
0
    def __init__(self):
        super().__init__(name="electronic", kind="classify")

        delete_extra_space = pynutil.delete(" ")
        alpha_num = (
            NEMO_ALPHA
            | pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
            | pynini.string_file(get_abs_path("data/numbers/zero.tsv")))

        symbols = pynini.string_file(
            get_abs_path("data/electronic/symbols.tsv")).invert()

        accepted_username = alpha_num | symbols
        process_dot = pynini.cross("punto", ".")
        username = (pynutil.insert("username: \"") + alpha_num +
                    delete_extra_space +
                    pynini.closure(accepted_username + delete_extra_space) +
                    alpha_num + pynutil.insert("\""))
        single_alphanum = pynini.closure(alpha_num +
                                         delete_extra_space) + alpha_num
        server = single_alphanum | pynini.string_file(
            get_abs_path("data/electronic/server_name.tsv")).invert()
        domain = single_alphanum | pynini.string_file(
            get_abs_path("data/electronic/domain.tsv")).invert()
        domain_graph = (pynutil.insert("domain: \"") + server +
                        delete_extra_space + process_dot + delete_extra_space +
                        domain + pynutil.insert("\""))
        graph = (username + delete_extra_space + pynutil.delete("arroba") +
                 insert_space + delete_extra_space + domain_graph)

        ############# url ###
        protocol_end = pynini.cross(
            pynini.union("www", "w w w", "doble ve doble ve doble ve"), "www")
        protocol_start = pynini.cross(
            pynini.union("http", "h t t p", "hache te te pe"), "http")
        protocol_start |= pynini.cross(
            pynini.union("https", "h t t p s", "hache te te pe ese"), "https")
        protocol_start += pynini.cross(" dos puntos barra barra ", "://")

        # e.g. .com, .es
        ending = (delete_extra_space + symbols + delete_extra_space +
                  (domain
                   | pynini.closure(accepted_username + delete_extra_space, ) +
                   accepted_username))

        protocol = (pynini.closure(protocol_start, 0, 1) + protocol_end +
                    delete_extra_space + process_dot + delete_extra_space +
                    (pynini.closure(delete_extra_space + accepted_username, 1)
                     | server) + pynini.closure(ending, 1))
        protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert(
            "\"")
        graph |= protocol
        ########

        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Exemple #12
0
    def __init__(self, cardinal: GraphFst, decimal: GraphFst):
        super().__init__(name="measure", kind="classify")

        cardinal_graph = cardinal.graph_no_exception

        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_four = pynini.cross("tư", "4")
        graph_one = pynini.cross("mốt", "1")
        graph_half = pynini.cross("rưỡi", "5")

        graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv"))
        graph_unit_singular = pynini.invert(graph_unit)  # singular -> abbr

        optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") +
            pynini.cross(pynini.union("âm", "trừ"), '"true"') +
            delete_extra_space,
            0,
            1,
        )

        unit_singular = convert_space(graph_unit_singular)
        unit_misc = pynutil.insert("/") + pynutil.delete(
            "trên") + delete_space + convert_space(graph_unit_singular)

        unit_singular = (pynutil.insert('units: "') +
                         (unit_singular | unit_misc | pynutil.add_weight(
                             unit_singular + delete_space + unit_misc, 0.01)) +
                         pynutil.insert('"'))

        subgraph_decimal = (pynutil.insert("decimal { ") +
                            optional_graph_negative +
                            decimal.final_graph_wo_negative +
                            pynutil.insert(" }") + delete_extra_space +
                            unit_singular)

        subgraph_cardinal = (pynutil.insert("cardinal { ") +
                             optional_graph_negative +
                             pynutil.insert('integer: "') + cardinal_graph +
                             pynutil.insert('"') + pynutil.insert(" }") +
                             delete_extra_space + unit_singular)
        fraction_graph = (delete_extra_space +
                          pynutil.insert('fractional_part: "') +
                          (graph_digit | graph_half | graph_one | graph_four) +
                          pynutil.insert('"'))

        subgraph_cardinal |= (pynutil.insert("cardinal { ") +
                              optional_graph_negative +
                              pynutil.insert('integer: "') + cardinal_graph +
                              pynutil.insert('" }') + delete_extra_space +
                              unit_singular + fraction_graph)
        final_graph = subgraph_decimal | subgraph_cardinal
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemple #13
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="electronic",
                         kind="verbalize",
                         deterministic=deterministic)
        graph_digit_no_zero = pynini.invert(
            pynini.string_file(
                get_abs_path("data/numbers/digit.tsv"))).optimize()
        graph_zero = pynini.cross("0", "zero")

        if not deterministic:
            graph_zero |= pynini.cross("0", "o") | pynini.cross("0", "oh")

        graph_digit = graph_digit_no_zero | graph_zero
        graph_symbols = pynini.string_file(
            get_abs_path("data/electronic/symbols.tsv")).optimize()
        user_name = (
            pynutil.delete("username:"******"\"") +
            (pynini.closure(
                pynutil.add_weight(graph_digit + insert_space, 1.09)
                | pynutil.add_weight(
                    pynini.closure(graph_symbols + pynutil.insert(" ")), 1.09)
                | pynutil.add_weight(NEMO_NOT_QUOTE + insert_space, 1.1))) +
            pynutil.delete("\""))

        server_common = pynini.string_file(
            get_abs_path("data/electronic/server_name.tsv"))
        domain_common = pynini.string_file(
            get_abs_path("data/electronic/domain.tsv"))

        convert_defaults = (NEMO_NOT_QUOTE
                            | pynutil.add_weight(domain_common, -0.1)
                            | pynutil.add_weight(server_common, -0.1))
        domain = convert_defaults + pynini.closure(
            pynutil.insert(" ") + convert_defaults)
        domain = pynini.compose(
            domain,
            pynini.closure(
                pynutil.add_weight(graph_symbols, -0.1)
                | pynutil.add_weight(graph_digit, -0.1) | NEMO_NOT_QUOTE),
        )

        domain = (pynutil.delete("domain:") + delete_space +
                  pynutil.delete("\"") + domain + delete_space +
                  pynutil.delete("\""))

        protocol = pynutil.delete("protocol: \"") + pynini.closure(
            NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
        graph = (pynini.closure(protocol + delete_space, 0, 1) +
                 pynini.closure(
                     user_name + delete_space + pynutil.insert("at ") +
                     delete_space, 0, 1) + domain + delete_space)

        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()
Exemple #14
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="electronic", kind="verbalize", deterministic=deterministic)
        graph_digit_no_zero = pynini.invert(pynini.string_file(get_abs_path("data/number/digit.tsv"))).optimize()
        graph_zero = pynini.cross("0", "zero")

        if not deterministic:
            graph_zero |= pynini.cross("0", "o") | pynini.cross("0", "oh")

        graph_digit = graph_digit_no_zero | graph_zero
        graph_symbols = pynini.string_file(get_abs_path("data/electronic/symbol.tsv")).optimize()

        default_chars_symbols = pynini.cdrewrite(
            pynutil.insert(" ") + (graph_symbols | graph_digit) + pynutil.insert(" "), "", "", NEMO_SIGMA
        )

        user_name = (
            pynutil.delete("username:"******"\"")
            + default_chars_symbols
            + pynutil.delete("\"")
        )

        domain_common = pynini.string_file(get_abs_path("data/electronic/domain.tsv"))

        domain = (
            default_chars_symbols
            + insert_space
            + plurals._priority_union(
                domain_common, pynutil.add_weight(pynini.cross(".", "dot"), weight=0.0001), NEMO_SIGMA
            )
            + pynini.closure(
                insert_space + (pynini.cdrewrite(TO_UPPER, "", "", NEMO_SIGMA) @ default_chars_symbols), 0, 1
            )
        )
        domain = (
            pynutil.delete("domain:")
            + delete_space
            + pynutil.delete("\"")
            + domain
            + delete_space
            + pynutil.delete("\"")
        ).optimize()

        protocol = pynutil.delete("protocol: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
        graph = (
            pynini.closure(protocol + delete_space, 0, 1)
            + pynini.closure(user_name + delete_space + pynutil.insert(" at ") + delete_space, 0, 1)
            + domain
            + delete_space
        ).optimize() @ pynini.cdrewrite(delete_extra_space, "", "", NEMO_SIGMA)

        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()
Exemple #15
0
    def get_address_graph(self, cardinal):
        """
        Finite state transducer for classifying serial.
            The serial is a combination of digits, letters and dashes, e.g.:
            2788 San Tomas Expy, Santa Clara, CA 95051 ->
                units: "address" cardinal
                { integer: "two seven eight eight San Tomas Expressway Santa Clara California nine five zero five one" }
                 preserve_order: true
        """
        ordinal_verbalizer = OrdinalVerbalizer().graph
        ordinal_tagger = OrdinalTagger(cardinal=cardinal).graph
        ordinal_num = pynini.compose(
            pynutil.insert("integer: \"") + ordinal_tagger +
            pynutil.insert("\""), ordinal_verbalizer)

        address_num = pynini.closure(NEMO_DIGIT,
                                     1) @ cardinal.single_digits_graph

        direction = (pynini.cross("E", "East")
                     | pynini.cross("S", "South")
                     | pynini.cross("W", "West")
                     | pynini.cross("N", "North"))
        direction = pynini.closure(
            pynutil.add_weight(pynini.accep(NEMO_SPACE) + direction, -1), 0, 1)

        address_words = pynini.string_file(
            get_abs_path("data/address/address_words.tsv"))
        address_words = (pynini.accep(NEMO_SPACE) +
                         pynini.closure(ordinal_num, 0, 1) +
                         pynini.closure(NEMO_ALPHA | NEMO_SPACE, 1) +
                         address_words)

        city = pynini.closure(NEMO_ALPHA | pynini.accep(NEMO_SPACE), 1)
        city = pynini.closure(
            pynini.cross(",", "") + pynini.accep(NEMO_SPACE) + city, 0, 1)

        state = pynini.invert(
            pynini.string_file(get_abs_path("data/address/states.tsv")))
        state = pynini.closure(
            pynini.cross(",", "") + pynini.accep(NEMO_SPACE) + state, 0, 1)

        zip_code = pynini.compose(NEMO_DIGIT**5, cardinal.single_digits_graph)
        zip_code = pynini.closure(
            pynutil.add_weight(
                pynini.closure(pynini.cross(",", ""), 0, 1) +
                pynini.accep(NEMO_SPACE) + zip_code, -100),
            0,
            1,
        )

        address = (address_num + direction + address_words +
                   pynini.closure(pynini.cross(".", ""), 0, 1) + city + state +
                   zip_code)
        return address
Exemple #16
0
    def __init__(self):
        super().__init__(name="electronic", kind="classify")

        delete_extra_space = pynutil.delete(" ")
        alpha_num = (
            NEMO_ALPHA
            | pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
            | pynini.string_file(get_abs_path("data/numbers/zero.tsv")))

        symbols = pynini.string_file(
            get_abs_path("data/electronic/symbols.tsv")).invert()

        accepted_username = alpha_num | symbols
        process_dot = pynini.cross("chấm", ".")
        username = (pynutil.insert('username: "******"'))
        single_alphanum = pynini.closure(alpha_num +
                                         delete_extra_space) + alpha_num
        server = single_alphanum | pynini.string_file(
            get_abs_path("data/electronic/server_name.tsv"))
        domain = single_alphanum | pynini.string_file(
            get_abs_path("data/electronic/domain.tsv"))
        multi_domain = (pynini.closure(process_dot + delete_extra_space +
                                       domain + delete_extra_space) +
                        process_dot + delete_extra_space + domain)
        domain_graph = pynutil.insert(
            'domain: "'
        ) + server + delete_extra_space + multi_domain + pynutil.insert('"')
        graph = (username + delete_extra_space +
                 pynutil.delete(pynini.union("a còng", "a móc", "a vòng")) +
                 insert_space + delete_extra_space + domain_graph)

        ############# url ###
        protocol_end = pynini.cross(pynini.union("w w w", "www"), "www")
        protocol_start = (pynini.cross("h t t p", "http") | pynini.cross(
            "h t t p s", "https")) + pynini.cross(" hai chấm sẹc sẹc ", "://")
        # .com,
        ending = (
            delete_extra_space + symbols + delete_extra_space +
            (domain | pynini.closure(accepted_username + delete_extra_space) +
             accepted_username))

        protocol = (pynini.closure(protocol_start, 0, 1) + protocol_end +
                    delete_extra_space + process_dot +
                    pynini.closure(delete_extra_space + accepted_username, 1) +
                    pynini.closure(ending, 1, 2))
        protocol = pynutil.insert('protocol: "') + protocol + pynutil.insert(
            '"')
        graph |= protocol
        ########

        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Exemple #17
0
    def __init__(self):
        super().__init__(name="telephone", kind="classify")

        # create `single_digits` and `double_digits` graphs as these will be
        # the building blocks of possible telephone numbers
        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
        graph_ties_unique = pynini.string_file(
            (get_abs_path("data/numbers/ties_unique.tsv")))
        graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))

        double_digits = pynini.union(
            graph_teen,
            graph_ties_unique,
            (graph_ties + pynutil.insert("0")),
            (graph_ties + delete_hyphen + graph_digit),
        )

        graph_first_pair = graph_zero + delete_space + graph_digit
        graph_first_pair |= pynutil.insert(
            "0") + graph_digit  # if zero is omitted
        graph_first_pair += (
            delete_space + insert_space
        )  # delete_space since closure allows possible gaps to be removed

        # All digits
        single_digits = graph_digit | graph_zero

        graph_pair_all_digits = single_digits + delete_space
        graph_pair_all_digits += single_digits

        graph_all_digits = pynini.closure(
            graph_pair_all_digits + delete_space + insert_space, 3, 3)
        graph_all_digits = graph_first_pair + graph_all_digits + graph_pair_all_digits

        # Paired digits
        graph_pair_digits_and_ties = double_digits | graph_pair_all_digits

        graph_digits_and_ties = pynini.closure(
            graph_pair_digits_and_ties + delete_space + insert_space, 3, 3)
        graph_digits_and_ties = graph_first_pair + graph_digits_and_ties + graph_pair_digits_and_ties

        number_part = pynini.union(graph_all_digits, graph_digits_and_ties)

        number_part = pynutil.insert(
            "number_part: \"") + number_part + pynutil.insert("\"")

        graph = number_part
        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Exemple #18
0
    def __init__(self, cardinal: GraphFst, decimal: GraphFst):
        super().__init__(name="measure", kind="classify")

        cardinal_graph = cardinal.graph_no_exception

        graph_unit_singular = pynini.string_file(
            get_abs_path("data/measurements_singular.tsv"))
        graph_unit_singular = pynini.invert(
            graph_unit_singular)  # singular -> abbr
        graph_unit_plural = pynini.string_file(
            get_abs_path("data/measurements_plural.tsv"))
        graph_unit_plural = pynini.invert(graph_unit_plural)  # plural -> abbr

        optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("menos", "\"true\"") +
            delete_extra_space, 0, 1)

        unit_singular = convert_space(graph_unit_singular)
        unit_plural = convert_space(graph_unit_plural)
        unit_misc = pynutil.insert("/") + pynutil.delete(
            "por") + delete_space + convert_space(graph_unit_singular)

        unit_singular = (pynutil.insert("units: \"") +
                         (unit_singular | unit_misc | pynutil.add_weight(
                             unit_singular + delete_space + unit_misc, 0.01)) +
                         pynutil.insert("\""))
        unit_plural = (pynutil.insert("units: \"") +
                       (unit_plural | unit_misc | pynutil.add_weight(
                           unit_plural + delete_space + unit_misc, 0.01)) +
                       pynutil.insert("\""))

        subgraph_decimal = (pynutil.insert("decimal { ") +
                            optional_graph_negative +
                            decimal.final_graph_wo_negative +
                            pynutil.insert(" }") + delete_extra_space +
                            unit_plural)
        subgraph_cardinal = (
            pynutil.insert("cardinal { ") + optional_graph_negative +
            pynutil.insert("integer: \"") +
            ((NEMO_SIGMA - "un" - "una" - "uno") @ cardinal_graph) +
            pynutil.insert("\"") + pynutil.insert(" }") + delete_extra_space +
            unit_plural)
        subgraph_cardinal |= (
            pynutil.insert("cardinal { ") + optional_graph_negative +
            pynutil.insert("integer: \"") +
            (pynini.cross("un", "1") | pynini.cross("una", "1")
             | pynini.cross("uno", "1")) + pynutil.insert("\"") +
            pynutil.insert(" }") + delete_extra_space + unit_singular)
        final_graph = subgraph_decimal | subgraph_cardinal
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemple #19
0
    def __init__(self, cardinal: GraphFst):
        super().__init__(name="ordinal", kind="classify")

        cardinal_graph = cardinal.graph_no_exception
        graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv"))
        graph_teens = pynini.string_file(get_abs_path("data/ordinals/teen.tsv"))
        graph = pynini.closure(NEMO_CHAR) + pynini.union(
            graph_digit, graph_teens, pynini.cross("tieth", "ty"), pynini.cross("th", "")
        )

        self.graph = graph @ cardinal_graph
        final_graph = pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemple #20
0
    def __init__(self, cardinal: GraphFst, deterministic: bool = True):
        super().__init__(name="telephone",
                         kind="classify",
                         deterministic=deterministic)

        graph_zero = pynini.invert(
            pynini.string_file(
                get_abs_path("data/numbers/zero.tsv"))).optimize()
        graph_digit_no_zero = pynini.invert(
            pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
        ).optimize() | pynini.cross("1", "eins")
        graph_digit = graph_digit_no_zero | graph_zero

        numbers_with_single_digits = pynini.closure(graph_digit +
                                                    insert_space) + graph_digit

        two_digit_and_zero = (
            NEMO_DIGIT**2 @ cardinal.two_digit_non_zero) | graph_zero
        # def add_space_after_two_digit():
        #     return pynini.closure(two_digit_and_zero + insert_space) + (
        #         two_digit_and_zero
        #     )

        country_code = pynini.closure(pynini.cross("+", "plus "), 0,
                                      1) + two_digit_and_zero
        country_code |= (pynutil.delete("(") + graph_zero + insert_space +
                         numbers_with_single_digits + pynutil.delete(")"))
        country_code |= graph_zero + insert_space + numbers_with_single_digits

        country_code = pynutil.insert(
            "country_code: \"") + country_code + pynutil.insert("\"")

        del_separator = pynini.cross(pynini.union("-", " "), " ")
        # numbers_with_two_digits = pynini.closure(graph_digit + insert_space) + add_space_after_two_digit() + pynini.closure(insert_space + graph_digit)
        # numbers = numbers_with_two_digits + pynini.closure(del_separator + numbers_with_two_digits, 0, 1)
        numbers = numbers_with_single_digits + pynini.closure(
            del_separator + numbers_with_single_digits, 0, 1)
        number_length = pynini.closure(
            (NEMO_DIGIT | pynini.union("-", " ", ")", "(")), 7)
        number_part = pynini.compose(number_length, numbers)
        number = pynutil.insert(
            "number_part: \"") + number_part + pynutil.insert("\"")

        graph = country_code + pynini.accep(" ") + number
        self.graph = graph
        final_graph = self.add_tokens(self.graph +
                                      pynutil.insert(" preserve_order: true"))
        self.fst = final_graph.optimize()
Exemple #21
0
def _get_month_graph():
    """
    Transducer for month, e.g. march -> march
    """
    month_graph = pynini.string_file(
        get_abs_path("data/months.tsv")).optimize()
    return month_graph
Exemple #22
0
    def __init__(self):
        super().__init__(name="whitelist", kind="classify")

        whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv"))
        graph = pynutil.insert("name: \"") + convert_space(
            whitelist) + pynutil.insert("\"")
        self.fst = graph.optimize()
Exemple #23
0
    def __init__(self, number_names: dict, alternative_formats: dict, deterministic=False):
        super().__init__(name="ordinal", kind="classify", deterministic=deterministic)

        one_thousand_alternative = alternative_formats['one_thousand_alternative']
        separators = alternative_formats['separators']

        ordinal = number_names['ordinal_number_names']

        ordinal |= ordinal @ one_thousand_alternative
        ordinal_numbers = separators @ ordinal

        # to handle cases like 2-ая
        endings = pynini.string_file(get_abs_path("data/numbers/ordinal_endings.tsv"))
        not_dash = pynini.closure(pynini.difference(NEMO_SIGMA, "-"))
        del_ending = pynini.cdrewrite(pynini.cross("-" + not_dash, ""), "", "[EOS]", NEMO_SIGMA)
        ordinal_numbers_marked = (
            ((separators @ ordinal).optimize() + pynini.accep("-") + not_dash).optimize()
            @ (NEMO_SIGMA + endings).optimize()
            @ del_ending
        ).optimize()

        self.ordinal_numbers = ordinal_numbers
        # "03" -> remove leading zeros and verbalize
        leading_zeros = pynini.closure(pynini.cross("0", ""))
        self.ordinal_numbers_with_leading_zeros = (leading_zeros + ordinal_numbers).optimize()

        final_graph = (ordinal_numbers | ordinal_numbers_marked).optimize()
        final_graph = pynutil.insert("integer: \"") + final_graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemple #24
0
    def __init__(self, cardinal: GraphFst, decimal: GraphFst):
        super().__init__(name="money", kind="classify")
        # quantity, integer_part, fractional_part, currency

        cardinal_graph = cardinal.graph_no_exception
        graph_decimal_final = decimal.final_graph_wo_negative
        graph_half = pynini.cross("rưỡi", "5")

        unit = pynini.string_file(get_abs_path("data/currency.tsv"))
        unit_singular = pynini.invert(unit)

        graph_unit_singular = pynutil.insert("currency: \"") + convert_space(
            unit_singular) + pynutil.insert("\"")

        add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (
            pynutil.insert("0") + NEMO_DIGIT)

        # twelve dollars fifty, only after integer
        optional_cents_suffix = pynini.closure(
            delete_extra_space + pynutil.insert("fractional_part: \"") +
            (pynutil.add_weight(
                cardinal_graph @ add_leading_zero_to_double_digit, -0.7)
             | graph_half) + pynutil.insert("\""),
            0,
            1,
        )

        graph_integer = (pynutil.insert("integer_part: \"") + cardinal_graph +
                         pynutil.insert("\"") + delete_extra_space +
                         graph_unit_singular + optional_cents_suffix)

        graph_decimal = graph_decimal_final + delete_extra_space + graph_unit_singular + optional_cents_suffix
        final_graph = graph_integer | graph_decimal
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemple #25
0
    def __init__(self):
        super().__init__(name="time", kind="verbalize")

        hour_to_night = pynini.string_file(get_abs_path("data/time/hour_to_night.tsv"))

        day_suffixes = pynutil.delete("suffix: \"am\"")
        night_suffixes = pynutil.delete("suffix: \"pm\"")

        hour = (
            pynutil.delete("hours:")
            + delete_space
            + pynutil.delete("\"")
            + pynini.closure(NEMO_DIGIT, 1, 2)
            + pynutil.delete("\"")
        )
        minute = (
            pynutil.delete("minutes:")
            + delete_extra_space
            + pynutil.delete("\"")
            + pynini.closure(NEMO_DIGIT, 1, 2)
            + pynutil.delete("\"")
        )

        graph = hour + delete_extra_space + pynutil.insert("h") + minute.ques + delete_space + day_suffixes.ques

        graph |= (
            hour @ hour_to_night
            + delete_extra_space
            + pynutil.insert("h")
            + minute.ques
            + delete_space
            + night_suffixes
        )
        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()
Exemple #26
0
    def __init__(self):
        super().__init__(name="telephone", kind="classify")
        delete_space = pynutil.delete(' ')
        # country code, number_part, extension
        add_separator = pynutil.insert(" ")  # between components
        digit = pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv"))).optimize() | pynini.cross(
            "0", pynini.union("o", "oh", "zero")
        )

        number_part = (
            (
                (pynini.closure(digit + insert_space, 2, 2) + digit + pynutil.delete("-"))
                | (
                    pynutil.delete("(")
                    + pynini.closure(digit + insert_space, 2, 2)
                    + digit
                    + pynutil.delete(")")
                    + pynini.closure(pynutil.delete("-"), 0, 1)
                    + delete_space
                )
            )
            + add_separator
            + pynini.closure(digit + insert_space, 2, 2)
            + digit
            + pynutil.delete("-")
            + add_separator
            + pynini.closure(digit + insert_space, 3, 3)
            + digit
        )
        number_part = pynutil.insert("number_part: \"") + pynini.invert(number_part) + pynutil.insert("\"")

        graph = number_part
        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Exemple #27
0
    def __init__(self, cardinal, deterministic: bool = True):
        super().__init__(name="fraction",
                         kind="classify",
                         deterministic=deterministic)
        cardinal_graph = cardinal.graph

        integer = pynutil.insert(
            "integer_part: \"") + cardinal_graph + pynutil.insert("\"")
        numerator = (pynutil.insert("numerator: \"") + cardinal_graph +
                     (pynini.cross("/", "\" ") | pynini.cross(" / ", "\" ")))

        endings = ["rd", "th", "st", "nd"]
        endings += [x.upper() for x in endings]
        optional_end = pynini.closure(pynini.cross(pynini.union(*endings), ""),
                                      0, 1)

        denominator = pynutil.insert(
            "denominator: \""
        ) + cardinal_graph + optional_end + pynutil.insert("\"")

        graph = pynini.closure(integer + pynini.accep(" "), 0,
                               1) + (numerator + denominator)
        graph |= pynini.closure(
            integer +
            (pynini.accep(" ") | pynutil.insert(" ")), 0, 1) + pynini.compose(
                pynini.string_file(get_abs_path("data/number/fraction.tsv")),
                (numerator + denominator))

        self.graph = graph
        final_graph = self.add_tokens(self.graph)
        self.fst = final_graph.optimize()
Exemple #28
0
    def __init__(self, cardinal: GraphFst):
        super().__init__(name="fraction", kind="classify")
        # integer_part # numerator # denominator

        cardinal_graph = cardinal.graph_no_exception
        fractional = pynini.string_file(get_abs_path("data/fractions.tsv"))

        self.fractional = ((pynini.closure(NEMO_CHAR) + fractional)
                           @ cardinal_graph).optimize()

        integer = pynutil.insert(
            "integer_part: \"") + cardinal_graph + pynutil.insert("\"")
        numerator = pynutil.insert(
            "numerator: \"") + cardinal_graph + pynutil.insert("\"")
        denominator = pynutil.insert(
            "denominator: \"") + self.fractional + pynutil.insert("\"")

        graph = pynini.closure(
            integer + delete_space, 0,
            1) + numerator + delete_space + insert_space + denominator
        graph = graph.optimize()
        self.final_graph_wo_negative = graph

        optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("minus", "\"true\"") +
            delete_extra_space, 0, 1)
        graph = optional_graph_negative + graph
        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Exemple #29
0
    def __init__(self, cardinal: GraphFst, decimal: GraphFst):
        super().__init__(name="money", kind="classify")
        cardinal_graph = cardinal.graph
        graph_decimal_final = decimal.final_graph_wo_negative

        unit_singular = pynini.string_file(get_abs_path("data/currency.tsv"))
        unit_plural = convert_space(unit_singular @ SINGULAR_TO_PLURAL)
        unit_singular = convert_space(unit_singular)

        graph_unit_singular = pynutil.insert(
            "currency: \"") + unit_singular + pynutil.insert("\"")
        graph_unit_plural = pynutil.insert(
            "currency: \"") + unit_plural + pynutil.insert("\"")

        graph_integer = (graph_unit_plural +
                         pynutil.insert(" integer_part: \"") +
                         ((NEMO_SIGMA - "1") @ cardinal_graph) +
                         pynutil.insert("\""))
        graph_integer |= (graph_unit_singular +
                          pynutil.insert(" integer_part: \"") +
                          pynini.cross("1", "one") + pynutil.insert("\""))
        graph_decimal = graph_unit_plural + insert_space + graph_decimal_final
        final_graph = graph_integer | graph_decimal
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemple #30
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="electronic",
                         kind="verbalize",
                         deterministic=deterministic)
        graph_digit = pynini.invert(
            pynini.string_file(
                get_abs_path("data/numbers/digit.tsv"))).optimize()
        graph_symbols = pynini.string_file(
            get_abs_path("data/electronic/symbols.tsv")).optimize()
        user_name = (
            pynutil.delete("username:"******"\"") +
            (pynini.closure(
                pynutil.add_weight(graph_digit + insert_space, 1.09)
                | pynutil.add_weight(
                    pynini.closure(graph_symbols + pynutil.insert(" ")), 1.09)
                | pynutil.add_weight(NEMO_NOT_QUOTE + insert_space, 1.1))) +
            pynutil.delete("\""))

        domain_default = (pynini.closure(NEMO_NOT_QUOTE + insert_space) +
                          pynini.cross(".", "dot ") + NEMO_NOT_QUOTE +
                          pynini.closure(insert_space + NEMO_NOT_QUOTE))

        server_default = (pynini.closure(
            (graph_digit | NEMO_ALPHA) + insert_space, 1) +
                          pynini.closure(graph_symbols + insert_space) +
                          pynini.closure(
                              (graph_digit | NEMO_ALPHA) + insert_space, 1))
        server_common = pynini.string_file(
            get_abs_path("data/electronic/server_name.tsv")) + insert_space

        domain_common = pynini.cross(".", "dot ") + pynini.string_file(
            get_abs_path("data/electronic/domain.tsv"))

        domain = (pynutil.delete("domain:") + delete_space +
                  pynutil.delete("\"") +
                  (pynutil.add_weight(server_common, 1.09)
                   | pynutil.add_weight(server_default, 1.1)) +
                  (pynutil.add_weight(domain_common, 1.09)
                   | pynutil.add_weight(domain_default, 1.1)) + delete_space +
                  pynutil.delete("\""))

        graph = (pynini.closure(
            user_name + delete_space + pynutil.insert("at ") + delete_space, 0,
            1) + domain + delete_space)

        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()