Ejemplo n.º 1
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="cardinal",
                         kind="classify",
                         deterministic=deterministic)

        graph = pynini.Far(
            get_abs_path("data/numbers/cardinal_number_name.far")).get_fst()
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            pynini.closure(NEMO_DIGIT, 2, 3)
            | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph
        self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure(
            pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT +
            NEMO_DIGIT + NEMO_DIGIT)) @ graph

        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
        single_digits_graph = pynutil.add_weight(
            pynini.invert(graph_digit | graph_zero), 1.2) | pynutil.add_weight(
                pynini.cross("0", "oh"), 1.1)
        self.single_digits_graph = single_digits_graph + pynini.closure(
            pynutil.insert(" ") + single_digits_graph)

        if not deterministic:
            single_digits_graph_with_commas = pynini.closure(
                self.single_digits_graph + pynutil.insert(" "), 1,
                3) + pynini.closure(
                    pynutil.delete(",") + single_digits_graph +
                    pynutil.insert(" ") + single_digits_graph +
                    pynutil.insert(" ") + single_digits_graph,
                    1,
                )
            self.graph |= self.single_digits_graph | get_hundreds_graph(
            ) | single_digits_graph_with_commas
            self.range_graph = (
                pynini.closure(pynutil.insert("from "), 0, 1) + self.graph +
                (pynini.cross("-", " to ") | pynini.cross("-", " ")) +
                self.graph)

            self.range_graph |= self.graph + (pynini.cross(
                "x", " by ") | pynini.cross(" x ", " by ")) + self.graph
            self.range_graph = self.range_graph.optimize()

        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)
        final_graph = self.graph | pynutil.add_weight(self.get_serial_graph(),
                                                      1.2)

        if not deterministic:
            final_graph |= self.range_graph

        final_graph = optional_minus_graph + pynutil.insert(
            "integer: \"") + final_graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Ejemplo n.º 2
0
    def __init__(self,
                 cardinal: GraphFst,
                 decimal: GraphFst,
                 deterministic: bool = True):
        super().__init__(name="money",
                         kind="classify",
                         deterministic=deterministic)
        cardinal_graph = cardinal.graph
        graph_decimal_final = decimal.final_graph_wo_negative

        unit_singular = pynini.string_file(
            get_abs_path("data/currency/currency.tsv"))
        unit_plural = convert_space(unit_singular @ SINGULAR_TO_PLURAL)
        unit_singular = convert_space(unit_singular)

        graph_unit_singular = pynutil.insert(
            "currency: \"") + unit_singular + pynutil.insert("\"")
        graph_unit_plural = pynutil.insert(
            "currency: \"") + unit_plural + pynutil.insert("\"")

        singular_graph = (graph_unit_singular +
                          pynutil.insert(" integer_part: \"") +
                          pynini.cross("1", "one") + pynutil.insert("\""))

        graph_decimal = graph_unit_plural + insert_space + graph_decimal_final

        if deterministic:
            graph_integer = (graph_unit_plural +
                             pynutil.insert(" integer_part: \"") +
                             ((NEMO_SIGMA - "1") @ cardinal_graph) +
                             pynutil.insert("\""))
        else:
            graph_integer = (
                graph_unit_plural + pynutil.insert(" integer_part: \"") +
                ((NEMO_SIGMA - "1")
                 @ (get_hundreds_graph(deterministic) | cardinal_graph)) +
                pynutil.insert("\""))
            graph_decimal |= singular_graph + insert_space + graph_decimal_final

        graph_integer |= singular_graph

        final_graph = graph_integer | graph_decimal
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Ejemplo n.º 3
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="cardinal",
                         kind="classify",
                         deterministic=deterministic)
        # TODO repalce to have "oh" as a default for "0"
        graph = pynini.Far(
            get_abs_path("data/numbers/cardinal_number_name.far")).get_fst()
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            pynini.closure(NEMO_DIGIT, 2, 3)
            | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph
        self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure(
            pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT +
            NEMO_DIGIT + NEMO_DIGIT)) @ graph

        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))

        single_digits_graph = pynini.invert(graph_digit | graph_zero)
        self.single_digits_graph = single_digits_graph + pynini.closure(
            insert_space + single_digits_graph)

        if not deterministic:
            # for a single token allow only the same normalization
            # "007" -> {"oh oh seven", "zero zero seven"} not {"oh zero seven"}
            single_digits_graph_zero = pynini.invert(graph_digit | graph_zero)
            single_digits_graph_oh = pynini.invert(graph_digit) | pynini.cross(
                "0", "oh")

            self.single_digits_graph = single_digits_graph_zero + pynini.closure(
                insert_space + single_digits_graph_zero)
            self.single_digits_graph |= single_digits_graph_oh + pynini.closure(
                insert_space + single_digits_graph_oh)

            single_digits_graph_with_commas = pynini.closure(
                self.single_digits_graph + insert_space, 1,
                3) + pynini.closure(
                    pynutil.delete(",") + single_digits_graph + insert_space +
                    single_digits_graph + insert_space + single_digits_graph,
                    1,
                )

            self.range_graph = pynutil.insert(
                "from ") + self.graph + pynini.cross("-", " to ") + self.graph
            self.range_graph |= self.graph + (pynini.cross(
                "x", " by ") | pynini.cross(" x ", " by ")) + self.graph
            self.range_graph |= (pynutil.insert("from ") +
                                 get_hundreds_graph() +
                                 pynini.cross("-", " to ") +
                                 get_hundreds_graph())
            self.range_graph = self.range_graph.optimize()

        serial_graph = self.get_serial_graph()
        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)

        if deterministic:
            long_numbers = pynini.compose(NEMO_DIGIT**(5, ...),
                                          self.single_digits_graph).optimize()
            final_graph = self.graph | serial_graph | pynutil.add_weight(
                long_numbers, -0.001)
            cardinal_with_leading_zeros = pynini.compose(
                pynini.accep("0") + pynini.closure(NEMO_DIGIT),
                self.single_digits_graph)
            final_graph |= cardinal_with_leading_zeros
        else:

            leading_zeros = pynini.compose(
                pynini.closure(pynini.accep("0"), 1), self.single_digits_graph)
            cardinal_with_leading_zeros = (
                leading_zeros + pynutil.insert(" ") +
                pynini.compose(pynini.closure(NEMO_DIGIT), self.graph))

            final_graph = (self.graph
                           | serial_graph
                           | self.range_graph
                           | self.single_digits_graph
                           | get_hundreds_graph()
                           | pynutil.add_weight(
                               single_digits_graph_with_commas, 0.001)
                           | cardinal_with_leading_zeros)

        final_graph = optional_minus_graph + pynutil.insert(
            "integer: \"") + final_graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)

        self.fst = final_graph.optimize()
Ejemplo n.º 4
0
    def __init__(self,
                 cardinal: GraphFst,
                 decimal: GraphFst,
                 deterministic: bool = True):
        super().__init__(name="money",
                         kind="classify",
                         deterministic=deterministic)
        cardinal_graph = cardinal.graph
        graph_decimal_final = decimal.final_graph_wo_negative

        unit_singular = pynini.string_file(
            get_abs_path("data/currency/currency.tsv"))
        unit_plural = convert_space(unit_singular @ SINGULAR_TO_PLURAL)
        unit_singular = convert_space(unit_singular)

        graph_unit_singular = pynutil.insert(
            "currency: \"") + unit_singular + pynutil.insert("\"")
        graph_unit_plural = pynutil.insert(
            "currency: \"") + unit_plural + pynutil.insert("\"")

        singular_graph = (graph_unit_singular +
                          pynutil.insert(" integer_part: \"") +
                          pynini.cross("1", "one") + pynutil.insert("\""))

        graph_decimal = graph_unit_plural + insert_space + graph_decimal_final

        if deterministic:
            graph_integer = (graph_unit_plural +
                             pynutil.insert(" integer_part: \"") +
                             ((NEMO_SIGMA - "1") @ cardinal_graph) +
                             pynutil.insert("\""))
        else:
            graph_integer = (
                graph_unit_plural + pynutil.insert(" integer_part: \"") +
                ((NEMO_SIGMA - "1")
                 @ (get_hundreds_graph(deterministic) | cardinal_graph)) +
                pynutil.insert("\""))
            graph_decimal |= singular_graph + insert_space + graph_decimal_final

        graph_integer |= singular_graph

        final_graph = graph_integer | graph_decimal

        if not deterministic:
            currencies = load_labels(
                get_abs_path("data/currency/currency.tsv"))
            zero_graph = pynini.cross("0", "") | pynini.accep("0")
            # add minor currency part only when there are two digits after the point
            # .01 -> {zero one cent, one cent}, .05 -> {oh five, five cents}
            two_digits_fractional_part = (
                NEMO_SIGMA + pynini.closure(NEMO_DIGIT) +
                ((pynini.accep(".") + (NEMO_DIGIT**(2) | zero_graph +
                                       (NEMO_DIGIT - "0")))
                 | pynutil.delete(".") +
                 pynini.cross(pynini.closure("0", 1), "")))

            integer_graph = None
            decimal_graph_with_minor = None
            decimal_graph_default = None

            for curr_symbol, curr_name in currencies:
                curr_symbol_graph = pynutil.delete(curr_symbol)
                graph_end = pynutil.insert(" currency: \"" + curr_symbol +
                                           "\"")
                preserve_order = pynutil.insert(" preserve_order: True")
                integer_part = decimal.graph_integer + graph_end + preserve_order

                # "$4" -> 'integer_part: "four" currency: "$" preserve_order: True' -> four dollars
                integer_graph_curr = curr_symbol_graph + integer_part
                # remove fractional part if it contains only zeros
                # "$4.00" -> 'integer_part: "four" currency: "$" preserve_order: True' -> four dollars
                integer_graph_curr |= pynini.compose(
                    two_digits_fractional_part, integer_graph_curr)
                decimal_graph_with_minor_curr = (
                    curr_symbol_graph + pynini.closure(integer_part, 0, 1) +
                    pynini.cross(".", " ") + decimal.graph_fractional +
                    graph_end)

                # "$.5" -> 'fractional_part: "five" currency: "dollars"' -> point five dollars
                decimal_graph_default_curr = (
                    pynutil.delete("currency: \"" +
                                   pynini.compose(curr_symbol, unit_plural) +
                                   "\"") + delete_space +
                    pynini.accep("fractional_part") + NEMO_SIGMA +
                    pynutil.insert(" currency: \"" +
                                   pynini.compose(curr_symbol, unit_plural) +
                                   "\""))

                # "$4.5" -> 'integer_part: "four" fractional_part: "five" currency: "dollars"' -> "four point five dollars"
                decimal_graph_default_curr |= (
                    pynutil.delete("currency: \"" + curr_name +
                                   pynini.closure(NEMO_NOT_QUOTE) + "\"") +
                    delete_space + pynini.accep("integer_part") + NEMO_SIGMA +
                    pynini.accep("fractional_part") + NEMO_SIGMA +
                    pynutil.insert(" currency: \"" +
                                   pynini.compose(curr_symbol, unit_plural) +
                                   "\""))

                # "£4 billion" -> 'integer_part: "four" quantity: "billion" currency: "pounds"' -> "four billion dollars"
                decimal_graph_default_curr |= (
                    pynutil.delete("currency: \"") + pynutil.delete(
                        rewrite.rewrite_lattice(
                            curr_symbol,
                            pynini.compose(curr_symbol, unit_plural)) + "\" ")
                    + pynini.difference(NEMO_SIGMA, "fractional_part") +
                    pynutil.insert(" currency: \"" +
                                   pynini.compose(curr_symbol, unit_plural) +
                                   "\""))

                decimal_graph_with_minor_curr = pynini.compose(
                    two_digits_fractional_part, decimal_graph_with_minor_curr)
                decimal_graph_default_curr = pynini.compose(
                    graph_decimal, decimal_graph_default_curr)

                integer_graph = (integer_graph_curr
                                 if integer_graph is None else pynini.union(
                                     integer_graph, integer_graph_curr))
                decimal_graph_with_minor = (decimal_graph_with_minor_curr
                                            if decimal_graph_with_minor is None
                                            else pynini.union(
                                                decimal_graph_with_minor,
                                                decimal_graph_with_minor_curr))
                decimal_graph_default = (
                    decimal_graph_default_curr
                    if decimal_graph_default is None else pynini.union(
                        decimal_graph_default, decimal_graph_default_curr))

            final_graph = decimal_graph_with_minor | decimal_graph_default | integer_graph

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()