Exemple #1
0
    def __init__(
        self,
        itn_cardinal_tagger: GraphFst,
        itn_decimal_tagger: GraphFst,
        itn_fraction_tagger: GraphFst,
        deterministic: bool = True,
    ):
        super().__init__(name="measure",
                         kind="classify",
                         deterministic=deterministic)

        cardinal_graph = (pynini.cdrewrite(
            pynini.cross(pynini.union("ein", "eine"), "eins"), "[BOS]",
            "[EOS]", NEMO_SIGMA) @ itn_cardinal_tagger.graph_no_exception)

        graph_unit_singular = pynini.invert(unit_singular)  # singular -> abbr
        unit = (pynini.invert(singular_to_plural())
                @ graph_unit_singular) | graph_unit_singular  # plural -> abbr
        unit = convert_space(unit)
        graph_unit_singular = convert_space(graph_unit_singular)

        optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("minus", "\"true\"") +
            delete_extra_space, 0, 1)

        unit_misc = pynutil.insert("/") + pynutil.delete(
            "pro") + delete_space + graph_unit_singular

        unit = (pynutil.insert("units: \"") +
                (unit | unit_misc
                 | pynutil.add_weight(unit + delete_space + unit_misc, 0.01)) +
                pynutil.insert("\""))

        subgraph_decimal = (pynutil.insert("decimal { ") +
                            optional_graph_negative +
                            itn_decimal_tagger.final_graph_wo_negative +
                            pynutil.insert(" }") + delete_extra_space + unit)

        subgraph_fraction = (pynutil.insert("decimal { ") +
                             optional_graph_negative +
                             pynutil.insert("integer_part: \"") +
                             itn_fraction_tagger.graph +
                             pynutil.insert("\" }") + delete_extra_space +
                             unit)

        subgraph_cardinal = (pynutil.insert("cardinal { ") +
                             optional_graph_negative +
                             pynutil.insert("integer: \"") + cardinal_graph +
                             pynutil.insert("\"") + pynutil.insert(" }") +
                             delete_extra_space + unit)
        final_graph = subgraph_cardinal | subgraph_decimal | subgraph_fraction
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemple #2
0
    def __init__(self, cardinal: GraphFst, decimal: GraphFst):
        super().__init__(name="measure", kind="classify")

        cardinal_graph = cardinal.graph_no_exception

        graph_unit_singular = pynini.string_file(
            get_abs_path("data/measurements_singular.tsv"))
        graph_unit_singular = pynini.invert(
            graph_unit_singular)  # singular -> abbr
        graph_unit_plural = pynini.string_file(
            get_abs_path("data/measurements_plural.tsv"))
        graph_unit_plural = pynini.invert(graph_unit_plural)  # plural -> abbr

        optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("menos", "\"true\"") +
            delete_extra_space, 0, 1)

        unit_singular = convert_space(graph_unit_singular)
        unit_plural = convert_space(graph_unit_plural)
        unit_misc = pynutil.insert("/") + pynutil.delete(
            "por") + delete_space + convert_space(graph_unit_singular)

        unit_singular = (pynutil.insert("units: \"") +
                         (unit_singular | unit_misc | pynutil.add_weight(
                             unit_singular + delete_space + unit_misc, 0.01)) +
                         pynutil.insert("\""))
        unit_plural = (pynutil.insert("units: \"") +
                       (unit_plural | unit_misc | pynutil.add_weight(
                           unit_plural + delete_space + unit_misc, 0.01)) +
                       pynutil.insert("\""))

        subgraph_decimal = (pynutil.insert("decimal { ") +
                            optional_graph_negative +
                            decimal.final_graph_wo_negative +
                            pynutil.insert(" }") + delete_extra_space +
                            unit_plural)
        subgraph_cardinal = (
            pynutil.insert("cardinal { ") + optional_graph_negative +
            pynutil.insert("integer: \"") +
            ((NEMO_SIGMA - "un" - "una" - "uno") @ cardinal_graph) +
            pynutil.insert("\"") + pynutil.insert(" }") + delete_extra_space +
            unit_plural)
        subgraph_cardinal |= (
            pynutil.insert("cardinal { ") + optional_graph_negative +
            pynutil.insert("integer: \"") +
            (pynini.cross("un", "1") | pynini.cross("una", "1")
             | pynini.cross("uno", "1")) + pynutil.insert("\"") +
            pynutil.insert(" }") + delete_extra_space + unit_singular)
        final_graph = subgraph_decimal | subgraph_cardinal
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemple #3
0
    def __init__(self, input_case: str, deterministic: bool = True):
        super().__init__(name="whitelist",
                         kind="classify",
                         deterministic=deterministic)

        def _get_whitelist_graph(input_case, file="data/whitelist.tsv"):
            whitelist = load_labels(get_abs_path(file))
            if input_case == "lower_cased":
                whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
            else:
                whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
            graph = pynini.string_map(whitelist)
            return graph

        graph = _get_whitelist_graph(input_case)

        units_graph = _get_whitelist_graph(input_case,
                                           file="data/measurements.tsv")
        # do not replace single letter units, like `м` or `°`
        units_graph = pynini.compose(
            pynini.difference(pynini.project(units_graph, "input"),
                              NEMO_ALPHA), units_graph)
        graph |= units_graph.optimize()
        graph |= TO_LATIN + pynini.closure(pynutil.insert(" ") + TO_LATIN)

        self.final_graph = convert_space(graph)
        self.fst = (pynutil.insert("name: \"") + self.final_graph +
                    pynutil.insert("\"")).optimize()
Exemple #4
0
    def __init__(self, itn_cardinal_tagger: GraphFst, tn_fraction_verbalizer: GraphFst, deterministic: bool = True):
        super().__init__(name="fraction", kind="classify", deterministic=deterministic)
        tagger = tn_fraction_verbalizer.graph.invert().optimize()

        delete_optional_sign = pynini.closure(pynutil.delete("negative: ") + pynini.cross("\"true\" ", "-"), 0, 1)
        delete_integer_marker = (
            pynutil.delete("integer_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
        ) @ itn_cardinal_tagger.graph_no_exception

        delete_numerator_marker = (
            pynutil.delete("numerator: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
        ) @ itn_cardinal_tagger.graph_no_exception

        delete_denominator_marker = (
            pynutil.insert('/')
            + (pynutil.delete("denominator: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\""))
            @ itn_cardinal_tagger.graph_no_exception
        )

        graph = (
            pynini.closure(delete_integer_marker + pynini.accep(" "), 0, 1)
            + delete_numerator_marker
            + delete_space
            + delete_denominator_marker
        ).optimize()
        verbalizer = delete_optional_sign + graph

        self.graph = tagger @ verbalizer

        graph = pynutil.insert("name: \"") + convert_space(self.graph) + pynutil.insert("\"")
        self.fst = graph.optimize()
Exemple #5
0
    def __init__(self, tn_cardinal_tagger: GraphFst, deterministic: bool = True):
        super().__init__(name="telephone", kind="classify", deterministic=deterministic)
        separator = pynini.accep(" ")  # between components
        digit = pynini.union(*list(map(str, range(1, 10)))) @ tn_cardinal_tagger.two_digit_non_zero
        zero = pynini.cross("0", "null")

        number_part = (
            pynutil.delete("(")
            + zero
            + insert_space
            + pynini.closure(digit + insert_space, 2, 2)
            + digit
            + pynutil.delete(")")
            + separator
            + pynini.closure(digit + insert_space, 3, 3)
            + digit
            + pynutil.delete("-")
            + insert_space
            + pynini.closure(digit + insert_space, 3, 3)
            + digit
        )
        graph = convert_space(pynini.invert(number_part))
        final_graph = pynutil.insert("name: \"") + graph + pynutil.insert("\"")

        self.fst = final_graph.optimize()
Exemple #6
0
    def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None):
        super().__init__(name="whitelist", kind="classify", deterministic=deterministic)

        def _get_whitelist_graph(input_case, file):
            whitelist = load_labels(file)
            if input_case == "lower_cased":
                whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
            graph = pynini.string_map(whitelist)
            return graph

        graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist.tsv"))
        if not deterministic and input_case != "lower_cased":
            graph |= pynutil.add_weight(
                _get_whitelist_graph("lower_cased", get_abs_path("data/whitelist.tsv")), weight=0.0001
            )

        if input_file:
            whitelist_provided = _get_whitelist_graph(input_case, input_file)
            if not deterministic:
                graph |= whitelist_provided
            else:
                graph = whitelist_provided

        if not deterministic:
            units_graph = _get_whitelist_graph(input_case, file=get_abs_path("data/measure/measurements.tsv"))
            graph |= units_graph

        self.graph = graph
        self.final_graph = convert_space(self.graph).optimize()
        self.fst = (pynutil.insert("name: \"") + self.final_graph + pynutil.insert("\"")).optimize()
Exemple #7
0
    def __init__(self, input_case: str, deterministic: bool = True):
        super().__init__(name="whitelist", kind="classify")

        def _get_whitelist_graph(input_case, file="data/whitelist.tsv"):
            whitelist = load_labels(get_abs_path(file))
            if input_case == "lower_cased":
                whitelist = [(x.lower(), y) for x, y in whitelist]
            else:
                whitelist = [(x, y) for x, y in whitelist]
            graph = pynini.string_map(whitelist)
            return graph

        def _get_whitelist_non_deterministic_graph(
                file="data/whitelist_alternatives.tsv"):
            whitelist = load_labels(get_abs_path(file))
            whitelist_lower = [(x.lower(), y.lower()) for x, y in whitelist]
            whitelist_cased = [(x, y) for x, y in whitelist]
            graph = pynini.string_map(whitelist_lower + whitelist_cased)
            return graph

        graph = _get_whitelist_graph(input_case)
        if not deterministic:
            graph |= _get_whitelist_graph(
                "lower_cased") | _get_whitelist_non_deterministic_graph()

        self.graph = (convert_space(graph)).optimize()
        self.fst = (pynutil.insert("name: \"") + self.graph +
                    pynutil.insert("\"")).optimize()
Exemple #8
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="word", kind="classify", deterministic=deterministic)

        symbols_to_exclude = (pynini.union("$", "€", "₩", "£", "¥", "#", "%") | NEMO_DIGIT).optimize()
        graph = pynini.closure(pynini.difference(NEMO_NOT_SPACE, symbols_to_exclude), 1)

        # leave phones of format [HH AH0 L OW1] untouched
        phoneme_unit = pynini.closure(NEMO_ALPHA, 1) + pynini.closure(NEMO_DIGIT)
        phoneme = (
            pynini.accep(pynini.escape("["))
            + pynini.closure(phoneme_unit + pynini.accep(" "))
            + phoneme_unit
            + pynini.accep(pynini.escape("]"))
        )

        if not deterministic:
            phoneme = (
                pynini.accep(pynini.escape("["))
                + pynini.closure(pynini.accep(" "), 0, 1)
                + pynini.closure(phoneme_unit + pynini.accep(" "))
                + phoneme_unit
                + pynini.closure(pynini.accep(" "), 0, 1)
                + pynini.accep(pynini.escape("]"))
            )
        self.graph = plurals._priority_union(convert_space(phoneme), graph, NEMO_SIGMA)
        self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
Exemple #9
0
    def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None):
        super().__init__(name="whitelist", kind="classify", deterministic=deterministic)

        def _get_whitelist_graph(input_case, file):
            whitelist = load_labels(file)
            if input_case == "lower_cased":
                whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
            else:
                whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
            graph = pynini.string_map(whitelist)
            return graph

        graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist.tsv"))

        if input_file:
            graph = _get_whitelist_graph(input_case, input_file)

        units_graph = _get_whitelist_graph(input_case, file=get_abs_path("data/measurements.tsv"))
        # do not replace single letter units, like `м`, `°` and `%` will be replaced
        units_graph = pynini.compose((NEMO_CHAR ** (2, ...) | pynini.difference(NEMO_CHAR, RU_ALPHA)), units_graph)
        graph |= units_graph.optimize()
        graph |= TO_CYRILLIC + pynini.closure(pynutil.insert(" ") + TO_CYRILLIC)

        self.final_graph = convert_space(graph)
        self.fst = (pynutil.insert("name: \"") + self.final_graph + pynutil.insert("\"")).optimize()
Exemple #10
0
    def __init__(self):
        super().__init__(name="whitelist", kind="classify")

        whitelist = pynini.string_file(
            get_abs_path("data/whitelist.tsv")).invert()
        graph = pynutil.insert("name: \"") + convert_space(
            whitelist) + pynutil.insert("\"")
        self.fst = graph.optimize()
Exemple #11
0
    def __init__(self,
                 cardinal: GraphFst,
                 decimal: GraphFst,
                 deterministic: bool = True):
        super().__init__(name="money",
                         kind="classify",
                         deterministic=deterministic)
        cardinal_graph = cardinal.graph
        graph_decimal_final = decimal.final_graph_wo_negative

        unit_singular = pynini.string_file(
            get_abs_path("data/currency/currency.tsv"))
        unit_plural = convert_space(unit_singular @ SINGULAR_TO_PLURAL)
        unit_singular = convert_space(unit_singular)

        graph_unit_singular = pynutil.insert(
            "currency: \"") + unit_singular + pynutil.insert("\"")
        graph_unit_plural = pynutil.insert(
            "currency: \"") + unit_plural + pynutil.insert("\"")

        singular_graph = (graph_unit_singular +
                          pynutil.insert(" integer_part: \"") +
                          pynini.cross("1", "one") + pynutil.insert("\""))

        graph_decimal = graph_unit_plural + insert_space + graph_decimal_final

        if deterministic:
            graph_integer = (graph_unit_plural +
                             pynutil.insert(" integer_part: \"") +
                             ((NEMO_SIGMA - "1") @ cardinal_graph) +
                             pynutil.insert("\""))
        else:
            graph_integer = (
                graph_unit_plural + pynutil.insert(" integer_part: \"") +
                ((NEMO_SIGMA - "1")
                 @ (get_hundreds_graph(deterministic) | cardinal_graph)) +
                pynutil.insert("\""))
            graph_decimal |= singular_graph + insert_space + graph_decimal_final

        graph_integer |= singular_graph

        final_graph = graph_integer | graph_decimal
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemple #12
0
    def __init__(self,
                 tn_whitelist_tagger: GraphFst,
                 deterministic: bool = True):
        super().__init__(name="whitelist",
                         kind="classify",
                         deterministic=deterministic)

        whitelist = pynini.invert(tn_whitelist_tagger.graph)
        graph = pynutil.insert("name: \"") + convert_space(
            whitelist) + pynutil.insert("\"")
        self.fst = graph.optimize()
Exemple #13
0
    def __init__(self,
                 itn_cardinal_tagger: GraphFst,
                 itn_decimal_tagger: GraphFst,
                 deterministic: bool = True):
        super().__init__(name="money",
                         kind="classify",
                         deterministic=deterministic)
        cardinal_graph = (pynini.cdrewrite(
            pynini.cross(pynini.union("ein", "eine"), "eins"), "[BOS]",
            "[EOS]", NEMO_SIGMA) @ itn_cardinal_tagger.graph_no_exception)
        graph_decimal_final = itn_decimal_tagger.final_graph_wo_negative

        graph_unit = pynini.invert(maj_singular)
        graph_unit = pynutil.insert("currency: \"") + convert_space(
            graph_unit) + pynutil.insert("\"")

        add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (
            pynutil.insert("0") + NEMO_DIGIT)
        min_unit = pynini.project(min_singular | min_plural, "output")
        # elf euro (und) vier cent, vier cent
        cents_standalone = (pynutil.insert("fractional_part: \"") +
                            cardinal_graph @ add_leading_zero_to_double_digit +
                            delete_space + pynutil.delete(min_unit) +
                            pynutil.insert("\""))

        optional_cents_standalone = pynini.closure(
            delete_space +
            pynini.closure(pynutil.delete("und") + delete_space, 0, 1) +
            insert_space + cents_standalone,
            0,
            1,
        )
        # elf euro vierzig, only after integer
        optional_cents_suffix = pynini.closure(
            delete_extra_space + pynutil.insert("fractional_part: \"") +
            pynutil.add_weight(
                cardinal_graph @ add_leading_zero_to_double_digit, -0.7) +
            pynutil.insert("\""),
            0,
            1,
        )

        graph_integer = (pynutil.insert("integer_part: \"") + cardinal_graph +
                         pynutil.insert("\"") + delete_extra_space +
                         graph_unit +
                         (optional_cents_standalone | optional_cents_suffix))
        graph_decimal = graph_decimal_final + delete_extra_space + graph_unit
        graph_decimal |= pynutil.insert(
            "currency: \"€\" integer_part: \"0\" ") + cents_standalone
        final_graph = graph_integer | graph_decimal
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemple #14
0
    def __init__(
        self,
        itn_cardinal_tagger: GraphFst,
        tn_date_tagger: GraphFst,
        tn_date_verbalizer: GraphFst,
        deterministic: bool = True,
    ):
        super().__init__(name="date", kind="classify", deterministic=deterministic)

        add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert("0") + NEMO_DIGIT)
        optional_delete_space = pynini.closure(NEMO_SIGMA | pynutil.delete(" ", weight=0.0001))
        tagger = tn_date_verbalizer.graph.invert().optimize()

        delete_day_marker = (
            pynutil.delete("day: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
        ) @ itn_cardinal_tagger.graph_no_exception

        month_as_number = pynutil.delete("month: \"") + itn_cardinal_tagger.graph_no_exception + pynutil.delete("\"")
        month_as_string = pynutil.delete("month: \"") + tn_date_tagger.month_abbr.invert() + pynutil.delete("\"")

        convert_year = (tn_date_tagger.year @ optional_delete_space).invert().optimize()
        delete_year_marker = (
            pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
        ) @ convert_year

        # day. month as string (year)
        verbalizer = (
            pynini.closure(delete_day_marker + pynutil.insert(".") + pynini.accep(" "), 0, 1)
            + month_as_string
            + pynini.closure(pynini.accep(" ") + delete_year_marker, 0, 1)
        )

        # day. month as number (year)
        verbalizer |= (
            delete_day_marker @ add_leading_zero_to_double_digit
            + pynutil.insert(".")
            + pynutil.delete(" ")
            + month_as_number @ add_leading_zero_to_double_digit
            + pynutil.insert(".")
            + pynini.closure(pynutil.delete(" ") + delete_year_marker, 0, 1)
        )

        # year
        verbalizer |= delete_year_marker

        final_graph = tagger @ verbalizer

        graph = pynutil.insert("name: \"") + convert_space(final_graph) + pynutil.insert("\"")
        self.fst = graph.optimize()
Exemple #15
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="word", kind="classify", deterministic=deterministic)

        punct = PunctuationFst().graph
        self.graph = pynini.closure(pynini.difference(NEMO_NOT_SPACE, punct.project("input")), 1)

        if not deterministic:
            self.graph = pynini.closure(
                pynini.difference(
                    self.graph, pynini.union("$", "€", "₩", "£", "¥", "#", "$", "%") + pynini.closure(NEMO_DIGIT, 1)
                ),
                1,
            )

        # leave phones of format [HH AH0 L OW1] untouched
        phoneme_unit = pynini.closure(NEMO_ALPHA, 1) + pynini.closure(NEMO_DIGIT)
        phoneme = (
            pynini.accep(pynini.escape("["))
            + pynini.closure(phoneme_unit + pynini.accep(" "))
            + phoneme_unit
            + pynini.accep(pynini.escape("]"))
        )
        self.graph = plurals._priority_union(convert_space(phoneme), self.graph, NEMO_SIGMA)
        self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
Exemple #16
0
    def __init__(
        self, time: GraphFst, date: GraphFst, cardinal: GraphFst, deterministic: bool = True, lm: bool = False
    ):
        super().__init__(name="range", kind="classify", deterministic=deterministic)

        delete_space = pynini.closure(pynutil.delete(" "), 0, 1)
        cardinal = cardinal.graph_with_and

        approx = pynini.cross("~", "approximately")

        # TIME
        time_graph = time + delete_space + pynini.cross("-", " to ") + delete_space + time
        self.graph = time_graph | (approx + time)

        # YEAR
        date_year_four_digit = (NEMO_DIGIT ** 4 + pynini.closure(pynini.accep("s"), 0, 1)) @ date
        date_year_two_digit = (NEMO_DIGIT ** 2 + pynini.closure(pynini.accep("s"), 0, 1)) @ date

        year_to_year_graph = (
            date_year_four_digit
            + delete_space
            + pynini.cross("-", " to ")
            + delete_space
            + (date_year_four_digit | date_year_two_digit | (NEMO_DIGIT ** 2 @ cardinal))
        )
        self.graph |= year_to_year_graph

        # ADDITION
        range_graph = cardinal + pynini.closure(pynini.cross("+", " plus ") + cardinal, 1)
        range_graph |= cardinal + pynini.closure(pynini.cross(" + ", " plus ") + cardinal, 1)
        range_graph |= approx + cardinal
        range_graph |= cardinal + (pynini.cross("...", " ... ") | pynini.accep(" ... ")) + cardinal

        if not deterministic or lm:
            # cardinal ----
            cardinal_to_cardinal_graph = (
                cardinal + delete_space + pynini.cross("-", pynini.union(" to ", " minus ")) + delete_space + cardinal
            )

            range_graph |= cardinal_to_cardinal_graph | (
                cardinal + delete_space + pynini.cross(":", " to ") + delete_space + cardinal
            )

            # MULTIPLY
            for x in [" x ", "x"]:
                range_graph |= cardinal + pynini.closure(
                    pynini.cross(x, pynini.union(" by ", " times ")) + cardinal, 1
                )

            for x in ["*", " * "]:
                range_graph |= cardinal + pynini.closure(pynini.cross(x, " times ") + cardinal, 1)

            # supports "No. 12" -> "Number 12"
            range_graph |= (
                (pynini.cross(pynini.union("NO", "No"), "Number") | pynini.cross("no", "number"))
                + pynini.closure(pynini.union(". ", " "), 0, 1)
                + cardinal
            )

            for x in ["/", " / "]:
                range_graph |= cardinal + pynini.closure(pynini.cross(x, " divided by ") + cardinal, 1)

        self.graph |= range_graph

        self.graph = self.graph.optimize()
        graph = pynutil.insert("name: \"") + convert_space(self.graph).optimize() + pynutil.insert("\"")
        self.fst = graph.optimize()
Exemple #17
0
    def __init__(self):
        super().__init__(name="time", kind="classify")

        suffix_graph = pynini.string_file(
            get_abs_path("data/time/time_suffix.tsv"))
        time_to_graph = pynini.string_file(
            get_abs_path("data/time/time_to.tsv"))

        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
        graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
        graph_twenties = pynini.string_file(
            get_abs_path("data/numbers/twenties.tsv"))

        graph_1_to_100 = pynini.union(
            graph_digit,
            graph_twenties,
            graph_teen,
            (graph_ties + pynutil.insert("0")),
            (graph_ties + pynutil.delete(" y ") + graph_digit),
        )

        # note that graph_hour will start from 2 hours
        # "1 o'clock" will be treated differently because it
        # is singular
        digits_2_to_23 = [str(digits) for digits in range(2, 24)]
        digits_1_to_59 = [str(digits) for digits in range(1, 60)]

        graph_1oclock = pynini.cross("la una", "la 1")
        graph_hour = pynini.cross(
            "las ", "las ") + graph_1_to_100 @ pynini.union(*digits_2_to_23)
        graph_minute = graph_1_to_100 @ pynini.union(*digits_1_to_59)
        graph_minute_verbose = pynini.cross("media", "30") | pynini.cross(
            "cuarto", "15")

        final_graph_hour = pynutil.insert("hours: \"") + (
            graph_1oclock | graph_hour) + pynutil.insert("\"")

        final_graph_minute = (pynutil.insert("minutes: \"") + pynini.closure(
            (pynutil.delete("y") | pynutil.delete("con")) + delete_space, 0,
            1) + (graph_minute | graph_minute_verbose) + pynutil.insert("\""))

        final_suffix = pynutil.insert("suffix: \"") + convert_space(
            suffix_graph) + pynutil.insert("\"")
        final_suffix_optional = pynini.closure(
            delete_space + insert_space + final_suffix, 0, 1)

        # las nueve a eme (only convert on-the-hour times if they are followed by a suffix)
        graph_hsuffix = (final_graph_hour + delete_extra_space +
                         pynutil.insert("minutes: \"00\"") + insert_space +
                         final_suffix)

        # las nueve y veinticinco
        graph_hm = final_graph_hour + delete_extra_space + final_graph_minute

        # un cuarto para las cinco
        graph_mh = (pynutil.insert("minutes: \"") + pynini.union(
            pynini.cross("un cuarto para", "45"),
            pynini.cross("cuarto para", "45"),
        ) + pynutil.insert("\"") + delete_extra_space +
                    pynutil.insert("hours: \"") + time_to_graph +
                    pynutil.insert("\""))

        # las diez menos diez
        graph_time_to = (pynutil.insert("hours: \"") + time_to_graph +
                         pynutil.insert("\"") + delete_extra_space +
                         pynutil.insert("minutes: \"") + delete_space +
                         pynutil.delete("menos") + delete_space + pynini.union(
                             pynini.cross("cinco", "55"),
                             pynini.cross("diez", "50"),
                             pynini.cross("cuarto", "45"),
                             pynini.cross("veinte", "40"),
                             pynini.cross("veinticinco", "30"),
                         ) + pynutil.insert("\""))
        final_graph = pynini.union(
            (graph_hm | graph_mh | graph_time_to) + final_suffix_optional,
            graph_hsuffix).optimize()

        final_graph = self.add_tokens(final_graph)

        self.fst = final_graph.optimize()
Exemple #18
0
    def __init__(self, cardinal: GraphFst, decimal: GraphFst):
        super().__init__(name="money", kind="classify")
        # quantity, integer_part, fractional_part, currency

        cardinal_graph = cardinal.graph_no_exception
        graph_decimal_final = decimal.final_graph_wo_negative

        unit_singular = pynini.string_file(
            get_abs_path("data/currency_singular.tsv"))
        unit_singular = pynini.invert(unit_singular)
        unit_plural = pynini.string_file(
            get_abs_path("data/currency_plural.tsv"))
        unit_plural = pynini.invert(unit_plural)

        graph_unit_singular = pynutil.insert("currency: \"") + convert_space(
            unit_singular) + pynutil.insert("\"")
        graph_unit_plural = pynutil.insert("currency: \"") + convert_space(
            unit_plural) + pynutil.insert("\"")

        add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (
            pynutil.insert("0") + NEMO_DIGIT)
        # twelve dollars (and) fifty cents, zero cents
        cents_standalone = (
            pynutil.insert("morphosyntactic_features: \",\""
                           )  # always use a comma in the decimal
            + insert_space + pynutil.insert("fractional_part: \"") +
            pynini.union(
                pynutil.add_weight(
                    ((NEMO_SIGMA - "un") @ cardinal_graph),
                    -0.7) @ add_leading_zero_to_double_digit + delete_space +
                pynutil.delete(pynini.union("centavos", "céntimos")),
                pynini.cross("un", "01") + delete_space +
                pynutil.delete(pynini.union("centavo", "céntimo")),
            ) + pynutil.insert("\""))

        optional_cents_standalone = pynini.closure(
            delete_space + pynini.closure(
                (pynutil.delete("con") | pynutil.delete('y')) + delete_space,
                0, 1) + insert_space + cents_standalone,
            0,
            1,
        )
        # twelve dollars fifty, only after integer
        # setenta y cinco dólares con sesenta y tres~$75,63
        optional_cents_suffix = pynini.closure(
            delete_extra_space +
            pynutil.insert("morphosyntactic_features: \",\""
                           )  # always use a comma in the decimal
            + insert_space + pynutil.insert("fractional_part: \"") +
            pynini.closure(
                (pynutil.delete("con") | pynutil.delete('y')) + delete_space,
                0, 1) + pynutil.add_weight(
                    cardinal_graph @ add_leading_zero_to_double_digit, -0.7) +
            pynutil.insert("\""),
            0,
            1,
        )

        graph_integer = (pynutil.insert("integer_part: \"") +
                         ((NEMO_SIGMA - "un" - "una") @ cardinal_graph) +
                         pynutil.insert("\"") + delete_extra_space +
                         graph_unit_plural +
                         (optional_cents_standalone | optional_cents_suffix))
        graph_integer |= (
            pynutil.insert("integer_part: \"") +
            (pynini.cross("un", "1") | pynini.cross("una", "1")) +
            pynutil.insert("\"") + delete_extra_space + graph_unit_singular +
            (optional_cents_standalone | optional_cents_suffix))
        graph_decimal = graph_decimal_final + delete_extra_space + graph_unit_plural
        graph_decimal |= pynutil.insert(
            "currency: \"$\" integer_part: \"0\" ") + cents_standalone
        final_graph = graph_integer | graph_decimal
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemple #19
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="time",
                         kind="classify",
                         deterministic=deterministic)

        final_suffix = pynutil.delete(" ") + pynutil.delete(
            "Uhr") | pynutil.delete("uhr")
        time_zone_graph = pynini.string_file(
            get_abs_path("data/time/time_zone.tsv"))

        labels_hour = [str(x) for x in range(0, 25)]
        labels_minute_single = [str(x) for x in range(1, 10)]
        labels_minute_double = [str(x) for x in range(10, 60)]

        delete_leading_zero_to_double_digit = (pynutil.delete("0") |
                                               (NEMO_DIGIT - "0")) + NEMO_DIGIT

        graph_hour = pynini.union(*labels_hour)

        graph_minute_single = pynini.union(*labels_minute_single)
        graph_minute_double = pynini.union(*labels_minute_double)

        final_graph_hour_only = pynutil.insert(
            "hours: \"") + graph_hour + pynutil.insert("\"")
        final_graph_hour = (pynutil.insert("hours: \"") +
                            delete_leading_zero_to_double_digit @ graph_hour +
                            pynutil.insert("\""))
        final_graph_minute = (
            pynutil.insert("minutes: \"") +
            (pynutil.delete("0") + graph_minute_single | graph_minute_double) +
            pynutil.insert("\""))
        final_graph_second = (
            pynutil.insert("seconds: \"") +
            (pynutil.delete("0") + graph_minute_single | graph_minute_double) +
            pynutil.insert("\""))
        final_time_zone_optional = pynini.closure(
            pynini.accep(" ") + pynutil.insert("zone: \"") +
            convert_space(time_zone_graph) + pynutil.insert("\""),
            0,
            1,
        )

        # 02:30 Uhr
        graph_hm = (final_graph_hour + pynutil.delete(":") +
                    (pynutil.delete("00") |
                     (insert_space + final_graph_minute)) + final_suffix +
                    final_time_zone_optional)

        # 10:30:05 Uhr,
        graph_hms = (final_graph_hour + pynutil.delete(":") +
                     (pynini.cross("00", " minutes: \"0\"") |
                      (insert_space + final_graph_minute)) +
                     pynutil.delete(":") +
                     (pynini.cross("00", " seconds: \"0\"") |
                      (insert_space + final_graph_second)) + final_suffix +
                     final_time_zone_optional +
                     pynutil.insert(" preserve_order: true"))

        # 2 Uhr est
        graph_h = final_graph_hour_only + final_suffix + final_time_zone_optional
        final_graph = (graph_hm | graph_h | graph_hms).optimize()
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemple #20
0
    def __init__(self,
                 cardinal: GraphFst,
                 decimal: GraphFst,
                 deterministic: bool = True):
        super().__init__(name="money",
                         kind="classify",
                         deterministic=deterministic)
        cardinal_graph = cardinal.graph
        graph_decimal_final = decimal.final_graph_wo_negative

        unit_singular = pynini.string_file(
            get_abs_path("data/currency/currency.tsv"))
        unit_plural = convert_space(unit_singular @ SINGULAR_TO_PLURAL)
        unit_singular = convert_space(unit_singular)

        graph_unit_singular = pynutil.insert(
            "currency: \"") + unit_singular + pynutil.insert("\"")
        graph_unit_plural = pynutil.insert(
            "currency: \"") + unit_plural + pynutil.insert("\"")

        singular_graph = (graph_unit_singular +
                          pynutil.insert(" integer_part: \"") +
                          pynini.cross("1", "one") + pynutil.insert("\""))

        graph_decimal = graph_unit_plural + insert_space + graph_decimal_final

        if deterministic:
            graph_integer = (graph_unit_plural +
                             pynutil.insert(" integer_part: \"") +
                             ((NEMO_SIGMA - "1") @ cardinal_graph) +
                             pynutil.insert("\""))
        else:
            graph_integer = (
                graph_unit_plural + pynutil.insert(" integer_part: \"") +
                ((NEMO_SIGMA - "1")
                 @ (get_hundreds_graph(deterministic) | cardinal_graph)) +
                pynutil.insert("\""))
            graph_decimal |= singular_graph + insert_space + graph_decimal_final

        graph_integer |= singular_graph

        final_graph = graph_integer | graph_decimal

        if not deterministic:
            currencies = load_labels(
                get_abs_path("data/currency/currency.tsv"))
            zero_graph = pynini.cross("0", "") | pynini.accep("0")
            # add minor currency part only when there are two digits after the point
            # .01 -> {zero one cent, one cent}, .05 -> {oh five, five cents}
            two_digits_fractional_part = (
                NEMO_SIGMA + pynini.closure(NEMO_DIGIT) +
                ((pynini.accep(".") + (NEMO_DIGIT**(2) | zero_graph +
                                       (NEMO_DIGIT - "0")))
                 | pynutil.delete(".") +
                 pynini.cross(pynini.closure("0", 1), "")))

            integer_graph = None
            decimal_graph_with_minor = None
            decimal_graph_default = None

            for curr_symbol, curr_name in currencies:
                curr_symbol_graph = pynutil.delete(curr_symbol)
                graph_end = pynutil.insert(" currency: \"" + curr_symbol +
                                           "\"")
                preserve_order = pynutil.insert(" preserve_order: True")
                integer_part = decimal.graph_integer + graph_end + preserve_order

                # "$4" -> 'integer_part: "four" currency: "$" preserve_order: True' -> four dollars
                integer_graph_curr = curr_symbol_graph + integer_part
                # remove fractional part if it contains only zeros
                # "$4.00" -> 'integer_part: "four" currency: "$" preserve_order: True' -> four dollars
                integer_graph_curr |= pynini.compose(
                    two_digits_fractional_part, integer_graph_curr)
                decimal_graph_with_minor_curr = (
                    curr_symbol_graph + pynini.closure(integer_part, 0, 1) +
                    pynini.cross(".", " ") + decimal.graph_fractional +
                    graph_end)

                # "$.5" -> 'fractional_part: "five" currency: "dollars"' -> point five dollars
                decimal_graph_default_curr = (
                    pynutil.delete("currency: \"" +
                                   pynini.compose(curr_symbol, unit_plural) +
                                   "\"") + delete_space +
                    pynini.accep("fractional_part") + NEMO_SIGMA +
                    pynutil.insert(" currency: \"" +
                                   pynini.compose(curr_symbol, unit_plural) +
                                   "\""))

                # "$4.5" -> 'integer_part: "four" fractional_part: "five" currency: "dollars"' -> "four point five dollars"
                decimal_graph_default_curr |= (
                    pynutil.delete("currency: \"" + curr_name +
                                   pynini.closure(NEMO_NOT_QUOTE) + "\"") +
                    delete_space + pynini.accep("integer_part") + NEMO_SIGMA +
                    pynini.accep("fractional_part") + NEMO_SIGMA +
                    pynutil.insert(" currency: \"" +
                                   pynini.compose(curr_symbol, unit_plural) +
                                   "\""))

                # "£4 billion" -> 'integer_part: "four" quantity: "billion" currency: "pounds"' -> "four billion dollars"
                decimal_graph_default_curr |= (
                    pynutil.delete("currency: \"") + pynutil.delete(
                        rewrite.rewrite_lattice(
                            curr_symbol,
                            pynini.compose(curr_symbol, unit_plural)) + "\" ")
                    + pynini.difference(NEMO_SIGMA, "fractional_part") +
                    pynutil.insert(" currency: \"" +
                                   pynini.compose(curr_symbol, unit_plural) +
                                   "\""))

                decimal_graph_with_minor_curr = pynini.compose(
                    two_digits_fractional_part, decimal_graph_with_minor_curr)
                decimal_graph_default_curr = pynini.compose(
                    graph_decimal, decimal_graph_default_curr)

                integer_graph = (integer_graph_curr
                                 if integer_graph is None else pynini.union(
                                     integer_graph, integer_graph_curr))
                decimal_graph_with_minor = (decimal_graph_with_minor_curr
                                            if decimal_graph_with_minor is None
                                            else pynini.union(
                                                decimal_graph_with_minor,
                                                decimal_graph_with_minor_curr))
                decimal_graph_default = (
                    decimal_graph_default_curr
                    if decimal_graph_default is None else pynini.union(
                        decimal_graph_default, decimal_graph_default_curr))

            final_graph = decimal_graph_with_minor | decimal_graph_default | integer_graph

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemple #21
0
    def __init__(self,
                 cardinal: GraphFst,
                 ordinal: GraphFst,
                 deterministic: bool = True,
                 lm: bool = False):
        super().__init__(name="integer",
                         kind="classify",
                         deterministic=deterministic)
        """
        Finite state transducer for classifying serial (handles only cases without delimiters,
        values with delimiters are handled by default).
            The serial is a combination of digits, letters and dashes, e.g.:
            c325b -> tokens { cardinal { integer: "c three two five b" } }
        """
        num_graph = pynini.compose(NEMO_DIGIT**(6, ...),
                                   cardinal.single_digits_graph).optimize()
        num_graph |= pynini.compose(NEMO_DIGIT**(1, 5),
                                    cardinal.graph).optimize()
        # to handle numbers starting with zero
        num_graph |= pynini.compose(
            pynini.accep("0") + pynini.closure(NEMO_DIGIT),
            cardinal.single_digits_graph).optimize()
        # TODO: "#" doesn't work from the file
        symbols_graph = pynini.string_file(
            get_abs_path("data/whitelist/symbol.tsv")).optimize(
            ) | pynini.cross("#", "hash")
        num_graph |= symbols_graph

        if not self.deterministic and not lm:
            num_graph |= cardinal.single_digits_graph
            # also allow double digits to be pronounced as integer in serial number
            num_graph |= pynutil.add_weight(
                NEMO_DIGIT**2 @ cardinal.
                graph_hundred_component_at_least_one_none_zero_digit,
                weight=0.0001)

        # add space between letter and digit/symbol
        symbols = [
            x[0]
            for x in load_labels(get_abs_path("data/whitelist/symbol.tsv"))
        ]
        symbols = pynini.union(*symbols)
        digit_symbol = NEMO_DIGIT | symbols

        graph_with_space = pynini.compose(
            pynini.cdrewrite(pynutil.insert(" "), NEMO_ALPHA | symbols,
                             digit_symbol, NEMO_SIGMA),
            pynini.cdrewrite(pynutil.insert(" "), digit_symbol,
                             NEMO_ALPHA | symbols, NEMO_SIGMA),
        )

        # serial graph with delimiter
        delimiter = pynini.accep("-") | pynini.accep("/") | pynini.accep(" ")
        alphas = pynini.closure(NEMO_ALPHA, 1)
        letter_num = alphas + delimiter + num_graph
        num_letter = pynini.closure(num_graph + delimiter, 1) + alphas
        next_alpha_or_num = pynini.closure(delimiter + (alphas | num_graph))
        next_alpha_or_num |= pynini.closure(
            delimiter + num_graph +
            plurals._priority_union(pynini.accep(" "), pynutil.insert(" "),
                                    NEMO_SIGMA).optimize() + alphas)

        serial_graph = letter_num + next_alpha_or_num
        serial_graph |= num_letter + next_alpha_or_num
        # numbers only with 2+ delimiters
        serial_graph |= (num_graph + delimiter + num_graph + delimiter +
                         num_graph + pynini.closure(delimiter + num_graph))
        # 2+ symbols
        serial_graph |= pynini.compose(NEMO_SIGMA + symbols + NEMO_SIGMA,
                                       num_graph + delimiter + num_graph)

        # exclude ordinal numbers from serial options
        serial_graph = pynini.compose(
            pynini.difference(NEMO_SIGMA,
                              pynini.project(ordinal.graph, "input")),
            serial_graph).optimize()

        serial_graph = pynutil.add_weight(serial_graph, 0.0001)
        serial_graph |= (pynini.closure(NEMO_NOT_SPACE, 1) +
                         (pynini.cross("^2", " squared")
                          | pynini.cross("^3", " cubed")).optimize())

        # at least one serial graph with alpha numeric value and optional additional serial/num/alpha values
        serial_graph = (
            pynini.closure((serial_graph | num_graph | alphas) + delimiter) +
            serial_graph + pynini.closure(delimiter +
                                          (serial_graph | num_graph | alphas)))

        serial_graph |= pynini.compose(graph_with_space,
                                       serial_graph.optimize()).optimize()
        serial_graph = pynini.compose(pynini.closure(NEMO_NOT_SPACE, 2),
                                      serial_graph).optimize()

        self.graph = serial_graph.optimize()
        graph = pynutil.insert("name: \"") + convert_space(
            self.graph).optimize() + pynutil.insert("\"")
        self.fst = graph.optimize()
Exemple #22
0
    def __init__(self,
                 cardinal: GraphFst,
                 decimal: GraphFst,
                 fraction: GraphFst,
                 deterministic: bool = True):
        super().__init__(name="measure",
                         kind="classify",
                         deterministic=deterministic)
        cardinal_graph = cardinal.graph

        if not deterministic:
            cardinal_graph |= cardinal.range_graph

        graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv"))
        graph_unit |= pynini.compose(
            pynini.closure(TO_LOWER, 1) + pynini.closure(NEMO_ALPHA),
            graph_unit)

        graph_unit_plural = convert_space(graph_unit @ SINGULAR_TO_PLURAL)
        graph_unit = convert_space(graph_unit)
        optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)

        graph_unit2 = pynini.cross("/", "per") + delete_space + pynutil.insert(
            NEMO_NON_BREAKING_SPACE) + graph_unit

        optional_graph_unit2 = pynini.closure(
            delete_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) +
            graph_unit2,
            0,
            1,
        )

        unit_plural = (
            pynutil.insert("units: \"") +
            (graph_unit_plural + optional_graph_unit2 | graph_unit2) +
            pynutil.insert("\""))

        unit_singular = (pynutil.insert("units: \"") +
                         (graph_unit + optional_graph_unit2 | graph_unit2) +
                         pynutil.insert("\""))

        subgraph_decimal = (pynutil.insert("decimal { ") +
                            optional_graph_negative +
                            decimal.final_graph_wo_negative + delete_space +
                            pynutil.insert(" } ") + unit_plural)

        subgraph_cardinal = (pynutil.insert("cardinal { ") +
                             optional_graph_negative +
                             pynutil.insert("integer: \"") +
                             ((NEMO_SIGMA - "1") @ cardinal_graph) +
                             delete_space + pynutil.insert("\"") +
                             pynutil.insert(" } ") + unit_plural)

        subgraph_cardinal |= (pynutil.insert("cardinal { ") +
                              optional_graph_negative +
                              pynutil.insert("integer: \"") +
                              pynini.cross("1", "one") + delete_space +
                              pynutil.insert("\"") + pynutil.insert(" } ") +
                              unit_singular)

        cardinal_dash_alpha = (pynutil.insert("cardinal { integer: \"") +
                               cardinal_graph + pynini.accep('-') +
                               pynutil.insert("\" } units: \"") +
                               pynini.closure(NEMO_ALPHA, 1) +
                               pynutil.insert("\""))

        alpha_dash_cardinal = (pynutil.insert("units: \"") +
                               pynini.closure(NEMO_ALPHA, 1) +
                               pynini.accep('-') + pynutil.insert("\"") +
                               pynutil.insert(" cardinal { integer: \"") +
                               cardinal_graph +
                               pynutil.insert("\" } preserve_order: true"))

        decimal_dash_alpha = (pynutil.insert("decimal { ") +
                              decimal.final_graph_wo_negative +
                              pynini.cross('-', '') +
                              pynutil.insert(" } units: \"") +
                              pynini.closure(NEMO_ALPHA, 1) +
                              pynutil.insert("\""))

        decimal_times = (pynutil.insert("decimal { ") +
                         decimal.final_graph_wo_negative +
                         pynutil.insert(" } units: \"") +
                         pynini.cross(pynini.union('x', "X"), 'x') +
                         pynutil.insert("\""))

        alpha_dash_decimal = (pynutil.insert("units: \"") +
                              pynini.closure(NEMO_ALPHA, 1) +
                              pynini.accep('-') + pynutil.insert("\"") +
                              pynutil.insert(" decimal { ") +
                              decimal.final_graph_wo_negative +
                              pynutil.insert(" } preserve_order: true"))

        subgraph_fraction = (pynutil.insert("fraction { ") + fraction.graph +
                             delete_space + pynutil.insert(" } ") +
                             unit_plural)

        address = self.get_address_graph(cardinal)
        address = (
            pynutil.insert("units: \"address\" cardinal { integer: \"") +
            address + pynutil.insert("\" } preserve_order: true"))

        math_operations = pynini.string_file(
            get_abs_path("data/math_operations.tsv"))
        delimiter = pynini.accep(" ") | pynutil.insert(" ")

        math = (cardinal_graph + delimiter + math_operations + delimiter +
                cardinal_graph + delimiter + pynini.cross("=", "equals") +
                delimiter + cardinal_graph)
        math = (pynutil.insert("units: \"math\" cardinal { integer: \"") +
                math + pynutil.insert("\" } preserve_order: true"))
        final_graph = (subgraph_decimal
                       | subgraph_cardinal
                       | cardinal_dash_alpha
                       | alpha_dash_cardinal
                       | decimal_dash_alpha
                       | decimal_times
                       | alpha_dash_decimal
                       | subgraph_fraction
                       | address
                       | math)
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemple #23
0
    def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, deterministic: bool = True):
        super().__init__(name="measure", kind="classify", deterministic=deterministic)
        cardinal_graph = cardinal.graph

        graph_unit_singular = convert_space(unit_singular)
        graph_unit_plural = graph_unit_singular @ pynini.cdrewrite(convert_space(suppletive), "", "[EOS]", NEMO_SIGMA)
        optional_graph_negative = pynini.closure("-", 0, 1)

        graph_unit_denominator = (
            pynini.cross("/", "pro") + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_singular
        )

        optional_unit_denominator = pynini.closure(
            pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, 0, 1,
        )

        unit_plural = (
            pynutil.insert("units: \"")
            + (graph_unit_plural + (optional_unit_denominator) | graph_unit_denominator)
            + pynutil.insert("\"")
        )

        unit_singular_graph = (
            pynutil.insert("units: \"")
            + ((graph_unit_singular + optional_unit_denominator) | graph_unit_denominator)
            + pynutil.insert("\"")
        )

        subgraph_decimal = decimal.fst + insert_space + pynini.closure(pynutil.delete(" "), 0, 1) + unit_plural

        subgraph_cardinal = (
            (optional_graph_negative + (pynini.closure(NEMO_DIGIT) - "1")) @ cardinal.fst
            + insert_space
            + pynini.closure(pynutil.delete(" "), 0, 1)
            + unit_plural
        )

        subgraph_cardinal |= (
            (optional_graph_negative + pynini.accep("1"))
            @ cardinal.fst
            @ pynini.cdrewrite(pynini.cross("eins", "ein"), "", "", NEMO_SIGMA)
            + insert_space
            + pynini.closure(pynutil.delete(" "), 0, 1)
            + unit_singular_graph
        )

        subgraph_fraction = fraction.fst + insert_space + pynini.closure(pynutil.delete(" "), 0, 1) + unit_plural

        cardinal_dash_alpha = (
            pynutil.insert("cardinal { integer: \"")
            + cardinal_graph
            + pynutil.delete('-')
            + pynutil.insert("\" } units: \"")
            + pynini.closure(NEMO_ALPHA, 1)
            + pynutil.insert("\"")
        )

        alpha_dash_cardinal = (
            pynutil.insert("units: \"")
            + pynini.closure(NEMO_ALPHA, 1)
            + pynutil.delete('-')
            + pynutil.insert("\"")
            + pynutil.insert(" cardinal { integer: \"")
            + cardinal_graph
            + pynutil.insert("\" }")
        )

        decimal_dash_alpha = (
            pynutil.insert("decimal { ")
            + decimal.final_graph_wo_negative
            + pynutil.delete('-')
            + pynutil.insert(" } units: \"")
            + pynini.closure(NEMO_ALPHA, 1)
            + pynutil.insert("\"")
        )

        decimal_times = (
            pynutil.insert("decimal { ")
            + decimal.final_graph_wo_negative
            + pynutil.insert(" } units: \"")
            + pynini.union('x', 'X')
            + pynutil.insert("\"")
        )

        cardinal_times = (
            pynutil.insert("cardinal { integer: \"")
            + cardinal_graph
            + pynutil.insert("\" } units: \"")
            + pynini.union('x', 'X')
            + pynutil.insert("\"")
        )

        alpha_dash_decimal = (
            pynutil.insert("units: \"")
            + pynini.closure(NEMO_ALPHA, 1)
            + pynutil.delete('-')
            + pynutil.insert("\"")
            + pynutil.insert(" decimal { ")
            + decimal.final_graph_wo_negative
            + pynutil.insert(" }")
        )

        final_graph = (
            subgraph_decimal
            | subgraph_cardinal
            | cardinal_dash_alpha
            | alpha_dash_cardinal
            | decimal_dash_alpha
            | decimal_times
            | alpha_dash_decimal
            | subgraph_fraction
            | cardinal_times
        )
        final_graph += pynutil.insert(" preserve_order: true")
        final_graph = self.add_tokens(final_graph)

        self.fst = final_graph.optimize()
Exemple #24
0
    def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, deterministic: bool = True):
        super().__init__(name="measure", kind="classify", deterministic=deterministic)
        cardinal_graph = cardinal.graph

        unit_singular = unit
        unit_plural = unit_singular @ (unit_plural_fem | unit_plural_masc)

        graph_unit_singular = convert_space(unit_singular)
        graph_unit_plural = convert_space(unit_plural)

        optional_graph_negative = pynini.closure("-", 0, 1)

        graph_unit_denominator = (
            pynini.cross("/", "por") + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_singular
        )

        optional_unit_denominator = pynini.closure(
            pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, 0, 1,
        )

        unit_plural = (
            pynutil.insert("units: \"")
            + ((graph_unit_plural + optional_unit_denominator) | graph_unit_denominator)
            + pynutil.insert("\"")
        )

        unit_singular_graph = (
            pynutil.insert("units: \"")
            + ((graph_unit_singular + optional_unit_denominator) | graph_unit_denominator)
            + pynutil.insert("\"")
        )

        subgraph_decimal = decimal.fst + insert_space + pynini.closure(NEMO_SPACE, 0, 1) + unit_plural

        subgraph_cardinal = (
            (optional_graph_negative + (NEMO_SIGMA - "1")) @ cardinal.fst
            + insert_space
            + pynini.closure(delete_space, 0, 1)
            + unit_plural
        )

        subgraph_cardinal |= (
            (optional_graph_negative + pynini.accep("1")) @ cardinal.fst
            + insert_space
            + pynini.closure(delete_space, 0, 1)
            + unit_singular_graph
        )

        subgraph_fraction = fraction.fst + insert_space + pynini.closure(delete_space, 0, 1) + unit_singular_graph

        decimal_times = (
            pynutil.insert("decimal { ")
            + decimal.final_graph_wo_negative
            + pynutil.insert(" } units: \"")
            + pynini.union('x', 'X')
            + pynutil.insert("\"")
        )

        cardinal_times = (
            pynutil.insert("cardinal { integer: \"")
            + strip_cardinal_apocope(cardinal_graph)
            + pynutil.insert("\" } units: \"")
            + pynini.union('x', 'X')
            + pynutil.insert("\"")
        )

        cardinal_dash_alpha = (
            pynutil.insert("cardinal { integer: \"")
            + strip_cardinal_apocope(cardinal_graph)
            + pynutil.delete('-')
            + pynutil.insert("\" } units: \"")
            + pynini.closure(NEMO_ALPHA, 1)
            + pynutil.insert("\"")
        )

        decimal_dash_alpha = (
            pynutil.insert("decimal { ")
            + decimal.final_graph_wo_negative
            + pynutil.delete('-')
            + pynutil.insert(" } units: \"")
            + pynini.closure(NEMO_ALPHA, 1)
            + pynutil.insert("\"")
        )

        alpha_dash_cardinal = (
            pynutil.insert("units: \"")
            + pynini.closure(NEMO_ALPHA, 1)
            + pynutil.delete('-')
            + pynutil.insert("\"")
            + pynutil.insert(" cardinal { integer: \"")
            + cardinal_graph
            + pynutil.insert("\" } preserve_order: true")
        )

        alpha_dash_decimal = (
            pynutil.insert("units: \"")
            + pynini.closure(NEMO_ALPHA, 1)
            + pynutil.delete('-')
            + pynutil.insert("\"")
            + pynutil.insert(" decimal { ")
            + decimal.final_graph_wo_negative
            + pynutil.insert(" } preserve_order: true")
        )

        final_graph = (
            subgraph_decimal
            | subgraph_cardinal
            | cardinal_dash_alpha
            | alpha_dash_cardinal
            | decimal_dash_alpha
            | subgraph_fraction
            | decimal_times
            | cardinal_times
            | alpha_dash_decimal
        )
        final_graph += pynutil.insert(" preserve_order: true")
        final_graph = self.add_tokens(final_graph)

        self.fst = final_graph.optimize()
Exemple #25
0
    def __init__(self, cardinal: GraphFst, decimal: GraphFst):
        super().__init__(name="money", kind="classify")
        # quantity, integer_part, fractional_part, currency

        cardinal_graph = cardinal.graph_no_exception
        graph_decimal_final = decimal.final_graph_wo_negative

        unit = pynini.string_file(get_abs_path("data/currency.tsv"))
        unit_singular = pynini.invert(unit)
        unit_plural = get_singulars(unit_singular)

        graph_unit_singular = pynutil.insert("currency: \"") + convert_space(
            unit_singular) + pynutil.insert("\"")
        graph_unit_plural = pynutil.insert("currency: \"") + convert_space(
            unit_plural) + pynutil.insert("\"")

        add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (
            pynutil.insert("0") + NEMO_DIGIT)
        # twelve dollars (and) fifty cents, zero cents
        cents_standalone = (pynutil.insert("fractional_part: \"") +
                            pynini.union(
                                pynutil.add_weight(
                                    ((NEMO_SIGMA - "one") @ cardinal_graph),
                                    -0.7) @ add_leading_zero_to_double_digit +
                                delete_space + pynutil.delete("cents"),
                                pynini.cross("one", "01") + delete_space +
                                pynutil.delete("cent"),
                            ) + pynutil.insert("\""))

        optional_cents_standalone = pynini.closure(
            delete_space +
            pynini.closure(pynutil.delete("and") + delete_space, 0, 1) +
            insert_space + cents_standalone,
            0,
            1,
        )
        # twelve dollars fifty, only after integer
        optional_cents_suffix = pynini.closure(
            delete_extra_space + pynutil.insert("fractional_part: \"") +
            pynutil.add_weight(
                cardinal_graph @ add_leading_zero_to_double_digit, -0.7) +
            pynutil.insert("\""),
            0,
            1,
        )

        graph_integer = (pynutil.insert("integer_part: \"") +
                         ((NEMO_SIGMA - "one") @ cardinal_graph) +
                         pynutil.insert("\"") + delete_extra_space +
                         graph_unit_plural +
                         (optional_cents_standalone | optional_cents_suffix))
        graph_integer |= (pynutil.insert("integer_part: \"") +
                          pynini.cross("one", "1") + pynutil.insert("\"") +
                          delete_extra_space + graph_unit_singular +
                          (optional_cents_standalone | optional_cents_suffix))
        graph_decimal = graph_decimal_final + delete_extra_space + graph_unit_plural
        graph_decimal |= pynutil.insert(
            "currency: \"$\" integer_part: \"0\" ") + cents_standalone
        final_graph = graph_integer | graph_decimal

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemple #26
0
    def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None):
        super().__init__(name="whitelist", kind="classify", deterministic=deterministic)

        def _get_whitelist_graph(input_case, file, keep_punct_add_end: bool = False):
            whitelist = load_labels(file)
            if input_case == "lower_cased":
                whitelist = [[x.lower(), y] for x, y in whitelist]
            else:
                whitelist = [[x, y] for x, y in whitelist]

            if keep_punct_add_end:
                whitelist.extend(augment_labels_with_punct_at_end(whitelist))

            graph = pynini.string_map(whitelist)
            return graph

        graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist/tts.tsv"))
        graph |= _get_whitelist_graph(input_case, get_abs_path("data/whitelist/symbol.tsv"))

        if deterministic:
            names = get_names()
            graph |= (
                pynini.cross(pynini.union("st", "St", "ST"), "Saint")
                + pynini.closure(pynutil.delete("."))
                + pynini.accep(" ")
                + names
            )
        else:
            graph |= _get_whitelist_graph(
                input_case, get_abs_path("data/whitelist/alternatives.tsv"), keep_punct_add_end=True
            )

        for x in [".", ". "]:
            graph |= (
                NEMO_UPPER
                + pynini.closure(pynutil.delete(x) + NEMO_UPPER, 2)
                + pynini.closure(pynutil.delete("."), 0, 1)
            )

        if not deterministic:
            multiple_forms_whitelist_graph = get_formats(get_abs_path("data/whitelist/alternatives_all_format.tsv"))
            graph |= multiple_forms_whitelist_graph

            graph_unit = pynini.string_file(get_abs_path("data/measure/unit.tsv")) | pynini.string_file(
                get_abs_path("data/measure/unit_alternatives.tsv")
            )
            graph_unit_plural = graph_unit @ SINGULAR_TO_PLURAL
            units_graph = pynini.compose(NEMO_CHAR ** (3, ...), convert_space(graph_unit | graph_unit_plural))
            graph |= units_graph

        # convert to states only if comma is present before the abbreviation to avoid converting all caps words,
        # e.g. "IN", "OH", "OK"
        # TODO or only exclude above?
        states = load_labels(get_abs_path("data/address/state.tsv"))
        additional_options = []
        for x, y in states:
            if input_case == "lower_cased":
                x = x.lower()
            additional_options.append((x, f"{y[0]}.{y[1:]}"))
            if not deterministic:
                additional_options.append((x, f"{y[0]}.{y[1:]}."))

        states.extend(additional_options)
        state_graph = pynini.string_map(states)
        graph |= pynini.closure(NEMO_NOT_SPACE, 1) + pynini.union(", ", ",") + pynini.invert(state_graph).optimize()

        if input_file:
            whitelist_provided = _get_whitelist_graph(input_case, input_file)
            if not deterministic:
                graph |= whitelist_provided
            else:
                graph = whitelist_provided

        self.graph = (convert_space(graph)).optimize()

        self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
Exemple #27
0
    def __init__(self,
                 cardinal: GraphFst,
                 decimal: GraphFst,
                 deterministic: bool = True):
        super().__init__(name="money",
                         kind="classify",
                         deterministic=deterministic)
        cardinal_graph = cardinal.graph_with_and
        graph_decimal_final = decimal.final_graph_wo_negative_w_abbr

        maj_singular_labels = load_labels(
            get_abs_path("data/money/currency_major.tsv"))
        maj_unit_plural = convert_space(maj_singular @ SINGULAR_TO_PLURAL)
        maj_unit_singular = convert_space(maj_singular)

        graph_maj_singular = pynutil.insert(
            "currency_maj: \"") + maj_unit_singular + pynutil.insert("\"")
        graph_maj_plural = pynutil.insert(
            "currency_maj: \"") + maj_unit_plural + pynutil.insert("\"")

        optional_delete_fractional_zeros = pynini.closure(
            pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), 0, 1)

        graph_integer_one = pynutil.insert("integer_part: \"") + pynini.cross(
            "1", "one") + pynutil.insert("\"")
        # only for decimals where third decimal after comma is non-zero or with quantity
        decimal_delete_last_zeros = (
            pynini.closure(NEMO_DIGIT | pynutil.delete(",")) +
            pynini.accep(".") + pynini.closure(NEMO_DIGIT, 2) +
            (NEMO_DIGIT - "0") + pynini.closure(pynutil.delete("0")))
        decimal_with_quantity = NEMO_SIGMA + NEMO_ALPHA

        graph_decimal = (graph_maj_plural + insert_space +
                         (decimal_delete_last_zeros | decimal_with_quantity)
                         @ graph_decimal_final)

        graph_integer = (pynutil.insert("integer_part: \"") +
                         ((NEMO_SIGMA - "1") @ cardinal_graph) +
                         pynutil.insert("\""))

        graph_integer_only = graph_maj_singular + insert_space + graph_integer_one
        graph_integer_only |= graph_maj_plural + insert_space + graph_integer

        final_graph = (graph_integer_only +
                       optional_delete_fractional_zeros) | graph_decimal

        # remove trailing zeros of non zero number in the first 2 digits and fill up to 2 digits
        # e.g. 2000 -> 20, 0200->02, 01 -> 01, 10 -> 10
        # not accepted: 002, 00, 0,
        two_digits_fractional_part = (
            pynini.closure(NEMO_DIGIT) +
            (NEMO_DIGIT - "0") + pynini.closure(pynutil.delete("0"))) @ (
                (pynutil.delete("0") + (NEMO_DIGIT - "0"))
                | ((NEMO_DIGIT - "0") + pynutil.insert("0"))
                | ((NEMO_DIGIT - "0") + NEMO_DIGIT))

        graph_min_singular = pynutil.insert(
            " currency_min: \"") + min_singular + pynutil.insert("\"")
        graph_min_plural = pynutil.insert(
            " currency_min: \"") + min_plural + pynutil.insert("\"")
        # format ** dollars ** cent
        decimal_graph_with_minor = None
        integer_graph_reordered = None
        decimal_default_reordered = None
        for curr_symbol, _ in maj_singular_labels:
            preserve_order = pynutil.insert(" preserve_order: true")
            integer_plus_maj = graph_integer + insert_space + pynutil.insert(
                curr_symbol) @ graph_maj_plural
            integer_plus_maj |= graph_integer_one + insert_space + pynutil.insert(
                curr_symbol) @ graph_maj_singular

            integer_plus_maj_with_comma = pynini.compose(
                NEMO_DIGIT - "0" +
                pynini.closure(NEMO_DIGIT | pynutil.delete(",")),
                integer_plus_maj)
            integer_plus_maj = pynini.compose(
                pynini.closure(NEMO_DIGIT) - "0", integer_plus_maj)
            integer_plus_maj |= integer_plus_maj_with_comma

            graph_fractional_one = two_digits_fractional_part @ pynini.cross(
                "1", "one")
            graph_fractional_one = pynutil.insert(
                "fractional_part: \"") + graph_fractional_one + pynutil.insert(
                    "\"")
            graph_fractional = (two_digits_fractional_part @ (
                pynini.closure(NEMO_DIGIT, 1, 2) - "1"
            ) @ cardinal.graph_hundred_component_at_least_one_none_zero_digit)
            graph_fractional = pynutil.insert(
                "fractional_part: \"") + graph_fractional + pynutil.insert(
                    "\"")

            fractional_plus_min = graph_fractional + insert_space + pynutil.insert(
                curr_symbol) @ graph_min_plural
            fractional_plus_min |= (
                graph_fractional_one + insert_space +
                pynutil.insert(curr_symbol) @ graph_min_singular)

            decimal_graph_with_minor_curr = integer_plus_maj + pynini.cross(
                ".", " ") + fractional_plus_min

            if not deterministic:
                decimal_graph_with_minor_curr |= pynutil.add_weight(
                    integer_plus_maj + pynini.cross(".", " ") +
                    pynutil.insert("fractional_part: \"") +
                    two_digits_fractional_part @ cardinal.
                    graph_hundred_component_at_least_one_none_zero_digit +
                    pynutil.insert("\""),
                    weight=0.0001,
                )
                default_fraction_graph = (
                    decimal_delete_last_zeros
                    | decimal_with_quantity) @ graph_decimal_final
            decimal_graph_with_minor_curr |= (
                pynini.closure(pynutil.delete("0"), 0, 1) +
                pynutil.delete(".") + fractional_plus_min)
            decimal_graph_with_minor_curr = (pynutil.delete(curr_symbol) +
                                             decimal_graph_with_minor_curr +
                                             preserve_order)

            decimal_graph_with_minor = (
                decimal_graph_with_minor_curr
                if decimal_graph_with_minor is None else pynini.union(
                    decimal_graph_with_minor,
                    decimal_graph_with_minor_curr).optimize())

            if not deterministic:
                integer_graph_reordered_curr = (pynutil.delete(curr_symbol) +
                                                integer_plus_maj +
                                                preserve_order).optimize()

                integer_graph_reordered = (
                    integer_graph_reordered_curr
                    if integer_graph_reordered is None else pynini.union(
                        integer_graph_reordered,
                        integer_graph_reordered_curr).optimize())
                decimal_default_reordered_curr = (
                    pynutil.delete(curr_symbol) + default_fraction_graph +
                    insert_space +
                    pynutil.insert(curr_symbol) @ graph_maj_plural)

                decimal_default_reordered = (
                    decimal_default_reordered_curr
                    if decimal_default_reordered is None else pynini.union(
                        decimal_default_reordered,
                        decimal_default_reordered_curr)).optimize()

        # weight for SH
        final_graph |= pynutil.add_weight(decimal_graph_with_minor, -0.0001)

        if not deterministic:
            final_graph |= integer_graph_reordered | decimal_default_reordered
            # to handle "$2.00" cases
            final_graph |= pynini.compose(
                NEMO_SIGMA + pynutil.delete(".") +
                pynini.closure(pynutil.delete("0"), 1),
                integer_graph_reordered)
        final_graph = self.add_tokens(final_graph.optimize())
        self.fst = final_graph.optimize()
Exemple #28
0
    def __init__(self, cardinal: GraphFst, deterministic: bool = True):
        super().__init__(name="time",
                         kind="classify",
                         deterministic=deterministic)
        suffix_labels = load_labels(get_abs_path("data/time/suffix.tsv"))
        suffix_labels.extend(augment_labels_with_punct_at_end(suffix_labels))
        suffix_graph = pynini.string_map(suffix_labels)

        time_zone_graph = pynini.string_file(
            get_abs_path("data/time/zone.tsv"))

        # only used for < 1000 thousand -> 0 weight
        cardinal = cardinal.graph

        labels_hour = [str(x) for x in range(0, 24)]
        labels_minute_single = [str(x) for x in range(1, 10)]
        labels_minute_double = [str(x) for x in range(10, 60)]

        delete_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (
            pynini.closure(pynutil.delete("0"), 0, 1) + NEMO_DIGIT)

        graph_hour = delete_leading_zero_to_double_digit @ pynini.union(
            *labels_hour) @ cardinal

        graph_minute_single = pynini.union(*labels_minute_single) @ cardinal
        graph_minute_double = pynini.union(*labels_minute_double) @ cardinal

        final_graph_hour = pynutil.insert(
            "hours: \"") + graph_hour + pynutil.insert("\"")
        final_graph_minute = (
            pynutil.insert("minutes: \"") +
            (pynini.cross("0", "o") + insert_space + graph_minute_single
             | graph_minute_double) + pynutil.insert("\""))
        final_graph_second = (
            pynutil.insert("seconds: \"") +
            (pynini.cross("0", "o") + insert_space + graph_minute_single
             | graph_minute_double) + pynutil.insert("\""))
        final_suffix = pynutil.insert("suffix: \"") + convert_space(
            suffix_graph) + pynutil.insert("\"")
        final_suffix_optional = pynini.closure(
            delete_space + insert_space + final_suffix, 0, 1)
        final_time_zone_optional = pynini.closure(
            delete_space + insert_space + pynutil.insert("zone: \"") +
            convert_space(time_zone_graph) + pynutil.insert("\""),
            0,
            1,
        )

        # 2:30 pm, 02:30, 2:00
        graph_hm = (
            final_graph_hour + pynutil.delete(":") +
            (pynutil.delete("00") | insert_space + final_graph_minute) +
            final_suffix_optional + final_time_zone_optional)

        # 10:30:05 pm,
        graph_hms = (final_graph_hour + pynutil.delete(":") +
                     (pynini.cross("00", " minutes: \"zero\"")
                      | insert_space + final_graph_minute) +
                     pynutil.delete(":") +
                     (pynini.cross("00", " seconds: \"zero\"")
                      | insert_space + final_graph_second) +
                     final_suffix_optional + final_time_zone_optional)

        # 2.xx pm/am
        graph_hm2 = (
            final_graph_hour + pynutil.delete(".") +
            (pynutil.delete("00") | insert_space + final_graph_minute) +
            delete_space + insert_space + final_suffix +
            final_time_zone_optional)
        # 2 pm est
        graph_h = final_graph_hour + delete_space + insert_space + final_suffix + final_time_zone_optional
        final_graph = (graph_hm | graph_h | graph_hm2 | graph_hms).optimize()

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemple #29
0
    def __init__(self):
        super().__init__(name="time", kind="classify")
        # hours, minutes, seconds, suffix, zone, style, speak_period

        suffix_graph = pynini.string_file(
            get_abs_path("data/time/time_suffix.tsv"))
        time_zone_graph = pynini.invert(
            pynini.string_file(get_abs_path("data/time/time_zone.tsv")))
        time_to_graph = pynini.string_file(
            get_abs_path("data/time/time_to.tsv"))

        # only used for < 1000 thousand -> 0 weight
        cardinal = pynutil.add_weight(CardinalFst().graph_no_exception,
                                      weight=-0.7)

        labels_hour = [num_to_word(x) for x in range(0, 24)]
        labels_minute_single = [num_to_word(x) for x in range(1, 10)]
        labels_minute_double = [num_to_word(x) for x in range(10, 60)]

        graph_hour = pynini.union(*labels_hour) @ cardinal

        graph_minute_single = pynini.union(*labels_minute_single) @ cardinal
        graph_minute_double = pynini.union(*labels_minute_double) @ cardinal
        graph_minute_verbose = pynini.cross("half", "30") | pynini.cross(
            "quarter", "15")
        oclock = pynini.cross(
            pynini.union("o' clock", "o clock", "o'clock", "oclock"), "")

        final_graph_hour = pynutil.insert(
            "hours: \"") + graph_hour + pynutil.insert("\"")
        graph_minute = (
            oclock + pynutil.insert("00")
            | pynutil.delete("o") + delete_space + graph_minute_single
            | graph_minute_double)
        final_suffix = pynutil.insert("suffix: \"") + convert_space(
            suffix_graph) + pynutil.insert("\"")
        final_suffix_optional = pynini.closure(
            delete_space + insert_space + final_suffix, 0, 1)
        final_time_zone_optional = pynini.closure(
            delete_space + insert_space + pynutil.insert("zone: \"") +
            convert_space(time_zone_graph) + pynutil.insert("\""),
            0,
            1,
        )

        # five o' clock
        # two o eight, two thiry five (am/pm)
        # two pm/am
        graph_hm = (final_graph_hour + delete_extra_space +
                    pynutil.insert("minutes: \"") + graph_minute +
                    pynutil.insert("\""))
        # 10 past four, quarter past four, half past four
        graph_mh = (pynutil.insert("minutes: \"") + pynini.union(
            graph_minute_single, graph_minute_double, graph_minute_verbose) +
                    pynutil.insert("\"") + delete_space +
                    pynutil.delete("past") + delete_extra_space +
                    final_graph_hour)

        graph_quarter_time = (pynutil.insert("minutes: \"") +
                              pynini.cross("quarter", "45") +
                              pynutil.insert("\"") + delete_space +
                              pynutil.delete(pynini.union("to", "till")) +
                              delete_extra_space +
                              pynutil.insert("hours: \"") + time_to_graph +
                              pynutil.insert("\""))

        graph_h = (final_graph_hour + delete_extra_space +
                   pynutil.insert("minutes: \"") +
                   (pynutil.insert("00") | graph_minute) +
                   pynutil.insert("\"") + delete_space + insert_space +
                   final_suffix + final_time_zone_optional)
        final_graph = (graph_hm | graph_mh | graph_quarter_time
                       ) + final_suffix_optional + final_time_zone_optional
        final_graph |= graph_h

        final_graph = self.add_tokens(final_graph)

        self.fst = final_graph.optimize()
Exemple #30
0
    def __init__(self, cardinal_tagger: GraphFst, deterministic: bool = True):
        super().__init__(name="time",
                         kind="verbalize",
                         deterministic=deterministic)

        # add weight so when using inverse text normalization this conversion is depriotized
        night_to_early = pynutil.add_weight(pynini.invert(
            pynini.string_file(
                get_abs_path("data/time/hour_to_night.tsv"))).optimize(),
                                            weight=0.0001)
        hour_to = pynini.invert(
            pynini.string_file(
                get_abs_path("data/time/hour_to.tsv"))).optimize()
        minute_to = pynini.invert(
            pynini.string_file(
                get_abs_path("data/time/minute_to.tsv"))).optimize()
        time_zone_graph = pynini.invert(
            convert_space(
                pynini.union(*[
                    x[1] for x in load_labels(
                        get_abs_path("data/time/time_zone.tsv"))
                ])))

        graph_zero = pynini.invert(
            pynini.string_file(
                get_abs_path("data/numbers/zero.tsv"))).optimize()
        number_verbalization = graph_zero | cardinal_tagger.two_digit_non_zero
        hour = pynutil.delete("hours: \"") + pynini.closure(
            NEMO_DIGIT, 1) + pynutil.delete("\"")
        hour_verbalized = hour @ number_verbalization @ pynini.cdrewrite(
            pynini.cross("eins", "ein"), "[BOS]", "[EOS]",
            NEMO_SIGMA) + pynutil.insert(" uhr")
        minute = pynutil.delete("minutes: \"") + pynini.closure(
            NEMO_DIGIT, 1) + pynutil.delete("\"")
        zone = pynutil.delete("zone: \"") + time_zone_graph + pynutil.delete(
            "\"")
        optional_zone = pynini.closure(pynini.accep(" ") + zone, 0, 1)
        second = pynutil.delete("seconds: \"") + pynini.closure(
            NEMO_DIGIT, 1) + pynutil.delete("\"")
        graph_hms = (hour_verbalized + pynini.accep(" ") +
                     minute @ number_verbalization +
                     pynutil.insert(" minuten") + pynini.accep(" ") +
                     second @ number_verbalization +
                     pynutil.insert(" sekunden") + optional_zone)
        graph_hms @= pynini.cdrewrite(
            pynini.cross("eins minuten", "eine minute")
            | pynini.cross("eins sekunden", "eine sekunde"),
            pynini.union(" ", "[BOS]"),
            "",
            NEMO_SIGMA,
        )

        min_30 = [str(x) for x in range(1, 31)]
        min_30 = pynini.union(*min_30)
        min_29 = [str(x) for x in range(1, 30)]
        min_29 = pynini.union(*min_29)

        graph_h = hour_verbalized
        graph_hm = hour_verbalized + pynini.accep(
            " ") + minute @ number_verbalization

        graph_m_past_h = (
            minute @ min_30
            @ (number_verbalization | pynini.cross("15", "viertel")) +
            pynini.accep(" ") + pynutil.insert("nach ")
            # + hour @ number_verbalization
            + hour @ pynini.cdrewrite(night_to_early, "[BOS]", "[EOS]",
                                      NEMO_SIGMA) @ number_verbalization)
        graph_m30_h = (minute @ pynini.cross("30", "halb") +
                       pynini.accep(" ") + hour @ pynini.cdrewrite(
                           night_to_early, "[BOS]", "[EOS]",
                           NEMO_SIGMA) @ hour_to @ number_verbalization)
        graph_m_to_h = (
            minute @ minute_to @ min_29
            @ (number_verbalization | pynini.cross("15", "viertel")) +
            pynini.accep(" ") + pynutil.insert("vor ") + hour
            @ pynini.cdrewrite(night_to_early, "[BOS]", "[EOS]",
                               NEMO_SIGMA) @ hour_to @ number_verbalization)

        self.graph = (graph_hms
                      | graph_h
                      | graph_hm
                      | pynutil.add_weight(graph_m_past_h, weight=0.0001)
                      | pynutil.add_weight(graph_m30_h, weight=0.0001)
                      | pynutil.add_weight(graph_m_to_h,
                                           weight=0.0001)) + optional_zone
        delete_tokens = self.delete_tokens(self.graph + delete_preserve_order)
        self.fst = delete_tokens.optimize()