Ejemplo n.º 1
0
def get_hundreds_graph(deterministic: bool = True):
    """
    Returns a four digit transducer which is combination of ties/teen or digits
    (using hundred instead of thousand format), e.g.
    1219 -> twelve nineteen
    3900 -> thirty nine hundred
    """
    graph_ties = get_ties_graph(deterministic)
    graph = (
        graph_ties + insert_space + graph_ties
        | graph_teen + insert_space + pynini.cross("00", "hundred")
        | (graph_teen + insert_space + (ties_graph | pynini.cross("1", "ten")) + pynutil.delete("0s"))
        @ pynini.cdrewrite(pynini.cross("y", "ies") | pynutil.insert("s"), "", "[EOS]", NEMO_SIGMA)
        | pynutil.add_weight(
            graph_digit
            + insert_space
            + pynini.cross("00", "thousand")
            + (pynutil.delete("0") | insert_space + graph_digit),
            weight=-0.001,
        )
        | pynutil.add_weight(
            graph_digit
            + insert_space
            + pynini.cross("000", "thousand")
            + pynini.closure(pynutil.delete(" "), 0, 1)
            + pynini.accep("s"),
            weight=-0.001,
        )
    )
    return graph
Ejemplo n.º 2
0
    def __init__(self, cardinal: GraphFst, decimal: GraphFst):
        super().__init__(name="measure", kind="classify")
        # decimal, fraction, cardinal, units, style(depr)

        cardinal_graph = cardinal.graph_no_exception

        graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv"))
        graph_unit_singular = pynini.invert(graph_unit)  # singular -> abbr
        graph_unit_plural = get_singulars(graph_unit_singular)  # plural -> abbr

        optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("minus", "\"true\"") + delete_extra_space, 0, 1
        )

        unit_singular = convert_space(graph_unit_singular)
        unit_plural = convert_space(graph_unit_plural)
        unit_misc = pynutil.insert("/") + pynutil.delete("per") + delete_space + convert_space(graph_unit_singular)

        unit_singular = (
            pynutil.insert("units: \"")
            + (unit_singular | unit_misc | pynutil.add_weight(unit_singular + delete_space + unit_misc, 0.01))
            + pynutil.insert("\"")
        )
        unit_plural = (
            pynutil.insert("units: \"")
            + (unit_plural | unit_misc | pynutil.add_weight(unit_plural + delete_space + unit_misc, 0.01))
            + pynutil.insert("\"")
        )

        subgraph_decimal = (
            pynutil.insert("decimal { ")
            + optional_graph_negative
            + decimal.final_graph_wo_negative
            + pynutil.insert(" }")
            + delete_extra_space
            + unit_plural
        )
        subgraph_cardinal = (
            pynutil.insert("cardinal { ")
            + optional_graph_negative
            + pynutil.insert("integer: \"")
            + ((NEMO_SIGMA - "one") @ cardinal_graph)
            + pynutil.insert("\"")
            + pynutil.insert(" }")
            + delete_extra_space
            + unit_plural
        )
        subgraph_cardinal |= (
            pynutil.insert("cardinal { ")
            + optional_graph_negative
            + pynutil.insert("integer: \"")
            + pynini.cross("one", "1")
            + pynutil.insert("\"")
            + pynutil.insert(" }")
            + delete_extra_space
            + unit_singular
        )
        final_graph = subgraph_decimal | subgraph_cardinal
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Ejemplo n.º 3
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="fraction", kind="verbalize", deterministic=deterministic)
        suffix = OrdinalFst().suffix

        integer = pynutil.delete("integer: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\" ")
        numerator = pynutil.delete("numerator: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\" ")
        numerator_one = pynutil.delete("numerator: \"") + pynini.accep("one") + pynutil.delete("\" ")
        denominator = pynutil.delete("denominator: \"") + (
            pynini.closure(NEMO_NOT_QUOTE) @ suffix | pynini.cross('four', 'quarter')
        )
        conjunction = pynutil.insert("and ")
        if not deterministic:
            conjunction = pynini.closure(conjunction, 0, 1)

        integer = pynini.closure(integer + insert_space + conjunction, 0, 1)

        denominator_half = pynini.cross("numerator: \"one\" denominator: \"two\"", "a half")
        denominator_one_two = pynini.cross("denominator: \"one\"", "over one") | pynini.cross(
            "denominator: \"two\"", "halves"
        )
        fraction_default = pynutil.add_weight(
            numerator + insert_space + denominator + pynutil.insert("s") + pynutil.delete("\""), 0.001
        )
        fraction_with_one = pynutil.add_weight(
            numerator_one + insert_space + denominator + pynutil.delete("\""), 0.0001
        )

        graph = integer + denominator_half | (fraction_with_one | fraction_default)
        graph |= pynini.cross("numerator: \"one\" denominator: \"two\"", "one half")
        graph |= (numerator | numerator_one) + insert_space + denominator_one_two

        self.graph = graph
        delete_tokens = self.delete_tokens(self.graph)
        self.fst = delete_tokens.optimize()
Ejemplo n.º 4
0
    def __init__(self, ordinal: GraphFst, cardinal: GraphFst):
        super().__init__(name="date", kind="classify")

        self.cardinal = cardinal
        ordinal_graph = ordinal.graph
        year_graph = self._get_year_graph()
        YEAR_WEIGHT = 0.001
        year_graph = pynutil.add_weight(year_graph, YEAR_WEIGHT)
        month_graph = _get_month_graph()

        month_graph = pynutil.insert(
            "month: \"") + month_graph + pynutil.insert("\"")

        day_graph = pynutil.insert("day: \"") + pynutil.add_weight(
            ordinal_graph, -0.7) + pynutil.insert("\"")
        optional_graph_year = pynini.closure(
            delete_extra_space + pynutil.insert("year: \"") +
            pynutil.add_weight(year_graph, -YEAR_WEIGHT) +
            pynutil.insert("\""),
            0,
            1,
        )
        graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year
        graph_year = (pynutil.insert("year: \"") + year_graph + pynini.closure(
            pynini.accep('er') + pynini.closure(pynini.accep('n'), 0, 1), 0, 1)
                      + pynutil.insert("\""))

        final_graph = graph_dmy | graph_year
        final_graph += pynutil.insert(" preserve_order: true")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Ejemplo n.º 5
0
    def add_optional_and(self, graph):
        if not self.deterministic:
            graph = pynini.compose(
                graph, NEMO_SIGMA +
                pynini.closure(pynini.cross("hundred ", " "), 0, 1) +
                NEMO_SIGMA)

        not_quote = pynini.closure(NEMO_NOT_QUOTE)
        no_thousand_million = pynini.difference(
            not_quote, not_quote + pynini.union("thousand", "million") +
            not_quote).optimize()
        integer = (not_quote + pynutil.add_weight(
            pynini.cross("hundred ", "hundred and ") + no_thousand_million,
            -0.0001)).optimize()

        no_hundred = pynini.difference(
            NEMO_SIGMA,
            not_quote + pynini.accep("hundred") + not_quote).optimize()
        integer |= (not_quote + pynutil.add_weight(
            pynini.cross("thousand ", "thousand and ") + no_hundred,
            -0.0001)).optimize()

        graph_with_and = pynini.compose(
            graph, integer).optimize() | pynutil.add_weight(graph, 0.00001)

        return graph_with_and
Ejemplo n.º 6
0
def roman_to_int(fst: 'pynini.FstLike') -> 'pynini.FstLike':
    """
    Alters given fst to convert Roman integers (lower and upper cased) into Arabic numerals. Valid for values up to 1000.
    e.g.
        "V" -> "5"
        "i" -> "1"

    Args:
        fst: Any fst. Composes fst onto Roman conversion outputs.
    """
    def _load_roman(file: str):
        roman = load_labels(get_abs_path(file))
        roman_numerals = [(x, y) for x, y in roman] + [(x.upper(), y)
                                                       for x, y in roman]
        return pynini.string_map(roman_numerals)

    digit = _load_roman("data/roman/digit.tsv")
    ties = _load_roman("data/roman/ties.tsv")
    hundreds = _load_roman("data/roman/hundreds.tsv")

    graph = (
        digit
        | ties + (digit | pynutil.add_weight(pynutil.insert("0"), 0.01))
        |
        (hundreds + (ties | pynutil.add_weight(pynutil.insert("0"), 0.01)) +
         (digit | pynutil.add_weight(pynutil.insert("0"), 0.01)))).optimize()

    return graph @ fst
Ejemplo n.º 7
0
    def __init__(self, ordinal: GraphFst):
        super().__init__(name="date", kind="classify")

        ordinal_graph = ordinal.graph
        year_graph = _get_year_graph()
        YEAR_WEIGHT = 0.001
        year_graph = pynutil.add_weight(year_graph, YEAR_WEIGHT)
        month_graph = _get_month_graph()

        month_graph = pynutil.insert(
            "month: \"") + month_graph + pynutil.insert("\"")

        day_graph = pynutil.insert("day: \"") + pynutil.add_weight(
            ordinal_graph, -0.7) + pynutil.insert("\"")
        optional_day_graph = pynini.closure(delete_extra_space + day_graph, 0,
                                            1)
        optional_graph_year = pynini.closure(
            delete_extra_space + pynutil.insert("year: \"") +
            pynutil.add_weight(year_graph, -YEAR_WEIGHT) +
            pynutil.insert("\""),
            0,
            1,
        )
        graph_mdy = month_graph + optional_day_graph + optional_graph_year
        graph_dmy = (pynutil.delete("the") + delete_space + day_graph +
                     delete_space + pynutil.delete("of") + delete_extra_space +
                     month_graph + optional_graph_year)
        graph_year = pynutil.insert("year: \"") + (
            year_graph | _get_range_graph()) + pynutil.insert("\"")

        final_graph = graph_mdy | graph_dmy | graph_year
        final_graph += pynutil.insert(" preserve_order: true")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Ejemplo n.º 8
0
def get_alternative_formats():
    """
    Utils to get alternative formats for numbers.
    """
    one_alternatives = load_labels(
        get_abs_path('data/numbers/cardinals_alternatives.tsv'))
    one_thousand_map = []
    for k in one_alternatives:
        default, alternative = k
        one_thousand_map.append((alternative.split()[1], alternative))
    one_thousand_map = pynini.string_map(one_thousand_map)

    one_thousand_alternative = pynini.cdrewrite(one_thousand_map, "[BOS]", "",
                                                NEMO_SIGMA)

    # Adapted from
    # https://github.com/google/TextNormalizationCoveringGrammars/blob/master/src/universal/thousands_punct.grm
    # Specifies common ways of delimiting thousands in digit strings.
    t = pynini.Far(get_abs_path('data/utils/universal_thousands_punct.far'))
    separators = (pynutil.add_weight(t['dot_thousands'], 0.1)
                  | pynutil.add_weight(t['no_delimiter'], -0.1)
                  | pynutil.add_weight(t['space_thousands'], 0.1))
    alternative_formats = {}
    alternative_formats[
        'one_thousand_alternative'] = one_thousand_alternative.optimize()
    alternative_formats['separators'] = separators.optimize()
    return alternative_formats
Ejemplo n.º 9
0
    def __init__(self,
                 whitelist: 'pynini.FstLike',
                 deterministic: bool = True):
        super().__init__(name="abbreviation",
                         kind="classify",
                         deterministic=deterministic)

        main_graph = NEMO_UPPER + pynini.closure(insert_space + NEMO_UPPER, 1)
        misc_graph = pynutil.add_weight(
            TO_LOWER +
            pynini.closure(insert_space + pynini.union(TO_LOWER | NEMO_LOWER)),
            110)
        misc_graph |= pynutil.add_weight(
            pynini.closure(NEMO_UPPER, 2) +
            pynini.closure(insert_space + NEMO_LOWER, 1), 110)
        misc_graph |= (
            NEMO_UPPER + pynutil.delete(".") +
            pynini.closure(insert_space + NEMO_UPPER + pynutil.delete(".")))
        misc_graph |= pynutil.add_weight(
            TO_LOWER + pynutil.delete(".") +
            pynini.closure(insert_space + TO_LOWER + pynutil.delete(".")), 110)

        # set weight of the misc graph to the value higher then word
        graph = pynutil.add_weight(main_graph.optimize(),
                                   10) | pynutil.add_weight(
                                       misc_graph.optimize(), 101)

        # exclude words that are included in the whitelist
        graph = pynini.compose(
            pynini.difference(pynini.project(graph, "input"),
                              pynini.project(whitelist.graph, "input")), graph)
        graph = pynutil.insert(
            "value: \"") + graph.optimize() + pynutil.insert("\"")
        graph = self.add_tokens(graph)
        self.fst = graph.optimize()
Ejemplo n.º 10
0
    def __init__(
        self,
        input_case: str,
        cache_dir: str = None,
        overwrite_cache: bool = False,
        deterministic: bool = True,
        whitelist: str = None,
    ):
        super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic)

        far_file = None
        if cache_dir is not None and cache_dir != "None":
            os.makedirs(cache_dir, exist_ok=True)
            whitelist_file = os.path.basename(whitelist) if whitelist else ""
            far_file = os.path.join(
                cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far"
            )
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
            logging.info(f'ClassifyFst.fst was restored from {far_file}.')
        else:
            logging.info(f"Creating ClassifyFst grammars.")

            word_graph = WordFst(deterministic=deterministic).fst
            whitelist_graph = WhiteListFst(input_case=input_case, deterministic=deterministic).fst
            punct_graph = PunctuationFst(deterministic=deterministic).fst

            classify = pynutil.add_weight(whitelist_graph, 1) | pynutil.add_weight(word_graph, 100)

            punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }")
            punct = pynini.closure(
                pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
                | (pynutil.insert(" ") + punct),
                1,
            )
            token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
            token_plus_punct = (
                pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)
            )

            graph = (
                token_plus_punct
                + pynini.closure(
                    (
                        pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
                        | (pynutil.insert(" ") + punct + pynutil.insert(" "))
                    )
                    + token_plus_punct
                ).optimize()
            )

            graph = delete_space + graph + delete_space
            graph |= punct

            self.fst = graph.optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f"ClassifyFst grammars are saved to {far_file}.")
Ejemplo n.º 11
0
    def __init__(self):
        super().__init__(name="money", kind="classify")
        # quantity, integer_part, fractional_part, currency, style(depr)
        cardinal_graph = CardinalFst().graph_no_exception
        graph_decimal_final = DecimalFst().final_graph_wo_negative

        unit = pynini.string_file(get_abs_path("data/currency.tsv"))
        unit_singular = pynini.invert(unit)
        unit_plural = get_singulars(unit_singular)

        graph_unit_singular = pynutil.insert("currency: \"") + convert_space(
            unit_singular) + pynutil.insert("\"")
        graph_unit_plural = pynutil.insert("currency: \"") + convert_space(
            unit_plural) + pynutil.insert("\"")

        add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (
            pynutil.insert("0") + NEMO_DIGIT)
        # twelve dollars (and) fifty cents, zero cents
        cents_standalone = (pynutil.insert("fractional_part: \"") +
                            pynini.union(
                                pynutil.add_weight(
                                    ((NEMO_SIGMA - "one") @ cardinal_graph),
                                    -0.7) @ add_leading_zero_to_double_digit +
                                delete_space + pynutil.delete("cents"),
                                pynini.cross("one", "01") + delete_space +
                                pynutil.delete("cent"),
                            ) + pynutil.insert("\""))

        optional_cents_standalone = pynini.closure(
            delete_space +
            pynini.closure(pynutil.delete("and") + delete_space, 0, 1) +
            insert_space + cents_standalone,
            0,
            1,
        )
        # twelve dollars fifty, only after integer
        optional_cents_suffix = pynini.closure(
            delete_extra_space + pynutil.insert("fractional_part: \"") +
            pynutil.add_weight(cardinal_graph, -0.7) + pynutil.insert("\""),
            0,
            1,
        )

        graph_integer = (pynutil.insert("integer_part: \"") +
                         ((NEMO_SIGMA - "one") @ cardinal_graph) +
                         pynutil.insert("\"") + delete_extra_space +
                         graph_unit_plural +
                         (optional_cents_standalone | optional_cents_suffix))
        graph_integer |= (pynutil.insert("integer_part: \"") +
                          pynini.cross("one", "1") + pynutil.insert("\"") +
                          delete_extra_space + graph_unit_singular +
                          (optional_cents_standalone | optional_cents_suffix))
        graph_decimal = graph_decimal_final + delete_extra_space + graph_unit_plural
        graph_decimal |= pynutil.insert(
            "currency: \"$\" integer_part: \"0\" ") + cents_standalone
        final_graph = graph_integer | graph_decimal
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Ejemplo n.º 12
0
    def __init__(self, number_names: dict, alternative_formats: dict, deterministic: bool = False):
        super().__init__(name="cardinal", kind="classify", deterministic=deterministic)

        self.cardinal_numbers_default = self.get_cardinal_numbers(number_names, alternative_formats, mode="all")
        self.cardinal_numbers_nominative = self.get_cardinal_numbers(
            number_names, alternative_formats, mode="nominative"
        )
        self.optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1
        )

        self.cardinal_numbers_with_optional_negative = (
            self.optional_graph_negative
            + pynutil.insert("integer: \"")
            + self.cardinal_numbers_default
            + pynutil.insert("\"")
        )

        # "03" -> remove leading zeros and verbalize
        leading_zeros = pynini.closure(pynini.cross("0", ""))
        self.cardinal_numbers_with_leading_zeros = (leading_zeros + self.cardinal_numbers_default).optimize()

        # "123" -> "один два три"
        single_digits_graph = pynini.compose(NEMO_DIGIT, self.cardinal_numbers_nominative)
        self.single_digits_graph = single_digits_graph + pynini.closure(insert_space + single_digits_graph)

        optional_quantity = pynini.string_file(get_abs_path("data/numbers/quantity.tsv")).optimize()
        optional_quantity = pynutil.insert("quantity: \"") + optional_quantity + pynutil.insert("\"")
        optional_quantity = pynini.closure(
            (pynutil.add_weight(pynini.accep(NEMO_SPACE), -0.1) | insert_space) + optional_quantity, 0, 1
        )

        serial_graph = self.get_serial_graph()

        final_graph = (
            self.optional_graph_negative
            + pynutil.insert("integer: \"")
            + self.cardinal_numbers_with_leading_zeros
            + pynutil.insert("\"")
            + optional_quantity
        ).optimize()

        final_graph = pynutil.add_weight(final_graph, -0.1)
        final_graph |= (
            pynutil.insert("integer: \"")
            + pynutil.add_weight(self.single_digits_graph | serial_graph, 10)
            + pynutil.insert("\"")
        )
        self.final_graph = final_graph

        # to cover cases "2-х" -> "двух" (this is not covered by ordinal endings)
        final_graph |= pynini.compose(
            pynini.compose(NEMO_DIGIT ** (1, ...) + pynini.cross('-х', ''), final_graph),
            NEMO_SIGMA + pynini.accep("х\"") + NEMO_SIGMA,
        )
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Ejemplo n.º 13
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="cardinal",
                         kind="classify",
                         deterministic=deterministic)

        graph = pynini.Far(
            get_abs_path("data/numbers/cardinal_number_name.far")).get_fst()
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            pynini.closure(NEMO_DIGIT, 2, 3)
            | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph
        self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure(
            pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT +
            NEMO_DIGIT + NEMO_DIGIT)) @ graph

        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
        single_digits_graph = pynutil.add_weight(
            pynini.invert(graph_digit | graph_zero), 1.2) | pynutil.add_weight(
                pynini.cross("0", "oh"), 1.1)
        self.single_digits_graph = single_digits_graph + pynini.closure(
            pynutil.insert(" ") + single_digits_graph)

        if not deterministic:
            single_digits_graph_with_commas = pynini.closure(
                self.single_digits_graph + pynutil.insert(" "), 1,
                3) + pynini.closure(
                    pynutil.delete(",") + single_digits_graph +
                    pynutil.insert(" ") + single_digits_graph +
                    pynutil.insert(" ") + single_digits_graph,
                    1,
                )
            self.graph |= self.single_digits_graph | get_hundreds_graph(
            ) | single_digits_graph_with_commas
            self.range_graph = (
                pynini.closure(pynutil.insert("from "), 0, 1) + self.graph +
                (pynini.cross("-", " to ") | pynini.cross("-", " ")) +
                self.graph)

            self.range_graph |= self.graph + (pynini.cross(
                "x", " by ") | pynini.cross(" x ", " by ")) + self.graph
            self.range_graph = self.range_graph.optimize()

        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)
        final_graph = self.graph | pynutil.add_weight(self.get_serial_graph(),
                                                      1.2)

        if not deterministic:
            final_graph |= self.range_graph

        final_graph = optional_minus_graph + pynutil.insert(
            "integer: \"") + final_graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Ejemplo n.º 14
0
    def __init__(self):
        super().__init__(name="tokenize_and_classify", kind="classify")

        cardinal_graph_fst = CardinalFst()
        cardinal = cardinal_graph_fst.fst

        ordinal_graph_fst = OrdinalFst(cardinal_graph_fst)
        ordinal = ordinal_graph_fst.fst

        decimal_graph_fst = DecimalFst(cardinal_graph_fst)
        decimal = decimal_graph_fst.fst

        measure = MeasureFst(cardinal_graph_fst, decimal_graph_fst).fst
        date = DateFst(ordinal_graph_fst).fst
        word = WordFst().fst
        time = TimeFst().fst
        money = MoneyFst(cardinal_graph_fst, decimal_graph_fst).fst
        whitelist = WhiteListFst().fst

        graph = (pynutil.add_weight(whitelist, 1.01)
                 | pynutil.add_weight(time, 1.1)
                 | pynutil.add_weight(date, 1.09)
                 | pynutil.add_weight(decimal, 1.1)
                 | pynutil.add_weight(measure, 1.1)
                 | pynutil.add_weight(cardinal, 1.1)
                 | pynutil.add_weight(ordinal, 1.1)
                 | pynutil.add_weight(money, 1.1)
                 | pynutil.add_weight(word, 100))

        self.fst = graph.optimize()
Ejemplo n.º 15
0
    def get_address_graph(self, cardinal):
        """
        Finite state transducer for classifying serial.
            The serial is a combination of digits, letters and dashes, e.g.:
            2788 San Tomas Expy, Santa Clara, CA 95051 ->
                units: "address" cardinal
                { integer: "two seven eight eight San Tomas Expressway Santa Clara California nine five zero five one" }
                 preserve_order: true
        """
        ordinal_verbalizer = OrdinalVerbalizer().graph
        ordinal_tagger = OrdinalTagger(cardinal=cardinal).graph
        ordinal_num = pynini.compose(
            pynutil.insert("integer: \"") + ordinal_tagger +
            pynutil.insert("\""), ordinal_verbalizer)

        address_num = pynini.closure(NEMO_DIGIT,
                                     1) @ cardinal.single_digits_graph

        direction = (pynini.cross("E", "East")
                     | pynini.cross("S", "South")
                     | pynini.cross("W", "West")
                     | pynini.cross("N", "North"))
        direction = pynini.closure(
            pynutil.add_weight(pynini.accep(NEMO_SPACE) + direction, -1), 0, 1)

        address_words = pynini.string_file(
            get_abs_path("data/address/address_words.tsv"))
        address_words = (pynini.accep(NEMO_SPACE) +
                         pynini.closure(ordinal_num, 0, 1) +
                         pynini.closure(NEMO_ALPHA | NEMO_SPACE, 1) +
                         address_words)

        city = pynini.closure(NEMO_ALPHA | pynini.accep(NEMO_SPACE), 1)
        city = pynini.closure(
            pynini.cross(",", "") + pynini.accep(NEMO_SPACE) + city, 0, 1)

        state = pynini.invert(
            pynini.string_file(get_abs_path("data/address/states.tsv")))
        state = pynini.closure(
            pynini.cross(",", "") + pynini.accep(NEMO_SPACE) + state, 0, 1)

        zip_code = pynini.compose(NEMO_DIGIT**5, cardinal.single_digits_graph)
        zip_code = pynini.closure(
            pynutil.add_weight(
                pynini.closure(pynini.cross(",", ""), 0, 1) +
                pynini.accep(NEMO_SPACE) + zip_code, -100),
            0,
            1,
        )

        address = (address_num + direction + address_words +
                   pynini.closure(pynini.cross(".", ""), 0, 1) + city + state +
                   zip_code)
        return address
Ejemplo n.º 16
0
    def __init__(self, cardinal: GraphFst, deterministic: bool = False):
        super().__init__(name="decimal",
                         kind="classify",
                         deterministic=deterministic)

        integer_part = cardinal.cardinal_numbers_default
        cardinal_numbers_with_leading_zeros = cardinal.cardinal_numbers_with_leading_zeros

        delimiter_map = prepare_labels_for_insertion(
            get_abs_path("data/numbers/decimal_delimiter.tsv"))
        delimiter = (
            pynini.cross(",", "") + delimiter_map['@@decimal_delimiter@@'] +
            pynini.closure(pynutil.add_weight(pynutil.insert(" и"), 0.5), 0,
                           1)).optimize()

        decimal_endings_map = prepare_labels_for_insertion(
            get_abs_path("data/numbers/decimal_endings.tsv"))

        self.integer_part = integer_part + delimiter
        graph_integer = pynutil.insert(
            "integer_part: \"") + self.integer_part + pynutil.insert("\"")

        graph_fractional = NEMO_DIGIT @ cardinal_numbers_with_leading_zeros + decimal_endings_map[
            '10']
        graph_fractional |= (
            NEMO_DIGIT + NEMO_DIGIT
        ) @ cardinal_numbers_with_leading_zeros + decimal_endings_map['100']
        graph_fractional |= (
            NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT
        ) @ cardinal_numbers_with_leading_zeros + decimal_endings_map['1000']
        graph_fractional |= (
            NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT
        ) @ cardinal_numbers_with_leading_zeros + decimal_endings_map['10000']

        self.optional_quantity = pynini.string_file(
            get_abs_path("data/numbers/quantity.tsv")).optimize()

        self.graph_fractional = graph_fractional
        graph_fractional = pynutil.insert(
            "fractional_part: \"") + graph_fractional + pynutil.insert("\"")
        optional_quantity = pynini.closure(
            (pynutil.add_weight(pynini.accep(NEMO_SPACE), -0.1) | insert_space)
            + pynutil.insert("quantity: \"") + self.optional_quantity +
            pynutil.insert("\""),
            0,
            1,
        )
        self.final_graph = (cardinal.optional_graph_negative + graph_integer +
                            insert_space + graph_fractional +
                            optional_quantity)

        self.final_graph = self.add_tokens(self.final_graph)
        self.fst = self.final_graph.optimize()
Ejemplo n.º 17
0
    def __init__(self, cardinal: GraphFst, decimal: GraphFst):
        super().__init__(name="money", kind="classify")
        # quantity, integer_part, fractional_part, currency

        cardinal_graph = cardinal.graph_no_exception
        graph_decimal_final = decimal.final_graph_wo_negative

        unit = pynini.string_file(get_abs_path("data/currency.tsv"))
        unit_singular = pynini.invert(unit)
        unit = get_singulars(unit_singular) | unit_singular

        graph_unit = pynutil.insert("currency: \"") + convert_space(
            unit) + pynutil.insert("\"")

        add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (
            pynutil.insert("0") + NEMO_DIGIT)
        # elf euro (und) vier cent, vier cent
        cents_standalone = (pynutil.insert("fractional_part: \"") +
                            (pynutil.add_weight(cardinal_graph, -0.7)
                             @ add_leading_zero_to_double_digit) +
                            delete_space + pynutil.delete("cent") +
                            pynutil.insert("\""))

        optional_cents_standalone = pynini.closure(
            delete_space +
            pynini.closure(pynutil.delete("und") + delete_space, 0, 1) +
            insert_space + cents_standalone,
            0,
            1,
        )
        # elf euro vierzig, only after integer
        optional_cents_suffix = pynini.closure(
            delete_extra_space + pynutil.insert("fractional_part: \"") +
            pynutil.add_weight(
                cardinal_graph @ add_leading_zero_to_double_digit, -0.7) +
            pynutil.insert("\""),
            0,
            1,
        )

        graph_integer = (pynutil.insert("integer_part: \"") + cardinal_graph +
                         pynutil.insert("\"") + delete_extra_space +
                         graph_unit +
                         (optional_cents_standalone | optional_cents_suffix))
        graph_decimal = graph_decimal_final + delete_extra_space + graph_unit
        graph_decimal |= pynutil.insert(
            "currency: \"€\" integer_part: \"0\" ") + cents_standalone
        final_graph = graph_integer | graph_decimal
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Ejemplo n.º 18
0
    def __init__(self, cardinal: GraphFst, decimal: GraphFst):
        super().__init__(name="money", kind="classify")
        # quantity, integer_part, fractional_part, currency

        cardinal_graph = cardinal.graph_no_exception
        graph_decimal_final = decimal.final_graph_wo_negative
        graph_half = pynini.cross("rưỡi", "5")

        unit = pynini.string_file(get_abs_path("data/currency.tsv"))
        unit_singular = pynini.invert(unit)

        graph_unit_singular = pynutil.insert("currency: \"") + convert_space(
            unit_singular) + pynutil.insert("\"")

        add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (
            pynutil.insert("0") + NEMO_DIGIT)

        # twelve dollars fifty, only after integer
        optional_cents_suffix = pynini.closure(
            delete_extra_space + pynutil.insert("fractional_part: \"") +
            (pynutil.add_weight(
                cardinal_graph @ add_leading_zero_to_double_digit, -0.7)
             | graph_half) + pynutil.insert("\""),
            0,
            1,
        )

        graph_integer = (pynutil.insert("integer_part: \"") + cardinal_graph +
                         pynutil.insert("\"") + delete_extra_space +
                         graph_unit_singular + optional_cents_suffix)

        graph_decimal = graph_decimal_final + delete_extra_space + graph_unit_singular + optional_cents_suffix
        final_graph = graph_integer | graph_decimal
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Ejemplo n.º 19
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="ordinal",
                         kind="verbalize",
                         deterministic=deterministic)

        graph_digit = pynini.string_file(
            get_abs_path("data/ordinals/digit.tsv")).invert()
        graph_teens = pynini.string_file(
            get_abs_path("data/ordinals/teen.tsv")).invert()

        graph = (pynutil.delete("integer:") + delete_space +
                 pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) +
                 pynutil.delete("\""))
        convert_rest = pynutil.insert("th", weight=0.01)

        suffix = pynini.cdrewrite(
            graph_digit | graph_teens
            | pynutil.add_weight(pynini.cross("ty", "tieth"), weight=0.001)
            | convert_rest,
            "",
            "[EOS]",
            NEMO_SIGMA,
        ).optimize()
        self.graph = pynini.compose(graph, suffix)
        self.suffix = suffix
        delete_tokens = self.delete_tokens(self.graph)
        self.fst = delete_tokens.optimize()
Ejemplo n.º 20
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="telephone",
                         kind="verbalize",
                         deterministic=deterministic)

        add_separator = pynutil.insert(",")  # between components

        optional_country_code = pynini.closure(
            pynutil.delete("country_code: \"") +
            pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") +
            delete_space + add_separator + insert_space,
            0,
            1,
        )

        number_part = (
            pynutil.delete("number_part: \"") +
            pynini.closure(NEMO_NOT_QUOTE, 1) + pynini.closure(
                pynutil.add_weight(pynutil.delete(" "), -0.1), 0, 1) +
            pynutil.delete("\""))

        optional_extension = pynini.closure(
            delete_space + insert_space + pynutil.delete("extension: \"") +
            pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\""),
            0,
            1,
        )

        graph = optional_country_code + number_part + optional_extension
        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()
Ejemplo n.º 21
0
    def get_serial_graph(self):
        """
        Finite state transducer for classifying serial.
            The serial is a combination of digits, letters and dashes, e.g.:
            c325-b -> tokens { cardinal { integer: "c three two five b" } }
        """
        alpha = NEMO_ALPHA

        if self.deterministic:
            num_graph = self.single_digits_graph
        else:
            num_graph = self.graph
            letter_pronunciation = pynini.string_map(
                load_labels(get_abs_path("data/letter_pronunciation.tsv")))
            alpha |= letter_pronunciation

        delimiter = insert_space | pynini.cross("-", " ") | pynini.cross(
            "/", " ")
        letter_num = pynini.closure(alpha + delimiter, 1) + num_graph
        num_letter = pynini.closure(num_graph + delimiter, 1) + alpha
        num_delimiter_num = pynini.closure(num_graph + delimiter,
                                           1) + num_graph
        next_alpha_or_num = pynini.closure(delimiter + (alpha | num_graph))
        serial_graph = (letter_num | num_letter
                        | num_delimiter_num) + next_alpha_or_num
        if not self.deterministic:
            serial_graph += pynini.closure(
                pynini.accep("s") | pynini.cross("s", "es"), 0, 1)

        serial_graph.optimize()
        return pynutil.add_weight(serial_graph, 10)
Ejemplo n.º 22
0
    def __init__(self):
        super().__init__(name="date", kind="verbalize")

        convert_primer = pynini.cross('1', '1ᵉʳ')
        day = (
            pynutil.delete("day:") + delete_space + pynutil.delete("\"") +
            (pynini.closure(NEMO_NOT_QUOTE, 1) | pynutil.add_weight(
                convert_primer, -1))  # first of the month is ordinal
            + pynutil.delete("\""))
        month = (pynutil.delete("month:") + delete_space +
                 pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) +
                 pynutil.delete("\""))
        year = (pynutil.delete("year:") + delete_space + pynutil.delete("\"") +
                pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\""))

        # day month
        graph_dm = day + delete_extra_space + month
        graph_dmy = graph_dm + delete_extra_space + year

        optional_preserve_order = pynini.closure(
            pynutil.delete("preserve_order:") + delete_space +
            pynutil.delete("true") + delete_space
            | pynutil.delete("field_order:") + delete_space +
            pynutil.delete("\"") + NEMO_NOT_QUOTE + pynutil.delete("\"") +
            delete_space)

        final_graph = (graph_dm
                       | graph_dmy) + delete_space + optional_preserve_order

        delete_tokens = self.delete_tokens(final_graph)
        self.fst = delete_tokens.optimize()
Ejemplo n.º 23
0
    def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None):
        super().__init__(name="whitelist", kind="classify", deterministic=deterministic)

        def _get_whitelist_graph(input_case, file):
            whitelist = load_labels(file)
            if input_case == "lower_cased":
                whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
            graph = pynini.string_map(whitelist)
            return graph

        graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist.tsv"))
        if not deterministic and input_case != "lower_cased":
            graph |= pynutil.add_weight(
                _get_whitelist_graph("lower_cased", get_abs_path("data/whitelist.tsv")), weight=0.0001
            )

        if input_file:
            whitelist_provided = _get_whitelist_graph(input_case, input_file)
            if not deterministic:
                graph |= whitelist_provided
            else:
                graph = whitelist_provided

        if not deterministic:
            units_graph = _get_whitelist_graph(input_case, file=get_abs_path("data/measure/measurements.tsv"))
            graph |= units_graph

        self.graph = graph
        self.final_graph = convert_space(self.graph).optimize()
        self.fst = (pynutil.insert("name: \"") + self.final_graph + pynutil.insert("\"")).optimize()
Ejemplo n.º 24
0
    def __init__(self, tn_cardinal_tagger: GraphFst, deterministic: bool = True):
        super().__init__(name="cardinal", kind="classify", deterministic=deterministic)

        # add_space_between_chars = pynini.cdrewrite(pynini.closure(insert_space, 0, 1), NEMO_CHAR, NEMO_CHAR, NEMO_SIGMA)
        optional_delete_space = pynini.closure(NEMO_SIGMA | pynutil.delete(" "))

        graph = (tn_cardinal_tagger.graph @ optional_delete_space).invert().optimize()
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            (tn_cardinal_tagger.graph_hundred_component_at_least_one_none_zero_digit @ optional_delete_space)
            .invert()
            .optimize()
        )

        self.graph_ties = (tn_cardinal_tagger.two_digit_non_zero @ optional_delete_space).invert().optimize()
        # this is to make sure if there is an ambiguity with decimal, decimal is chosen, e.g. 1000000 vs. 1 million
        graph = pynutil.add_weight(graph, weight=0.001)
        self.graph_no_exception = graph
        self.digit = pynini.arcmap(tn_cardinal_tagger.digit, map_type="rmweight").invert().optimize()
        graph_exception = pynini.project(self.digit, 'input')
        self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph

        self.optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("minus ", "\"-\" "), 0, 1
        )

        final_graph = self.optional_minus_graph + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"")

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Ejemplo n.º 25
0
    def __init__(self, cardinal: GraphFst):
        super().__init__(name="date", kind="classify")

        cardinal_graph = cardinal.graph_no_exception
        year_graph = _get_year_graph()
        YEAR_WEIGHT = 0.001
        year_graph = pynutil.add_weight(year_graph, YEAR_WEIGHT)
        month_graph = _get_month_graph()

        month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"")

        day_graph = pynutil.insert("day: \"") + cardinal_graph + pynutil.insert("\"")
        optional_day = pynini.closure(pynutil.delete(pynini.union("ngày", "mùng") + delete_space), 0, 1)
        graph_year = (
            delete_extra_space
            + pynutil.delete("năm")
            + delete_extra_space
            + pynutil.insert("year: \"")
            + pynutil.add_weight(year_graph, -YEAR_WEIGHT)
            + pynutil.insert("\"")
        )
        optional_graph_year = pynini.closure(graph_year, 0, 1)
        graph_mdy = (
            pynutil.delete("tháng")
            + delete_space
            + month_graph
            + (
                (delete_space + pynutil.delete("ngày") + delete_extra_space + day_graph + optional_graph_year)
                | optional_graph_year
            )
        )
        graph_dmy = (
            optional_day
            + day_graph
            + delete_space
            + pynutil.delete("tháng")
            + delete_extra_space
            + month_graph
            + optional_graph_year
        )
        graph_year = (
            pynutil.delete("năm") + delete_extra_space + pynutil.insert("year: \"") + year_graph + pynutil.insert("\"")
        )

        final_graph = pynini.union((graph_dmy | graph_year) + pynutil.insert(" preserve_order: true"), graph_mdy)
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Ejemplo n.º 26
0
    def __init__(self):
        super().__init__(name="time", kind="classify")
        # hours, minutes, seconds, suffix, zone, style, speak_period

        suffix_graph = pynini.string_file(get_abs_path("data/time_suffix.tsv"))
        time_zone_graph = pynini.invert(
            pynini.string_file(get_abs_path("data/time_zone.tsv")))

        # only used for < 1000 thousand -> 0 weight
        cardinal = pynutil.add_weight(CardinalFst().graph_no_exception,
                                      weight=-0.7)

        labels_hour = [num_to_word(x) for x in range(0, 24)]
        labels_minute_single = [num_to_word(x) for x in range(1, 10)]
        labels_minute_double = [num_to_word(x) for x in range(10, 60)]

        graph_hour = pynini.union(*labels_hour) @ cardinal

        graph_minute_single = pynini.union(*labels_minute_single) @ cardinal
        graph_minute_double = pynini.union(*labels_minute_double) @ cardinal
        graph_minute_verbose = pynini.cross("half", "30") | pynini.cross(
            "quarter", "15")
        oclock = pynini.cross(
            pynini.union("o' clock", "o clock", "o'clock", "oclock"), "")

        final_graph_hour = pynutil.insert(
            "hours: \"") + graph_hour + pynutil.insert("\"")
        final_graph_minute = (
            pynutil.insert("minutes: \"") +
            (pynutil.insert("00")
             | oclock + pynutil.insert("00")
             | pynutil.delete("o") + delete_space + graph_minute_single
             | graph_minute_double) + pynutil.insert("\""))
        final_suffix = pynutil.insert("suffix: \"") + convert_space(
            suffix_graph) + pynutil.insert("\"")
        final_suffix_optional = pynini.closure(
            delete_space + insert_space + final_suffix, 0, 1)
        final_time_zone_optional = pynini.closure(
            delete_space + insert_space + pynutil.insert("zone: \"") +
            convert_space(time_zone_graph) + pynutil.insert("\""),
            0,
            1,
        )

        # five o' clock
        # two o eight, two thiry five (am/pm)
        # two pm/am
        graph_hm = final_graph_hour + delete_extra_space + final_graph_minute
        # 10 past four, quarter past four, half past four
        graph_mh = (pynutil.insert("minutes: \"") + pynini.union(
            graph_minute_single, graph_minute_double, graph_minute_verbose) +
                    pynutil.insert("\"") + delete_space +
                    pynutil.delete("past") + delete_extra_space +
                    final_graph_hour)
        final_graph = ((graph_hm | graph_mh) + final_suffix_optional +
                       final_time_zone_optional).optimize()

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Ejemplo n.º 27
0
Archivo: date.py Proyecto: manneh/NeMo
    def __init__(self, ordinal: GraphFst, deterministic: bool = True):
        super().__init__(name="date", kind="verbalize", deterministic=deterministic)

        month = pynini.closure(NEMO_NOT_QUOTE, 1)
        day_cardinal = (
            pynutil.delete("day:")
            + delete_space
            + pynutil.delete("\"")
            + pynini.closure(NEMO_NOT_QUOTE, 1)
            + pynutil.delete("\"")
        )
        day = day_cardinal @ ordinal.suffix

        if not deterministic:
            day |= day_cardinal

        month = pynutil.delete("month:") + delete_space + pynutil.delete("\"") + month + pynutil.delete("\"")

        year = (
            pynutil.delete("year:")
            + delete_space
            + pynutil.delete("\"")
            + pynini.closure(NEMO_NOT_QUOTE, 1)
            + delete_space
            + pynutil.delete("\"")
        )

        # month (day) year
        graph_mdy = (
            month + pynini.closure(delete_extra_space + day, 0, 1) + pynini.closure(delete_extra_space + year, 0, 1)
        )

        # day month year
        graph_dmy = (
            pynutil.insert("the ")
            + day
            + delete_extra_space
            + pynutil.insert("of ")
            + month
            + pynini.closure(delete_extra_space + year, 0, 1)
        )

        optional_preserve_order = pynini.closure(
            pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space
            | pynutil.delete("field_order:")
            + delete_space
            + pynutil.delete("\"")
            + NEMO_NOT_QUOTE
            + pynutil.delete("\"")
            + delete_space
        )

        final_graph = (
            (graph_mdy | year | pynutil.add_weight(graph_dmy, 0.001)) + delete_space + optional_preserve_order
        )

        delete_tokens = self.delete_tokens(final_graph)
        self.fst = delete_tokens.optimize()
Ejemplo n.º 28
0
    def get_serial_graph(self):
        """
        Finite state transducer for classifying serial (handles only cases without delimiters,
        values with delimiters are handled by default).
            The serial is a combination of digits, letters and dashes, e.g.:
            c325b -> tokens { cardinal { integer: "c three two five b" } }
        """
        num_graph = self.single_digits_graph

        if not self.deterministic:
            num_graph |= self.graph

        # add space between letter and digit
        graph_with_space = pynini.compose(
            pynini.cdrewrite(pynutil.insert(" "), NEMO_ALPHA, NEMO_DIGIT,
                             NEMO_SIGMA),
            pynini.cdrewrite(pynutil.insert(" "), NEMO_DIGIT, NEMO_ALPHA,
                             NEMO_SIGMA),
        )

        # make sure at least one digit and letter is present
        not_space = pynini.closure(NEMO_NOT_SPACE)
        graph_with_space = pynini.compose(
            (not_space + NEMO_ALPHA + not_space + NEMO_DIGIT + not_space)
            | (not_space + NEMO_DIGIT + not_space + NEMO_ALPHA + not_space),
            graph_with_space,
        )

        keep_space = pynini.accep(" ")
        serial_graph = pynini.compose(
            graph_with_space,
            pynini.closure(pynini.closure(NEMO_ALPHA, 1) + keep_space, 1) +
            num_graph +
            pynini.closure(keep_space + pynini.closure(NEMO_ALPHA) +
                           pynini.closure(keep_space + num_graph, 0, 1)),
        )
        serial_graph |= pynini.compose(
            graph_with_space,
            num_graph + keep_space + pynini.closure(NEMO_ALPHA, 1) +
            pynini.closure(keep_space + num_graph + pynini.closure(
                keep_space + pynini.closure(NEMO_ALPHA), 0, 1)),
        )

        # serial graph with delimiter
        delimiter = pynini.accep("-") | pynini.accep("/")
        alphas = pynini.closure(NEMO_ALPHA, 1)
        letter_num = alphas + delimiter + num_graph
        num_letter = pynini.closure(num_graph + delimiter, 1) + alphas
        next_alpha_or_num = pynini.closure(delimiter + (alphas | num_graph))
        next_alpha_or_num |= pynini.closure(delimiter + num_graph +
                                            pynutil.insert(" ") + alphas)

        serial_graph |= letter_num + next_alpha_or_num
        serial_graph |= num_letter + next_alpha_or_num
        # numbers only with 2+ delimiters
        serial_graph |= (num_graph + delimiter + num_graph + delimiter +
                         num_graph + pynini.closure(delimiter + num_graph))
        return pynutil.add_weight(serial_graph, 2)
Ejemplo n.º 29
0
    def __init__(self, cardinal: GraphFst):
        super().__init__(name="time", kind="classify")
        suffix_graph = pynini.string_file(get_abs_path("data/time_suffix.tsv"))
        time_zone_graph = pynini.string_file(
            get_abs_path("data/time_zone.tsv"))

        # only used for < 1000 thousand -> 0 weight
        cardinal = pynutil.add_weight(cardinal.graph, weight=-0.7)

        labels_hour = [str(x) for x in range(0, 24)]
        labels_minute_single = [str(x) for x in range(1, 10)]
        labels_minute_double = [str(x) for x in range(10, 60)]

        delete_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (
            pynini.closure(pynutil.delete("0"), 0, 1) + NEMO_DIGIT)

        graph_hour = delete_leading_zero_to_double_digit @ pynini.union(
            *labels_hour) @ cardinal

        graph_minute_single = pynini.union(*labels_minute_single) @ cardinal
        graph_minute_double = pynini.union(*labels_minute_double) @ cardinal

        final_graph_hour = pynutil.insert(
            "hours: \"") + graph_hour + pynutil.insert("\"")
        final_graph_minute = (
            pynutil.insert("minutes: \"") +
            (pynini.cross("0", "o") + insert_space + graph_minute_single
             | graph_minute_double) + pynutil.insert("\""))
        final_suffix = pynutil.insert("suffix: \"") + convert_space(
            suffix_graph) + pynutil.insert("\"")
        final_suffix_optional = pynini.closure(
            delete_space + insert_space + final_suffix, 0, 1)
        final_time_zone_optional = pynini.closure(
            delete_space + insert_space + pynutil.insert("zone: \"") +
            convert_space(time_zone_graph) + pynutil.insert("\""),
            0,
            1,
        )

        # 2:30 pm, 02:30, 2:00
        graph_hm = (
            final_graph_hour + pynutil.delete(":") +
            (pynutil.delete("00") | insert_space + final_graph_minute) +
            final_suffix_optional + final_time_zone_optional)

        # 2.xx pm/am
        graph_hm2 = (
            final_graph_hour + pynutil.delete(".") +
            (pynutil.delete("00") | insert_space + final_graph_minute) +
            delete_space + insert_space + final_suffix +
            final_time_zone_optional)
        # 2 pm est
        graph_h = final_graph_hour + delete_space + insert_space + final_suffix + final_time_zone_optional
        final_graph = (graph_hm | graph_h | graph_hm2).optimize()

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Ejemplo n.º 30
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="telephone",
                         kind="classify",
                         deterministic=deterministic)

        add_separator = pynutil.insert(", ")  # between components
        digit = pynini.invert(
            pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
        ).optimize() | pynini.cross("0", "o")

        country_code = (pynutil.insert("country_code: \"") +
                        pynini.closure(pynutil.delete("+"), 0, 1) +
                        pynini.closure(digit + insert_space, 0, 2) + digit +
                        pynutil.insert("\""))
        optional_country_code = pynini.closure(
            country_code + pynini.closure(pynutil.delete("-"), 0, 1) +
            delete_space + insert_space, 0, 1)

        area_part_common = pynutil.add_weight(
            pynini.cross("800", "eight hundred"), -1.1)
        area_part_default = pynini.closure(digit + insert_space, 2, 2) + digit
        area_part = area_part_default | area_part_common

        area_part = (
            (area_part + pynutil.delete("-"))
            | (pynutil.delete("(") + area_part +
               (pynutil.delete(") ") | pynutil.delete(")-")))) + add_separator

        del_separator = pynini.closure(pynini.union("-", " "), 0, 1)
        number_length = ((NEMO_DIGIT + del_separator) |
                         (NEMO_ALPHA + del_separator))**7
        number_words = pynini.closure((NEMO_DIGIT @ digit) +
                                      (insert_space | pynini.cross("-", ', '))
                                      | NEMO_ALPHA
                                      | (NEMO_ALPHA + pynini.cross("-", ' ')))
        number_words = pynini.compose(number_length, number_words)
        number_part = area_part + number_words
        number_part = pynutil.insert(
            "number_part: \"") + number_part + pynutil.insert("\"")
        extension = (pynutil.insert("extension: \"") +
                     pynini.closure(digit + insert_space, 0, 3) + digit +
                     pynutil.insert("\""))
        optional_extension = pynini.closure(insert_space + extension, 0, 1)

        graph = optional_country_code + number_part + optional_extension

        # ip
        digit_to_str_graph = pynini.compose(
            NEMO_DIGIT**(1, 3),
            digit + pynini.closure(pynutil.insert(" ") + digit)).optimize()
        ip_graph = digit_to_str_graph + (pynini.cross(".", " dot ") +
                                         digit_to_str_graph)**3
        graph |= pynutil.insert(
            "number_part: \"") + ip_graph.optimize() + pynutil.insert("\"")

        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()