Ejemplo n.º 1
0
    def __init__(self):
        super().__init__(name="electronic", kind="verbalize")
        user_name = (pynutil.delete("username:"******"\"") + pynini.closure(NEMO_NOT_QUOTE, 1) +
                     pynutil.delete("\""))
        domain = (pynutil.delete("domain:") + delete_space +
                  pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) +
                  pynutil.delete("\""))

        protocol = (pynutil.delete("protocol:") + delete_space +
                    pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) +
                    pynutil.delete("\""))

        graph = user_name + delete_space + pynutil.insert("@") + domain
        graph |= protocol
        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()
Ejemplo n.º 2
0
 def __init__(self, decimal: GraphFst, cardinal: GraphFst):
     super().__init__(name="measure", kind="verbalize")
     optional_sign = pynini.closure(pynini.cross("negative: \"true\"", "-"),
                                    0, 1)
     unit = (pynutil.delete("units:") + delete_space +
             pynutil.delete("\"") + pynini.closure(NEMO_CHAR - " ", 1) +
             pynutil.delete("\"") + delete_space)
     graph_decimal = (pynutil.delete("decimal {") + delete_space +
                      optional_sign + delete_space + decimal.numbers +
                      delete_space + pynutil.delete("}"))
     graph_cardinal = (pynutil.delete("cardinal {") + delete_space +
                       optional_sign + delete_space + cardinal.numbers +
                       delete_space + pynutil.delete("}"))
     graph = (graph_cardinal
              | graph_decimal) + delete_space + pynutil.insert(" ") + unit
     delete_tokens = self.delete_tokens(graph)
     self.fst = delete_tokens.optimize()
Ejemplo n.º 3
0
def _get_ties_graph():
    """
    Transducer for 20-99 e.g
    hai ba -> 23
    """
    graph_one = pynini.cross("mốt", "1")
    graph_four = pynini.cross("tư", "4")
    graph_five = pynini.cross("lăm", "5")
    graph_ten = pynini.cross("mươi", "")
    optional_ten = pynini.closure(delete_space + graph_ten, 0, 1)

    graph = (
        ties_graph
        + optional_ten
        + ((delete_space + (graph_digit | graph_one | graph_four | graph_five)) | pynutil.insert("0"))
    )
    return graph
Ejemplo n.º 4
0
    def __init__(self):
        super().__init__(name="time", kind="verbalize")
        hour = (pynutil.delete("hours: ") + pynutil.delete("\"") +
                pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\""))
        minutes = (pynutil.delete("minutes: ") + pynutil.delete("\"") +
                   pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\""))

        graph_preserve_order = pynutil.delete("hours: \"") + pynini.closure(
            NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")

        # for cases that require permutations for the correct verbalization
        graph_reverse_order = hour + delete_space + pynutil.insert(
            ":") + minutes + delete_space

        graph = graph_preserve_order | graph_reverse_order
        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()
Ejemplo n.º 5
0
def get_quantity(deci):
    numbers = cardinal.graph_hundred_component_at_least_one_none_zero_digit @ (
        pynutil.delete(pynini.closure("0")) +
        pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT))
    suffix = pynini.union("million", "billion", "trillion", "quadrillion",
                          "quintillion", "sextillion")
    res = (pynutil.insert("integer_part: \"") + numbers +
           pynutil.insert("\"") + delete_extra_space +
           pynutil.insert("quantity: \"") + suffix + pynutil.insert("\""))
    res |= deci + delete_extra_space + pynutil.insert("quantity: \"") + (
        suffix | "thousand") + pynutil.insert("\"")
    return res
Ejemplo n.º 6
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="electronic",
                         kind="verbalize",
                         deterministic=deterministic)
        graph_digit_no_zero = pynini.invert(
            pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
        ).optimize() | pynini.cross("1", "eins")
        graph_zero = pynini.invert(
            pynini.string_file(
                get_abs_path("data/numbers/zero.tsv"))).optimize()
        graph_digit = graph_digit_no_zero | graph_zero
        graph_symbols = pynini.string_file(
            get_abs_path("data/electronic/symbols.tsv")).optimize()
        server_common = pynini.string_file(
            get_abs_path("data/electronic/server_name.tsv"))
        domain_common = pynini.string_file(
            get_abs_path("data/electronic/domain.tsv"))

        def add_space_after_char():
            return pynini.closure(NEMO_NOT_QUOTE - pynini.accep(" ") +
                                  insert_space) + (NEMO_NOT_QUOTE -
                                                   pynini.accep(" "))

        verbalize_characters = pynini.cdrewrite(graph_symbols | graph_digit,
                                                "", "", NEMO_SIGMA)

        user_name = pynutil.delete(
            "username: \"") + add_space_after_char() + pynutil.delete("\"")
        user_name @= verbalize_characters

        convert_defaults = pynutil.add_weight(
            NEMO_NOT_QUOTE, weight=0.0001) | domain_common | server_common
        domain = convert_defaults + pynini.closure(insert_space +
                                                   convert_defaults)
        domain @= verbalize_characters

        domain = pynutil.delete("domain: \"") + domain + pynutil.delete("\"")
        protocol = (pynutil.delete("protocol: \"") + add_space_after_char()
                    @ pynini.cdrewrite(graph_symbols, "", "", NEMO_SIGMA) +
                    pynutil.delete("\""))
        self.graph = (pynini.closure(protocol + pynini.accep(" "), 0, 1) +
                      domain) | (user_name + pynini.accep(" ") +
                                 pynutil.insert("at ") + domain)
        delete_tokens = self.delete_tokens(self.graph + delete_preserve_order)
        self.fst = delete_tokens.optimize()
Ejemplo n.º 7
0
def prepare_labels_for_insertion(file_path: str):
    """
    Read the file and creates a union insertion graph

    Args:
        file_path: path to a file (single column)

    Returns fst that inserts labels from the file
    """
    labels = load_labels(file_path)
    mapping = defaultdict(list)
    for k, v in labels:
        mapping[k].append(v)

    for k in mapping:
        mapping[k] = insert_space + pynini.union(
            *[pynutil.insert(end) for end in mapping[k]])
    return mapping
Ejemplo n.º 8
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="telephone",
                         kind="classify",
                         deterministic=deterministic)

        add_separator = pynutil.insert(", ")  # between components
        digit = pynini.invert(
            pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
        ).optimize() | pynini.cross("0", "o")

        country_code = (pynutil.insert("country_code: \"") +
                        pynini.closure(pynutil.delete("+"), 0, 1) +
                        pynini.closure(digit + insert_space, 0, 2) + digit +
                        pynutil.insert("\""))
        optional_country_code = pynini.closure(
            country_code + pynini.closure(pynutil.delete("-"), 0, 1) +
            delete_space + insert_space, 0, 1)

        area_part_common = pynutil.add_weight(
            pynini.cross("800", "eight hundred"), -1.1)
        area_part_default = pynini.closure(digit + insert_space, 2, 2) + digit
        area_part = area_part_default | area_part_common

        area_part = (
            (area_part + pynutil.delete("-"))
            | (pynutil.delete("(") + area_part +
               (pynutil.delete(") ") | pynutil.delete(")-")))) + add_separator

        del_separator = pynini.closure(pynini.union("-", " "), 0, 1)
        number_length = ((NEMO_DIGIT + del_separator) |
                         (NEMO_ALPHA + del_separator))**7
        number_words = pynini.closure((NEMO_DIGIT @ digit) +
                                      (insert_space | pynini.cross("-", ', '))
                                      | NEMO_ALPHA
                                      | (NEMO_ALPHA + pynini.cross("-", ' ')))
        number_words = pynini.compose(number_length, number_words)
        number_part = area_part + number_words
        number_part = pynutil.insert(
            "number_part: \"") + number_part + pynutil.insert("\"")
        extension = (pynutil.insert("extension: \"") +
                     pynini.closure(digit + insert_space, 0, 3) + digit +
                     pynutil.insert("\""))
        optional_extension = pynini.closure(insert_space + extension, 0, 1)

        graph = optional_country_code + number_part + optional_extension
        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Ejemplo n.º 9
0
def _get_year_graph():
    """
    Transducer for year, e.g. hai không hai mươi -> 2020
    """
    def _get_digits_graph():
        zero = pynini.cross((pynini.union("linh", "lẻ")), "0")
        four = pynini.cross("tư", "4")
        graph = pynini.union(
            zero + delete_space + (graph_digit | four),
            graph_zero + delete_space + graph_digit,
        )
        graph.optimize()
        return graph

    def _get_hundreds_graph(graph_ties, graph_digits):
        graph = (graph_digit + delete_space + pynutil.delete("trăm") +
                 delete_space + (graph_teen | graph_ties | graph_digits))
        return graph

    def _get_thousands_graph(graph_ties, graph_digits):
        graph_hundred_component = (
            (graph_digit | graph_zero) + delete_space +
            pynutil.delete("trăm")) | pynutil.insert("0")
        graph = (graph_digit + delete_space +
                 pynutil.delete(pynini.union("nghìn", "ngàn")) + delete_space +
                 graph_hundred_component + delete_space +
                 (graph_teen | graph_ties | graph_digits))
        return graph

    graph_ties = _get_ties_graph()
    graph_digits = _get_digits_graph()
    graph_hundreds = _get_hundreds_graph(graph_ties, graph_digits)
    graph_thousands = _get_thousands_graph(graph_ties, graph_digits)
    year_graph = (
        # 20 19, 40 12, 2012, 2 0 0 5, 2 0 17, 938 - assuming no limit on the year
        graph_digit + delete_space +
        (graph_digit | graph_zero) + delete_space +
        (graph_teen | graph_ties | graph_digits)
        | graph_thousands
        | graph_hundreds
        | (graph_digit + pynutil.insert("0") + delete_space +
           (graph_ties | graph_digits | graph_teen)))
    year_graph.optimize()
    return year_graph
Ejemplo n.º 10
0
    def __init__(
        self,
        input_case: str,
        cache_dir: str = None,
        overwrite_cache: bool = False,
        deterministic: bool = True,
        whitelist: str = None,
    ):
        super().__init__(name="tokenize_and_classify",
                         kind="classify",
                         deterministic=deterministic)

        far_file = None
        if cache_dir is not None and cache_dir != "None":
            os.makedirs(cache_dir, exist_ok=True)
            whitelist_file = os.path.basename(whitelist) if whitelist else ""
            far_file = os.path.join(
                cache_dir,
                f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far"
            )
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
            logging.info(f'ClassifyFst.fst was restored from {far_file}.')
        else:
            logging.info(f"Creating ClassifyFst grammars.")

            word_graph = WordFst(deterministic=deterministic).fst
            whitelist_graph = WhiteListFst(input_case=input_case,
                                           deterministic=deterministic).fst
            punct_graph = PunctuationFst(deterministic=deterministic).fst

            classify = pynutil.add_weight(
                whitelist_graph, 1) | pynutil.add_weight(word_graph, 100)

            punct = pynutil.insert("tokens { ") + pynutil.add_weight(
                punct_graph, weight=1.1) + pynutil.insert(" }")
            token = pynutil.insert("tokens { ") + classify + pynutil.insert(
                " }")
            token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) +
                                token +
                                pynini.closure(pynutil.insert(" ") + punct))

            graph = token_plus_punct + pynini.closure(delete_extra_space +
                                                      token_plus_punct)
            graph = delete_space + graph + delete_space

            self.fst = graph.optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f"ClassifyFst grammars are saved to {far_file}.")
Ejemplo n.º 11
0
    def __init__(self, ordinal: GraphFst):
        super().__init__(name="date", kind="classify")

        ordinal_graph = ordinal.graph

        # weekday, day, month, year, style(depr), text(depr), short_year(depr), era
        year_graph = _get_year_graph()
        YEAR_WEIGHT = 0.001
        year_graph = pynutil.add_weight(year_graph, YEAR_WEIGHT)
        month_graph = _get_month_graph()

        month_graph = pynutil.insert(
            "month: \"") + month_graph + pynutil.insert("\"")

        day_graph = pynutil.insert("day: \"") + pynutil.add_weight(
            ordinal_graph, -0.7) + pynutil.insert("\"")
        optional_day_graph = pynini.closure(delete_extra_space + day_graph, 0,
                                            1)
        optional_graph_year = pynini.closure(
            delete_extra_space + pynutil.insert("year: \"") +
            pynutil.add_weight(year_graph, -YEAR_WEIGHT) +
            pynutil.insert("\""),
            0,
            1,
        )
        graph_mdy = month_graph + optional_day_graph + optional_graph_year
        graph_dmy = (pynutil.delete("the") + delete_space + day_graph +
                     delete_space + pynutil.delete("of") + delete_extra_space +
                     month_graph + optional_graph_year)
        graph_year = pynutil.insert("year: \"") + (
            year_graph | _get_range_graph()) + pynutil.insert("\"")

        final_graph = graph_mdy | graph_dmy | graph_year
        final_graph += pynutil.insert(" preserve_order: true")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Ejemplo n.º 12
0
Archivo: date.py Proyecto: NVIDIA/NeMo
def get_four_digit_year_graph(deterministic: bool = True):
    """
    Returns a four digit transducer which is combination of ties/teen or digits
    (using hundred instead of thousand format), e.g.
    1219 -> twelve nineteen
    3900 -> thirty nine hundred
    """
    graph_ties = get_ties_graph(deterministic)

    graph_with_s = (
        (graph_ties + insert_space + graph_ties)
        | (graph_teen + insert_space + (ties_graph | pynini.cross("1", "ten")))
    ) + pynutil.delete("0s")

    graph_with_s |= (graph_teen | graph_ties) + insert_space + pynini.cross("00", "hundred") + pynutil.delete("s")
    graph_with_s = graph_with_s @ pynini.cdrewrite(
        pynini.cross("y", "ies") | pynutil.insert("s"), "", "[EOS]", NEMO_SIGMA
    )

    graph = graph_ties + insert_space + graph_ties
    graph |= (graph_teen | graph_ties) + insert_space + pynini.cross("00", "hundred")

    thousand_graph = (
        graph_digit
        + insert_space
        + pynini.cross("00", "thousand")
        + (pynutil.delete("0") | insert_space + graph_digit)
    )
    thousand_graph |= (
        graph_digit
        + insert_space
        + pynini.cross("000", "thousand")
        + pynini.closure(pynutil.delete(" "), 0, 1)
        + pynini.accep("s")
    )

    graph |= graph_with_s
    if deterministic:
        graph = plurals._priority_union(thousand_graph, graph, NEMO_SIGMA)
    else:
        graph |= thousand_graph

    return graph.optimize()
Ejemplo n.º 13
0
    def __init__(self):
        super().__init__(name="electronic", kind="classify")

        delete_extra_space = pynutil.delete(" ")
        alpha_num = (
            NEMO_ALPHA
            | pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
            | pynini.string_file(get_abs_path("data/numbers/zero.tsv")))

        symbols = pynini.string_file(
            get_abs_path("data/electronic/symbols.tsv")).invert()

        accepted_username = alpha_num | symbols
        process_dot = pynini.cross("dot", ".")
        username = (pynutil.insert("username: \"") + alpha_num +
                    pynini.closure(delete_extra_space + accepted_username) +
                    pynutil.insert("\""))
        single_alphanum = pynini.closure(alpha_num +
                                         delete_extra_space) + alpha_num
        server = single_alphanum | pynini.string_file(
            get_abs_path("data/electronic/server_name.tsv"))
        domain = single_alphanum | pynini.string_file(
            get_abs_path("data/electronic/domain.tsv"))
        domain_graph = (pynutil.insert("domain: \"") + server +
                        delete_extra_space + process_dot + delete_extra_space +
                        domain + pynutil.insert("\""))
        graph = username + delete_extra_space + pynutil.delete(
            "at") + insert_space + delete_extra_space + domain_graph

        ############# url ###
        protocol_end = pynini.cross(pynini.union("w w w", "www"), "www")
        protocol_start = (pynini.cross("h t t p", "http") | pynini.cross(
            "h t t p s", "https")) + pynini.cross(" colon slash slash ", "://")
        # .com,
        ending = (delete_extra_space + symbols + delete_extra_space +
                  (domain
                   | pynini.closure(accepted_username + delete_extra_space, ) +
                   accepted_username))

        protocol = (pynini.closure(protocol_start, 0, 1) + protocol_end +
                    delete_extra_space + process_dot +
                    pynini.closure(delete_extra_space + accepted_username, 1) +
                    pynini.closure(ending, 1))
        protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert(
            "\"")
        graph |= protocol
        ########

        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Ejemplo n.º 14
0
def get_quantity(decimal: "pynini.FstLike",
                 cardinal_up_to_hundred: "pynini.FstLike") -> "pynini.FstLike":
    """
    Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral,
    e.g. một triệu -> integer_part: "1" quantity: "triệu"
    e.g. một tỷ rưỡi -> integer_part: "1" fractional_part: "5" quantity: "tỷ"

    Args:
        decimal: decimal FST
        cardinal_up_to_hundred: cardinal FST
    """
    numbers = cardinal_up_to_hundred @ (pynutil.delete(pynini.closure("0")) +
                                        pynini.difference(NEMO_DIGIT, "0") +
                                        pynini.closure(NEMO_DIGIT))
    suffix = pynini.union("triệu", "tỉ", "tỷ", "vạn")
    graph_four = pynini.cross("tư", "4")
    graph_one = pynini.cross("mốt", "1")
    graph_half = pynini.cross("rưỡi", "5")
    last_digit_exception = pynini.project(pynini.cross("năm", "5"), "input")
    last_digit = pynini.union(
        (pynini.project(graph_digit, "input") - last_digit_exception.arcsort())
        @ graph_digit,
        graph_one,
        graph_four,
        graph_half,
    )
    optional_fraction_graph = pynini.closure(
        delete_extra_space + pynutil.insert('fractional_part: "') +
        (last_digit | graph_half | graph_one | graph_four) +
        pynutil.insert('"'),
        0,
        1,
    )

    res = (pynutil.insert('integer_part: "') + numbers + pynutil.insert('"') +
           delete_extra_space + pynutil.insert('quantity: "') + suffix +
           pynutil.insert('"') + optional_fraction_graph)
    res |= (decimal + delete_extra_space + pynutil.insert('quantity: "') +
            (suffix | "ngàn" | "nghìn") + pynutil.insert('"'))
    return res
Ejemplo n.º 15
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="date", kind="verbalize", deterministic=deterministic)

        day_cardinal = pynutil.delete("day: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
        day = strip_cardinal_apocope(day_cardinal)

        primero = pynini.cdrewrite(pynini.cross("uno", "primero"), "[BOS]", "[EOS]", NEMO_SIGMA)
        day = (
            (day @ primero) if deterministic else pynini.union(day, day @ primero)
        )  # Primero for first day is traditional, but will vary depending on region

        month = pynutil.delete("month: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")

        year = (
            pynutil.delete("year: \"")
            + articles
            + NEMO_SPACE
            + pynini.closure(NEMO_NOT_QUOTE, 1)
            + pynutil.delete("\"")
        )

        # Insert preposition if wasn't originally with the year. This would mean a space was present
        year = pynutil.add_weight(year, -0.001)
        year |= (
            pynutil.delete("year: \"")
            + pynutil.insert("de ")
            + pynini.closure(NEMO_NOT_QUOTE, 1)
            + pynutil.delete("\"")
        )

        # day month year
        graph_dmy = day + pynini.cross(NEMO_SPACE, " de ") + month + pynini.closure(pynini.accep(" ") + year, 0, 1)

        graph_mdy = month + NEMO_SPACE + day + pynini.closure(NEMO_SPACE + year, 0, 1)
        if deterministic:
            graph_mdy += pynutil.delete(" preserve_order: true")  # Only accepts this if was explicitly passed

        self.graph = graph_dmy | graph_mdy
        final_graph = self.graph + delete_preserve_order

        delete_tokens = self.delete_tokens(final_graph)
        self.fst = delete_tokens.optimize()
Ejemplo n.º 16
0
    def __init__(self, cardinal: GraphFst, deterministic: bool):
        super().__init__(name="decimal",
                         kind="classify",
                         deterministic=deterministic)

        cardinal_graph = cardinal.graph
        cardinal_graph_hundred_component_at_least_one_none_zero_digit = (
            cardinal.graph_hundred_component_at_least_one_none_zero_digit)

        graph_decimal = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_decimal |= pynini.string_file(
            get_abs_path("data/numbers/zero.tsv"))

        graph_decimal = (
            pynini.cross("zero", "0")
            | graph_decimal
            | (graph_decimal | pynini.cross("o", "0")) + pynini.closure(
                delete_space + (graph_decimal | pynini.cross("o", "0")), 1))
        self.graph = pynini.invert(graph_decimal).optimize()
        if not deterministic:
            self.graph = self.graph | cardinal_graph

        point = pynutil.delete(".")
        optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)

        graph_fractional = pynutil.insert(
            "fractional_part: \"") + self.graph + pynutil.insert("\"")
        graph_integer = pynutil.insert(
            "integer_part: \"") + cardinal_graph + pynutil.insert("\"")
        final_graph_wo_sign = (
            pynini.closure(graph_integer + pynutil.insert(" "), 0, 1) + point +
            pynutil.insert(" ") + graph_fractional)

        self.final_graph_wo_negative = final_graph_wo_sign | get_quantity(
            final_graph_wo_sign,
            cardinal_graph_hundred_component_at_least_one_none_zero_digit)

        final_graph = optional_graph_negative + self.final_graph_wo_negative

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Ejemplo n.º 17
0
    def __init__(
        self,
        itn_cardinal_tagger: GraphFst,
        tn_date_tagger: GraphFst,
        tn_date_verbalizer: GraphFst,
        deterministic: bool = True,
    ):
        super().__init__(name="date", kind="classify", deterministic=deterministic)

        add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert("0") + NEMO_DIGIT)
        optional_delete_space = pynini.closure(NEMO_SIGMA | pynutil.delete(" ", weight=0.0001))
        tagger = tn_date_verbalizer.graph.invert().optimize()

        delete_day_marker = (
            pynutil.delete("day: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
        ) @ itn_cardinal_tagger.graph_no_exception

        month_as_number = pynutil.delete("month: \"") + itn_cardinal_tagger.graph_no_exception + pynutil.delete("\"")
        month_as_string = pynutil.delete("month: \"") + tn_date_tagger.month_abbr.invert() + pynutil.delete("\"")

        convert_year = (tn_date_tagger.year @ optional_delete_space).invert().optimize()
        delete_year_marker = (
            pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
        ) @ convert_year

        # day. month as string (year)
        verbalizer = (
            pynini.closure(delete_day_marker + pynutil.insert(".") + pynini.accep(" "), 0, 1)
            + month_as_string
            + pynini.closure(pynini.accep(" ") + delete_year_marker, 0, 1)
        )

        # day. month as number (year)
        verbalizer |= (
            delete_day_marker @ add_leading_zero_to_double_digit
            + pynutil.insert(".")
            + pynutil.delete(" ")
            + month_as_number @ add_leading_zero_to_double_digit
            + pynutil.insert(".")
            + pynini.closure(pynutil.delete(" ") + delete_year_marker, 0, 1)
        )

        # year
        verbalizer |= delete_year_marker

        final_graph = tagger @ verbalizer

        graph = pynutil.insert("name: \"") + convert_space(final_graph) + pynutil.insert("\"")
        self.fst = graph.optimize()
Ejemplo n.º 18
0
    def __init__(self):
        super().__init__(name="fraction", kind="verbalize")
        optional_sign = pynini.closure(
            pynini.cross("negative: \"true\"", "-") + delete_space, 0, 1)
        integer = (pynutil.delete("integer_part: \"") +
                   pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") +
                   insert_space)
        numerator = pynutil.delete("numerator: \"") + pynini.closure(
            NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")

        denominator = (pynutil.insert('/') +
                       pynutil.delete("denominator: \"") +
                       pynini.closure(NEMO_NOT_QUOTE, 1) +
                       pynutil.delete("\""))

        graph = (pynini.closure(integer + delete_space, 0, 1) + numerator +
                 delete_space + denominator).optimize()
        self.numbers = graph
        delete_tokens = self.delete_tokens(optional_sign + graph)
        self.fst = delete_tokens.optimize()
Ejemplo n.º 19
0
def singular_to_plural():
    # plural endung n/en maskuline Nomen mit den Endungen e, ent, and, ant, ist, or
    _n = NEMO_SIGMA + pynini.union("e") + pynutil.insert("n")
    _en = (
        NEMO_SIGMA
        + pynini.union("ent", "and", "ant", "ist", "or", "ion", "ik", "heit", "keit", "schaft", "tät", "ung")
        + pynutil.insert("en")
    )
    _nen = NEMO_SIGMA + pynini.union("in") + (pynutil.insert("e") | pynutil.insert("nen"))
    _fremd = NEMO_SIGMA + pynini.union("ma", "um", "us") + pynutil.insert("en")
    # maskuline Nomen mit den Endungen eur, ich, ier, ig, ling, ör
    _e = NEMO_SIGMA + pynini.union("eur", "ich", "ier", "ig", "ling", "ör") + pynutil.insert("e")
    _s = NEMO_SIGMA + pynini.union("a", "i", "o", "u", "y") + pynutil.insert("s")

    graph_plural = plurals._priority_union(
        suppletive, pynini.union(_n, _en, _nen, _fremd, _e, _s), NEMO_SIGMA
    ).optimize()

    return graph_plural
Ejemplo n.º 20
0
def get_hundreds_graph(deterministic: bool = True):
    """
    Returns a four digit transducer which is combination of ties/teen or digits
    (using hundred instead of thousand format), e.g.
    1219 -> twelve nineteen
    3900 -> thirty nine hundred
    """
    graph_ties = get_ties_graph(deterministic)
    graph = (graph_ties + insert_space + graph_ties
             | graph_teen + insert_space + pynini.cross("00", "hundred")
             | (graph_teen + insert_space +
                (ties_graph | pynini.cross("1", "ten")) + pynutil.delete("0s"))
             @ pynini.cdrewrite(
                 pynini.cross("y", "ies") | pynutil.insert("s"), "", "[EOS]",
                 NEMO_SIGMA)
             | pynutil.add_weight(
                 graph_digit + insert_space + pynini.cross("00", "thousand") +
                 (pynutil.delete("0") | insert_space + graph_digit),
                 weight=-0.001,
             ))
    return graph
Ejemplo n.º 21
0
    def __init__(self, decimal: GraphFst, deterministic: bool = True):
        super().__init__(name="money",
                         kind="verbalize",
                         deterministic=deterministic)
        keep_space = pynini.accep(" ")
        maj = pynutil.delete("currency_maj: \"") + pynini.closure(
            NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
        min = pynutil.delete("currency_min: \"") + pynini.closure(
            NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")

        fractional_part = (pynutil.delete("fractional_part: \"") +
                           pynini.closure(NEMO_NOT_QUOTE, 1) +
                           pynutil.delete("\""))

        integer_part = decimal.integer

        #  *** currency_maj
        graph_integer = integer_part + keep_space + maj

        #  *** currency_maj + (***) | ((and) *** current_min)
        fractional = fractional_part + delete_extra_space + min

        if not deterministic:
            fractional |= pynutil.insert("and ") + fractional

        graph_integer_with_minor = integer_part + keep_space + maj + keep_space + fractional + delete_preserve_order

        # *** point *** currency_maj
        graph_decimal = decimal.numbers + keep_space + maj

        # *** current_min
        graph_minor = fractional_part + delete_extra_space + min + delete_preserve_order

        graph = graph_integer | graph_integer_with_minor | graph_decimal | graph_minor

        if not deterministic:
            graph |= graph_integer + delete_preserve_order

        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()
Ejemplo n.º 22
0
    def __init__(self, decimal: GraphFst, deterministic: bool = True):
        super().__init__(name="money",
                         kind="verbalize",
                         deterministic=deterministic)

        keep_space = pynini.accep(" ")

        maj = pynutil.delete("currency_maj: \"") + pynini.closure(
            NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
        min = pynutil.delete("currency_min: \"") + pynini.closure(
            NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")

        fractional_part = (pynutil.delete("fractional_part: \"") +
                           pynini.closure(NEMO_NOT_QUOTE, 1) +
                           pynutil.delete("\""))

        integer_part = pynutil.delete("integer_part: \"") + pynini.closure(
            NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
        optional_add_and = pynini.closure(pynutil.insert("und "), 0, 1)

        #  *** currency_maj
        graph_integer = integer_part + keep_space + maj

        #  *** currency_maj + (***) | ((und) *** current_min)
        graph_integer_with_minor = (
            integer_part + keep_space + maj + keep_space +
            (fractional_part |
             (optional_add_and + fractional_part + keep_space + min)) +
            delete_preserve_order)

        # *** komma *** currency_maj
        graph_decimal = decimal.fst + keep_space + maj

        # *** current_min
        graph_minor = fractional_part + keep_space + min + delete_preserve_order

        graph = graph_integer | graph_integer_with_minor | graph_decimal | graph_minor

        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()
Ejemplo n.º 23
0
def prepare_labels_for_insertion(file_path: str):
    """
    Read the file and creates a union insertion graph

    Args:
        file_path: path to a file (3 columns: a label type e.g.
        "@@decimal_delimiter@@", a label e.g. "целого", and a weight e.g. "0.1").

    Returns dictionary mapping from label type to an fst that inserts the labels with the specified weights.

    """
    labels = load_labels(file_path)
    mapping = defaultdict(list)
    for k, v, w in labels:
        mapping[k].append((v, w))

    for k in mapping:
        mapping[k] = (insert_space + pynini.union(*[
            pynutil.add_weight(pynutil.insert(end), weight)
            for end, weight in mapping[k]
        ])).optimize()
    return mapping
Ejemplo n.º 24
0
    def __init__(self,
                 decimal: GraphFst,
                 cardinal: GraphFst,
                 deterministic: bool = True):
        super().__init__(name="measure",
                         kind="verbalize",
                         deterministic=deterministic)
        optional_sign = cardinal.optional_sign
        unit = pynutil.insert(" ") + pynini.closure(NEMO_CHAR - " ", 1)

        unit = pynutil.delete("units: \"") + unit + pynutil.delete(
            "\"") + delete_space
        graph_decimal = (pynutil.delete("decimal {") + delete_space +
                         optional_sign + delete_space + decimal.numbers +
                         delete_space + pynutil.delete("}"))
        self.graph_cardinal = (pynutil.delete("cardinal {") + delete_space +
                               optional_sign + delete_space +
                               cardinal.numbers + delete_space +
                               pynutil.delete("}"))
        graph = (self.graph_cardinal | graph_decimal) + delete_space + unit
        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()
Ejemplo n.º 25
0
    def __init__(self, input_case: str):
        super().__init__(name="tokenize_and_classify", kind="classify")

        cardinal = CardinalFst()
        cardinal_graph = cardinal.fst

        ordinal = OrdinalFst(cardinal=cardinal)
        ordinal_graph = ordinal.fst

        decimal = DecimalFst(cardinal=cardinal)
        decimal_graph = decimal.fst

        measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal).fst
        date_graph = DateFst(cardinal=cardinal).fst
        word_graph = WordFst().fst
        time_graph = TimeFst(cardinal=cardinal).fst
        telephone_graph = TelephoneFst().fst
        electonic_graph = ElectronicFst().fst
        money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst
        whitelist_graph = WhiteListFst(input_case=input_case).fst
        punct_graph = PunctuationFst().fst

        classify = (pynutil.add_weight(whitelist_graph, 1.01)
                    | pynutil.add_weight(time_graph, 1.1)
                    | pynutil.add_weight(date_graph, 1.09)
                    | pynutil.add_weight(decimal_graph, 1.1)
                    | pynutil.add_weight(measure_graph, 1.1)
                    | pynutil.add_weight(cardinal_graph, 1.1)
                    | pynutil.add_weight(ordinal_graph, 1.1)
                    | pynutil.add_weight(money_graph, 1.1)
                    | pynutil.add_weight(telephone_graph, 1.1)
                    | pynutil.add_weight(electonic_graph, 1.1)
                    | pynutil.add_weight(word_graph, 100)).optimize()

        punct = pynutil.insert("tokens { ") + pynutil.add_weight(
            punct_graph, weight=1.1) + pynutil.insert(" }")
        token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
        token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) +
                            token +
                            pynini.closure(pynutil.insert(" ") + punct))

        graph = token_plus_punct + pynini.closure(delete_extra_space +
                                                  token_plus_punct)
        graph = delete_space + graph + delete_space

        self.fst = graph.optimize()
Ejemplo n.º 26
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic)

        graph = pynutil.delete("integer: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")

        # masculne gender we leave as is
        graph_masc = graph + pynutil.delete(" morphosyntactic_features: \"gender_masc")

        # shift gender
        graph_fem_ending = graph @ pynini.cdrewrite(
            pynini.cross("o", "a"), "", NEMO_SPACE | pynini.accep("[EOS]"), NEMO_SIGMA
        )
        graph_fem = shift_number_gender(graph_fem_ending) + pynutil.delete(" morphosyntactic_features: \"gender_fem")

        # Apocope just changes tercero and primero. May occur if someone wrote 11.er (uncommon)
        graph_apocope = (
            pynini.cross("tercero", "tercer")
            | pynini.cross("primero", "primer")
            | pynini.cross("undécimo", "decimoprimer")
        )  # In case someone wrote 11.er with deterministic
        graph_apocope = (graph @ pynini.cdrewrite(graph_apocope, "", "", NEMO_SIGMA)) + pynutil.delete(
            " morphosyntactic_features: \"apocope"
        )

        graph = graph_apocope | graph_masc | graph_fem

        if not deterministic:
            # Plural graph
            graph_plural = pynini.cdrewrite(
                pynutil.insert("s"), pynini.union("o", "a"), NEMO_SPACE | pynini.accep("[EOS]"), NEMO_SIGMA
            )

            graph |= (graph @ graph_plural) + pynutil.delete("/plural")

        self.graph = graph + pynutil.delete("\"")

        delete_tokens = self.delete_tokens(self.graph)
        self.fst = delete_tokens.optimize()
Ejemplo n.º 27
0
    def __init__(self):
        super().__init__(name="electronic", kind="verbalize")
        graph_digit = pynini.invert(
            pynini.string_file(
                get_abs_path("data/numbers/digit.tsv"))).optimize()
        user_name = (
            pynutil.delete("username:"******"\"") +
            (pynini.closure(
                pynutil.add_weight(graph_digit + insert_space, 1.09)
                | pynutil.add_weight(pynini.closure(pynini.cross(".", "dot ")),
                                     1.09)
                | pynutil.add_weight(NEMO_NOT_QUOTE + insert_space, 1.1))) +
            pynutil.delete("\""))

        domain_default = (pynini.closure(NEMO_NOT_QUOTE + insert_space) +
                          pynini.cross(".", "dot ") + NEMO_NOT_QUOTE +
                          pynini.closure(insert_space + NEMO_NOT_QUOTE))

        server_default = pynini.closure(NEMO_NOT_QUOTE + insert_space)
        server_common = pynini.string_file(
            get_abs_path("data/electronic/server_name.tsv")) + insert_space

        domain_common = pynini.cross(".", "dot ") + pynini.string_file(
            get_abs_path("data/electronic/domain.tsv"))

        domain = (pynutil.delete("domain:") + delete_space +
                  pynutil.delete("\"") +
                  (pynutil.add_weight(server_common, 1.09)
                   | pynutil.add_weight(server_default, 1.1)) +
                  (pynutil.add_weight(domain_common, 1.09)
                   | pynutil.add_weight(domain_default, 1.1)) + delete_space +
                  pynutil.delete("\""))

        graph = user_name + delete_space + pynutil.insert(
            "at ") + delete_space + domain + delete_space

        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()
Ejemplo n.º 28
0
    def __init__(self, cardinal: GraphFst):
        super().__init__(name="date", kind="classify")

        self.cardinal = cardinal.graph_no_exception

        year_graph = self.cardinal

        month_graph = pynini.string_file(get_abs_path("data/months.tsv"))
        month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"")

        day_graph = self.cardinal | pynini.cross("premier", "1")  # Premier is only ordinal used for dates
        day_graph = pynutil.insert("day: \"") + day_graph + pynutil.insert("\"")
        optional_graph_year = pynini.closure(
            delete_extra_space + pynutil.insert("year: \"") + year_graph + pynutil.insert("\""), 0, 1,
        )
        graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year

        final_graph = graph_dmy
        final_graph += pynutil.insert(" preserve_order: true")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Ejemplo n.º 29
0
 def __init__(self):
     super().__init__(name="time", kind="verbalize")
     hour = (pynutil.delete("hours:") + delete_space +
             pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) +
             pynutil.delete("\""))
     minute = (pynutil.delete("minutes:") + delete_space +
               pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) +
               pynutil.delete("\""))
     suffix = (pynutil.delete("suffix:") + delete_space +
               pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) +
               pynutil.delete("\""))
     optional_suffix = pynini.closure(delete_space + insert_space + suffix,
                                      0, 1)
     zone = (pynutil.delete("zone:") + delete_space + pynutil.delete("\"") +
             pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\""))
     optional_zone = pynini.closure(delete_space + insert_space + zone, 0,
                                    1)
     graph = hour + delete_space + insert_space + minute + optional_suffix + optional_zone
     graph |= hour + insert_space + pynutil.insert(
         "o'clock") + optional_zone
     graph |= hour + delete_space + insert_space + suffix + optional_zone
     delete_tokens = self.delete_tokens(graph)
     self.fst = delete_tokens.optimize()
Ejemplo n.º 30
0
    def __init__(self, number_names: dict, deterministic: bool = True):
        super().__init__(name="telephone", kind="classify", deterministic=deterministic)

        separator = pynini.cross("-", " ")  # between components
        number = number_names["cardinal_names_nominative"]

        country_code = (
            pynutil.insert("country_code: \"")
            + pynini.closure(pynutil.add_weight(pynutil.delete("+"), 0.1), 0, 1)
            + number
            + separator
            + pynutil.insert("\"")
        )
        optional_country_code = pynini.closure(country_code + insert_space, 0, 1)

        number_part = (
            NEMO_DIGIT ** 3 @ number
            + separator
            + NEMO_DIGIT ** 3 @ number
            + separator
            + NEMO_DIGIT ** 2 @ number
            + separator
            + NEMO_DIGIT ** 2 @ (pynini.closure(pynini.cross("0", "ноль ")) + number)
        )
        number_part = pynutil.insert("number_part: \"") + number_part + pynutil.insert("\"")
        tagger_graph = (optional_country_code + number_part).optimize()

        # verbalizer
        verbalizer_graph = pynini.closure(
            pynutil.delete("country_code: \"")
            + pynini.closure(RU_ALPHA_OR_SPACE, 1)
            + pynutil.delete("\"")
            + delete_space,
            0,
            1,
        )
        verbalizer_graph += (
            pynutil.delete("number_part: \"") + pynini.closure(RU_ALPHA_OR_SPACE, 1) + pynutil.delete("\"")
        )
        verbalizer_graph = verbalizer_graph.optimize()

        self.final_graph = (tagger_graph @ verbalizer_graph).optimize()
        self.fst = self.add_tokens(
            pynutil.insert("number_part: \"") + self.final_graph + pynutil.insert("\"")
        ).optimize()