コード例 #1
0
def get_alternative_formats():
    """
    Utils to get alternative formats for numbers.
    """
    one_alternatives = load_labels(
        get_abs_path('data/numbers/cardinals_alternatives.tsv'))
    one_thousand_map = []
    for k in one_alternatives:
        default, alternative = k
        one_thousand_map.append((alternative.split()[1], alternative))
    one_thousand_map = pynini.string_map(one_thousand_map)

    one_thousand_alternative = pynini.cdrewrite(one_thousand_map, "[BOS]", "",
                                                NEMO_SIGMA)

    # Adapted from
    # https://github.com/google/TextNormalizationCoveringGrammars/blob/master/src/universal/thousands_punct.grm
    # Specifies common ways of delimiting thousands in digit strings.
    t = pynini.Far(get_abs_path('data/utils/universal_thousands_punct.far'))
    separators = (pynutil.add_weight(t['dot_thousands'], 0.1)
                  | pynutil.add_weight(t['no_delimiter'], -0.1)
                  | pynutil.add_weight(t['space_thousands'], 0.1))
    alternative_formats = {}
    alternative_formats[
        'one_thousand_alternative'] = one_thousand_alternative.optimize()
    alternative_formats['separators'] = separators.optimize()
    return alternative_formats
コード例 #2
0
    def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None):
        super().__init__(name="whitelist", kind="classify", deterministic=deterministic)

        def _get_whitelist_graph(input_case, file):
            whitelist = load_labels(file)
            if input_case == "lower_cased":
                whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
            else:
                whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
            graph = pynini.string_map(whitelist)
            return graph

        graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist.tsv"))

        if input_file:
            graph = _get_whitelist_graph(input_case, input_file)

        units_graph = _get_whitelist_graph(input_case, file=get_abs_path("data/measurements.tsv"))
        # do not replace single letter units, like `м`, `°` and `%` will be replaced
        units_graph = pynini.compose((NEMO_CHAR ** (2, ...) | pynini.difference(NEMO_CHAR, RU_ALPHA)), units_graph)
        graph |= units_graph.optimize()
        graph |= TO_CYRILLIC + pynini.closure(pynutil.insert(" ") + TO_CYRILLIC)

        self.final_graph = convert_space(graph)
        self.fst = (pynutil.insert("name: \"") + self.final_graph + pynutil.insert("\"")).optimize()
コード例 #3
0
    def __init__(self, cardinal: GraphFst, deterministic: bool = False):
        super().__init__(name="decimal",
                         kind="classify",
                         deterministic=deterministic)

        integer_part = cardinal.cardinal_numbers_default
        cardinal_numbers_with_leading_zeros = cardinal.cardinal_numbers_with_leading_zeros

        delimiter_map = prepare_labels_for_insertion(
            get_abs_path("data/numbers/decimal_delimiter.tsv"))
        delimiter = (
            pynini.cross(",", "") + delimiter_map['@@decimal_delimiter@@'] +
            pynini.closure(pynutil.add_weight(pynutil.insert(" и"), 0.5), 0,
                           1)).optimize()

        decimal_endings_map = prepare_labels_for_insertion(
            get_abs_path("data/numbers/decimal_endings.tsv"))

        self.integer_part = integer_part + delimiter
        graph_integer = pynutil.insert(
            "integer_part: \"") + self.integer_part + pynutil.insert("\"")

        graph_fractional = NEMO_DIGIT @ cardinal_numbers_with_leading_zeros + decimal_endings_map[
            '10']
        graph_fractional |= (
            NEMO_DIGIT + NEMO_DIGIT
        ) @ cardinal_numbers_with_leading_zeros + decimal_endings_map['100']
        graph_fractional |= (
            NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT
        ) @ cardinal_numbers_with_leading_zeros + decimal_endings_map['1000']
        graph_fractional |= (
            NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT
        ) @ cardinal_numbers_with_leading_zeros + decimal_endings_map['10000']

        self.optional_quantity = pynini.string_file(
            get_abs_path("data/numbers/quantity.tsv")).optimize()

        self.graph_fractional = graph_fractional
        graph_fractional = pynutil.insert(
            "fractional_part: \"") + graph_fractional + pynutil.insert("\"")
        optional_quantity = pynini.closure(
            (pynutil.add_weight(pynini.accep(NEMO_SPACE), -0.1) | insert_space)
            + pynutil.insert("quantity: \"") + self.optional_quantity +
            pynutil.insert("\""),
            0,
            1,
        )
        self.final_graph = (cardinal.optional_graph_negative + graph_integer +
                            insert_space + graph_fractional +
                            optional_quantity)

        self.final_graph = self.add_tokens(self.final_graph)
        self.fst = self.final_graph.optimize()
コード例 #4
0
ファイル: ordinal.py プロジェクト: NVIDIA/NeMo
    def __init__(self, number_names: dict, alternative_formats: dict, deterministic=False):
        super().__init__(name="ordinal", kind="classify", deterministic=deterministic)

        one_thousand_alternative = alternative_formats['one_thousand_alternative']
        separators = alternative_formats['separators']

        ordinal = number_names['ordinal_number_names']

        ordinal |= ordinal @ one_thousand_alternative
        ordinal_numbers = separators @ ordinal

        # to handle cases like 2-ая
        endings = pynini.string_file(get_abs_path("data/numbers/ordinal_endings.tsv"))
        not_dash = pynini.closure(pynini.difference(NEMO_SIGMA, "-"))
        del_ending = pynini.cdrewrite(pynini.cross("-" + not_dash, ""), "", "[EOS]", NEMO_SIGMA)
        ordinal_numbers_marked = (
            ((separators @ ordinal).optimize() + pynini.accep("-") + not_dash).optimize()
            @ (NEMO_SIGMA + endings).optimize()
            @ del_ending
        ).optimize()

        self.ordinal_numbers = ordinal_numbers
        # "03" -> remove leading zeros and verbalize
        leading_zeros = pynini.closure(pynini.cross("0", ""))
        self.ordinal_numbers_with_leading_zeros = (leading_zeros + ordinal_numbers).optimize()

        final_graph = (ordinal_numbers | ordinal_numbers_marked).optimize()
        final_graph = pynutil.insert("integer: \"") + final_graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
コード例 #5
0
 def _get_whitelist_graph(input_case, file="data/whitelist.tsv"):
     whitelist = load_labels(get_abs_path(file))
     if input_case == "lower_cased":
         whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
     else:
         whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
     graph = pynini.string_map(whitelist)
     return graph
コード例 #6
0
    def __init__(self, number_names: dict, alternative_formats: dict, deterministic: bool = False):
        super().__init__(name="cardinal", kind="classify", deterministic=deterministic)

        self.cardinal_numbers_default = self.get_cardinal_numbers(number_names, alternative_formats, mode="all")
        self.cardinal_numbers_nominative = self.get_cardinal_numbers(
            number_names, alternative_formats, mode="nominative"
        )
        self.optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1
        )

        self.cardinal_numbers_with_optional_negative = (
            self.optional_graph_negative
            + pynutil.insert("integer: \"")
            + self.cardinal_numbers_default
            + pynutil.insert("\"")
        )

        # "03" -> remove leading zeros and verbalize
        leading_zeros = pynini.closure(pynini.cross("0", ""))
        self.cardinal_numbers_with_leading_zeros = (leading_zeros + self.cardinal_numbers_default).optimize()

        # "123" -> "один два три"
        single_digits_graph = pynini.compose(NEMO_DIGIT, self.cardinal_numbers_nominative)
        self.single_digits_graph = single_digits_graph + pynini.closure(insert_space + single_digits_graph)

        optional_quantity = pynini.string_file(get_abs_path("data/numbers/quantity.tsv")).optimize()
        optional_quantity = pynutil.insert("quantity: \"") + optional_quantity + pynutil.insert("\"")
        optional_quantity = pynini.closure(
            (pynutil.add_weight(pynini.accep(NEMO_SPACE), -0.1) | insert_space) + optional_quantity, 0, 1
        )

        serial_graph = self.get_serial_graph()

        final_graph = (
            self.optional_graph_negative
            + pynutil.insert("integer: \"")
            + self.cardinal_numbers_with_leading_zeros
            + pynutil.insert("\"")
            + optional_quantity
        ).optimize()

        final_graph = pynutil.add_weight(final_graph, -0.1)
        final_graph |= (
            pynutil.insert("integer: \"")
            + pynutil.add_weight(self.single_digits_graph | serial_graph, 10)
            + pynutil.insert("\"")
        )
        self.final_graph = final_graph

        # to cover cases "2-х" -> "двух" (this is not covered by ordinal endings)
        final_graph |= pynini.compose(
            pynini.compose(NEMO_DIGIT ** (1, ...) + pynini.cross('-х', ''), final_graph),
            NEMO_SIGMA + pynini.accep("х\"") + NEMO_SIGMA,
        )
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
コード例 #7
0
 def __init__(self, deterministic: bool = True):
     super().__init__(name="whitelist",
                      kind="classify",
                      deterministic=deterministic)
     whitelist = pynini.string_file(
         get_abs_path("data/whitelist.tsv")).invert()
     graph = pynutil.insert("name: \"") + convert_space(
         whitelist) + pynutil.insert("\"")
     self.fst = graph.optimize()
コード例 #8
0
def get_alternative_formats():
    """
    Utils to get alternative formats for numbers.
    """
    one_alternatives = load_labels(
        get_abs_path('data/numbers/cardinals_alternatives.tsv'))
    one_thousand_map = []
    for k in one_alternatives:
        default, alternative = k
        one_thousand_map.append((alternative.split()[1], alternative))
    one_thousand_map = pynini.string_map(one_thousand_map)

    one_thousand_alternative = pynini.cdrewrite(one_thousand_map, "[BOS]", "",
                                                NEMO_SIGMA)

    t = pynini.Far(get_abs_path('data/utils/universal_thousands_punct.far'))
    separators = (pynutil.add_weight(t['dot_thousands'], 0.1)
                  | pynutil.add_weight(t['no_delimiter'], -0.1)
                  | pynutil.add_weight(t['space_thousands'], 0.1))
    alternative_formats = {}
    alternative_formats['one_thousand_alternative'] = one_thousand_alternative
    alternative_formats['separators'] = separators
    return alternative_formats
コード例 #9
0
def get_number_names():
    """
    Creates numbers names.

    Based on: 1) Gorman, K., and Sproat, R. 2016. Minimally supervised number normalization.
    Transactions of the Association for Computational Linguistics 4: 507-519.
    and 2) Ng, A. H., Gorman, K., and Sproat, R. 2017.
    Minimally supervised written-to-spoken text normalization. In ASRU, pages 665-670.
    """
    a = pynini.Far(get_abs_path('data/utils/util_arithmetic.far'), mode='r')
    d = a['DELTA_STAR']
    f = a['IARITHMETIC_RESTRICTED']
    g = pynini.Fst.read(get_abs_path('data/utils/g.fst'))
    fg = (d @ (f @ (f @ (f @ g).optimize()).optimize()).optimize()).optimize()
    assert rewrite.top_rewrite("230", fg) == "(+ 200 30 +)"

    # Compiles lexicon transducers (L).
    cardinal_name_nominative = pynini.string_file(
        get_abs_path("data/numbers/1_cardinals_nominative_именительный.tsv")
    ).optimize()
    cardinal_name_genitive = pynini.string_file(
        get_abs_path(
            "data/numbers/2_cardinals_genitive_родительный.tsv")).optimize()
    cardinal_name_dative = pynini.string_file(
        get_abs_path(
            "data/numbers/3_cardinals_dative_датильный.tsv")).optimize()
    cardinal_name_accusative = pynini.string_file(
        get_abs_path(
            "data/numbers/4_cardinals_accusative_винительный.tsv")).optimize()
    cardinal_name_instrumental = pynini.string_file(
        get_abs_path("data/numbers/5_cardinals_instrumental_творительный.tsv")
    ).optimize()
    cardinal_name_prepositional = pynini.string_file(
        get_abs_path("data/numbers/6_cardinals_prepositional_предложный.tsv")
    ).optimize()

    cardinal_l = (
        pynini.closure(cardinal_name_nominative + pynini.accep(" ")) +
        cardinal_name_nominative).optimize()
    for case in [
            cardinal_name_genitive,
            cardinal_name_dative,
            cardinal_name_accusative,
            cardinal_name_instrumental,
            cardinal_name_prepositional,
    ]:
        cardinal_l |= (pynini.closure(case + pynini.accep(" ")) +
                       case).optimize()

    # Numbers up to 1000 in nominative case (to use, for example, with telephone)
    nominative_up_to_thousand_name = pynini.string_file(
        get_abs_path("data/numbers/cardinals_nominative_case.tsv"))
    nominative_up_to_thousand_name_l = (
        pynini.closure(nominative_up_to_thousand_name + pynini.accep(" ")) +
        nominative_up_to_thousand_name).optimize()

    # Convert e.g. "(* 5 1000 *)" back to  "5000" so complex ordinals will be formed correctly,
    #  e.g. "пятитысячный" will eventually be formed. (If we didn't do this, the incorrect phrase
    # "пять тысячный" would be formed).
    # We do this for all thousands from "(*2 1000 *)" —> "2000" to "(*20 1000 *)" —> "20000".
    # We do not go higher, in order to prevent the WFST graph becoming even larger.
    complex_numbers = pynini.cross("(* 2 1000 *)", "2000")
    for number in range(3, 21):
        complex_numbers |= pynini.cross(f"(* {number} 1000 *)", f"{number}000")

    complex_numbers = (NEMO_SIGMA + pynutil.add_weight(complex_numbers, -1) +
                       pynini.closure(pynini.union(" ", ")", "(", "+", "*")))
    fg_ordinal = pynutil.add_weight(pynini.compose(fg, complex_numbers),
                                    -1) | fg
    ordinal_name = pynini.string_file(
        get_abs_path("data/numbers/ordinals.tsv"))
    ordinal_l = (pynini.closure(cardinal_name_nominative + pynini.accep(" ")) +
                 ordinal_name).optimize()

    # Composes L with the leaf transducer (P), then composes that with FG.
    p = a['LEAVES']
    number_names = {}
    number_names['ordinal_number_names'] = (
        fg_ordinal @ (p @ ordinal_l)).optimize()
    number_names['cardinal_number_names'] = (fg @ (p @ cardinal_l)).optimize()
    number_names['nominative_up_to_thousand_names'] = (
        fg @ (p @ nominative_up_to_thousand_name_l)).optimize()
    return number_names
コード例 #10
0
ファイル: alphabet.py プロジェクト: mousebaiker/NeMo
        ("Ё́", "Е'"),
        ("И́", "И'"),
        ("О́", "О'"),
        ("У́", "У'"),
        ("Ы́", "Ы'"),
        ("Э́", "Э'"),
        ("Ю́", "Ю'"),
        ("Я́", "Я'"),
        ("а́", "а'"),
        ("е́", "е'"),
        ("ё́", "е'"),
        ("и́", "и'"),
        ("о́", "о'"),
        ("у́", "у'"),
        ("ы́", "ы'"),
        ("э́", "э'"),
        ("ю́", "ю'"),
        ("я́", "я'"),
        ("ё", "е"),
        ("Ё", "Е"),
    ]

    REWRITE_STRESSED = pynini.closure(pynini.string_map(RU_STRESSED_MAP).optimize() | RU_ALPHA).optimize()
    TO_LATIN = pynini.string_file(get_abs_path("data/cyrillic_to_latin.tsv"))
    RU_ALPHA_OR_SPACE = pynini.union(RU_ALPHA, NEMO_SPACE, NEMO_NON_BREAKING_SPACE).optimize()

except (ModuleNotFoundError, ImportError):
    # Create placeholders
    RU_ALPHA = None
    LO_LATIN = None
コード例 #11
0
ファイル: date.py プロジェクト: mousebaiker/NeMo
    def __init__(self, number_names: dict, deterministic: bool):
        super().__init__(name="date",
                         kind="classify",
                         deterministic=deterministic)

        # Ru format: DD-MM-YYYY or DD-MM-YY
        month_abbr_to_names = pynini.string_file(
            get_abs_path("data/months/abbr_to_name.tsv")).optimize()

        delete_sep = pynutil.add_weight(pynini.cross(
            ".", " "), 1.09) | pynutil.add_weight(
                pynini.cross(pynini.union("/", "-"), " "), 1.1)

        numbers = number_names['ordinal_number_names']

        zero = (pynutil.add_weight(pynini.cross("0", ""),
                                   -0.1)) | (pynutil.add_weight(
                                       pynini.cross("0", "ноль "), 0.1))
        zero_digit = zero + pynini.compose(NEMO_DIGIT, numbers)
        digit_day = (pynini.union("1", "2", "3") + NEMO_DIGIT) | NEMO_DIGIT
        digit_day = pynini.compose(digit_day, numbers)
        day = (pynutil.insert("day: \"") + (zero_digit | digit_day) +
               pynutil.insert("\"")).optimize()

        digit_month = zero_digit | pynini.compose(
            pynini.accep("1") + NEMO_DIGIT, numbers)
        month_number_to_abbr = pynini.string_file(
            get_abs_path("data/months/numbers.tsv")).optimize()
        month_number_to_abbr = (((
            (pynutil.add_weight(pynini.cross("0", ""), -0.1)
             | pynini.accep("1")) + NEMO_DIGIT) | NEMO_DIGIT).optimize()
                                @ month_number_to_abbr).optimize()

        month_name = ((month_number_to_abbr @ month_abbr_to_names) |
                      pynutil.add_weight(month_abbr_to_names, 0.1)).optimize()
        month = (pynutil.insert("month: \"") + (month_name | digit_month) +
                 pynutil.insert("\"")).optimize()
        year = pynini.compose(((NEMO_DIGIT**4) | (NEMO_DIGIT**2)),
                              numbers).optimize()
        year |= zero_digit
        year_word_singular = ["год", "года", "году", "годом", "годе"]
        year_word_plural = [
            "годы", "годов", "годам", "годами", "годам", "годах"
        ]

        year_word = pynini.cross("г.", pynini.union(*year_word_singular))
        year_word |= pynini.cross("гг.", pynini.union(*year_word_plural))
        year_word = (pynutil.add_weight(insert_space, -0.1)
                     | pynutil.add_weight(pynini.accep(" "), 0.1)) + year_word

        year_optional = pynutil.insert("year: \"") + year + pynini.closure(
            year_word, 0, 1) + pynutil.insert("\"")
        year_optional = pynini.closure(delete_sep + year_optional, 0,
                                       1).optimize()
        year_only = pynutil.insert(
            "year: \"") + year + year_word + pynutil.insert("\"")

        tagger_graph = (day + delete_sep + month + year_optional) | year_only

        # Verbalizer
        day = (pynutil.delete("day:") + delete_space + pynutil.delete("\"") +
               pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\""))
        month = (pynutil.delete("month:") + delete_space +
                 pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) +
                 pynutil.delete("\""))
        year = (pynutil.delete("year:") + delete_space + pynutil.delete("\"") +
                pynini.closure(NEMO_NOT_QUOTE, 1) + delete_space +
                pynutil.delete("\""))
        year_optional = pynini.closure(delete_extra_space + year, 0, 1)
        graph_dmy = day + delete_extra_space + month + year_optional
        verbalizer_graph = (graph_dmy | year) + delete_space

        self.final_graph = pynini.compose(tagger_graph,
                                          verbalizer_graph).optimize()
        self.fst = pynutil.insert(
            "day: \"") + self.final_graph + pynutil.insert("\"")
        self.fst = self.add_tokens(self.fst).optimize()
コード例 #12
0
ファイル: electronic.py プロジェクト: manneh/NeMo
    def __init__(self, deterministic: bool = True):
        super().__init__(name="electronic",
                         kind="classify",
                         deterministic=deterministic)

        # tagger
        accepted_symbols = []
        with open(get_abs_path("data/electronic/symbols.tsv"), 'r') as f:
            for line in f:
                symbol, _ = line.split('\t')
                accepted_symbols.append(pynini.accep(symbol))
        username = (pynutil.insert("username: \"") + NEMO_ALPHA +
                    pynini.closure(NEMO_ALPHA | NEMO_DIGIT
                                   | pynini.union(*accepted_symbols)) +
                    pynutil.insert("\"") + pynini.cross('@', ' '))
        domain_graph = (
            NEMO_ALPHA +
            (pynini.closure(NEMO_ALPHA | NEMO_DIGIT | pynini.accep('-')
                            | pynini.accep('.'))) + (NEMO_ALPHA | NEMO_DIGIT))
        domain_graph = pynutil.insert(
            "domain: \"") + domain_graph + pynutil.insert("\"")
        tagger_graph = (username + domain_graph).optimize()

        # verbalizer
        graph_digit = pynini.string_file(
            get_abs_path(
                "data/numbers/digits_nominative_case.tsv")).optimize()
        graph_symbols = pynini.string_file(
            get_abs_path("data/electronic/symbols.tsv")).optimize()
        user_name = (
            pynutil.delete("username:"******"\"") +
            (pynini.closure(
                pynutil.add_weight(graph_digit + insert_space, 1.09)
                | pynutil.add_weight(
                    pynini.closure(graph_symbols + pynutil.insert(" ")), 1.09)
                | pynutil.add_weight(NEMO_NOT_QUOTE + insert_space, 1.1))) +
            pynutil.delete("\""))

        domain_default = (pynini.closure(NEMO_NOT_QUOTE + insert_space) +
                          pynini.cross(".", "точка ") + NEMO_NOT_QUOTE +
                          pynini.closure(insert_space + NEMO_NOT_QUOTE))

        server_default = (pynini.closure(
            (graph_digit | NEMO_ALPHA) + insert_space, 1) +
                          pynini.closure(graph_symbols + insert_space) +
                          pynini.closure(
                              (graph_digit | NEMO_ALPHA) + insert_space, 1))
        server_common = pynini.string_file(
            get_abs_path("data/electronic/server_name.tsv")) + insert_space
        domain_common = pynini.cross(".", "точка ") + pynini.string_file(
            get_abs_path("data/electronic/domain.tsv"))
        domain = (pynutil.delete("domain:") + delete_space +
                  pynutil.delete("\"") +
                  (pynutil.add_weight(server_common, 1.09)
                   | pynutil.add_weight(server_default, 1.1)) +
                  (pynutil.add_weight(domain_common, 1.09)
                   | pynutil.add_weight(domain_default, 1.1)) + delete_space +
                  pynutil.delete("\""))

        graph = user_name + delete_space + pynutil.insert(
            "собака ") + delete_space + domain + delete_space
        # replace all latin letters with their Ru verbalization
        verbalizer_graph = (graph.optimize() @ (pynini.closure(
            TO_CYRILLIC | RU_ALPHA | pynini.accep(" ")))).optimize()
        verbalizer_graph = verbalizer_graph.optimize()

        self.final_graph = (tagger_graph @ verbalizer_graph).optimize()
        self.fst = self.add_tokens(
            pynutil.insert("username: \"") + self.final_graph +
            pynutil.insert("\"")).optimize()
コード例 #13
0
    def __init__(self,
                 cardinal: GraphFst,
                 decimal: GraphFst,
                 deterministic: bool = True):
        super().__init__(name="money",
                         kind="classify",
                         deterministic=deterministic)
        cardinal_graph = cardinal.cardinal_numbers_default
        decimal_graph = decimal.final_graph

        unit_singular = pynini.string_file(
            get_abs_path("data/currency/currency_singular.tsv"))
        unit_plural = pynini.string_file(
            get_abs_path("data/currency/currency_plural.tsv"))

        # adding weight to make sure the space is preserved for ITN
        optional_delimiter = pynini.closure(
            pynutil.add_weight(pynini.cross(NEMO_SPACE, ""), -100), 0, 1)
        graph_unit_singular = (optional_delimiter +
                               pynutil.insert(" currency: \"") +
                               unit_singular + pynutil.insert("\""))
        graph_unit_plural = optional_delimiter + pynutil.insert(
            " currency: \"") + unit_plural + pynutil.insert("\"")

        one = pynini.compose(pynini.accep("1"), cardinal_graph).optimize()
        singular_graph = pynutil.insert(
            "integer_part: \"") + one + pynutil.insert(
                "\"") + graph_unit_singular

        graph_decimal = decimal_graph + graph_unit_plural

        graph_integer = (pynutil.insert("integer_part: \"") +
                         ((NEMO_SIGMA - "1") @ cardinal_graph) +
                         pynutil.insert("\"") + (graph_unit_plural))

        graph_integer |= singular_graph
        tagger_graph = (graph_integer.optimize()
                        | graph_decimal.optimize()).optimize()

        # verbalizer
        integer = pynutil.delete("\"") + pynini.closure(
            NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
        integer_part = pynutil.delete("integer_part: ") + integer

        unit = (pynutil.delete("currency: ") + pynutil.delete("\"") +
                pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\""))
        unit = pynini.accep(NEMO_SPACE) + unit

        verbalizer_graph_cardinal = (integer_part + unit).optimize()

        fractional_part = pynutil.delete("fractional_part: ") + integer
        optional_quantity = pynini.closure(
            pynini.accep(NEMO_SPACE) + pynutil.delete("quantity: ") + integer,
            0, 1)

        verbalizer_graph_decimal = (pynutil.delete('decimal { ') +
                                    integer_part + pynini.accep(" ") +
                                    fractional_part + optional_quantity +
                                    pynutil.delete(" }") + unit)

        verbalizer_graph = (verbalizer_graph_cardinal
                            | verbalizer_graph_decimal).optimize()

        self.final_graph = (tagger_graph @ verbalizer_graph).optimize()
        self.fst = self.add_tokens(
            pynutil.insert("integer_part: \"") + self.final_graph +
            pynutil.insert("\"")).optimize()
コード例 #14
0
ファイル: measure.py プロジェクト: quuhua911/NeMo
    def __init__(self,
                 cardinal: GraphFst,
                 decimal: GraphFst,
                 deterministic: bool = True):
        super().__init__(name="measure",
                         kind="classify",
                         deterministic=deterministic)

        # adding weight to make sure the space is preserved for ITN
        delete_space = pynini.closure(
            pynutil.add_weight(
                pynutil.delete(
                    pynini.union(NEMO_SPACE, NEMO_NON_BREAKING_SPACE)), -1), 0,
            1)

        cardinal_graph = cardinal.cardinal_numbers_default
        cardinal_graph_nominative = cardinal.cardinal_numbers_nominative
        graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv"))
        optional_graph_negative = cardinal.optional_graph_negative

        space_for_units = (
            pynutil.add_weight(pynutil.insert(NEMO_NON_BREAKING_SPACE), -0.1)
            | pynutil.add_weight(pynutil.insert(NEMO_SPACE), 0.1)).optimize()
        slash_unit = (pynini.cross("/", "в")
                      | pynini.cross("/", "за")) + space_for_units + graph_unit

        unit_slash_unit = pynutil.add_weight(
            graph_unit + space_for_units + slash_unit, -0.1)
        default_units = pynutil.insert("units: \"") + (
            graph_unit | unit_slash_unit) + pynutil.insert("\"")
        slash_units = pynutil.insert(
            "units: \"") + slash_unit + pynutil.insert("\"")
        subgraph_decimal = decimal.final_graph + (
            (delete_space + default_units) | slash_units)

        cardinal_space = (
            pynutil.insert("cardinal { ") + optional_graph_negative +
            pynutil.insert("integer: \"") + cardinal_graph +
            ((delete_space + pynutil.insert("\"") + pynutil.insert(" } ") +
              default_units)
             | (pynutil.insert("\"") + pynutil.insert(" } ") + slash_units)))

        cardinal_optional_dash_alpha = (
            pynutil.insert("cardinal { integer: \"") + cardinal_graph +
            pynini.closure(pynini.cross('-', ''), 0, 1) +
            pynutil.insert("\" } units: \"") + pynini.closure(RU_ALPHA, 1) +
            pynutil.insert("\""))

        alpha_optional_dash_cardinal = (
            pynutil.insert("units: \"") + pynini.closure(RU_ALPHA, 1) +
            pynini.closure(pynini.cross('-', ''), 0, 1) +
            pynutil.insert("\"") + pynutil.insert(" cardinal { integer: \"") +
            cardinal_graph_nominative +
            pynutil.insert("\" } preserve_order: true"))

        decimal_dash_alpha = (decimal.final_graph + pynini.cross('-', '') +
                              pynutil.insert(" units: \"") +
                              pynini.closure(RU_ALPHA, 1) +
                              pynutil.insert("\""))

        alpha_dash_decimal = (pynutil.insert("units: \"") +
                              pynini.closure(RU_ALPHA, 1) +
                              pynini.cross('-', '') + pynutil.insert("\" ") +
                              decimal.final_graph +
                              pynutil.insert(" preserve_order: true"))

        self.tagger_graph_default = (subgraph_decimal
                                     | cardinal_space).optimize()

        tagger_graph = (self.tagger_graph_default
                        | cardinal_optional_dash_alpha
                        | alpha_optional_dash_cardinal
                        | decimal_dash_alpha
                        | alpha_dash_decimal).optimize()

        # verbalizer
        unit = pynutil.delete("units: \"") + pynini.closure(
            NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + delete_space

        optional_sign = pynini.closure(
            pynini.cross("negative: \"true\" ", "минус "), 0, 1)
        integer = pynutil.delete(" \"") + pynini.closure(
            NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
        integer_part = pynutil.delete("integer_part:") + integer
        fractional_part = pynutil.delete("fractional_part:") + integer
        optional_quantity_part = pynini.closure(
            pynini.accep(" ") + pynutil.delete("quantity: \"") +
            pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\""),
            0,
            1,
        )
        graph_decimal = optional_sign + integer_part + pynini.accep(
            " ") + fractional_part + optional_quantity_part

        graph_decimal = pynutil.delete(
            "decimal {"
        ) + delete_space + graph_decimal + delete_space + pynutil.delete("}")

        graph_cardinal = (pynutil.delete("cardinal {") + delete_space +
                          optional_sign + pynutil.delete("integer: \"") +
                          pynini.closure(NEMO_NOT_QUOTE, 1) +
                          pynutil.delete("\"") + delete_space +
                          pynutil.delete("}"))

        verbalizer_graph = (graph_cardinal |
                            graph_decimal) + delete_space + insert_space + unit

        # SH adds "preserve_order: true" by default
        preserve_order = pynutil.delete(
            "preserve_order:") + delete_space + pynutil.delete(
                "true") + delete_space
        verbalizer_graph |= (unit + insert_space +
                             (graph_cardinal | graph_decimal) + delete_space +
                             pynini.closure(preserve_order, 0, 1))
        self.verbalizer_graph = verbalizer_graph.optimize()

        final_graph = (tagger_graph @ verbalizer_graph).optimize()
        self.fst = self.add_tokens(
            pynutil.insert("cardinal { integer: \"") + final_graph +
            pynutil.insert("\" }")).optimize()
コード例 #15
0
ファイル: alphabet.py プロジェクト: quuhua911/NeMo
        ("Ы́", "Ы'"),
        ("Э́", "Э'"),
        ("Ю́", "Ю'"),
        ("Я́", "Я'"),
        ("а́", "а'"),
        ("е́", "е'"),
        ("ё́", "е'"),
        ("и́", "и'"),
        ("о́", "о'"),
        ("у́", "у'"),
        ("ы́", "ы'"),
        ("э́", "э'"),
        ("ю́", "ю'"),
        ("я́", "я'"),
        ("ё", "е"),
        ("Ё", "Е"),
    ]

    REWRITE_STRESSED = pynini.closure(
        pynini.string_map(RU_STRESSED_MAP).optimize() | RU_ALPHA).optimize()
    TO_CYRILLIC = pynini.string_file(
        get_abs_path("data/latin_to_cyrillic.tsv")).optimize()
    TO_LATIN = pynini.invert(TO_CYRILLIC).optimize()
    RU_ALPHA_OR_SPACE = pynini.union(RU_ALPHA, NEMO_SPACE,
                                     NEMO_NON_BREAKING_SPACE).optimize()

except (ModuleNotFoundError, ImportError):
    # Create placeholders
    RU_ALPHA = None
    LO_LATIN = None
コード例 #16
0
ファイル: time.py プロジェクト: quuhua911/NeMo
    def __init__(self, number_names: dict, deterministic: bool = True):
        super().__init__(name="time",
                         kind="classify",
                         deterministic=deterministic)

        increment_hour_ordinal = pynini.string_file(
            get_abs_path("data/time/increment_hour_ordinal.tsv"))
        increment_hour_cardinal = pynini.string_file(
            get_abs_path("data/time/increment_hour_cardinal.tsv"))
        convert_hour = pynini.string_file(
            get_abs_path("data/time/time_convert.tsv"))

        number = pynini.closure(pynini.cross("0", ""), 0,
                                1) + number_names['cardinal_names_nominative']
        hour_options = pynini.project(increment_hour_ordinal, "input")
        hour_options = hour_options | pynini.project(convert_hour, "output")

        hour_exeption_ends_with_one = pynini.union(*["01", "21"])
        hour_exeption_ends_rest = pynini.union(*["02", "03", "04", "22", "23"])
        hour_other = (pynini.difference(
            hour_options,
            pynini.union(hour_exeption_ends_with_one,
                         hour_exeption_ends_rest))).optimize()

        hour = hour_exeption_ends_with_one @ number + pynutil.insert(" час")
        hour |= hour_exeption_ends_rest @ number + pynutil.insert(" часа")
        hour |= hour_other @ number + pynutil.insert(" часов")

        optional_and = pynini.closure(pynutil.insert("и "), 0, 1)
        digits = pynini.union(*[str(x) for x in range(10)])
        mins_start = pynini.union(*"012345")
        mins_options = mins_start + digits
        mins_exception_ends_with_one = mins_start + pynini.accep("1")
        mins_exception_ends_rest = pynini.difference(
            mins_start + pynini.union(*"234"),
            pynini.union(*["12", "13", "14"]))
        mins_other = pynini.difference(
            mins_options,
            pynini.union(mins_exception_ends_with_one,
                         mins_exception_ends_rest))

        minutes = mins_exception_ends_with_one @ number + pynutil.insert(
            " минута")
        minutes |= mins_exception_ends_rest @ number + pynutil.insert(
            " минуты")
        minutes |= mins_other @ number + pynutil.insert(" минут")
        self.minutes = minutes.optimize()
        # 17:15 -> "семнадцать часов и пятнадцать минут"
        hm = (pynutil.insert("hours: \"") + hour.optimize() +
              pynutil.insert("\"") +
              (pynini.cross(":", " ") + pynutil.insert("minutes: \"") +
               optional_and + minutes.optimize()) + pynutil.insert("\"") +
              pynutil.insert(" preserve_order: true"))
        h = pynutil.insert("hours: \"") + hour + pynutil.insert(
            "\"") + pynutil.delete(":00")
        self.graph_preserve_order = (hm | h).optimize()

        # 17:15 -> "пятнадцать минут шестого"
        # Requires permutations for the correct verbalization
        self.increment_hour_ordinal = pynini.compose(
            hour_options, increment_hour_ordinal).optimize()
        m_next_h = (pynutil.insert("hours: \"") + self.increment_hour_ordinal +
                    pynutil.insert("\"") + pynini.cross(":", " ") +
                    pynutil.insert("minutes: \"") + minutes +
                    pynutil.insert("\""))

        # 17:45 -> "без пятнадцати минут шесть"
        # Requires permutations for the correct verbalization
        self.mins_to_h = pynini.string_file(
            get_abs_path("data/time/minutes_to_hour.tsv")).optimize()
        self.increment_hour_cardinal = pynini.compose(
            hour_options, increment_hour_cardinal).optimize()
        m_to_h = (pynutil.insert("hours: \"") + self.increment_hour_cardinal +
                  pynutil.insert("\"") + pynini.cross(":", " ") +
                  pynutil.insert("minutes: \"без ") + self.mins_to_h +
                  pynutil.insert("\""))

        self.final_graph = m_next_h | self.graph_preserve_order | m_to_h
        self.fst = self.add_tokens(self.final_graph)
        self.fst = self.fst.optimize()