Ejemplo n.º 1
0
    def __init__(self, cardinal: GraphFst, deterministic: bool = True):
        super().__init__(name="time",
                         kind="classify",
                         deterministic=deterministic)
        suffix_graph = pynini.string_file(get_abs_path("data/time_suffix.tsv"))
        time_zone_graph = pynini.string_file(
            get_abs_path("data/time_zone.tsv"))

        # only used for < 1000 thousand -> 0 weight
        cardinal = cardinal.graph

        labels_hour = [str(x) for x in range(0, 24)]
        labels_minute_single = [str(x) for x in range(1, 10)]
        labels_minute_double = [str(x) for x in range(10, 60)]

        delete_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (
            pynini.closure(pynutil.delete("0"), 0, 1) + NEMO_DIGIT)

        graph_hour = delete_leading_zero_to_double_digit @ pynini.union(
            *labels_hour) @ cardinal

        graph_minute_single = pynini.union(*labels_minute_single) @ cardinal
        graph_minute_double = pynini.union(*labels_minute_double) @ cardinal

        final_graph_hour = pynutil.insert(
            "hours: \"") + graph_hour + pynutil.insert("\"")
        final_graph_minute = (
            pynutil.insert("minutes: \"") +
            (pynini.cross("0", "o") + insert_space + graph_minute_single
             | graph_minute_double) + pynutil.insert("\""))
        final_graph_second = (
            pynutil.insert("seconds: \"") +
            (pynini.cross("0", "o") + insert_space + graph_minute_single
             | graph_minute_double) + pynutil.insert("\""))
        final_suffix = pynutil.insert("suffix: \"") + convert_space(
            suffix_graph) + pynutil.insert("\"")
        final_suffix_optional = pynini.closure(
            delete_space + insert_space + final_suffix, 0, 1)
        final_time_zone_optional = pynini.closure(
            delete_space + insert_space + pynutil.insert("zone: \"") +
            convert_space(time_zone_graph) + pynutil.insert("\""),
            0,
            1,
        )

        # 2:30 pm, 02:30, 2:00
        graph_hm = (
            final_graph_hour + pynutil.delete(":") +
            (pynutil.delete("00") | insert_space + final_graph_minute) +
            final_suffix_optional + final_time_zone_optional)

        # 10:30:05 pm,
        graph_hms = (final_graph_hour + pynutil.delete(":") +
                     (pynini.cross("00", " minutes: \"zero\"")
                      | insert_space + final_graph_minute) +
                     pynutil.delete(":") +
                     (pynini.cross("00", " seconds: \"zero\"")
                      | insert_space + final_graph_second) +
                     final_suffix_optional + final_time_zone_optional)

        # 2.xx pm/am
        graph_hm2 = (
            final_graph_hour + pynutil.delete(".") +
            (pynutil.delete("00") | insert_space + final_graph_minute) +
            delete_space + insert_space + final_suffix +
            final_time_zone_optional)
        # 2 pm est
        graph_h = final_graph_hour + delete_space + insert_space + final_suffix + final_time_zone_optional
        final_graph = (graph_hm | graph_h | graph_hm2 | graph_hms).optimize()

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Ejemplo n.º 2
0
        ("Ё́", "Е'"),
        ("И́", "И'"),
        ("О́", "О'"),
        ("У́", "У'"),
        ("Ы́", "Ы'"),
        ("Э́", "Э'"),
        ("Ю́", "Ю'"),
        ("Я́", "Я'"),
        ("а́", "а'"),
        ("е́", "е'"),
        ("ё́", "е'"),
        ("и́", "и'"),
        ("о́", "о'"),
        ("у́", "у'"),
        ("ы́", "ы'"),
        ("э́", "э'"),
        ("ю́", "ю'"),
        ("я́", "я'"),
        ("ё", "е"),
        ("Ё", "Е"),
    ]

    REWRITE_STRESSED = pynini.closure(pynini.string_map(RU_STRESSED_MAP).optimize() | RU_ALPHA).optimize()
    TO_LATIN = pynini.string_file(get_abs_path("data/cyrillic_to_latin.tsv"))
    RU_ALPHA_OR_SPACE = pynini.union(RU_ALPHA, NEMO_SPACE, NEMO_NON_BREAKING_SPACE).optimize()

except (ModuleNotFoundError, ImportError):
    # Create placeholders
    RU_ALPHA = None
    LO_LATIN = None
Ejemplo n.º 3
0
    def __init__(self):
        super().__init__(name="time", kind="classify")
        # hours, minutes, seconds, suffix, zone, style, speak_period

        suffix_graph = pynini.string_file(
            get_abs_path("data/time/time_suffix.tsv"))
        time_zone_graph = pynini.invert(
            pynini.string_file(get_abs_path("data/time/time_zone.tsv")))
        time_to_graph = pynini.string_file(
            get_abs_path("data/time/time_to.tsv"))

        # only used for < 1000 thousand -> 0 weight
        cardinal = pynutil.add_weight(CardinalFst().graph_no_exception,
                                      weight=-0.7)

        labels_hour = [num_to_word(x) for x in range(0, 24)]
        labels_minute_single = [num_to_word(x) for x in range(1, 10)]
        labels_minute_double = [num_to_word(x) for x in range(10, 60)]

        graph_hour = pynini.union(*labels_hour) @ cardinal

        graph_minute_single = pynini.union(*labels_minute_single) @ cardinal
        graph_minute_double = pynini.union(*labels_minute_double) @ cardinal
        graph_minute_verbose = pynini.cross("half", "30") | pynini.cross(
            "quarter", "15")
        oclock = pynini.cross(
            pynini.union("o' clock", "o clock", "o'clock", "oclock"), "")

        final_graph_hour = pynutil.insert(
            "hours: \"") + graph_hour + pynutil.insert("\"")
        final_graph_minute = (
            pynutil.insert("minutes: \"") +
            (pynutil.insert("00")
             | oclock + pynutil.insert("00")
             | pynutil.delete("o") + delete_space + graph_minute_single
             | graph_minute_double) + pynutil.insert("\""))
        final_suffix = pynutil.insert("suffix: \"") + convert_space(
            suffix_graph) + pynutil.insert("\"")
        final_suffix_optional = pynini.closure(
            delete_space + insert_space + final_suffix, 0, 1)
        final_time_zone_optional = pynini.closure(
            delete_space + insert_space + pynutil.insert("zone: \"") +
            convert_space(time_zone_graph) + pynutil.insert("\""),
            0,
            1,
        )

        # five o' clock
        # two o eight, two thiry five (am/pm)
        # two pm/am
        graph_hm = final_graph_hour + delete_extra_space + final_graph_minute
        # 10 past four, quarter past four, half past four
        graph_mh = (pynutil.insert("minutes: \"") + pynini.union(
            graph_minute_single, graph_minute_double, graph_minute_verbose) +
                    pynutil.insert("\"") + delete_space +
                    pynutil.delete("past") + delete_extra_space +
                    final_graph_hour)

        graph_quarter_time = (pynutil.insert("minutes: \"") +
                              pynini.cross("quarter", "45") +
                              pynutil.insert("\"") + delete_space +
                              pynutil.delete(pynini.union("to", "till")) +
                              delete_extra_space +
                              pynutil.insert("hours: \"") + time_to_graph +
                              pynutil.insert("\""))
        final_graph = ((graph_hm | graph_mh | graph_quarter_time) +
                       final_suffix_optional +
                       final_time_zone_optional).optimize()

        final_graph = self.add_tokens(final_graph)

        self.fst = final_graph.optimize()
Ejemplo n.º 4
0
    NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize()
    NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize()
    NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize()
    NEMO_HEX = pynini.union(*string.hexdigits).optimize()
    NEMO_NON_BREAKING_SPACE = u"\u00A0"
    NEMO_SPACE = " "
    NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r",
                                    u"\u00A0").optimize()
    NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize()
    NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize()

    NEMO_PUNCT = pynini.union(
        *map(pynini.escape, string.punctuation)).optimize()
    NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize()

    NEMO_SIGMA = pynini.closure(NEMO_CHAR)

    delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE))
    insert_space = pynutil.insert(" ")
    delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ")
    delete_preserve_order = pynini.closure(
        pynutil.delete(" preserve_order: true")
        | (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE +
           pynutil.delete("\"")))

    suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv"))
    # _v = pynini.union("a", "e", "i", "o", "u")
    _c = pynini.union("b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n",
                      "p", "q", "r", "s", "t", "v", "w", "x", "y", "z")
    _ies = NEMO_SIGMA + _c + pynini.cross("y", "ies")
    _es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x",
Ejemplo n.º 5
0
    def __init__(
        self,
        input_case: str,
        deterministic: bool = False,
        cache_dir: str = None,
        overwrite_cache: bool = False,
        whitelist: str = None,
    ):
        super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic)
        if deterministic:
            raise ValueError(
                'Ru TN only supports non-deterministic cases and produces multiple normalization options.'
            )
        far_file = None
        if cache_dir is not None and cache_dir != "None":
            os.makedirs(cache_dir, exist_ok=True)
            whitelist_file = os.path.basename(whitelist) if whitelist else ""
            far_file = os.path.join(
                cache_dir, f"_{input_case}_ru_tn_{deterministic}_deterministic{whitelist_file}.far"
            )
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
            logging.info(f"ClassifyFst.fst was restored from {far_file}.")
        else:
            logging.info(f"Creating ClassifyFst grammars. This might take some time...")
            number_names = get_number_names()
            alternative_formats = get_alternative_formats()

            self.cardinal = CardinalFst(
                number_names=number_names, alternative_formats=alternative_formats, deterministic=deterministic
            )
            cardinal_graph = self.cardinal.fst

            self.ordinal = OrdinalFst(
                number_names=number_names, alternative_formats=alternative_formats, deterministic=deterministic
            )
            ordinal_graph = self.ordinal.fst

            self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic)
            decimal_graph = self.decimal.fst

            self.measure = MeasureFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic)
            measure_graph = self.measure.fst
            self.date = DateFst(number_names=number_names, deterministic=deterministic)
            date_graph = self.date.fst
            word_graph = WordFst(deterministic=deterministic).fst
            self.time = TimeFst(number_names=number_names, deterministic=deterministic)
            time_graph = self.time.fst
            self.telephone = TelephoneFst(number_names=number_names, deterministic=deterministic)
            telephone_graph = self.telephone.fst
            self.electronic = ElectronicFst(deterministic=deterministic)
            electronic_graph = self.electronic.fst
            self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic)
            money_graph = self.money.fst
            self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist)
            whitelist_graph = self.whitelist.fst
            punct_graph = PunctuationFst(deterministic=deterministic).fst

            classify = (
                pynutil.add_weight(whitelist_graph, 1.01)
                | pynutil.add_weight(time_graph, 1.1)
                | pynutil.add_weight(date_graph, 1.09)
                | pynutil.add_weight(decimal_graph, 1.1)
                | pynutil.add_weight(measure_graph, 0.9)
                | pynutil.add_weight(cardinal_graph, 1.1)
                | pynutil.add_weight(ordinal_graph, 1.1)
                | pynutil.add_weight(money_graph, 1.1)
                | pynutil.add_weight(telephone_graph, 1.1)
                | pynutil.add_weight(electronic_graph, 1.1)
                | pynutil.add_weight(word_graph, 100)
            )

            punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }")
            token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
            token_plus_punct = (
                pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)
            )

            graph = token_plus_punct + pynini.closure(pynutil.add_weight(delete_extra_space, 1.1) + token_plus_punct)
            graph = delete_space + graph + delete_space

            self.fst = graph.optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f"ClassifyFst grammars are saved to {far_file}.")
Ejemplo n.º 6
0
    def __init__(self):
        super().__init__(name="cardinal", kind="classify")
        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
        graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
        graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
        graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))

        graph_hundred = pynini.cross("hundred", "")

        graph_hundred_component = pynini.union(graph_digit + delete_space + graph_hundred, pynutil.insert("0"))
        graph_hundred_component += delete_space
        graph_hundred_component += pynini.union(
            graph_teen | pynutil.insert("00"),
            (graph_ties | pynutil.insert("0")) + delete_space + (graph_digit | pynutil.insert("0")),
        )

        graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ (
            pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)
        )
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            graph_hundred_component_at_least_one_none_zero_digit
        )

        graph_thousands = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("thousand"),
            pynutil.insert("000", weight=0.1),
        )

        graph_million = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("million"),
            pynutil.insert("000", weight=0.1),
        )
        graph_billion = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("billion"),
            pynutil.insert("000", weight=0.1),
        )
        graph_trillion = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("trillion"),
            pynutil.insert("000", weight=0.1),
        )
        graph_quadrillion = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("quadrillion"),
            pynutil.insert("000", weight=0.1),
        )
        graph_quintillion = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("quintillion"),
            pynutil.insert("000", weight=0.1),
        )
        graph_sextillion = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("sextillion"),
            pynutil.insert("000", weight=0.1),
        )

        graph = pynini.union(
            graph_sextillion
            + delete_space
            + graph_quintillion
            + delete_space
            + graph_quadrillion
            + delete_space
            + graph_trillion
            + delete_space
            + graph_billion
            + delete_space
            + graph_million
            + delete_space
            + graph_thousands
            + delete_space
            + graph_hundred_component,
            graph_zero,
        )

        graph = graph @ pynini.union(
            pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0"
        )

        labels_exception = [num_to_word(x) for x in range(0, 13)]
        graph_exception = pynini.union(*labels_exception)

        graph = pynini.cdrewrite(pynutil.delete("and"), NEMO_SPACE, NEMO_SPACE, NEMO_SIGMA) @ graph

        self.graph_no_exception = graph

        self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph

        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("minus", "\"-\"") + NEMO_SPACE, 0, 1
        )

        final_graph = optional_minus_graph + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"")

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Ejemplo n.º 7
0
def lg_containing_str(x, i):
    return (sigmaStar + pynini.closure(b, i, i) + sigmaStar).minimize()
Ejemplo n.º 8
0
def rewrite(cardinal: 'pynini.FstLike') -> 'pynini.FstLike':
    """
    Function to rewrite cardinals written in traditional orthograph (no '-' for numbers >100)
    to current orthography ('-' between all words in number string)
    e.g. deux mille cent vingt-trois -> deux-mille-cent-vingt-trois.
    In cases where original orthography is current, or string is mixture of two orthographies,
    will render invalid form that will not pass through CardinalFst
    e.g. deux-mille cent-vingt-trois -> "deux##vingt-trois" ('#' is not accepted in cardinal FST and will fail to convert.)
    e.g. deux 

    Args: 
        cardinal: cardinal FST
    """

    # Traditional orthography does not hyphenate numbers > 100, this will insert hyphens in
    # those contexts.
    targets = pynini.string_map([
        "et",  # for 'et un/onze'
        "cent",
        "mille",
        "million",
        "milliard",
        "billion",
        "billiard",
        "trillion",
        "trilliard",
    ])
    targets += pynini.accep("s").ques

    no_spaces = pynini.closure(NEMO_NOT_SPACE)

    # Valid numbers in reformed orthography will have no spaces.
    new_orthography_sigma = no_spaces

    # Old orthography will not have these strings. Replacing with character to mark.
    targets_for_filtering = ("-" + targets) | ("-" + targets +
                                               "-") | (targets + "-")

    filter = pynini.cdrewrite(pynini.cross(targets_for_filtering, "#"), "", "",
                              NEMO_SIGMA)  # Invalid for cardinal

    old_orthography_sigma = pynini.difference(
        NEMO_CHAR, "#")  # Marked character removed from sigma_star.
    old_orthography_sigma.closure()

    # Only accept strings that occur in old orthography. (This avoids tying two non-related numbers together.)
    # e.g. mille cent-une -> mille-cent-une
    filter @= old_orthography_sigma

    # Now know replacements will only work around targets
    replace_left = pynini.cdrewrite(pynini.cross(" ", "-"), "", targets,
                                    NEMO_SIGMA)

    replace_right = pynini.cdrewrite(pynini.cross(" ", "-"), targets, "",
                                     NEMO_SIGMA)

    replace = replace_left @ replace_right

    graph = new_orthography_sigma | (filter @ replace)

    return graph @ cardinal
Ejemplo n.º 9
0
    def __init__(self, number_names: dict, deterministic: bool):
        super().__init__(name="date",
                         kind="classify",
                         deterministic=deterministic)

        # Ru format: DD-MM-YYYY or DD-MM-YY
        month_abbr_to_names = pynini.string_file(
            get_abs_path("data/months/abbr_to_name.tsv")).optimize()

        delete_sep = pynutil.add_weight(pynini.cross(
            ".", " "), 1.09) | pynutil.add_weight(
                pynini.cross(pynini.union("/", "-"), " "), 1.1)

        numbers = number_names['ordinal_number_names']

        zero = (pynutil.add_weight(pynini.cross("0", ""),
                                   -0.1)) | (pynutil.add_weight(
                                       pynini.cross("0", "ноль "), 0.1))
        zero_digit = zero + pynini.compose(NEMO_DIGIT, numbers)
        digit_day = (pynini.union("1", "2", "3") + NEMO_DIGIT) | NEMO_DIGIT
        digit_day = pynini.compose(digit_day, numbers)
        day = (pynutil.insert("day: \"") + (zero_digit | digit_day) +
               pynutil.insert("\"")).optimize()

        digit_month = zero_digit | pynini.compose(
            pynini.accep("1") + NEMO_DIGIT, numbers)
        month_number_to_abbr = pynini.string_file(
            get_abs_path("data/months/numbers.tsv")).optimize()
        month_number_to_abbr = (((
            (pynutil.add_weight(pynini.cross("0", ""), -0.1)
             | pynini.accep("1")) + NEMO_DIGIT) | NEMO_DIGIT).optimize()
                                @ month_number_to_abbr).optimize()

        month_name = ((month_number_to_abbr @ month_abbr_to_names) |
                      pynutil.add_weight(month_abbr_to_names, 0.1)).optimize()
        month = (pynutil.insert("month: \"") + (month_name | digit_month) +
                 pynutil.insert("\"")).optimize()
        year = pynini.compose(((NEMO_DIGIT**4) | (NEMO_DIGIT**2)),
                              numbers).optimize()
        year |= zero_digit
        year_word_singular = ["год", "года", "году", "годом", "годе"]
        year_word_plural = [
            "годы", "годов", "годам", "годами", "годам", "годах"
        ]

        year_word = pynini.cross("г.", pynini.union(*year_word_singular))
        year_word |= pynini.cross("гг.", pynini.union(*year_word_plural))
        year_word = (pynutil.add_weight(insert_space, -0.1)
                     | pynutil.add_weight(pynini.accep(" "), 0.1)) + year_word

        year_optional = pynutil.insert("year: \"") + year + pynini.closure(
            year_word, 0, 1) + pynutil.insert("\"")
        year_optional = pynini.closure(delete_sep + year_optional, 0,
                                       1).optimize()
        year_only = pynutil.insert(
            "year: \"") + year + year_word + pynutil.insert("\"")

        tagger_graph = (day + delete_sep + month + year_optional) | year_only

        # Verbalizer
        day = (pynutil.delete("day:") + delete_space + pynutil.delete("\"") +
               pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\""))
        month = (pynutil.delete("month:") + delete_space +
                 pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) +
                 pynutil.delete("\""))
        year = (pynutil.delete("year:") + delete_space + pynutil.delete("\"") +
                pynini.closure(NEMO_NOT_QUOTE, 1) + delete_space +
                pynutil.delete("\""))
        year_optional = pynini.closure(delete_extra_space + year, 0, 1)
        graph_dmy = day + delete_extra_space + month + year_optional
        verbalizer_graph = (graph_dmy | year) + delete_space

        self.final_graph = pynini.compose(tagger_graph,
                                          verbalizer_graph).optimize()
        self.fst = pynutil.insert(
            "day: \"") + self.final_graph + pynutil.insert("\"")
        self.fst = self.add_tokens(self.fst).optimize()
Ejemplo n.º 10
0
    NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize()
    NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize()
    NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize()
    NEMO_HEX = pynini.union(*string.hexdigits).optimize()
    NEMO_NON_BREAKING_SPACE = u"\u00A0"
    NEMO_SPACE = " "
    NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r",
                                    u"\u00A0").optimize()
    NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize()
    NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize()

    NEMO_PUNCT = pynini.union(
        *map(pynini.escape, string.punctuation)).optimize()
    NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize()

    NEMO_SIGMA = pynini.closure(NEMO_CHAR)

    delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE))
    insert_space = pynutil.insert(" ")
    delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ")

    suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv"))
    # plural endung n/en maskuline Nomen mit den Endungen e, ent, and, ant, ist, or
    _n = NEMO_SIGMA + pynini.union("e") + pynutil.insert("n")
    _en = (NEMO_SIGMA +
           pynini.union("ent", "and", "ant", "ist", "or", "ion", "ik", "heit",
                        "keit", "schaft", "tät", "ung") + pynutil.insert("en"))
    _nen = NEMO_SIGMA + pynini.union("in") + (pynutil.insert("e")
                                              | pynutil.insert("nen"))
    _fremd = NEMO_SIGMA + pynini.union("ma", "um", "us") + pynutil.insert("en")
    # maskuline Nomen mit den Endungen eur, ich, ier, ig, ling, ör
Ejemplo n.º 11
0
    def __init__(self):
        super().__init__(name="cardinal", kind="classify")
        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_teens = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
        graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
        graph_ties_unique = pynini.string_file(
            get_abs_path("data/numbers/ties_unique.tsv"))

        # Tens components
        graph_tens_component = graph_ties + (
            (delete_hyphen + graph_digit) | pynutil.insert("0"))
        graph_tens_component = pynini.union(graph_tens_component, graph_teens,
                                            graph_ties_unique)

        graph_tens_component_with_leading_zeros = pynini.union(
            graph_tens_component,
            (pynutil.insert("0") +
             (graph_digit | pynutil.insert("0", weight=0.01))))

        # Hundreds components
        graph_cent_singular = pynutil.delete("cent")  # Used in hundreds place
        graph_cent_plural = pynini.cross(
            "cents", "00"
        )  # Only used as terminus of hundred sequence. deux cents -> 200, deux cent un -> 201

        graph_digit_no_one = pynini.project(pynini.union("un", "une"), 'input')
        graph_digit_no_one = (pynini.project(graph_digit, "input") -
                              graph_digit_no_one.arcsort()) @ graph_digit

        graph_hundreds_component_singular = (
            graph_digit_no_one + delete_hyphen + graph_cent_singular
        )  # Regular way: [1-9] * 100

        graph_hundreds_component_singular = pynini.union(
            graph_hundreds_component_singular, pynini.cross("cent", "1"))
        graph_hundreds_component_singular += delete_hyphen
        graph_hundreds_component_singular += graph_tens_component_with_leading_zeros

        graph_hundreds_component_plural = graph_digit_no_one + delete_hyphen + graph_cent_plural

        graph_hundreds_component = pynini.union(
            graph_hundreds_component_singular,
            graph_hundreds_component_plural,
            pynutil.insert("0") + graph_tens_component_with_leading_zeros,
        )

        graph_hundreds_component_at_least_one_none_zero_digit = graph_hundreds_component @ (
            pynini.closure(NEMO_DIGIT) +
            (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT))
        self.graph_hundreds_component_at_least_one_none_zero_digit = rewrite(
            graph_hundreds_component_at_least_one_none_zero_digit).optimize()

        # Graph thousands (we'll need this for cases of mille millions, mille milliards...)
        graph_tens_of_hundreds_component_singular = (
            graph_tens_component + delete_hyphen + graph_cent_singular
        )  # Tens of hundreds. e.g. 1900 = nineteen hundred/ 'dix neuf cents"
        graph_tens_of_hundreds_component_singular += delete_hyphen + graph_tens_component_with_leading_zeros
        graph_tens_of_hundreds_component_plural = graph_tens_component + delete_hyphen + graph_cent_plural
        graph_tens_of_hundred_component = (
            graph_tens_of_hundreds_component_plural
            | graph_tens_of_hundreds_component_singular)

        graph_thousands = pynini.union(
            graph_hundreds_component_at_least_one_none_zero_digit +
            delete_hyphen + pynutil.delete("mille"),
            pynutil.insert("001") +
            pynutil.delete("mille"),  # because 'mille', not 'un mille'
            pynutil.insert("000", weight=0.1),
        )

        # All other large amounts
        graph_millions = pynini.union(
            graph_hundreds_component_at_least_one_none_zero_digit +
            delete_hyphen +
            (pynutil.delete("million") | pynutil.delete("millions")),
            pynutil.insert("000", weight=0.1),
        )

        graph_milliards = pynini.union(  # French for English 'billion'
            graph_hundreds_component_at_least_one_none_zero_digit +
            delete_hyphen +
            (pynutil.delete("milliard") | pynutil.delete("milliards")),
            pynutil.insert("000", weight=0.1),
        )

        graph_billions = pynini.union(  # NOTE: this is English 'trillion.'
            graph_hundreds_component_at_least_one_none_zero_digit +
            delete_hyphen +
            (pynutil.delete("billions") | pynutil.delete("billion")),
            pynutil.insert("000", weight=0.1),
        )

        graph_mille_billion = pynini.union(
            graph_hundreds_component_at_least_one_none_zero_digit +
            delete_hyphen + pynutil.delete("mille"),
            pynutil.insert("001") +
            pynutil.delete("mille"),  # because we say 'mille', not 'un mille'
        )
        graph_mille_billion += delete_hyphen + (
            graph_millions | pynutil.insert("000") + pynutil.delete("billions")
        )  # allow for 'mil millones'
        graph_mille_billion |= pynutil.insert("000000", weight=0.1)

        graph_billiards = pynini.union(
            graph_hundreds_component_at_least_one_none_zero_digit +
            delete_hyphen +
            (pynutil.delete("billiards") | pynutil.delete("billiard")),
            pynutil.insert("000", weight=0.1),
        )

        graph_trillions = pynini.union(  # One thousand English trillions.
            graph_hundreds_component_at_least_one_none_zero_digit +
            delete_hyphen +
            (pynutil.delete("trillions") | pynutil.delete("trillion")),
            pynutil.insert("000", weight=0.1),
        )

        graph_trilliards = pynini.union(
            graph_hundreds_component_at_least_one_none_zero_digit +
            delete_hyphen +
            (pynutil.delete("trilliards") | pynutil.delete("trilliard")),
            pynutil.insert("000", weight=0.1),
        )

        graph = pynini.union(
            graph_trilliards + delete_hyphen + graph_trillions +
            delete_hyphen + graph_billiards + delete_hyphen + graph_billions +
            delete_hyphen + graph_milliards + delete_hyphen + graph_millions +
            delete_hyphen + graph_thousands + delete_hyphen +
            graph_hundreds_component,
            graph_tens_of_hundred_component,
            graph_zero,
        )

        graph = graph @ pynini.union(
            pynutil.delete(pynini.closure("0")) + pynini.difference(
                NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0")

        graph = rewrite(graph)

        self.graph_no_exception = graph.optimize()

        # save self.numbers_up_to_thousand for use in DecimalFst
        digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3)
        numbers_up_to_thousand = pynini.compose(
            graph, digits_up_to_thousand).optimize()
        self.numbers_up_to_thousand = numbers_up_to_thousand

        # save self.numbers_up_to_million for use in DecimalFst
        digits_up_to_million = (NEMO_DIGIT
                                | (NEMO_DIGIT**2)
                                | (NEMO_DIGIT**3)
                                | (NEMO_DIGIT**4)
                                | (NEMO_DIGIT**5)
                                | (NEMO_DIGIT**6))
        numbers_up_to_million = pynini.compose(
            graph, digits_up_to_million).optimize()
        self.numbers_up_to_million = numbers_up_to_million

        # don't convert cardinals from zero to nine inclusive
        graph_exception = pynini.project(pynini.union(graph_digit, graph_zero),
                                         'input')

        self.graph = (pynini.project(graph, "input") -
                      graph_exception.arcsort()) @ graph

        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("moins", "\"-\"") +
            NEMO_SPACE, 0, 1)

        final_graph = optional_minus_graph + pynutil.insert(
            "integer: \"") + self.graph + pynutil.insert("\"")

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Ejemplo n.º 12
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="electronic",
                         kind="classify",
                         deterministic=deterministic)

        # tagger
        accepted_symbols = []
        with open(get_abs_path("data/electronic/symbols.tsv"), 'r') as f:
            for line in f:
                symbol, _ = line.split('\t')
                accepted_symbols.append(pynini.accep(symbol))
        username = (pynutil.insert("username: \"") + NEMO_ALPHA +
                    pynini.closure(NEMO_ALPHA | NEMO_DIGIT
                                   | pynini.union(*accepted_symbols)) +
                    pynutil.insert("\"") + pynini.cross('@', ' '))
        domain_graph = (
            NEMO_ALPHA +
            (pynini.closure(NEMO_ALPHA | NEMO_DIGIT | pynini.accep('-')
                            | pynini.accep('.'))) + (NEMO_ALPHA | NEMO_DIGIT))
        domain_graph = pynutil.insert(
            "domain: \"") + domain_graph + pynutil.insert("\"")
        tagger_graph = (username + domain_graph).optimize()

        # verbalizer
        graph_digit = pynini.string_file(
            get_abs_path(
                "data/numbers/digits_nominative_case.tsv")).optimize()
        graph_symbols = pynini.string_file(
            get_abs_path("data/electronic/symbols.tsv")).optimize()
        user_name = (
            pynutil.delete("username:"******"\"") +
            (pynini.closure(
                pynutil.add_weight(graph_digit + insert_space, 1.09)
                | pynutil.add_weight(
                    pynini.closure(graph_symbols + pynutil.insert(" ")), 1.09)
                | pynutil.add_weight(NEMO_NOT_QUOTE + insert_space, 1.1))) +
            pynutil.delete("\""))

        domain_default = (pynini.closure(NEMO_NOT_QUOTE + insert_space) +
                          pynini.cross(".", "точка ") + NEMO_NOT_QUOTE +
                          pynini.closure(insert_space + NEMO_NOT_QUOTE))

        server_default = (pynini.closure(
            (graph_digit | NEMO_ALPHA) + insert_space, 1) +
                          pynini.closure(graph_symbols + insert_space) +
                          pynini.closure(
                              (graph_digit | NEMO_ALPHA) + insert_space, 1))
        server_common = pynini.string_file(
            get_abs_path("data/electronic/server_name.tsv")) + insert_space
        domain_common = pynini.cross(".", "точка ") + pynini.string_file(
            get_abs_path("data/electronic/domain.tsv"))
        domain = (pynutil.delete("domain:") + delete_space +
                  pynutil.delete("\"") +
                  (pynutil.add_weight(server_common, 1.09)
                   | pynutil.add_weight(server_default, 1.1)) +
                  (pynutil.add_weight(domain_common, 1.09)
                   | pynutil.add_weight(domain_default, 1.1)) + delete_space +
                  pynutil.delete("\""))

        graph = user_name + delete_space + pynutil.insert(
            "собака ") + delete_space + domain + delete_space
        # replace all latin letters with their Ru verbalization
        verbalizer_graph = (graph.optimize() @ (pynini.closure(
            TO_CYRILLIC | RU_ALPHA | pynini.accep(" ")))).optimize()
        verbalizer_graph = verbalizer_graph.optimize()

        self.final_graph = (tagger_graph @ verbalizer_graph).optimize()
        self.fst = self.add_tokens(
            pynutil.insert("username: \"") + self.final_graph +
            pynutil.insert("\"")).optimize()
Ejemplo n.º 13
0
    def __init__(self, cardinal: GraphFst):
        super().__init__(name="telephone", kind="classify")
        # country code, number_part, extension
        digit_to_str = pynini.invert(
            pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
        ).optimize() | pynini.cross("0", pynini.union("o", "oh", "zero"))
        str_to_digit = pynini.invert(digit_to_str)
        double_digit = pynini.union(*[
            pynini.cross(
                pynini.project(str(i) @ digit_to_str, "output") +
                pynini.accep(" ") +
                pynini.project(str(i) @ digit_to_str, "output"),
                pynutil.insert("double ") +
                pynini.project(str(i) @ digit_to_str, "output"),
            ) for i in range(10)
        ])
        double_digit.invert()

        # to handle cases like "one twenty three"
        two_digit_cardinal = pynini.compose(cardinal.graph_no_exception,
                                            NEMO_DIGIT**2)
        double_digit_to_digit = (pynini.compose(
            double_digit, str_to_digit + pynutil.delete(" ") + str_to_digit)
                                 | two_digit_cardinal)

        single_or_double_digit = (double_digit_to_digit
                                  | str_to_digit).optimize()
        single_or_double_digit = (
            single_or_double_digit +
            pynini.closure(pynutil.delete(" ") +
                           single_or_double_digit)).optimize()

        number_part = pynini.compose(
            single_or_double_digit,
            NEMO_DIGIT**3 + pynutil.insert("-") + NEMO_DIGIT**3 +
            pynutil.insert("-") + NEMO_DIGIT**4,
        ).optimize()
        number_part = pynutil.insert(
            "number_part: \"") + number_part.optimize() + pynutil.insert("\"")

        cardinal_option = pynini.compose(single_or_double_digit,
                                         NEMO_DIGIT**(2, 3))

        country_code = (
            pynutil.insert("country_code: \"") +
            pynini.closure(pynini.cross("plus ", "+"), 0, 1) +
            ((pynini.closure(str_to_digit + pynutil.delete(" "), 0, 2) +
              str_to_digit) | cardinal_option) + pynutil.insert("\""))
        optional_country_code = pynini.closure(
            country_code + pynutil.delete(" ") + insert_space, 0,
            1).optimize()
        graph = optional_country_code + number_part

        # credit card number
        space_four_digits = insert_space + NEMO_DIGIT**4
        credit_card_graph = pynini.compose(
            single_or_double_digit,
            NEMO_DIGIT**4 + space_four_digits**3).optimize()
        graph |= pynutil.insert(
            "number_part: \"") + credit_card_graph.optimize() + pynutil.insert(
                "\"")

        # SSN
        ssn_graph = pynini.compose(
            single_or_double_digit,
            NEMO_DIGIT**3 + pynutil.insert("-") + NEMO_DIGIT**2 +
            pynutil.insert("-") + NEMO_DIGIT**4,
        ).optimize()
        graph |= pynutil.insert(
            "number_part: \"") + ssn_graph.optimize() + pynutil.insert("\"")

        # ip
        digit_or_double = pynini.closure(str_to_digit + pynutil.delete(" "), 0,
                                         1) + double_digit_to_digit
        digit_or_double |= double_digit_to_digit + pynini.closure(
            pynutil.delete(" ") + str_to_digit, 0, 1)
        digit_or_double |= str_to_digit + (pynutil.delete(" ") +
                                           str_to_digit)**(0, 2)
        digit_or_double |= cardinal_option
        digit_or_double = digit_or_double.optimize()

        ip_graph = digit_or_double + (pynini.cross(" dot ", ".") +
                                      digit_or_double)**3
        graph |= pynutil.insert(
            "number_part: \"") + ip_graph.optimize() + pynutil.insert("\"")

        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Ejemplo n.º 14
0
    def __init__(self,
                 time: GraphFst,
                 date: GraphFst,
                 cardinal: GraphFst,
                 deterministic: bool = True,
                 lm: bool = False):
        super().__init__(name="range",
                         kind="classify",
                         deterministic=deterministic)

        delete_space = pynini.closure(pynutil.delete(" "), 0, 1)
        cardinal = cardinal.graph_with_and

        approx = pynini.cross("~", "approximately") + delete_extra_space

        # TIME
        time_graph = time + delete_space + pynini.cross(
            "-", " to ") + delete_space + time
        self.graph = time_graph | (approx + time)

        # YEAR
        date_year_four_digit = (NEMO_DIGIT**4 +
                                pynini.closure(pynini.accep("s"), 0, 1)) @ date
        date_year_two_digit = (NEMO_DIGIT**2 +
                               pynini.closure(pynini.accep("s"), 0, 1)) @ date

        year_to_year_graph = (date_year_four_digit + delete_space +
                              pynini.cross("-", " to ") + delete_space +
                              (date_year_four_digit | date_year_two_digit |
                               (NEMO_DIGIT**2 @ cardinal)))
        self.graph |= year_to_year_graph

        # ADDITION
        range_graph = cardinal + pynini.closure(
            pynini.cross("+", " plus ") + cardinal, 1)
        range_graph |= cardinal + pynini.closure(
            pynini.cross(" + ", " plus ") + cardinal, 1)
        range_graph |= approx + cardinal

        if not deterministic or lm:
            # cardinal ----
            cardinal_to_cardinal_graph = (
                cardinal + delete_space +
                pynini.cross("-", pynini.union(" to ", " minus ")) +
                delete_space + cardinal)

            range_graph |= cardinal_to_cardinal_graph | (
                cardinal + delete_space + pynini.cross(":", " to ") +
                delete_space + cardinal)

            # MULTIPLY
            for x in [" x ", "x"]:
                range_graph |= cardinal + pynini.closure(
                    pynini.cross(x, pynini.union(" by ", " times ")) +
                    cardinal, 1)

            for x in ["*", " * "]:
                range_graph |= cardinal + pynini.closure(
                    pynini.cross(x, " times ") + cardinal, 1)

            # supports "No. 12" -> "Number 12"
            range_graph |= ((pynini.cross(pynini.union("NO", "No"), "Number")
                             | pynini.cross("no", "number")) +
                            pynini.closure(pynini.union(". ", " "), 0, 1) +
                            cardinal)

            for x in ["/", " / "]:
                range_graph |= cardinal + pynini.closure(
                    pynini.cross(x, " divided by ") + cardinal, 1)

        self.graph |= range_graph

        self.graph = self.graph.optimize()
        graph = pynutil.insert("name: \"") + convert_space(
            self.graph).optimize() + pynutil.insert("\"")
        self.fst = graph.optimize()
Ejemplo n.º 15
0
    def __init__(self):
        super().__init__(name="telephone", kind="verbalize")

        number_part = pynutil.delete('number_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"')
        delete_tokens = self.delete_tokens(number_part)
        self.fst = delete_tokens.optimize()
Ejemplo n.º 16
0
    def __init__(self,
                 input_case: str,
                 deterministic: bool = True,
                 cache_dir: str = None,
                 overwrite_cache: bool = False):
        super().__init__(name="tokenize_and_classify",
                         kind="classify",
                         deterministic=deterministic)

        far_file = None
        if cache_dir is not None and cache_dir != "None":
            os.makedirs(cache_dir, exist_ok=True)
            far_file = os.path.join(
                cache_dir,
                f"_{input_case}_en_tn_{deterministic}_deterministic.far")
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
            logging.info(f'ClassifyFst.fst was restored from {far_file}.')
        else:
            logging.info(f"Creating ClassifyFst grammars.")
            cardinal = CardinalFst(deterministic=deterministic)
            cardinal_graph = cardinal.fst

            ordinal = OrdinalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            ordinal_graph = ordinal.fst

            decimal = DecimalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            decimal_graph = decimal.fst
            fraction = FractionFst(deterministic=deterministic,
                                   cardinal=cardinal)
            fraction_graph = fraction.fst

            measure = MeasureFst(cardinal=cardinal,
                                 decimal=decimal,
                                 fraction=fraction,
                                 deterministic=deterministic)
            measure_graph = measure.fst
            date_graph = DateFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            word_graph = WordFst(deterministic=deterministic).fst
            time_graph = TimeFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            telephone_graph = TelephoneFst(deterministic=deterministic).fst
            electonic_graph = ElectronicFst(deterministic=deterministic).fst
            money_graph = MoneyFst(cardinal=cardinal,
                                   decimal=decimal,
                                   deterministic=deterministic).fst
            whitelist_graph = WhiteListFst(input_case=input_case,
                                           deterministic=deterministic).fst
            punct_graph = PunctuationFst(deterministic=deterministic).fst

            classify = (pynutil.add_weight(whitelist_graph, 1.01)
                        | pynutil.add_weight(time_graph, 1.1)
                        | pynutil.add_weight(date_graph, 1.09)
                        | pynutil.add_weight(decimal_graph, 1.1)
                        | pynutil.add_weight(measure_graph, 1.1)
                        | pynutil.add_weight(cardinal_graph, 1.1)
                        | pynutil.add_weight(ordinal_graph, 1.1)
                        | pynutil.add_weight(money_graph, 1.1)
                        | pynutil.add_weight(telephone_graph, 1.1)
                        | pynutil.add_weight(electonic_graph, 1.1)
                        | pynutil.add_weight(fraction_graph, 1.1)
                        | pynutil.add_weight(word_graph, 100))

            if not deterministic:
                roman_graph = RomanFst(deterministic=deterministic).fst
                # the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens
                classify |= pynutil.add_weight(roman_graph, 100)

                abbreviation_graph = AbbreviationFst(
                    deterministic=deterministic).fst
                classify |= pynutil.add_weight(abbreviation_graph, 100)

            punct = pynutil.insert("tokens { ") + pynutil.add_weight(
                punct_graph, weight=1.1) + pynutil.insert(" }")
            token = pynutil.insert("tokens { ") + classify + pynutil.insert(
                " }")
            token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) +
                                token +
                                pynini.closure(pynutil.insert(" ") + punct))

            graph = token_plus_punct + pynini.closure(delete_extra_space +
                                                      token_plus_punct)
            graph = delete_space + graph + delete_space

            self.fst = graph.optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f"ClassifyFst grammars are saved to {far_file}.")
Ejemplo n.º 17
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="fraction",
                         kind="verbalize",
                         deterministic=deterministic)

        # Derivational strings append 'avo' as a suffix. Adding space for processing aid
        fraction_stem = pynutil.insert(" avo")
        plural = pynutil.insert("s")
        conjunction = pynutil.insert(" y ")

        integer = (pynutil.delete("integer_part: \"") +
                   strip_cardinal_apocope(pynini.closure(NEMO_NOT_QUOTE)) +
                   pynutil.delete("\""))

        numerator_one = pynutil.delete("numerator: \"") + pynini.accep(
            "un") + pynutil.delete("\" ")
        numerator = (pynutil.delete("numerator: \"") +
                     pynini.difference(pynini.closure(NEMO_NOT_QUOTE), "un") +
                     pynutil.delete("\" "))

        denominator_add_stem = pynutil.delete("denominator: \"") + (
            pynini.closure(NEMO_NOT_QUOTE) + fraction_stem +
            pynutil.delete("\" morphosyntactic_features: \"add_root\""))
        denominator_ordinal = pynutil.delete("denominator: \"") + (
            pynini.closure(NEMO_NOT_QUOTE) +
            pynutil.delete("\" morphosyntactic_features: \"ordinal\""))
        denominator_cardinal = pynutil.delete("denominator: \"") + (
            pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\""))

        denominator_singular = pynini.union(denominator_add_stem,
                                            denominator_ordinal)
        if not deterministic:
            # Occasional exceptions
            denominator_singular |= denominator_add_stem @ pynini.string_map(
                [("once avo", "undécimo"), ("doce avo", "duodécimo")])
        denominator_plural = denominator_singular + plural

        # Merging operations
        merge = pynini.cdrewrite(
            pynini.cross(" y ", "i"), "", "", NEMO_SIGMA
        )  # The denominator must be a single word, with the conjunction "y" replaced by i
        merge @= pynini.cdrewrite(delete_space, "",
                                  pynini.difference(NEMO_CHAR, "parte"),
                                  NEMO_SIGMA)

        # The merger can produce duplicate vowels. This is not allowed in orthography
        delete_duplicates = pynini.string_map([("aa", "a"),
                                               ("oo", "o")])  # Removes vowels
        delete_duplicates = pynini.cdrewrite(delete_duplicates, "", "",
                                             NEMO_SIGMA)

        remove_accents = pynini.cdrewrite(
            accents,
            pynini.union(NEMO_SPACE, pynini.accep("[BOS]")) +
            pynini.closure(NEMO_NOT_SPACE),
            pynini.closure(NEMO_NOT_SPACE) +
            pynini.union("avo", "ava", "ésimo", "ésima"),
            NEMO_SIGMA,
        )
        merge_into_single_word = merge @ remove_accents @ delete_duplicates

        fraction_default = numerator + delete_space + insert_space + (
            denominator_plural @ merge_into_single_word)

        fraction_with_one = (numerator_one + delete_space + insert_space +
                             (denominator_singular @ merge_into_single_word))

        fraction_with_cardinal = strip_cardinal_apocope(numerator
                                                        | numerator_one)
        fraction_with_cardinal += (
            delete_space + pynutil.insert(" sobre ") +
            strip_cardinal_apocope(denominator_cardinal))

        if not deterministic:
            # There is an alternative rendering where ordinals act as adjectives for 'parte'. This requires use of the feminine
            # Other rules will manage use of "un" at end, so just worry about endings
            exceptions = pynini.string_map([("tercia", "tercera")])
            apply_exceptions = pynini.cdrewrite(exceptions, "", "", NEMO_SIGMA)
            vowel_change = pynini.cdrewrite(pynini.cross("o", "a"), "",
                                            pynini.accep("[EOS]"), NEMO_SIGMA)

            denominator_singular_fem = shift_cardinal_gender(
                denominator_singular) @ vowel_change @ apply_exceptions
            denominator_plural_fem = denominator_singular_fem + plural

            numerator_one_fem = shift_cardinal_gender(numerator_one)
            numerator_fem = shift_cardinal_gender(numerator)

            fraction_with_cardinal |= (
                (numerator_one_fem | numerator_fem) + delete_space +
                pynutil.insert(" sobre ") +
                shift_cardinal_gender(denominator_cardinal))

            # Still need to manage stems
            merge_stem = pynini.cdrewrite(
                delete_space, "", pynini.union("avo", "ava", "avos", "avas"),
                NEMO_SIGMA)  # For managing alternative spacing
            merge_stem @= remove_accents @ delete_duplicates

            fraction_with_one_fem = numerator_one_fem + delete_space + insert_space
            fraction_with_one_fem += pynini.union(
                denominator_singular_fem @ merge_stem, denominator_singular_fem
                @ merge_into_single_word)  # Both forms exists
            fraction_with_one_fem += pynutil.insert(" parte")
            fraction_with_one_fem @= pynini.cdrewrite(
                pynini.cross("una media", "media"), "", "",
                NEMO_SIGMA)  # "media" not "una media"

            fraction_default_fem = numerator_fem + delete_space + insert_space
            fraction_default_fem += pynini.union(
                denominator_plural_fem @ merge_stem,
                denominator_plural_fem @ merge_into_single_word)
            fraction_default_fem += pynutil.insert(" partes")

            fraction_default |= (numerator + delete_space + insert_space +
                                 denominator_plural @ merge_stem
                                 )  # Case of no merger
            fraction_default |= fraction_default_fem

            fraction_with_one |= numerator_one + delete_space + insert_space + denominator_singular @ merge_stem
            fraction_with_one |= fraction_with_one_fem

        fraction_with_one @= pynini.cdrewrite(pynini.cross(
            "un medio", "medio"), "", "", NEMO_SIGMA)  # "medio" not "un medio"

        fraction = fraction_with_one | fraction_default | fraction_with_cardinal
        graph_masc = pynini.closure(integer + delete_space + conjunction, 0,
                                    1) + fraction

        # Manage cases of fem gender (only shows on integer except for "medio")
        integer_fem = shift_cardinal_gender(integer)
        fraction_default |= (
            shift_cardinal_gender(numerator) + delete_space + insert_space +
            (denominator_plural @ pynini.cross("medios", "medias")))
        fraction_with_one |= (
            pynutil.delete(numerator_one) + delete_space +
            (denominator_singular @ pynini.cross("medio", "media")))

        fraction_fem = fraction_with_one | fraction_default | fraction_with_cardinal
        graph_fem = pynini.closure(integer_fem + delete_space + conjunction, 0,
                                   1) + fraction_fem

        self.graph_masc = pynini.optimize(graph_masc)
        self.graph_fem = pynini.optimize(graph_fem)

        self.graph = graph_masc | graph_fem

        delete_tokens = self.delete_tokens(self.graph)
        self.fst = delete_tokens.optimize()
Ejemplo n.º 18
0
def get_pos_string(fsa, min_len, max_len):
    fsa_dict = {}
    for i in range(min_len, max_len + 1):
        fsa_dict[i] = pynini.intersect(fsa, pynini.closure(sigma, i, i))
        # print(list_string_set(fsa_dict[i]))
    return fsa_dict
Ejemplo n.º 19
0
    def __init__(self, input_case: str, deterministic: bool = True):
        super().__init__(name="tokenize_and_classify",
                         kind="classify",
                         deterministic=deterministic)

        cardinal = CardinalFst(deterministic=deterministic)
        cardinal_graph = cardinal.fst

        ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic)
        ordinal_graph = ordinal.fst

        decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic)
        decimal_graph = decimal.fst

        measure = MeasureFst(cardinal=cardinal,
                             decimal=decimal,
                             deterministic=deterministic)
        measure_graph = measure.fst
        date_graph = DateFst(cardinal=cardinal,
                             deterministic=deterministic).fst
        word_graph = WordFst(deterministic=deterministic).fst
        time_graph = TimeFst(cardinal=cardinal,
                             deterministic=deterministic).fst
        telephone_graph = TelephoneFst(deterministic=deterministic).fst
        electonic_graph = ElectronicFst(deterministic=deterministic).fst
        money_graph = MoneyFst(cardinal=cardinal,
                               decimal=decimal,
                               deterministic=deterministic).fst
        whitelist_graph = WhiteListFst(input_case=input_case,
                                       deterministic=deterministic).fst
        punct_graph = PunctuationFst(deterministic=deterministic).fst

        classify = (pynutil.add_weight(whitelist_graph, 1.01)
                    | pynutil.add_weight(time_graph, 1.1)
                    | pynutil.add_weight(date_graph, 1.09)
                    | pynutil.add_weight(decimal_graph, 1.1)
                    | pynutil.add_weight(measure_graph, 1.1)
                    | pynutil.add_weight(cardinal_graph, 1.1)
                    | pynutil.add_weight(ordinal_graph, 1.1)
                    | pynutil.add_weight(money_graph, 1.1)
                    | pynutil.add_weight(telephone_graph, 1.1)
                    | pynutil.add_weight(electonic_graph, 1.1)
                    | pynutil.add_weight(word_graph, 100))

        if not deterministic:
            serial_graph = SerialFst(cardinal, deterministic=deterministic).fst
            classify |= pynutil.add_weight(serial_graph, 1.1)
            classify = classify.optimize()

        punct = pynutil.insert("tokens { ") + pynutil.add_weight(
            punct_graph, weight=1.1) + pynutil.insert(" }")
        token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
        token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) +
                            token +
                            pynini.closure(pynutil.insert(" ") + punct))

        graph = token_plus_punct + pynini.closure(delete_extra_space +
                                                  token_plus_punct)
        graph = delete_space + graph + delete_space

        self.fst = graph.optimize()
Ejemplo n.º 20
0
def get_neg_string(fsa, min_len, max_len):
    fsa_dict = {}
    for i in range(min_len, max_len + 1):
        fsa_dict[i] = pynini.difference(pynini.closure(sigma, i, i), fsa)
        # print(list_string_set(fsa_dict[i]))
    return fsa_dict
Ejemplo n.º 21
0
def lg_containing_ssq(x, i):
    return (pynini.closure(sigmaStar + x + sigmaStar, i, i)).minimize()
Ejemplo n.º 22
0
    def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None):
        super().__init__(name="whitelist", kind="classify", deterministic=deterministic)

        def _get_whitelist_graph(input_case, file, keep_punct_add_end: bool = False):
            whitelist = load_labels(file)
            if input_case == "lower_cased":
                whitelist = [[x.lower(), y] for x, y in whitelist]
            else:
                whitelist = [[x, y] for x, y in whitelist]

            if keep_punct_add_end:
                whitelist.extend(augment_labels_with_punct_at_end(whitelist))

            graph = pynini.string_map(whitelist)
            return graph

        graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist/tts.tsv"))
        graph |= _get_whitelist_graph(input_case, get_abs_path("data/whitelist/symbol.tsv"))

        if deterministic:
            names = get_names()
            graph |= (
                pynini.cross(pynini.union("st", "St", "ST"), "Saint")
                + pynini.closure(pynutil.delete("."))
                + pynini.accep(" ")
                + names
            )
        else:
            graph |= _get_whitelist_graph(
                input_case, get_abs_path("data/whitelist/alternatives.tsv"), keep_punct_add_end=True
            )

        for x in [".", ". "]:
            graph |= (
                NEMO_UPPER
                + pynini.closure(pynutil.delete(x) + NEMO_UPPER, 2)
                + pynini.closure(pynutil.delete("."), 0, 1)
            )

        if not deterministic:
            multiple_forms_whitelist_graph = get_formats(get_abs_path("data/whitelist/alternatives_all_format.tsv"))
            graph |= multiple_forms_whitelist_graph

            graph_unit = pynini.string_file(get_abs_path("data/measure/unit.tsv")) | pynini.string_file(
                get_abs_path("data/measure/unit_alternatives.tsv")
            )
            graph_unit_plural = graph_unit @ SINGULAR_TO_PLURAL
            units_graph = pynini.compose(NEMO_CHAR ** (3, ...), convert_space(graph_unit | graph_unit_plural))
            graph |= units_graph

        # convert to states only if comma is present before the abbreviation to avoid converting all caps words,
        # e.g. "IN", "OH", "OK"
        # TODO or only exclude above?
        states = load_labels(get_abs_path("data/address/state.tsv"))
        additional_options = []
        for x, y in states:
            if input_case == "lower_cased":
                x = x.lower()
            additional_options.append((x, f"{y[0]}.{y[1:]}"))
            if not deterministic:
                additional_options.append((x, f"{y[0]}.{y[1:]}."))

        states.extend(additional_options)
        state_graph = pynini.string_map(states)
        graph |= pynini.closure(NEMO_NOT_SPACE, 1) + pynini.union(", ", ",") + pynini.invert(state_graph).optimize()

        if input_file:
            whitelist_provided = _get_whitelist_graph(input_case, input_file)
            if not deterministic:
                graph |= whitelist_provided
            else:
                graph = whitelist_provided

        self.graph = (convert_space(graph)).optimize()

        self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
Ejemplo n.º 23
0
    def __init__(self):
        super().__init__(name="word", kind="classify")

        word = pynutil.insert("name: \"") + pynini.closure(
            NEMO_NOT_SPACE, 1) + pynutil.insert("\"")
        self.fst = word.optimize()
Ejemplo n.º 24
0
    def __init__(self,
                 min_word_constraint: str,
                 name: str,
                 cont_classes: List[Tuple[Optional[str], float]],
                 alphabet: Dict[str, List[str]] = {},
                 start: bool = False):
        """
    Converts a limited PCRE regex (scope, quantification) to an OpenFst FST.
    Substitutes phoneme classes with symbols.
    Assumes long vowels have been expanded. 
    Unlike Slot, a StemGuesser's FST is eagerly evaluated

    Args:
      min_word_constraint: str
        a minimal word constraint expressed as a limited regular expression of phone classes
      name: str
        name of the StemGuesser Slot
      cont_classes: list[tuple[str, float]]
        list of continuation classes and their weights
          example: [('PluralSuffix', 0.8), (None, 0.5)]
          The StemGuesser's destination state is a final state if None is present in the list
          A StemGuesser can be both a terminal and non-terminal class
          Empty list of continuation classes are not allowed
      alphabet: dict[str, list[str]], optional
        dictionary mapping phone classes to list of symbols; if sigma (.) is used in the regex, alphabet is required
      start: bool, optional
        the slot is one of root slots (root class in LEXC)
    """

        # phone classes could overlap so phones to set first
        symbols = {
            symb
            for symbol_class in alphabet.values() for symb in symbol_class
        }

        stack = []  # check for matching parens
        fst = None
        fst_stack = []  # to be used in union or scope mode
        regex = min_word_constraint

        # () means scope / grouping - concatenation
        # [] means match anything inside - union
        # . means match any character in the alphabet (not including epsilon) - sigma
        # quantifiers: ?, *, +
        for i in range(len(regex)):
            if regex[i] == '[':
                stack.append(regex[i])
                fst_stack.append(('union', pynini.accep('')))
            elif regex[i] == '(':
                stack.append(regex[i])
                fst_stack.append(('scope', pynini.accep('')))
            elif regex[i] == ')':
                if stack.pop(-1) != '(':
                    raise Exception('Unmatched parentheses')
                fst_stack[-1] = ('processed', fst_stack[-1][1])
            elif regex[i] == ']':
                if stack.pop(-1) != '[':
                    raise Exception('Unmatched brackets')
                fst_stack[-1] = ('processed', fst_stack[-1][1])
            elif fst_stack and fst_stack[-1][0] in ['scope', 'union']:
                if fst_stack[-1][0] == 'scope':
                    # concatenate only the current chars
                    if regex[i] not in alphabet:
                        fst_stack[-1][1].concat(regex[i])
                    else:
                        fst_stack[-1][1].concat(
                            pynini.union(*alphabet[regex[i]]))
                elif fst_stack[-1][0] == 'union':
                    if fst_stack[-1][1].num_states() == 1:
                        # make sure we don't union with empty string
                        if regex[i] not in alphabet:
                            fst_stack[-1][1].concat(regex[i])
                        else:
                            fst_stack[-1][1].concat(
                                pynini.union(*alphabet[regex[i]]))
                    else:
                        # union only the current chars within the matching parens
                        if regex[i] not in alphabet:
                            fst_stack[-1][1].union(regex[i])
                        else:
                            fst_stack[-1][1].union(
                                pynini.union(*alphabet[regex[i]]))
            # sigma
            elif regex[i] == '.':
                if not alphabet:
                    raise Exception(
                        'Alphabet required if regex includes sigma')
                # make copy each time to avoid state issues
                sigma = pynini.union(*list(symbols))
                fst_stack.append(('sigma', sigma))
            # quantification - perform closure on last FST
            elif regex[i] == '?':
                if i == 0:
                    raise Exception('Empty quantification')
                fst_stack[-1] = (fst_stack[-1][0],
                                 pynini.closure(fst_stack[-1][1], 0, 1))
            elif regex[i] == '*':
                if i == 0:
                    raise Exception('Empty quantification')

                fst_stack[-1] = (fst_stack[-1][0],
                                 pynini.closure(fst_stack[-1][1]))

                # if the entire regex is a Kleene closure or previous character is sigma, accept empty string too
                if (len(fst_stack) == 1 and i == len(regex) - 1) or (
                        fst_stack and fst_stack[-1][0] == 'sigma'):
                    fst_stack[-1] = (fst_stack[-1][0],
                                     pynini.union(fst_stack[-1][1], ''))
            elif regex[i] == '+':
                if i == 0:
                    raise Exception('Empty quantification')
                fst_stack[-1] = (fst_stack[-1][0],
                                 pynini.closure(fst_stack[-1][1], 1))
            else:
                if regex[i] not in alphabet:
                    fst_stack.append(('symbol', pynini.accep(regex[i])))
                else:
                    fst_stack.append(
                        ('symbol', pynini.union(*alphabet[regex[i]])))

        for (_, f) in fst_stack:
            if not fst:  # first FST
                fst = f
            else:
                fst = fst + f

        if len(stack) > 0:
            raise Exception('Unmatched brackets')

        # upper/lower alphabet symbol transitions and weights not used by compiler
        rules = [('', '', cont_classes, 0.0)]
        super(StemGuesser, self).__init__(name, rules, start)
        self.fst = fst.optimize()
    def __init__(
        self,
        input_case: str,
        deterministic: bool = True,
        cache_dir: str = None,
        overwrite_cache: bool = True,
        whitelist: str = None,
    ):
        super().__init__(name="tokenize_and_classify",
                         kind="classify",
                         deterministic=deterministic)

        far_file = None
        if cache_dir is not None and cache_dir != 'None':
            os.makedirs(cache_dir, exist_ok=True)
            whitelist_file = os.path.basename(whitelist) if whitelist else ""
            far_file = os.path.join(
                cache_dir,
                f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far"
            )
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode='r')['tokenize_and_classify']
            logging.info(f'ClassifyFst.fst was restored from {far_file}.')
        else:
            logging.info(
                f'Creating ClassifyFst grammars. This might take some time...')
            # TAGGERS
            cardinal = CardinalFst(deterministic=deterministic)
            cardinal_graph = cardinal.fst

            ordinal = OrdinalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            ordinal_graph = ordinal.fst

            decimal = DecimalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            decimal_graph = decimal.fst
            fraction = FractionFst(deterministic=deterministic,
                                   cardinal=cardinal)
            fraction_graph = fraction.fst

            measure = MeasureFst(cardinal=cardinal,
                                 decimal=decimal,
                                 fraction=fraction,
                                 deterministic=deterministic)
            measure_graph = measure.fst
            date_graph = DateFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            word_graph = WordFst(deterministic=deterministic).graph
            time_graph = TimeFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            telephone_graph = TelephoneFst(deterministic=deterministic).fst
            electronic_graph = ElectronicFst(deterministic=deterministic).fst
            money_graph = MoneyFst(cardinal=cardinal,
                                   decimal=decimal,
                                   deterministic=deterministic).fst
            whitelist = WhiteListFst(input_case=input_case,
                                     deterministic=deterministic,
                                     input_file=whitelist)
            whitelist_graph = whitelist.graph
            punct_graph = PunctuationFst(deterministic=deterministic).graph

            # VERBALIZERS
            cardinal = vCardinal(deterministic=deterministic)
            v_cardinal_graph = cardinal.fst
            decimal = vDecimal(cardinal=cardinal, deterministic=deterministic)
            v_decimal_graph = decimal.fst
            ordinal = vOrdinal(deterministic=deterministic)
            v_ordinal_graph = ordinal.fst
            fraction = vFraction(deterministic=deterministic)
            v_fraction_graph = fraction.fst
            v_telephone_graph = vTelephone(deterministic=deterministic).fst
            v_electronic_graph = vElectronic(deterministic=deterministic).fst
            measure = vMeasure(decimal=decimal,
                               cardinal=cardinal,
                               fraction=fraction,
                               deterministic=deterministic)
            v_measure_graph = measure.fst
            v_time_graph = vTime(deterministic=deterministic).fst
            v_date_graph = vDate(ordinal=ordinal,
                                 deterministic=deterministic).fst
            v_money_graph = vMoney(decimal=decimal,
                                   deterministic=deterministic).fst
            v_roman_graph = vRoman(deterministic=deterministic).fst
            v_abbreviation = vAbbreviation(deterministic=deterministic).fst

            classify_and_verbalize = (
                pynutil.add_weight(whitelist_graph, 1.01)
                | pynutil.add_weight(pynini.compose(time_graph, v_time_graph),
                                     1.1)
                | pynutil.add_weight(
                    pynini.compose(decimal_graph, v_decimal_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(measure_graph, v_measure_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(cardinal_graph, v_cardinal_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(ordinal_graph, v_ordinal_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(telephone_graph, v_telephone_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(electronic_graph, v_electronic_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(fraction_graph, v_fraction_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(money_graph, v_money_graph), 1.1)
                | pynutil.add_weight(word_graph, 100)
                | pynutil.add_weight(pynini.compose(date_graph, v_date_graph),
                                     1.09)).optimize()

            if not deterministic:
                roman_graph = RomanFst(deterministic=deterministic).fst
                # the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens
                classify_and_verbalize |= pynutil.add_weight(
                    pynini.compose(roman_graph, v_roman_graph), 100)

                abbreviation_graph = AbbreviationFst(
                    whitelist=whitelist, deterministic=deterministic).fst
                classify_and_verbalize |= pynutil.add_weight(
                    pynini.compose(abbreviation_graph, v_abbreviation), 100)

            punct = pynutil.add_weight(punct_graph, weight=1.1)
            token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) +
                                classify_and_verbalize +
                                pynini.closure(pynutil.insert(" ") + punct))

            graph = token_plus_punct + pynini.closure(delete_extra_space +
                                                      token_plus_punct)
            graph = delete_space + graph + delete_space

            self.fst = graph.optimize()
            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f'ClassifyFst grammars are saved to {far_file}.')
Ejemplo n.º 26
0
    def __init__(self, decimal: GraphFst, cardinal: GraphFst,
                 fraction: GraphFst, deterministic: bool):
        super().__init__(name="measure",
                         kind="verbalize",
                         deterministic=deterministic)

        graph_decimal_masc = decimal.delete_tokens(decimal.graph_masc)
        graph_decimal_fem = decimal.delete_tokens(decimal.graph_fem)
        graph_cardinal_masc = cardinal.delete_tokens(cardinal.graph_masc)
        graph_cardinal_fem = cardinal.delete_tokens(cardinal.graph_fem)
        graph_fraction_fem = fraction.delete_tokens(fraction.graph_fem)
        graph_fraction_masc = fraction.delete_tokens(fraction.graph_masc)

        unit_masc = (unit_plural_masc | unit_singular_masc) + pynini.closure(
            NEMO_WHITE_SPACE + "por" + pynini.closure(NEMO_NOT_QUOTE, 1), 0, 1)
        unit_masc |= "por" + pynini.closure(NEMO_NOT_QUOTE, 1)
        unit_masc = pynutil.delete("units: \"") + (
            pynini.closure(NEMO_NOT_QUOTE) @ unit_masc) + pynutil.delete("\"")

        unit_fem = (unit_plural_fem | unit_singular_fem) + pynini.closure(
            NEMO_WHITE_SPACE + "por" + pynini.closure(NEMO_NOT_QUOTE, 1), 0, 1)
        unit_fem = pynutil.delete("units: \"") + (
            pynini.closure(NEMO_NOT_QUOTE) @ unit_fem) + pynutil.delete("\"")

        graph_masc = (graph_cardinal_masc
                      | graph_decimal_masc) + NEMO_WHITE_SPACE + unit_masc
        graph_masc |= graph_fraction_masc + NEMO_WHITE_SPACE + pynutil.insert(
            "de ") + unit_masc
        graph_masc |= pynutil.add_weight(
            graph_fraction_masc
            @ (NEMO_SIGMA + pynini.union("medio", "medios")) +
            NEMO_WHITE_SPACE + unit_masc,
            -0.001)  # "medio litro" not "medio de litro"

        graph_fem = (graph_cardinal_fem
                     | graph_decimal_fem) + NEMO_WHITE_SPACE + unit_fem
        graph_fem |= graph_fraction_fem + NEMO_WHITE_SPACE + pynutil.insert(
            "de ") + unit_fem
        graph_fem |= pynutil.add_weight(
            graph_fraction_fem @ (NEMO_SIGMA + pynini.union("media", "medias"))
            + NEMO_WHITE_SPACE + unit_fem, -0.001)

        graph = graph_masc | graph_fem

        graph = (pynini.cdrewrite(
            pynutil.insert(" de"), "quantity: \"" +
            pynini.closure(NEMO_NOT_QUOTE, 1), "\"", NEMO_SIGMA) @ graph
                 )  # billones de xyz

        graph @= pynini.cdrewrite(pynini.cross(ones, "uno"), "",
                                  NEMO_WHITE_SPACE + "por", NEMO_SIGMA)

        # To manage alphanumeric combonations ("a-8, 5x"), we let them use a weighted default path.
        alpha_num_unit = pynutil.delete("units: \"") + pynini.closure(
            NEMO_NOT_QUOTE) + pynutil.delete("\"")
        graph_alpha_num = pynini.union(
            (graph_cardinal_masc | graph_decimal_masc) + NEMO_SPACE +
            alpha_num_unit,
            alpha_num_unit + delete_extra_space +
            (graph_cardinal_masc | graph_decimal_masc),
        )

        graph |= pynutil.add_weight(graph_alpha_num, 0.01)

        graph += delete_preserve_order

        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()
Ejemplo n.º 27
0
    def __init__(self, cardinal: GraphFst, decimal: GraphFst):
        super().__init__(name="money", kind="classify")
        # quantity, integer_part, fractional_part, currency

        cardinal_graph = cardinal.graph_no_exception
        graph_decimal_final = decimal.final_graph_wo_negative

        unit_singular = pynini.string_file(
            get_abs_path("data/currency_singular.tsv"))
        unit_singular = pynini.invert(unit_singular)
        unit_plural = pynini.string_file(
            get_abs_path("data/currency_plural.tsv"))
        unit_plural = pynini.invert(unit_plural)

        graph_unit_singular = pynutil.insert("currency: \"") + convert_space(
            unit_singular) + pynutil.insert("\"")
        graph_unit_plural = pynutil.insert("currency: \"") + convert_space(
            unit_plural) + pynutil.insert("\"")

        add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (
            pynutil.insert("0") + NEMO_DIGIT)
        # twelve dollars (and) fifty cents, zero cents
        cents_standalone = (
            pynutil.insert("morphosyntactic_features: \",\""
                           )  # always use a comma in the decimal
            + insert_space + pynutil.insert("fractional_part: \"") +
            pynini.union(
                pynutil.add_weight(
                    ((NEMO_SIGMA - "un") @ cardinal_graph),
                    -0.7) @ add_leading_zero_to_double_digit + delete_space +
                pynutil.delete(pynini.union("centavos", "céntimos")),
                pynini.cross("un", "01") + delete_space +
                pynutil.delete(pynini.union("centavo", "céntimo")),
            ) + pynutil.insert("\""))

        optional_cents_standalone = pynini.closure(
            delete_space + pynini.closure(
                (pynutil.delete("con") | pynutil.delete('y')) + delete_space,
                0, 1) + insert_space + cents_standalone,
            0,
            1,
        )
        # twelve dollars fifty, only after integer
        # setenta y cinco dólares con sesenta y tres~$75,63
        optional_cents_suffix = pynini.closure(
            delete_extra_space +
            pynutil.insert("morphosyntactic_features: \",\""
                           )  # always use a comma in the decimal
            + insert_space + pynutil.insert("fractional_part: \"") +
            pynini.closure(
                (pynutil.delete("con") | pynutil.delete('y')) + delete_space,
                0, 1) + pynutil.add_weight(
                    cardinal_graph @ add_leading_zero_to_double_digit, -0.7) +
            pynutil.insert("\""),
            0,
            1,
        )

        graph_integer = (pynutil.insert("integer_part: \"") +
                         ((NEMO_SIGMA - "un" - "una") @ cardinal_graph) +
                         pynutil.insert("\"") + delete_extra_space +
                         graph_unit_plural +
                         (optional_cents_standalone | optional_cents_suffix))
        graph_integer |= (
            pynutil.insert("integer_part: \"") +
            (pynini.cross("un", "1") | pynini.cross("una", "1")) +
            pynutil.insert("\"") + delete_extra_space + graph_unit_singular +
            (optional_cents_standalone | optional_cents_suffix))
        graph_decimal = graph_decimal_final + delete_extra_space + graph_unit_plural
        graph_decimal |= pynutil.insert(
            "currency: \"$\" integer_part: \"0\" ") + cents_standalone
        final_graph = graph_integer | graph_decimal
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Ejemplo n.º 28
0
Archivo: time.py Proyecto: NVIDIA/NeMo
    def __init__(self):
        super().__init__(name="time", kind="classify")
        # hours, minutes, seconds, suffix, zone, style, speak_period

        graph_hours_to = pynini.string_file(get_abs_path("data/time/hours_to.tsv"))
        graph_minutes_to = pynini.string_file(get_abs_path("data/time/minutes_to.tsv"))
        graph_hours = pynini.string_file(get_abs_path("data/time/hours.tsv"))
        graph_minutes = pynini.string_file(get_abs_path("data/time/minutes.tsv"))
        time_zone_graph = pynini.invert(pynini.string_file(get_abs_path("data/time/time_zone.tsv")))

        graph_half = pynini.cross("rưỡi", "30")
        oclock = pynini.cross("giờ", "")
        minute = pynini.cross("phút", "")
        optional_minute = pynini.closure(delete_space + minute, 0, 1)
        second = pynini.cross("giây", "")

        final_graph_hour = pynutil.insert('hours: "') + graph_hours + pynutil.insert('"') + delete_space + oclock
        graph_minute = graph_minutes + optional_minute
        graph_second = graph_minutes + delete_space + second
        final_time_zone_optional = pynini.closure(
            delete_space
            + insert_space
            + pynutil.insert('zone: "')
            + convert_space(time_zone_graph)
            + pynutil.insert('"'),
            0,
            1,
        )

        graph_hm = (
            final_graph_hour
            + delete_extra_space
            + pynutil.insert('minutes: "')
            + (graph_minute | graph_half)
            + pynutil.insert('"')
        )

        graph_hms = (
            final_graph_hour
            + delete_extra_space
            + pynutil.insert('minutes: "')
            + graph_minutes
            + delete_space
            + minute
            + pynutil.insert('"')
            + delete_extra_space
            + pynutil.insert('seconds: "')
            + graph_second
            + pynutil.insert('"')
        )

        graph_ms = (
            pynutil.insert('minutes: "')
            + graph_minutes
            + delete_space
            + minute
            + pynutil.insert('"')
            + delete_extra_space
            + pynutil.insert('seconds: "')
            + (graph_second | graph_half)
            + pynutil.insert('"')
        )

        graph_hours_to_component = graph_hours @ graph_hours_to
        graph_minutes_to_component = graph_minutes @ graph_minutes_to

        graph_time_to = (
            pynutil.insert('hours: "')
            + graph_hours_to_component
            + pynutil.insert('"')
            + delete_space
            + oclock
            + delete_space
            + pynutil.delete("kém")
            + delete_extra_space
            + pynutil.insert('minutes: "')
            + graph_minutes_to_component
            + pynutil.insert('"')
            + optional_minute
        )

        final_graph = (final_graph_hour | graph_hm | graph_hms) + final_time_zone_optional
        final_graph |= graph_ms
        final_graph |= graph_time_to

        final_graph = self.add_tokens(final_graph)

        self.fst = final_graph.optimize()
Ejemplo n.º 29
0
    def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
        super().__init__(name="tokenize_and_classify", kind="classify")

        far_file = None
        if cache_dir is not None and cache_dir != "None":
            os.makedirs(cache_dir, exist_ok=True)
            far_file = os.path.join(cache_dir, "_en_itn.far")
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
            logging.info(f"ClassifyFst.fst was restored from {far_file}.")
        else:
            logging.info(f"Creating ClassifyFst grammars.")
            cardinal = CardinalFst()
            cardinal_graph = cardinal.fst

            fraction = FractionFst(cardinal)
            fraction_graph = fraction.fst

            ordinal = OrdinalFst()
            ordinal_graph = ordinal.fst

            decimal = DecimalFst(cardinal)
            decimal_graph = decimal.fst

            measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal).fst
            date_graph = DateFst(cardinal=cardinal).fst
            word_graph = WordFst().fst
            time_graph = TimeFst().fst
            money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst
            whitelist_graph = WhiteListFst().fst
            punct_graph = PunctuationFst().fst
            electronic_graph = ElectronicFst().fst
            telephone_graph = TelephoneFst().fst

            classify = (pynutil.add_weight(whitelist_graph, 1.01)
                        | pynutil.add_weight(time_graph, 1.05)
                        | pynutil.add_weight(date_graph, 1.09)
                        | pynutil.add_weight(decimal_graph, 1.08)
                        | pynutil.add_weight(measure_graph, 1.1)
                        | pynutil.add_weight(cardinal_graph, 1.1)
                        | pynutil.add_weight(ordinal_graph, 1.1)
                        | pynutil.add_weight(fraction_graph, 1.09)
                        | pynutil.add_weight(money_graph, 1.07)
                        | pynutil.add_weight(telephone_graph, 1.1)
                        | pynutil.add_weight(electronic_graph, 1.1)
                        | pynutil.add_weight(word_graph, 100))

            punct = pynutil.insert("tokens { ") + pynutil.add_weight(
                punct_graph, weight=1.1) + pynutil.insert(" }")
            token = pynutil.insert("tokens { ") + classify + pynutil.insert(
                " }")
            token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) +
                                token +
                                pynini.closure(pynutil.insert(" ") + punct))

            graph = token_plus_punct + pynini.closure(delete_extra_space +
                                                      token_plus_punct)
            graph = delete_space + graph + delete_space

            self.fst = graph.optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f"ClassifyFst grammars are saved to {far_file}.")
Ejemplo n.º 30
0
 def __init__(self, deterministic: bool = True):
     super().__init__(name="time", kind="verbalize", deterministic=deterministic)
     hour = (
         pynutil.delete("hours:")
         + delete_space
         + pynutil.delete("\"")
         + pynini.closure(NEMO_NOT_QUOTE, 1)
         + pynutil.delete("\"")
     )
     minute = (
         pynutil.delete("minutes:")
         + delete_space
         + pynutil.delete("\"")
         + pynini.closure(NEMO_NOT_QUOTE, 1)
         + pynutil.delete("\"")
     )
     suffix = (
         pynutil.delete("suffix:")
         + delete_space
         + pynutil.delete("\"")
         + pynini.closure(NEMO_NOT_QUOTE, 1)
         + pynutil.delete("\"")
     )
     optional_suffix = pynini.closure(delete_space + insert_space + suffix, 0, 1)
     zone = (
         pynutil.delete("zone:")
         + delete_space
         + pynutil.delete("\"")
         + pynini.closure(NEMO_NOT_QUOTE, 1)
         + pynutil.delete("\"")
     )
     optional_zone = pynini.closure(delete_space + insert_space + zone, 0, 1)
     second = (
         pynutil.delete("seconds:")
         + delete_space
         + pynutil.delete("\"")
         + pynini.closure(NEMO_NOT_QUOTE, 1)
         + pynutil.delete("\"")
     )
     graph_hms = (
         hour
         + pynutil.insert(" hours ")
         + delete_space
         + minute
         + pynutil.insert(" minutes and ")
         + delete_space
         + second
         + pynutil.insert(" seconds")
         + optional_suffix
         + optional_zone
     )
     graph_hms @= pynini.cdrewrite(
         pynutil.delete("o ")
         | pynini.cross("one minutes", "one minute")
         | pynini.cross("one seconds", "one second")
         | pynini.cross("one hours", "one hour"),
         pynini.union(" ", "[BOS]"),
         "",
         NEMO_SIGMA,
     )
     graph = hour + delete_space + insert_space + minute + optional_suffix + optional_zone
     graph |= hour + insert_space + pynutil.insert("o'clock") + optional_zone
     graph |= hour + delete_space + insert_space + suffix + optional_zone
     graph |= graph_hms
     delete_tokens = self.delete_tokens(graph)
     self.fst = delete_tokens.optimize()