Exemple #1
0
def get_names():
    """
    Returns the graph that matched common male and female names.
    """
    male_labels = load_labels(get_abs_path("data/roman/male.tsv"))
    female_labels = load_labels(get_abs_path("data/roman/female.tsv"))
    male_labels.extend([[x[0].upper()] for x in male_labels])
    female_labels.extend([[x[0].upper()] for x in female_labels])
    names = pynini.string_map(male_labels).optimize()
    names |= pynini.string_map(female_labels).optimize()
    return names
Exemple #2
0
 def _get_whitelist_non_deterministic_graph(
         file="data/whitelist_alternatives.tsv"):
     whitelist = load_labels(get_abs_path(file))
     whitelist_lower = [(x.lower(), y.lower()) for x, y in whitelist]
     whitelist_cased = [(x, y) for x, y in whitelist]
     graph = pynini.string_map(whitelist_lower + whitelist_cased)
     return graph
Exemple #3
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="punctuation",
                         kind="classify",
                         deterministic=deterministic)
        s = "!#%&\'()*+,-./:;<=>?@^_`{|}~\""

        punct_symbols_to_exclude = ["[", "]"]
        punct_unicode = [
            chr(i) for i in range(sys.maxunicode)
            if category(chr(i)).startswith("P")
            and chr(i) not in punct_symbols_to_exclude
        ]

        whitelist_symbols = load_labels(
            get_abs_path("data/whitelist/symbol.tsv"))
        whitelist_symbols = [x[0] for x in whitelist_symbols]
        self.punct_marks = [
            p for p in punct_unicode + list(s) if p not in whitelist_symbols
        ]

        punct = pynini.union(*self.punct_marks)
        punct = pynini.closure(punct, 1)

        emphasis = (pynini.accep("<") + (
            (pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1) +
             pynini.closure(pynini.accep("/"), 0, 1))
            | (pynini.accep("/") +
               pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1))) +
                    pynini.accep(">"))
        punct = plurals._priority_union(emphasis, punct, NEMO_SIGMA)

        self.graph = punct
        self.fst = (pynutil.insert("name: \"") + self.graph +
                    pynutil.insert("\"")).optimize()
Exemple #4
0
    def get_serial_graph(self):
        """
        Finite state transducer for classifying serial.
            The serial is a combination of digits, letters and dashes, e.g.:
            c325-b -> tokens { cardinal { integer: "c three two five b" } }
        """
        alpha = NEMO_ALPHA

        if self.deterministic:
            num_graph = self.single_digits_graph
        else:
            num_graph = self.graph
            letter_pronunciation = pynini.string_map(
                load_labels(get_abs_path("data/letter_pronunciation.tsv")))
            alpha |= letter_pronunciation

        delimiter = insert_space | pynini.cross("-", " ") | pynini.cross(
            "/", " ")
        letter_num = pynini.closure(alpha + delimiter, 1) + num_graph
        num_letter = pynini.closure(num_graph + delimiter, 1) + alpha
        num_delimiter_num = pynini.closure(num_graph + delimiter,
                                           1) + num_graph
        next_alpha_or_num = pynini.closure(delimiter + (alpha | num_graph))
        serial_graph = (letter_num | num_letter
                        | num_delimiter_num) + next_alpha_or_num
        if not self.deterministic:
            serial_graph += pynini.closure(
                pynini.accep("s") | pynini.cross("s", "es"), 0, 1)

        serial_graph.optimize()
        return pynutil.add_weight(serial_graph, 10)
Exemple #5
0
 def _get_whitelist_graph(input_case, file):
     whitelist = load_labels(file)
     if input_case == "lower_cased":
         whitelist = [(x.lower(), y) for x, y in whitelist]
     else:
         whitelist = [(x, y) for x, y in whitelist]
     graph = pynini.string_map(whitelist)
     return graph
Exemple #6
0
 def _get_whitelist_graph(input_case, file="data/whitelist.tsv"):
     whitelist = load_labels(get_abs_path(file))
     if input_case == "lower_cased":
         whitelist = [(x.lower(), y) for x, y in whitelist]
     else:
         whitelist = [(x, y) for x, y in whitelist]
     graph = pynini.string_map(whitelist)
     return graph
Exemple #7
0
        def _get_whitelist_graph(input_case, file, keep_punct_add_end: bool = False):
            whitelist = load_labels(file)
            if input_case == "lower_cased":
                whitelist = [[x.lower(), y] for x, y in whitelist]
            else:
                whitelist = [[x, y] for x, y in whitelist]

            if keep_punct_add_end:
                whitelist.extend(augment_labels_with_punct_at_end(whitelist))

            graph = pynini.string_map(whitelist)
            return graph
Exemple #8
0
def prepare_labels_for_insertion(file_path: str):
    """
    Read the file and creates a union insertion graph

    Args:
        file_path: path to a file (single column)

    Returns fst that inserts labels from the file
    """
    labels = load_labels(file_path)
    mapping = defaultdict(list)
    for k, v in labels:
        mapping[k].append(v)

    for k in mapping:
        mapping[k] = insert_space + pynini.union(
            *[pynutil.insert(end) for end in mapping[k]])
    return mapping
Exemple #9
0
def get_formats(input_f, input_case="cased", is_default=True):
    """
    Adds various abbreviation format options to the list of acceptable input forms
    """
    multiple_formats = load_labels(input_f)
    additional_options = []
    for x, y in multiple_formats:
        if input_case == "lower_cased":
            x = x.lower()
        additional_options.append((f"{x}.", y))  # default "dr" -> doctor, this includes period "dr." -> doctor
        additional_options.append((f"{x[0].upper() + x[1:]}", f"{y[0].upper() + y[1:]}"))  # "Dr" -> Doctor
        additional_options.append((f"{x[0].upper() + x[1:]}.", f"{y[0].upper() + y[1:]}"))  # "Dr." -> Doctor
    multiple_formats.extend(additional_options)

    if not is_default:
        multiple_formats = [(x, f"|raw_start|{x}|raw_end||norm_start|{y}|norm_end|") for (x, y) in multiple_formats]

    multiple_formats = pynini.string_map(multiple_formats)
    return multiple_formats
Exemple #10
0
def prepare_labels_for_insertion(file_path: str):
    """
    Read the file and creates a union insertion graph

    Args:
        file_path: path to a file (3 columns: a label type e.g.
        "@@decimal_delimiter@@", a label e.g. "целого", and a weight e.g. "0.1").

    Returns dictionary mapping from label type to an fst that inserts the labels with the specified weights.

    """
    labels = load_labels(file_path)
    mapping = defaultdict(list)
    for k, v, w in labels:
        mapping[k].append((v, w))

    for k in mapping:
        mapping[k] = (insert_space + pynini.union(*[
            pynutil.add_weight(pynutil.insert(end), weight)
            for end, weight in mapping[k]
        ])).optimize()
    return mapping
Exemple #11
0
    def __init__(self, deterministic: bool = True, lm: bool = False):
        super().__init__(name="roman",
                         kind="classify",
                         deterministic=deterministic)

        roman_dict = load_labels(
            get_abs_path("data/roman/roman_to_spoken.tsv"))
        default_graph = pynini.string_map(roman_dict).optimize()
        default_graph = pynutil.insert(
            "integer: \"") + default_graph + pynutil.insert("\"")
        ordinal_limit = 19

        graph_teens = pynini.string_map(
            [x[0] for x in roman_dict[:ordinal_limit]]).optimize()

        # roman numerals up to ordinal_limit with a preceding name are converted to ordinal form
        names = get_names()
        graph = (pynutil.insert("key_the_ordinal: \"") + names +
                 pynutil.insert("\"") + pynini.accep(" ") +
                 graph_teens @ default_graph).optimize()

        # single symbol roman numerals with preceding key words (multiple formats) are converted to cardinal form
        key_words = []
        for k_word in load_labels(get_abs_path("data/roman/key_word.tsv")):
            key_words.append(k_word)
            key_words.append([k_word[0][0].upper() + k_word[0][1:]])
            key_words.append([k_word[0].upper()])

        key_words = pynini.string_map(key_words).optimize()
        graph |= (pynutil.insert("key_cardinal: \"") + key_words +
                  pynutil.insert("\"") + pynini.accep(" ") +
                  default_graph).optimize()

        if deterministic:
            # two digit roman numerals up to 49
            roman_to_cardinal = pynini.compose(
                pynini.closure(NEMO_ALPHA, 2),
                (pynutil.insert("default_cardinal: \"default\" ") +
                 (pynini.string_map([x[0] for x in roman_dict[:50]
                                     ]).optimize()) @ default_graph),
            )
        elif not lm:
            # two or more digit roman numerals
            roman_to_cardinal = pynini.compose(
                pynini.difference(NEMO_SIGMA, "I"),
                (pynutil.insert("default_cardinal: \"default\" integer: \"") +
                 pynini.string_map(roman_dict).optimize() +
                 pynutil.insert("\"")),
            ).optimize()

        # convert three digit roman or up with suffix to ordinal
        roman_to_ordinal = pynini.compose(
            pynini.closure(NEMO_ALPHA, 3),
            (pynutil.insert("default_ordinal: \"default\" ") +
             graph_teens @ default_graph + pynutil.delete("th")),
        )

        graph |= roman_to_cardinal | roman_to_ordinal
        graph = self.add_tokens(graph)

        self.fst = graph.optimize()
Exemple #12
0
    def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None):
        super().__init__(name="whitelist", kind="classify", deterministic=deterministic)

        def _get_whitelist_graph(input_case, file, keep_punct_add_end: bool = False):
            whitelist = load_labels(file)
            if input_case == "lower_cased":
                whitelist = [[x.lower(), y] for x, y in whitelist]
            else:
                whitelist = [[x, y] for x, y in whitelist]

            if keep_punct_add_end:
                whitelist.extend(augment_labels_with_punct_at_end(whitelist))

            graph = pynini.string_map(whitelist)
            return graph

        graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist/tts.tsv"))
        graph |= _get_whitelist_graph(input_case, get_abs_path("data/whitelist/symbol.tsv"))

        if deterministic:
            names = get_names()
            graph |= (
                pynini.cross(pynini.union("st", "St", "ST"), "Saint")
                + pynini.closure(pynutil.delete("."))
                + pynini.accep(" ")
                + names
            )
        else:
            graph |= _get_whitelist_graph(
                input_case, get_abs_path("data/whitelist/alternatives.tsv"), keep_punct_add_end=True
            )

        for x in [".", ". "]:
            graph |= (
                NEMO_UPPER
                + pynini.closure(pynutil.delete(x) + NEMO_UPPER, 2)
                + pynini.closure(pynutil.delete("."), 0, 1)
            )

        if not deterministic:
            multiple_forms_whitelist_graph = get_formats(get_abs_path("data/whitelist/alternatives_all_format.tsv"))
            graph |= multiple_forms_whitelist_graph

            graph_unit = pynini.string_file(get_abs_path("data/measure/unit.tsv")) | pynini.string_file(
                get_abs_path("data/measure/unit_alternatives.tsv")
            )
            graph_unit_plural = graph_unit @ SINGULAR_TO_PLURAL
            units_graph = pynini.compose(NEMO_CHAR ** (3, ...), convert_space(graph_unit | graph_unit_plural))
            graph |= units_graph

        # convert to states only if comma is present before the abbreviation to avoid converting all caps words,
        # e.g. "IN", "OH", "OK"
        # TODO or only exclude above?
        states = load_labels(get_abs_path("data/address/state.tsv"))
        additional_options = []
        for x, y in states:
            if input_case == "lower_cased":
                x = x.lower()
            additional_options.append((x, f"{y[0]}.{y[1:]}"))
            if not deterministic:
                additional_options.append((x, f"{y[0]}.{y[1:]}."))

        states.extend(additional_options)
        state_graph = pynini.string_map(states)
        graph |= pynini.closure(NEMO_NOT_SPACE, 1) + pynini.union(", ", ",") + pynini.invert(state_graph).optimize()

        if input_file:
            whitelist_provided = _get_whitelist_graph(input_case, input_file)
            if not deterministic:
                graph |= whitelist_provided
            else:
                graph = whitelist_provided

        self.graph = (convert_space(graph)).optimize()

        self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
Exemple #13
0
    def __init__(self,
                 cardinal: GraphFst,
                 decimal: GraphFst,
                 deterministic: bool = True):
        super().__init__(name="money",
                         kind="classify",
                         deterministic=deterministic)
        cardinal_graph = cardinal.graph
        graph_decimal_final = decimal.final_graph_wo_negative

        unit_singular = pynini.string_file(
            get_abs_path("data/currency/currency.tsv"))
        unit_plural = convert_space(unit_singular @ SINGULAR_TO_PLURAL)
        unit_singular = convert_space(unit_singular)

        graph_unit_singular = pynutil.insert(
            "currency: \"") + unit_singular + pynutil.insert("\"")
        graph_unit_plural = pynutil.insert(
            "currency: \"") + unit_plural + pynutil.insert("\"")

        singular_graph = (graph_unit_singular +
                          pynutil.insert(" integer_part: \"") +
                          pynini.cross("1", "one") + pynutil.insert("\""))

        graph_decimal = graph_unit_plural + insert_space + graph_decimal_final

        if deterministic:
            graph_integer = (graph_unit_plural +
                             pynutil.insert(" integer_part: \"") +
                             ((NEMO_SIGMA - "1") @ cardinal_graph) +
                             pynutil.insert("\""))
        else:
            graph_integer = (
                graph_unit_plural + pynutil.insert(" integer_part: \"") +
                ((NEMO_SIGMA - "1")
                 @ (get_hundreds_graph(deterministic) | cardinal_graph)) +
                pynutil.insert("\""))
            graph_decimal |= singular_graph + insert_space + graph_decimal_final

        graph_integer |= singular_graph

        final_graph = graph_integer | graph_decimal

        if not deterministic:
            currencies = load_labels(
                get_abs_path("data/currency/currency.tsv"))
            zero_graph = pynini.cross("0", "") | pynini.accep("0")
            # add minor currency part only when there are two digits after the point
            # .01 -> {zero one cent, one cent}, .05 -> {oh five, five cents}
            two_digits_fractional_part = (
                NEMO_SIGMA + pynini.closure(NEMO_DIGIT) +
                ((pynini.accep(".") + (NEMO_DIGIT**(2) | zero_graph +
                                       (NEMO_DIGIT - "0")))
                 | pynutil.delete(".") +
                 pynini.cross(pynini.closure("0", 1), "")))

            integer_graph = None
            decimal_graph_with_minor = None
            decimal_graph_default = None

            for curr_symbol, curr_name in currencies:
                curr_symbol_graph = pynutil.delete(curr_symbol)
                graph_end = pynutil.insert(" currency: \"" + curr_symbol +
                                           "\"")
                preserve_order = pynutil.insert(" preserve_order: True")
                integer_part = decimal.graph_integer + graph_end + preserve_order

                # "$4" -> 'integer_part: "four" currency: "$" preserve_order: True' -> four dollars
                integer_graph_curr = curr_symbol_graph + integer_part
                # remove fractional part if it contains only zeros
                # "$4.00" -> 'integer_part: "four" currency: "$" preserve_order: True' -> four dollars
                integer_graph_curr |= pynini.compose(
                    two_digits_fractional_part, integer_graph_curr)
                decimal_graph_with_minor_curr = (
                    curr_symbol_graph + pynini.closure(integer_part, 0, 1) +
                    pynini.cross(".", " ") + decimal.graph_fractional +
                    graph_end)

                # "$.5" -> 'fractional_part: "five" currency: "dollars"' -> point five dollars
                decimal_graph_default_curr = (
                    pynutil.delete("currency: \"" +
                                   pynini.compose(curr_symbol, unit_plural) +
                                   "\"") + delete_space +
                    pynini.accep("fractional_part") + NEMO_SIGMA +
                    pynutil.insert(" currency: \"" +
                                   pynini.compose(curr_symbol, unit_plural) +
                                   "\""))

                # "$4.5" -> 'integer_part: "four" fractional_part: "five" currency: "dollars"' -> "four point five dollars"
                decimal_graph_default_curr |= (
                    pynutil.delete("currency: \"" + curr_name +
                                   pynini.closure(NEMO_NOT_QUOTE) + "\"") +
                    delete_space + pynini.accep("integer_part") + NEMO_SIGMA +
                    pynini.accep("fractional_part") + NEMO_SIGMA +
                    pynutil.insert(" currency: \"" +
                                   pynini.compose(curr_symbol, unit_plural) +
                                   "\""))

                # "£4 billion" -> 'integer_part: "four" quantity: "billion" currency: "pounds"' -> "four billion dollars"
                decimal_graph_default_curr |= (
                    pynutil.delete("currency: \"") + pynutil.delete(
                        rewrite.rewrite_lattice(
                            curr_symbol,
                            pynini.compose(curr_symbol, unit_plural)) + "\" ")
                    + pynini.difference(NEMO_SIGMA, "fractional_part") +
                    pynutil.insert(" currency: \"" +
                                   pynini.compose(curr_symbol, unit_plural) +
                                   "\""))

                decimal_graph_with_minor_curr = pynini.compose(
                    two_digits_fractional_part, decimal_graph_with_minor_curr)
                decimal_graph_default_curr = pynini.compose(
                    graph_decimal, decimal_graph_default_curr)

                integer_graph = (integer_graph_curr
                                 if integer_graph is None else pynini.union(
                                     integer_graph, integer_graph_curr))
                decimal_graph_with_minor = (decimal_graph_with_minor_curr
                                            if decimal_graph_with_minor is None
                                            else pynini.union(
                                                decimal_graph_with_minor,
                                                decimal_graph_with_minor_curr))
                decimal_graph_default = (
                    decimal_graph_default_curr
                    if decimal_graph_default is None else pynini.union(
                        decimal_graph_default, decimal_graph_default_curr))

            final_graph = decimal_graph_with_minor | decimal_graph_default | integer_graph

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemple #14
0
 def _load_roman(file: str):
     roman = load_labels(get_abs_path(file))
     roman_numerals = [(x, y) for x, y in roman] + [(x.upper(), y)
                                                    for x, y in roman]
     return pynini.string_map(roman_numerals)
Exemple #15
0
)
from nemo_text_processing.text_normalization.en.utils import (
    augment_labels_with_punct_at_end,
    get_abs_path,
    load_labels,
)

try:
    import pynini
    from pynini.lib import pynutil
    from pynini.examples import plurals

    graph_teen = pynini.invert(pynini.string_file(get_abs_path("data/number/teen.tsv"))).optimize()
    graph_digit = pynini.invert(pynini.string_file(get_abs_path("data/number/digit.tsv"))).optimize()
    ties_graph = pynini.invert(pynini.string_file(get_abs_path("data/number/ty.tsv"))).optimize()
    year_suffix = load_labels(get_abs_path("data/date/year_suffix.tsv"))
    year_suffix.extend(augment_labels_with_punct_at_end(year_suffix))
    year_suffix = pynini.string_map(year_suffix).optimize()

    PYNINI_AVAILABLE = True
except (ModuleNotFoundError, ImportError):
    # Add placeholders for global variables
    graph_teen = None
    graph_digit = None
    ties_graph = None

    PYNINI_AVAILABLE = True


def get_ties_graph(deterministic: bool = True):
    """
Exemple #16
0
    def __init__(self,
                 cardinal: GraphFst,
                 decimal: GraphFst,
                 deterministic: bool = True):
        super().__init__(name="money",
                         kind="classify",
                         deterministic=deterministic)
        cardinal_graph = cardinal.graph_with_and
        graph_decimal_final = decimal.final_graph_wo_negative_w_abbr

        maj_singular_labels = load_labels(
            get_abs_path("data/money/currency_major.tsv"))
        maj_unit_plural = convert_space(maj_singular @ SINGULAR_TO_PLURAL)
        maj_unit_singular = convert_space(maj_singular)

        graph_maj_singular = pynutil.insert(
            "currency_maj: \"") + maj_unit_singular + pynutil.insert("\"")
        graph_maj_plural = pynutil.insert(
            "currency_maj: \"") + maj_unit_plural + pynutil.insert("\"")

        optional_delete_fractional_zeros = pynini.closure(
            pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), 0, 1)

        graph_integer_one = pynutil.insert("integer_part: \"") + pynini.cross(
            "1", "one") + pynutil.insert("\"")
        # only for decimals where third decimal after comma is non-zero or with quantity
        decimal_delete_last_zeros = (
            pynini.closure(NEMO_DIGIT | pynutil.delete(",")) +
            pynini.accep(".") + pynini.closure(NEMO_DIGIT, 2) +
            (NEMO_DIGIT - "0") + pynini.closure(pynutil.delete("0")))
        decimal_with_quantity = NEMO_SIGMA + NEMO_ALPHA

        graph_decimal = (graph_maj_plural + insert_space +
                         (decimal_delete_last_zeros | decimal_with_quantity)
                         @ graph_decimal_final)

        graph_integer = (pynutil.insert("integer_part: \"") +
                         ((NEMO_SIGMA - "1") @ cardinal_graph) +
                         pynutil.insert("\""))

        graph_integer_only = graph_maj_singular + insert_space + graph_integer_one
        graph_integer_only |= graph_maj_plural + insert_space + graph_integer

        final_graph = (graph_integer_only +
                       optional_delete_fractional_zeros) | graph_decimal

        # remove trailing zeros of non zero number in the first 2 digits and fill up to 2 digits
        # e.g. 2000 -> 20, 0200->02, 01 -> 01, 10 -> 10
        # not accepted: 002, 00, 0,
        two_digits_fractional_part = (
            pynini.closure(NEMO_DIGIT) +
            (NEMO_DIGIT - "0") + pynini.closure(pynutil.delete("0"))) @ (
                (pynutil.delete("0") + (NEMO_DIGIT - "0"))
                | ((NEMO_DIGIT - "0") + pynutil.insert("0"))
                | ((NEMO_DIGIT - "0") + NEMO_DIGIT))

        graph_min_singular = pynutil.insert(
            " currency_min: \"") + min_singular + pynutil.insert("\"")
        graph_min_plural = pynutil.insert(
            " currency_min: \"") + min_plural + pynutil.insert("\"")
        # format ** dollars ** cent
        decimal_graph_with_minor = None
        integer_graph_reordered = None
        decimal_default_reordered = None
        for curr_symbol, _ in maj_singular_labels:
            preserve_order = pynutil.insert(" preserve_order: true")
            integer_plus_maj = graph_integer + insert_space + pynutil.insert(
                curr_symbol) @ graph_maj_plural
            integer_plus_maj |= graph_integer_one + insert_space + pynutil.insert(
                curr_symbol) @ graph_maj_singular

            integer_plus_maj_with_comma = pynini.compose(
                NEMO_DIGIT - "0" +
                pynini.closure(NEMO_DIGIT | pynutil.delete(",")),
                integer_plus_maj)
            integer_plus_maj = pynini.compose(
                pynini.closure(NEMO_DIGIT) - "0", integer_plus_maj)
            integer_plus_maj |= integer_plus_maj_with_comma

            graph_fractional_one = two_digits_fractional_part @ pynini.cross(
                "1", "one")
            graph_fractional_one = pynutil.insert(
                "fractional_part: \"") + graph_fractional_one + pynutil.insert(
                    "\"")
            graph_fractional = (two_digits_fractional_part @ (
                pynini.closure(NEMO_DIGIT, 1, 2) - "1"
            ) @ cardinal.graph_hundred_component_at_least_one_none_zero_digit)
            graph_fractional = pynutil.insert(
                "fractional_part: \"") + graph_fractional + pynutil.insert(
                    "\"")

            fractional_plus_min = graph_fractional + insert_space + pynutil.insert(
                curr_symbol) @ graph_min_plural
            fractional_plus_min |= (
                graph_fractional_one + insert_space +
                pynutil.insert(curr_symbol) @ graph_min_singular)

            decimal_graph_with_minor_curr = integer_plus_maj + pynini.cross(
                ".", " ") + fractional_plus_min

            if not deterministic:
                decimal_graph_with_minor_curr |= pynutil.add_weight(
                    integer_plus_maj + pynini.cross(".", " ") +
                    pynutil.insert("fractional_part: \"") +
                    two_digits_fractional_part @ cardinal.
                    graph_hundred_component_at_least_one_none_zero_digit +
                    pynutil.insert("\""),
                    weight=0.0001,
                )
                default_fraction_graph = (
                    decimal_delete_last_zeros
                    | decimal_with_quantity) @ graph_decimal_final
            decimal_graph_with_minor_curr |= (
                pynini.closure(pynutil.delete("0"), 0, 1) +
                pynutil.delete(".") + fractional_plus_min)
            decimal_graph_with_minor_curr = (pynutil.delete(curr_symbol) +
                                             decimal_graph_with_minor_curr +
                                             preserve_order)

            decimal_graph_with_minor = (
                decimal_graph_with_minor_curr
                if decimal_graph_with_minor is None else pynini.union(
                    decimal_graph_with_minor,
                    decimal_graph_with_minor_curr).optimize())

            if not deterministic:
                integer_graph_reordered_curr = (pynutil.delete(curr_symbol) +
                                                integer_plus_maj +
                                                preserve_order).optimize()

                integer_graph_reordered = (
                    integer_graph_reordered_curr
                    if integer_graph_reordered is None else pynini.union(
                        integer_graph_reordered,
                        integer_graph_reordered_curr).optimize())
                decimal_default_reordered_curr = (
                    pynutil.delete(curr_symbol) + default_fraction_graph +
                    insert_space +
                    pynutil.insert(curr_symbol) @ graph_maj_plural)

                decimal_default_reordered = (
                    decimal_default_reordered_curr
                    if decimal_default_reordered is None else pynini.union(
                        decimal_default_reordered,
                        decimal_default_reordered_curr)).optimize()

        # weight for SH
        final_graph |= pynutil.add_weight(decimal_graph_with_minor, -0.0001)

        if not deterministic:
            final_graph |= integer_graph_reordered | decimal_default_reordered
            # to handle "$2.00" cases
            final_graph |= pynini.compose(
                NEMO_SIGMA + pynutil.delete(".") +
                pynini.closure(pynutil.delete("0"), 1),
                integer_graph_reordered)
        final_graph = self.add_tokens(final_graph.optimize())
        self.fst = final_graph.optimize()
Exemple #17
0
    def __init__(self, cardinal: GraphFst, deterministic: bool):
        super().__init__(name="date",
                         kind="classify",
                         deterministic=deterministic)

        month_graph = pynini.string_file(
            get_abs_path("data/months/names.tsv")).optimize()
        month_graph |= (TO_LOWER + pynini.closure(NEMO_CHAR)) @ month_graph
        month_abbr_graph = pynini.string_file(
            get_abs_path("data/months/abbr.tsv")).optimize()
        month_abbr_graph = (month_abbr_graph |
                            (TO_LOWER + pynini.closure(NEMO_CHAR))
                            @ month_abbr_graph) + pynini.closure(
                                pynutil.delete("."), 0, 1)
        month_graph |= month_abbr_graph

        # to support all caps names
        names_all_caps = [[
            x[0].upper()
        ] for x in load_labels(get_abs_path("data/months/names.tsv"))]
        abbr_all_caps = [
            (x.upper(), y)
            for x, y in load_labels(get_abs_path("data/months/abbr.tsv"))
        ]
        month_graph |= pynini.string_map(names_all_caps) | (
            pynini.string_map(abbr_all_caps) +
            pynini.closure(pynutil.delete("."), 0, 1))

        month_numbers_graph = pynini.string_file(
            get_abs_path("data/months/numbers.tsv")).optimize()
        cardinal_graph = cardinal.graph_hundred_component_at_least_one_none_zero_digit

        year_graph = _get_year_graph(deterministic)

        YEAR_WEIGHT = 0.001
        year_graph_standalone = (pynutil.insert("year: \"") +
                                 pynutil.add_weight(year_graph, YEAR_WEIGHT) +
                                 pynutil.insert("\""))

        month_graph = pynutil.insert(
            "month: \"") + month_graph + pynutil.insert("\"")
        month_numbers_graph = pynutil.insert(
            "month: \"") + month_numbers_graph + pynutil.insert("\"")

        day_graph = (pynutil.insert("day: \"") +
                     ((pynini.union("1", "2", "3") + NEMO_DIGIT) | NEMO_DIGIT)
                     @ cardinal_graph + pynutil.insert("\""))
        optional_day_graph = pynini.closure(delete_extra_space + day_graph, 0,
                                            1)

        two_digit_year = NEMO_DIGIT**(2) @ (cardinal.single_digits_graph
                                            | cardinal_graph)
        two_digit_year = pynutil.insert(
            "year: \"") + two_digit_year + pynutil.insert("\"")

        year_graph = pynutil.insert("year: \"") + year_graph + pynutil.insert(
            "\"")
        optional_graph_year = pynini.closure(
            delete_extra_space + year_graph,
            0,
            1,
        )
        graph_mdy = (month_graph + optional_day_graph + delete_space +
                     pynini.closure(pynutil.delete(","), 0, 1) +
                     optional_graph_year)

        delete_sep = pynutil.delete(pynini.union("-", "/", "."))
        graph_mdy |= (month_numbers_graph + delete_sep + insert_space +
                      pynini.closure(pynutil.delete("0"), 0, 1) + day_graph +
                      delete_sep + insert_space +
                      (year_graph | two_digit_year))

        graph_dmy = (day_graph + delete_extra_space + month_graph +
                     pynini.closure(pynutil.delete(","), 0, 1) +
                     optional_graph_year)
        graph_ymd = ((year_graph | two_digit_year) + delete_sep +
                     insert_space + month_numbers_graph + delete_sep +
                     insert_space + pynini.closure(pynutil.delete("0"), 0, 1) +
                     day_graph)

        final_graph = graph_mdy | graph_dmy
        if deterministic:
            final_graph += pynutil.insert(" preserve_order: true")
        else:
            final_graph += pynini.closure(
                pynutil.insert(" preserve_order: true"), 0, 1)
        final_graph |= graph_ymd | year_graph_standalone

        if not deterministic:
            ymd_to_mdy_graph = None
            mdy_to_dmy_graph = None

            for month in [
                    x[0]
                    for x in load_labels(get_abs_path("data/months/names.tsv"))
            ]:
                for day in [
                        x[0] for x in load_labels(
                            get_abs_path("data/months/days.tsv"))
                ]:
                    ymd_to_mdy_curr = (
                        pynutil.insert("month: \"" + month + "\" day: \"" +
                                       day + "\" ") + pynini.accep('year:') +
                        NEMO_SIGMA + pynutil.delete(" month: \"" + month +
                                                    "\" day: \"" + day + "\""))

                    # YY-MM-DD -> MM-DD-YY
                    ymd_to_mdy_curr = pynini.compose(final_graph,
                                                     ymd_to_mdy_curr)
                    ymd_to_mdy_graph = (
                        ymd_to_mdy_curr if ymd_to_mdy_graph is None else
                        pynini.union(ymd_to_mdy_curr, ymd_to_mdy_graph))

                    mdy_to_dmy_curr = (
                        pynutil.insert("day: \"" + day + "\" month: \"" +
                                       month + "\" ") +
                        pynutil.delete("month: \"" + month + "\" day: \"" +
                                       day + "\" ") + pynini.accep('year:') +
                        NEMO_SIGMA)

                    # pynini.compose(ymd_to_mdy_curr, mdy_to_dmy_curr) to handle:
                    # YY-MM-DD (input format) -> MM-DD-YY (intermediate ymd_to_mdy_curr representation) -> DD-MM-YY
                    # '2000-01-05' -> 'day: "five" month: "january" year: "two thousand"'
                    # pynini.compose(final_graph, mdy_to_dmy_curr) to handle:
                    # MM-DD-YY (input format) -> DD-MM-YY
                    mdy_to_dmy_curr = pynini.compose(
                        ymd_to_mdy_curr, mdy_to_dmy_curr) | pynini.compose(
                            final_graph, mdy_to_dmy_curr)
                    mdy_to_dmy_graph = (
                        mdy_to_dmy_curr if mdy_to_dmy_graph is None else
                        pynini.union(mdy_to_dmy_curr, mdy_to_dmy_graph))

            final_graph |= ymd_to_mdy_graph | mdy_to_dmy_graph

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemple #18
0
    def get_address_graph(self, cardinal):
        """
        Finite state transducer for classifying serial.
            The serial is a combination of digits, letters and dashes, e.g.:
            2788 San Tomas Expy, Santa Clara, CA 95051 ->
                units: "address" cardinal
                { integer: "two seven eight eight San Tomas Expressway Santa Clara California nine five zero five one" }
                 preserve_order: true
        """
        ordinal_verbalizer = OrdinalVerbalizer().graph
        ordinal_tagger = OrdinalTagger(cardinal=cardinal).graph
        ordinal_num = pynini.compose(
            pynutil.insert("integer: \"") + ordinal_tagger +
            pynutil.insert("\""), ordinal_verbalizer)

        address_num = NEMO_DIGIT**(
            1,
            2) @ cardinal.graph_hundred_component_at_least_one_none_zero_digit
        address_num += insert_space + NEMO_DIGIT**2 @ (
            pynini.closure(pynini.cross("0", "zero "), 0, 1) +
            cardinal.graph_hundred_component_at_least_one_none_zero_digit)
        # to handle the rest of the numbers
        address_num = pynini.compose(NEMO_DIGIT**(3, 4), address_num)
        address_num = plurals._priority_union(address_num, cardinal.graph,
                                              NEMO_SIGMA)

        direction = (pynini.cross("E", "East")
                     | pynini.cross("S", "South")
                     | pynini.cross("W", "West")
                     | pynini.cross("N", "North")) + pynini.closure(
                         pynutil.delete("."), 0, 1)

        direction = pynini.closure(pynini.accep(NEMO_SPACE) + direction, 0, 1)
        address_words = get_formats(
            get_abs_path("data/address/address_word.tsv"))
        address_words = (
            pynini.accep(NEMO_SPACE) +
            (pynini.closure(ordinal_num, 0, 1)
             | NEMO_UPPER + pynini.closure(NEMO_ALPHA, 1)) + NEMO_SPACE +
            pynini.closure(NEMO_UPPER + pynini.closure(NEMO_ALPHA) +
                           NEMO_SPACE) + address_words)

        city = pynini.closure(NEMO_ALPHA | pynini.accep(NEMO_SPACE), 1)
        city = pynini.closure(
            pynini.accep(",") + pynini.accep(NEMO_SPACE) + city, 0, 1)

        states = load_labels(get_abs_path("data/address/state.tsv"))

        additional_options = []
        for x, y in states:
            additional_options.append((x, f"{y[0]}.{y[1:]}"))
        states.extend(additional_options)
        state_graph = pynini.string_map(states)
        state = pynini.invert(state_graph)
        state = pynini.closure(
            pynini.accep(",") + pynini.accep(NEMO_SPACE) + state, 0, 1)

        zip_code = pynini.compose(NEMO_DIGIT**5, cardinal.single_digits_graph)
        zip_code = pynini.closure(
            pynini.closure(pynini.accep(","), 0, 1) +
            pynini.accep(NEMO_SPACE) + zip_code,
            0,
            1,
        )

        address = address_num + direction + address_words + pynini.closure(
            city + state + zip_code, 0, 1)

        address |= address_num + direction + address_words + pynini.closure(
            pynini.cross(".", ""), 0, 1)

        return address
Exemple #19
0
    def __init__(self, cardinal: GraphFst, deterministic: bool = True):
        super().__init__(name="time",
                         kind="classify",
                         deterministic=deterministic)
        suffix_labels = load_labels(get_abs_path("data/time/suffix.tsv"))
        suffix_labels.extend(augment_labels_with_punct_at_end(suffix_labels))
        suffix_graph = pynini.string_map(suffix_labels)

        time_zone_graph = pynini.string_file(
            get_abs_path("data/time/zone.tsv"))

        # only used for < 1000 thousand -> 0 weight
        cardinal = cardinal.graph

        labels_hour = [str(x) for x in range(0, 24)]
        labels_minute_single = [str(x) for x in range(1, 10)]
        labels_minute_double = [str(x) for x in range(10, 60)]

        delete_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (
            pynini.closure(pynutil.delete("0"), 0, 1) + NEMO_DIGIT)

        graph_hour = delete_leading_zero_to_double_digit @ pynini.union(
            *labels_hour) @ cardinal

        graph_minute_single = pynini.union(*labels_minute_single) @ cardinal
        graph_minute_double = pynini.union(*labels_minute_double) @ cardinal

        final_graph_hour = pynutil.insert(
            "hours: \"") + graph_hour + pynutil.insert("\"")
        final_graph_minute = (
            pynutil.insert("minutes: \"") +
            (pynini.cross("0", "o") + insert_space + graph_minute_single
             | graph_minute_double) + pynutil.insert("\""))
        final_graph_second = (
            pynutil.insert("seconds: \"") +
            (pynini.cross("0", "o") + insert_space + graph_minute_single
             | graph_minute_double) + pynutil.insert("\""))
        final_suffix = pynutil.insert("suffix: \"") + convert_space(
            suffix_graph) + pynutil.insert("\"")
        final_suffix_optional = pynini.closure(
            delete_space + insert_space + final_suffix, 0, 1)
        final_time_zone_optional = pynini.closure(
            delete_space + insert_space + pynutil.insert("zone: \"") +
            convert_space(time_zone_graph) + pynutil.insert("\""),
            0,
            1,
        )

        # 2:30 pm, 02:30, 2:00
        graph_hm = (
            final_graph_hour + pynutil.delete(":") +
            (pynutil.delete("00") | insert_space + final_graph_minute) +
            final_suffix_optional + final_time_zone_optional)

        # 10:30:05 pm,
        graph_hms = (final_graph_hour + pynutil.delete(":") +
                     (pynini.cross("00", " minutes: \"zero\"")
                      | insert_space + final_graph_minute) +
                     pynutil.delete(":") +
                     (pynini.cross("00", " seconds: \"zero\"")
                      | insert_space + final_graph_second) +
                     final_suffix_optional + final_time_zone_optional)

        # 2.xx pm/am
        graph_hm2 = (
            final_graph_hour + pynutil.delete(".") +
            (pynutil.delete("00") | insert_space + final_graph_minute) +
            delete_space + insert_space + final_suffix +
            final_time_zone_optional)
        # 2 pm est
        graph_h = final_graph_hour + delete_space + insert_space + final_suffix + final_time_zone_optional
        final_graph = (graph_hm | graph_h | graph_hm2 | graph_hms).optimize()

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemple #20
0
    def __init__(self, deterministic: bool = True, lm: bool = False):
        super().__init__(name="roman",
                         kind="classify",
                         deterministic=deterministic)

        roman_dict = load_labels(
            get_abs_path("data/roman/roman_to_spoken.tsv"))
        default_graph = pynini.string_map(roman_dict).optimize()
        default_graph = pynutil.insert(
            "integer: \"") + default_graph + pynutil.insert("\"")
        graph_teens = pynini.string_map([x[0]
                                         for x in roman_dict[:19]]).optimize()

        # up to five digit roman numerals with a preceding name are converted to ordinal form
        names = get_names()
        graph = (pynutil.insert("key_the_ordinal: \"") + names +
                 pynutil.insert("\"") + pynini.accep(" ") +
                 graph_teens @ default_graph).optimize()

        # single symbol roman numerals with preceding key words are converted to cardinal form
        key_words = pynini.string_map(
            load_labels(get_abs_path("data/roman/key_word.tsv"))).optimize()
        graph |= (pynutil.insert("key_cardinal: \"") + key_words +
                  pynutil.insert("\"") + pynini.accep(" ") +
                  default_graph).optimize()

        if deterministic:
            # two digit roman numerals up to 49
            roman_to_cardinal = pynini.compose(
                pynini.closure(NEMO_ALPHA, 2),
                (pynutil.insert("default_cardinal: \"default\" ") +
                 (pynini.string_map([x[0] for x in roman_dict[:50]
                                     ]).optimize()) @ default_graph),
            )
        elif not lm:
            # two or more digit roman numerals
            roman_to_cardinal = pynini.compose(
                pynini.closure(NEMO_ALPHA, 2),
                (pynutil.insert("default_cardinal: \"default\" ") +
                 (pynini.string_map([x[0] for x in roman_dict[:50]
                                     ]).optimize()) @ default_graph),
            )

        # convert three digit roman or up with suffix to ordinal
        roman_to_ordinal = pynini.compose(
            pynini.closure(NEMO_ALPHA, 3),
            (pynutil.insert("default_ordinal: \"default\" ") +
             graph_teens @ default_graph + pynutil.delete("th")),
        )

        graph |= roman_to_cardinal | roman_to_ordinal

        # # add a higher weight when roman number consists of a single symbol
        # graph = pynini.compose(pynini.closure(NEMO_CHAR, 2), graph) | pynutil.add_weight(
        #     pynini.compose(NEMO_CHAR, graph), 101
        # )
        # graph = graph.optimize() + pynini.closure(pynutil.delete("."), 0, 1)

        # graph = pynutil.insert("integer: \"") + graph + pynutil.insert("\"")
        graph = self.add_tokens(graph)

        self.fst = graph.optimize()
Exemple #21
0
    def __init__(self,
                 cardinal: GraphFst,
                 ordinal: GraphFst,
                 deterministic: bool = True,
                 lm: bool = False):
        super().__init__(name="integer",
                         kind="classify",
                         deterministic=deterministic)
        """
        Finite state transducer for classifying serial (handles only cases without delimiters,
        values with delimiters are handled by default).
            The serial is a combination of digits, letters and dashes, e.g.:
            c325b -> tokens { cardinal { integer: "c three two five b" } }
        """
        num_graph = pynini.compose(NEMO_DIGIT**(6, ...),
                                   cardinal.single_digits_graph).optimize()
        num_graph |= pynini.compose(NEMO_DIGIT**(1, 5),
                                    cardinal.graph).optimize()
        # to handle numbers starting with zero
        num_graph |= pynini.compose(
            pynini.accep("0") + pynini.closure(NEMO_DIGIT),
            cardinal.single_digits_graph).optimize()
        # TODO: "#" doesn't work from the file
        symbols_graph = pynini.string_file(
            get_abs_path("data/whitelist/symbol.tsv")).optimize(
            ) | pynini.cross("#", "hash")
        num_graph |= symbols_graph

        if not self.deterministic and not lm:
            num_graph |= cardinal.single_digits_graph
            # also allow double digits to be pronounced as integer in serial number
            num_graph |= pynutil.add_weight(
                NEMO_DIGIT**2 @ cardinal.
                graph_hundred_component_at_least_one_none_zero_digit,
                weight=0.0001)

        # add space between letter and digit/symbol
        symbols = [
            x[0]
            for x in load_labels(get_abs_path("data/whitelist/symbol.tsv"))
        ]
        symbols = pynini.union(*symbols)
        digit_symbol = NEMO_DIGIT | symbols

        graph_with_space = pynini.compose(
            pynini.cdrewrite(pynutil.insert(" "), NEMO_ALPHA | symbols,
                             digit_symbol, NEMO_SIGMA),
            pynini.cdrewrite(pynutil.insert(" "), digit_symbol,
                             NEMO_ALPHA | symbols, NEMO_SIGMA),
        )

        # serial graph with delimiter
        delimiter = pynini.accep("-") | pynini.accep("/") | pynini.accep(" ")
        alphas = pynini.closure(NEMO_ALPHA, 1)
        letter_num = alphas + delimiter + num_graph
        num_letter = pynini.closure(num_graph + delimiter, 1) + alphas
        next_alpha_or_num = pynini.closure(delimiter + (alphas | num_graph))
        next_alpha_or_num |= pynini.closure(
            delimiter + num_graph +
            plurals._priority_union(pynini.accep(" "), pynutil.insert(" "),
                                    NEMO_SIGMA).optimize() + alphas)

        serial_graph = letter_num + next_alpha_or_num
        serial_graph |= num_letter + next_alpha_or_num
        # numbers only with 2+ delimiters
        serial_graph |= (num_graph + delimiter + num_graph + delimiter +
                         num_graph + pynini.closure(delimiter + num_graph))
        # 2+ symbols
        serial_graph |= pynini.compose(NEMO_SIGMA + symbols + NEMO_SIGMA,
                                       num_graph + delimiter + num_graph)

        # exclude ordinal numbers from serial options
        serial_graph = pynini.compose(
            pynini.difference(NEMO_SIGMA,
                              pynini.project(ordinal.graph, "input")),
            serial_graph).optimize()

        serial_graph = pynutil.add_weight(serial_graph, 0.0001)
        serial_graph |= (pynini.closure(NEMO_NOT_SPACE, 1) +
                         (pynini.cross("^2", " squared")
                          | pynini.cross("^3", " cubed")).optimize())

        # at least one serial graph with alpha numeric value and optional additional serial/num/alpha values
        serial_graph = (
            pynini.closure((serial_graph | num_graph | alphas) + delimiter) +
            serial_graph + pynini.closure(delimiter +
                                          (serial_graph | num_graph | alphas)))

        serial_graph |= pynini.compose(graph_with_space,
                                       serial_graph.optimize()).optimize()
        serial_graph = pynini.compose(pynini.closure(NEMO_NOT_SPACE, 2),
                                      serial_graph).optimize()

        self.graph = serial_graph.optimize()
        graph = pynutil.insert("name: \"") + convert_space(
            self.graph).optimize() + pynutil.insert("\"")
        self.fst = graph.optimize()
Exemple #22
0
    def __init__(self, cardinal: GraphFst, deterministic: bool):
        super().__init__(name="date", kind="classify", deterministic=deterministic)

        month_graph = pynini.string_file(get_abs_path("data/months/names.tsv")).optimize()
        month_graph |= (TO_LOWER + pynini.closure(NEMO_CHAR)) @ month_graph
        month_abbr_graph = pynini.string_file(get_abs_path("data/months/abbr.tsv")).optimize()
        month_abbr_graph = (
            month_abbr_graph | (TO_LOWER + pynini.closure(NEMO_CHAR)) @ month_abbr_graph
        ) + pynini.closure(pynutil.delete("."), 0, 1)
        month_graph |= month_abbr_graph

        # to support all caps names
        names_all_caps = [[x[0].upper()] for x in load_labels(get_abs_path("data/months/names.tsv"))]
        abbr_all_caps = [(x.upper(), y) for x, y in load_labels(get_abs_path("data/months/abbr.tsv"))]
        month_graph |= pynini.string_map(names_all_caps) | (
            pynini.string_map(abbr_all_caps) + pynini.closure(pynutil.delete("."), 0, 1)
        )

        month_numbers_graph = pynini.string_file(get_abs_path("data/months/numbers.tsv")).optimize()
        cardinal_graph = cardinal.graph_hundred_component_at_least_one_none_zero_digit

        year_graph = _get_year_graph(deterministic)

        YEAR_WEIGHT = 0.001
        year_graph_standalone = (
            pynutil.insert("year: \"") + pynutil.add_weight(year_graph, YEAR_WEIGHT) + pynutil.insert("\"")
        )

        month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"")
        month_numbers_graph = pynutil.insert("month: \"") + month_numbers_graph + pynutil.insert("\"")

        day_graph = (
            pynutil.insert("day: \"")
            + ((pynini.union("1", "2", "3") + NEMO_DIGIT) | NEMO_DIGIT) @ cardinal_graph
            + pynutil.insert("\"")
        )
        optional_day_graph = pynini.closure(delete_extra_space + day_graph, 0, 1)

        year_graph = pynutil.insert("year: \"") + year_graph + pynutil.insert("\"")
        optional_graph_year = pynini.closure(delete_extra_space + year_graph, 0, 1,)
        graph_mdy = (
            month_graph
            + optional_day_graph
            + delete_space
            + pynini.closure(pynutil.delete(","), 0, 1)
            + optional_graph_year
        )

        delete_sep = pynutil.delete(pynini.union("-", "/", "."))
        graph_mdy |= (
            month_numbers_graph
            + delete_sep
            + insert_space
            + pynini.closure(pynutil.delete("0"), 0, 1)
            + day_graph
            + delete_sep
            + insert_space
            + year_graph
        )

        graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year
        graph_ymd = (
            year_graph
            + delete_sep
            + insert_space
            + month_numbers_graph
            + delete_sep
            + insert_space
            + pynini.closure(pynutil.delete("0"), 0, 1)
            + day_graph
        )

        final_graph = (graph_mdy | graph_dmy) + pynutil.insert(" preserve_order: true")
        final_graph |= graph_ymd | year_graph_standalone
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemple #23
0
    def __init__(self, cardinal: GraphFst, deterministic: bool, lm: bool = False):
        super().__init__(name="date", kind="classify", deterministic=deterministic)

        # january
        month_graph = pynini.string_file(get_abs_path("data/date/month_name.tsv")).optimize()
        # January, JANUARY
        month_graph |= pynini.compose(TO_LOWER + pynini.closure(NEMO_CHAR), month_graph) | pynini.compose(
            TO_LOWER ** (2, ...), month_graph
        )

        # jan
        month_abbr_graph = pynini.string_file(get_abs_path("data/date/month_abbr.tsv")).optimize()
        # jan, Jan, JAN
        month_abbr_graph = (
            month_abbr_graph
            | pynini.compose(TO_LOWER + pynini.closure(NEMO_LOWER, 1), month_abbr_graph).optimize()
            | pynini.compose(TO_LOWER ** (2, ...), month_abbr_graph).optimize()
        ) + pynini.closure(pynutil.delete("."), 0, 1)
        month_graph |= month_abbr_graph.optimize()

        month_numbers_labels = pynini.string_file(get_abs_path("data/date/month_number.tsv")).optimize()
        cardinal_graph = cardinal.graph_hundred_component_at_least_one_none_zero_digit

        year_graph = _get_year_graph(cardinal_graph=cardinal_graph, deterministic=deterministic)

        # three_digit_year = (NEMO_DIGIT @ cardinal_graph) + insert_space + (NEMO_DIGIT ** 2) @ cardinal_graph
        # year_graph |= three_digit_year

        month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"")
        month_numbers_graph = pynutil.insert("month: \"") + month_numbers_labels + pynutil.insert("\"")

        endings = ["rd", "th", "st", "nd"]
        endings += [x.upper() for x in endings]
        endings = pynini.union(*endings)

        day_graph = (
            pynutil.insert("day: \"")
            + pynini.closure(pynutil.delete("the "), 0, 1)
            + (
                ((pynini.union("1", "2") + NEMO_DIGIT) | NEMO_DIGIT | (pynini.accep("3") + pynini.union("0", "1")))
                + pynini.closure(pynutil.delete(endings), 0, 1)
            )
            @ cardinal_graph
            + pynutil.insert("\"")
        )

        two_digit_year = _get_two_digit_year(
            cardinal_graph=cardinal_graph, single_digits_graph=cardinal.single_digits_graph
        )
        two_digit_year = pynutil.insert("year: \"") + two_digit_year + pynutil.insert("\"")

        # if lm:
        #     two_digit_year = pynini.compose(pynini.difference(NEMO_DIGIT, "0") + NEMO_DIGIT ** (3), two_digit_year)
        #     year_graph = pynini.compose(pynini.difference(NEMO_DIGIT, "0") + NEMO_DIGIT ** (2), year_graph)
        #     year_graph |= pynini.compose(pynini.difference(NEMO_DIGIT, "0") + NEMO_DIGIT ** (4, ...), year_graph)

        graph_year = pynutil.insert(" year: \"") + pynutil.delete(" ") + year_graph + pynutil.insert("\"")
        graph_year |= (
            pynutil.insert(" year: \"")
            + pynini.accep(",")
            + pynini.closure(pynini.accep(" "), 0, 1)
            + year_graph
            + pynutil.insert("\"")
        )
        optional_graph_year = pynini.closure(graph_year, 0, 1)

        year_graph = pynutil.insert("year: \"") + year_graph + pynutil.insert("\"")

        graph_mdy = month_graph + (
            (delete_extra_space + day_graph)
            | (pynini.accep(" ") + day_graph)
            | graph_year
            | (delete_extra_space + day_graph + graph_year)
        )

        graph_mdy |= (
            month_graph
            + pynini.cross("-", " ")
            + day_graph
            + pynini.closure(((pynini.cross("-", " ") + NEMO_SIGMA) @ graph_year), 0, 1)
        )

        for x in ["-", "/", "."]:
            delete_sep = pynutil.delete(x)
            graph_mdy |= (
                month_numbers_graph
                + delete_sep
                + insert_space
                + pynini.closure(pynutil.delete("0"), 0, 1)
                + day_graph
                + delete_sep
                + insert_space
                + (year_graph | two_digit_year)
            )

        graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year
        day_ex_month = (NEMO_DIGIT ** 2 - pynini.project(month_numbers_graph, "input")) @ day_graph
        for x in ["-", "/", "."]:
            delete_sep = pynutil.delete(x)
            graph_dmy |= (
                day_ex_month
                + delete_sep
                + insert_space
                + month_numbers_graph
                + delete_sep
                + insert_space
                + (year_graph | two_digit_year)
            )

        graph_ymd = pynini.accep("")
        for x in ["-", "/", "."]:
            delete_sep = pynutil.delete(x)
            graph_ymd |= (
                (year_graph | two_digit_year)
                + delete_sep
                + insert_space
                + month_numbers_graph
                + delete_sep
                + insert_space
                + pynini.closure(pynutil.delete("0"), 0, 1)
                + day_graph
            )

        final_graph = graph_mdy | graph_dmy

        if not deterministic or lm:
            final_graph += pynini.closure(pynutil.insert(" preserve_order: true"), 0, 1)
            m_sep_d = (
                month_numbers_graph
                + pynutil.delete(pynini.union("-", "/"))
                + insert_space
                + pynini.closure(pynutil.delete("0"), 0, 1)
                + day_graph
            )
            final_graph |= m_sep_d
        else:
            final_graph += pynutil.insert(" preserve_order: true")

        final_graph |= graph_ymd | year_graph

        if not deterministic or lm:
            ymd_to_mdy_graph = None
            ymd_to_dmy_graph = None
            mdy_to_dmy_graph = None
            md_to_dm_graph = None

            for month in [x[0] for x in load_labels(get_abs_path("data/date/month_name.tsv"))]:
                for day in [x[0] for x in load_labels(get_abs_path("data/date/day.tsv"))]:
                    ymd_to_mdy_curr = (
                        pynutil.insert("month: \"" + month + "\" day: \"" + day + "\" ")
                        + pynini.accep('year:')
                        + NEMO_SIGMA
                        + pynutil.delete(" month: \"" + month + "\" day: \"" + day + "\"")
                    )

                    # YY-MM-DD -> MM-DD-YY
                    ymd_to_mdy_curr = pynini.compose(graph_ymd, ymd_to_mdy_curr)
                    ymd_to_mdy_graph = (
                        ymd_to_mdy_curr
                        if ymd_to_mdy_graph is None
                        else pynini.union(ymd_to_mdy_curr, ymd_to_mdy_graph)
                    )

                    ymd_to_dmy_curr = (
                        pynutil.insert("day: \"" + day + "\" month: \"" + month + "\" ")
                        + pynini.accep('year:')
                        + NEMO_SIGMA
                        + pynutil.delete(" month: \"" + month + "\" day: \"" + day + "\"")
                    )

                    # YY-MM-DD -> MM-DD-YY
                    ymd_to_dmy_curr = pynini.compose(graph_ymd, ymd_to_dmy_curr).optimize()
                    ymd_to_dmy_graph = (
                        ymd_to_dmy_curr
                        if ymd_to_dmy_graph is None
                        else pynini.union(ymd_to_dmy_curr, ymd_to_dmy_graph)
                    )

                    mdy_to_dmy_curr = (
                        pynutil.insert("day: \"" + day + "\" month: \"" + month + "\" ")
                        + pynutil.delete("month: \"" + month + "\" day: \"" + day + "\" ")
                        + pynini.accep('year:')
                        + NEMO_SIGMA
                    ).optimize()
                    # MM-DD-YY -> verbalize as MM-DD-YY (February fourth 1991) or DD-MM-YY (the fourth of February 1991)
                    mdy_to_dmy_curr = pynini.compose(graph_mdy, mdy_to_dmy_curr).optimize()
                    mdy_to_dmy_graph = (
                        mdy_to_dmy_curr
                        if mdy_to_dmy_graph is None
                        else pynini.union(mdy_to_dmy_curr, mdy_to_dmy_graph).optimize()
                    ).optimize()

                    md_to_dm_curr = pynutil.insert("day: \"" + day + "\" month: \"" + month + "\"") + pynutil.delete(
                        "month: \"" + month + "\" day: \"" + day + "\""
                    )
                    md_to_dm_curr = pynini.compose(m_sep_d, md_to_dm_curr).optimize()

                    md_to_dm_graph = (
                        md_to_dm_curr
                        if md_to_dm_graph is None
                        else pynini.union(md_to_dm_curr, md_to_dm_graph).optimize()
                    ).optimize()

            final_graph |= mdy_to_dmy_graph | md_to_dm_graph | ymd_to_mdy_graph | ymd_to_dmy_graph

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemple #24
0
    def __init__(self, cardinal: GraphFst, deterministic: bool):
        super().__init__(name="date", kind="classify", deterministic=deterministic)

        month_graph = pynini.string_file(get_abs_path("data/months/names.tsv")).optimize()
        month_graph |= pynini.compose(TO_LOWER + pynini.closure(NEMO_CHAR), month_graph) | pynini.compose(
            TO_LOWER ** (2, ...), month_graph
        )

        month_abbr_graph = pynini.string_file(get_abs_path("data/months/abbr.tsv")).optimize()
        month_abbr_graph = (
            month_abbr_graph
            | pynini.compose(TO_LOWER + pynini.closure(NEMO_LOWER, 1), month_abbr_graph).optimize()
            | pynini.compose(TO_LOWER ** (2, ...), month_abbr_graph).optimize()
        ) + pynini.closure(pynutil.delete("."), 0, 1)
        month_graph |= month_abbr_graph.optimize()

        month_numbers_graph = pynini.string_file(get_abs_path("data/months/numbers.tsv")).optimize()
        cardinal_graph = cardinal.graph_hundred_component_at_least_one_none_zero_digit

        year_graph = _get_year_graph(deterministic)

        YEAR_WEIGHT = 0.001
        year_graph_standalone = (
            pynutil.insert("year: \"") + pynutil.add_weight(year_graph, YEAR_WEIGHT) + pynutil.insert("\"")
        )

        month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"")
        month_numbers_graph = pynutil.insert("month: \"") + month_numbers_graph + pynutil.insert("\"")

        endings = ["rd", "th", "st", "nd"]
        endings += [x.upper() for x in endings]
        endings = pynini.union(*endings)

        day_graph = (
            pynutil.insert("day: \"")
            + pynini.closure(pynutil.delete("the "), 0, 1)
            + (
                ((pynini.union("1", "2") + NEMO_DIGIT) | NEMO_DIGIT | (pynini.accep("3") + pynini.union("0", "1")))
                + pynini.closure(pynutil.delete(endings), 0, 1)
            )
            @ cardinal_graph
            + pynutil.insert("\"")
        )

        two_digit_year = NEMO_DIGIT ** (2) @ (cardinal.single_digits_graph | cardinal_graph)
        two_digit_year = pynutil.insert("year: \"") + two_digit_year + pynutil.insert("\"")
        graph_year = pynutil.insert(" year: \"") + pynutil.delete(" ") + year_graph + pynutil.insert("\"")
        optional_graph_year = pynini.closure(graph_year, 0, 1)
        year_graph = pynutil.insert("year: \"") + year_graph + pynutil.insert("\"")

        graph_mdy = month_graph + (
            (delete_extra_space + day_graph)
            | (pynini.accep(" ") + day_graph)
            | graph_year
            | (delete_extra_space + day_graph + graph_year)
        )

        delete_sep = pynutil.delete(pynini.union("-", "/", "."))
        graph_mdy |= (
            month_numbers_graph
            + delete_sep
            + insert_space
            + pynini.closure(pynutil.delete("0"), 0, 1)
            + day_graph
            + delete_sep
            + insert_space
            + (year_graph | two_digit_year)
        )

        graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year
        graph_ymd = (
            (year_graph | two_digit_year)
            + delete_sep
            + insert_space
            + month_numbers_graph
            + delete_sep
            + insert_space
            + pynini.closure(pynutil.delete("0"), 0, 1)
            + day_graph
        )

        final_graph = graph_mdy | graph_dmy

        if deterministic:
            final_graph += pynutil.insert(" preserve_order: true")
        else:
            final_graph += pynini.closure(pynutil.insert(" preserve_order: true"), 0, 1)
            m_sep_d = (
                month_numbers_graph + delete_sep + insert_space + pynini.closure(pynutil.delete("0"), 0, 1) + day_graph
            )
            final_graph |= m_sep_d

        final_graph |= graph_ymd | year_graph_standalone

        if not deterministic:
            ymd_to_mdy_graph = None
            ymd_to_dmy_graph = None
            mdy_to_dmy_graph = None
            md_to_dm_graph = None

            for month in [x[0] for x in load_labels(get_abs_path("data/months/names.tsv"))]:
                for day in [x[0] for x in load_labels(get_abs_path("data/months/days.tsv"))]:
                    ymd_to_mdy_curr = (
                        pynutil.insert("month: \"" + month + "\" day: \"" + day + "\" ")
                        + pynini.accep('year:')
                        + NEMO_SIGMA
                        + pynutil.delete(" month: \"" + month + "\" day: \"" + day + "\"")
                    )

                    # YY-MM-DD -> MM-DD-YY
                    ymd_to_mdy_curr = pynini.compose(graph_ymd, ymd_to_mdy_curr)
                    ymd_to_mdy_graph = (
                        ymd_to_mdy_curr
                        if ymd_to_mdy_graph is None
                        else pynini.union(ymd_to_mdy_curr, ymd_to_mdy_graph)
                    )

                    ymd_to_dmy_curr = (
                        pynutil.insert("day: \"" + day + "\" month: \"" + month + "\" ")
                        + pynini.accep('year:')
                        + NEMO_SIGMA
                        + pynutil.delete(" month: \"" + month + "\" day: \"" + day + "\"")
                    )

                    # YY-MM-DD -> MM-DD-YY
                    ymd_to_dmy_curr = pynini.compose(graph_ymd, ymd_to_dmy_curr).optimize()
                    ymd_to_dmy_graph = (
                        ymd_to_dmy_curr
                        if ymd_to_dmy_graph is None
                        else pynini.union(ymd_to_dmy_curr, ymd_to_dmy_graph)
                    )

                    mdy_to_dmy_curr = (
                        pynutil.insert("day: \"" + day + "\" month: \"" + month + "\" ")
                        + pynutil.delete("month: \"" + month + "\" day: \"" + day + "\" ")
                        + pynini.accep('year:')
                        + NEMO_SIGMA
                    ).optimize()
                    # MM-DD-YY -> verbalize as MM-DD-YY (February fourth 1991) or DD-MM-YY (the fourth of February 1991)
                    mdy_to_dmy_curr = pynini.compose(graph_mdy, mdy_to_dmy_curr).optimize()
                    mdy_to_dmy_graph = (
                        mdy_to_dmy_curr
                        if mdy_to_dmy_graph is None
                        else pynini.union(mdy_to_dmy_curr, mdy_to_dmy_graph).optimize()
                    ).optimize()

                    md_to_dm_curr = pynutil.insert("day: \"" + day + "\" month: \"" + month + "\"") + pynutil.delete(
                        "month: \"" + month + "\" day: \"" + day + "\""
                    )
                    md_to_dm_curr = pynini.compose(m_sep_d, md_to_dm_curr).optimize()

                    md_to_dm_graph = (
                        md_to_dm_curr
                        if md_to_dm_graph is None
                        else pynini.union(md_to_dm_curr, md_to_dm_graph).optimize()
                    ).optimize()

            final_graph |= mdy_to_dmy_graph | md_to_dm_graph | ymd_to_mdy_graph | ymd_to_dmy_graph

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()