Beispiel #1
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="telephone", kind="classify", deterministic=deterministic)

        add_separator = pynutil.insert(", ")  # between components
        zero = pynini.cross("0", "zero")
        if not deterministic:
            zero |= pynini.cross("0", pynini.union("o", "oh"))
        digit = pynini.invert(pynini.string_file(get_abs_path("data/number/digit.tsv"))).optimize() | zero

        telephone_prompts = pynini.string_file(get_abs_path("data/telephone/telephone_prompt.tsv"))
        country_code = (
            pynini.closure(telephone_prompts + delete_extra_space, 0, 1)
            + pynini.closure(pynini.cross("+", "plus "), 0, 1)
            + pynini.closure(digit + insert_space, 0, 2)
            + digit
            + pynutil.insert(",")
        )
        country_code |= telephone_prompts
        country_code = pynutil.insert("country_code: \"") + country_code + pynutil.insert("\"")
        country_code = country_code + pynini.closure(pynutil.delete("-"), 0, 1) + delete_space + insert_space

        area_part_default = pynini.closure(digit + insert_space, 2, 2) + digit
        area_part = pynini.cross("800", "eight hundred") | pynini.compose(
            pynini.difference(NEMO_SIGMA, "800"), area_part_default
        )

        area_part = (
            (area_part + (pynutil.delete("-") | pynutil.delete(".")))
            | (
                pynutil.delete("(")
                + area_part
                + ((pynutil.delete(")") + pynini.closure(pynutil.delete(" "), 0, 1)) | pynutil.delete(")-"))
            )
        ) + add_separator

        del_separator = pynini.closure(pynini.union("-", " ", "."), 0, 1)
        number_length = ((NEMO_DIGIT + del_separator) | (NEMO_ALPHA + del_separator)) ** 7
        number_words = pynini.closure(
            (NEMO_DIGIT @ digit) + (insert_space | (pynini.cross("-", ', ')))
            | NEMO_ALPHA
            | (NEMO_ALPHA + pynini.cross("-", ' '))
        )
        number_words |= pynini.closure(
            (NEMO_DIGIT @ digit) + (insert_space | (pynini.cross(".", ', ')))
            | NEMO_ALPHA
            | (NEMO_ALPHA + pynini.cross(".", ' '))
        )
        number_words = pynini.compose(number_length, number_words)
        number_part = area_part + number_words
        number_part = pynutil.insert("number_part: \"") + number_part + pynutil.insert("\"")
        extension = (
            pynutil.insert("extension: \"") + pynini.closure(digit + insert_space, 0, 3) + digit + pynutil.insert("\"")
        )
        extension = pynini.closure(insert_space + extension, 0, 1)

        graph = plurals._priority_union(country_code + number_part, number_part, NEMO_SIGMA).optimize()
        graph = plurals._priority_union(country_code + number_part + extension, graph, NEMO_SIGMA).optimize()
        graph = plurals._priority_union(number_part + extension, graph, NEMO_SIGMA).optimize()

        # ip
        ip_prompts = pynini.string_file(get_abs_path("data/telephone/ip_prompt.tsv"))
        digit_to_str_graph = digit + pynini.closure(pynutil.insert(" ") + digit, 0, 2)
        ip_graph = digit_to_str_graph + (pynini.cross(".", " dot ") + digit_to_str_graph) ** 3
        graph |= (
            pynini.closure(
                pynutil.insert("country_code: \"") + ip_prompts + pynutil.insert("\"") + delete_extra_space, 0, 1
            )
            + pynutil.insert("number_part: \"")
            + ip_graph.optimize()
            + pynutil.insert("\"")
        )
        # ssn
        ssn_prompts = pynini.string_file(get_abs_path("data/telephone/ssn_prompt.tsv"))
        three_digit_part = digit + (pynutil.insert(" ") + digit) ** 2
        two_digit_part = digit + pynutil.insert(" ") + digit
        four_digit_part = digit + (pynutil.insert(" ") + digit) ** 3
        ssn_separator = pynini.cross("-", ", ")
        ssn_graph = three_digit_part + ssn_separator + two_digit_part + ssn_separator + four_digit_part

        graph |= (
            pynini.closure(
                pynutil.insert("country_code: \"") + ssn_prompts + pynutil.insert("\"") + delete_extra_space, 0, 1
            )
            + pynutil.insert("number_part: \"")
            + ssn_graph.optimize()
            + pynutil.insert("\"")
        )

        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Beispiel #2
0
    def get_address_graph(self, cardinal):
        """
        Finite state transducer for classifying serial.
            The serial is a combination of digits, letters and dashes, e.g.:
            2788 San Tomas Expy, Santa Clara, CA 95051 ->
                units: "address" cardinal
                { integer: "two seven eight eight San Tomas Expressway Santa Clara California nine five zero five one" }
                 preserve_order: true
        """
        ordinal_verbalizer = OrdinalVerbalizer().graph
        ordinal_tagger = OrdinalTagger(cardinal=cardinal).graph
        ordinal_num = pynini.compose(
            pynutil.insert("integer: \"") + ordinal_tagger +
            pynutil.insert("\""), ordinal_verbalizer)

        address_num = NEMO_DIGIT**(
            1,
            2) @ cardinal.graph_hundred_component_at_least_one_none_zero_digit
        address_num += insert_space + NEMO_DIGIT**2 @ (
            pynini.closure(pynini.cross("0", "zero "), 0, 1) +
            cardinal.graph_hundred_component_at_least_one_none_zero_digit)
        # to handle the rest of the numbers
        address_num = pynini.compose(NEMO_DIGIT**(3, 4), address_num)
        address_num = plurals._priority_union(address_num, cardinal.graph,
                                              NEMO_SIGMA)

        direction = (pynini.cross("E", "East")
                     | pynini.cross("S", "South")
                     | pynini.cross("W", "West")
                     | pynini.cross("N", "North")) + pynini.closure(
                         pynutil.delete("."), 0, 1)

        direction = pynini.closure(pynini.accep(NEMO_SPACE) + direction, 0, 1)
        address_words = get_formats(
            get_abs_path("data/address/address_word.tsv"))
        address_words = (
            pynini.accep(NEMO_SPACE) +
            (pynini.closure(ordinal_num, 0, 1)
             | NEMO_UPPER + pynini.closure(NEMO_ALPHA, 1)) + NEMO_SPACE +
            pynini.closure(NEMO_UPPER + pynini.closure(NEMO_ALPHA) +
                           NEMO_SPACE) + address_words)

        city = pynini.closure(NEMO_ALPHA | pynini.accep(NEMO_SPACE), 1)
        city = pynini.closure(
            pynini.accep(",") + pynini.accep(NEMO_SPACE) + city, 0, 1)

        states = load_labels(get_abs_path("data/address/state.tsv"))

        additional_options = []
        for x, y in states:
            additional_options.append((x, f"{y[0]}.{y[1:]}"))
        states.extend(additional_options)
        state_graph = pynini.string_map(states)
        state = pynini.invert(state_graph)
        state = pynini.closure(
            pynini.accep(",") + pynini.accep(NEMO_SPACE) + state, 0, 1)

        zip_code = pynini.compose(NEMO_DIGIT**5, cardinal.single_digits_graph)
        zip_code = pynini.closure(
            pynini.closure(pynini.accep(","), 0, 1) +
            pynini.accep(NEMO_SPACE) + zip_code,
            0,
            1,
        )

        address = address_num + direction + address_words + pynini.closure(
            city + state + zip_code, 0, 1)

        address |= address_num + direction + address_words + pynini.closure(
            pynini.cross(".", ""), 0, 1)

        return address
Beispiel #3
0
# Test, for your convinience
# If you have completed the above FSTs, the following asserts should not fail
# Feel free to comment them out while developing the program
assert (sorted_outputs("1" * numbers_to_words) == ["one"])
assert (sorted_outputs("0" * numbers_to_words) == ["zero"])
assert (sorted_outputs("10" * numbers_to_words) == ["ten"])
assert (sorted_outputs("11" * numbers_to_words) == ["eleven"])
assert (sorted_outputs("21" * numbers_to_words) == ["twenty one"])
assert (sorted_outputs("121" * numbers_to_words) == [
    "hundred twenty one", "one hundred twenty one"
])
assert (sorted_outputs("12.23" *
                       numbers_to_words) == ["twelve point two three"])

invert_ultimate = pn.invert(ultimate)

invert_ultimate = pn.invert(ultimate) * pn.invert(f)

# Now, the interactive program
while True:
    try:
        number = raw_input(
            "Please enter a number or '-r' for inverted behaviour (Ctrl-C to exit): "
        )
        if number.startswith("-r"):
            number = raw_input("Please write out a number (Ctrl-C to exit): ")
            print("Result in numbers")
            print(sorted_outputs(number * invert_ultimate))
        else:
            print("Result in factorized form")
Beispiel #4
0
from nemo_text_processing.text_normalization.graph_utils import (
    NEMO_CHAR,
    NEMO_DIGIT,
    NEMO_SIGMA,
    TO_LOWER,
    GraphFst,
    delete_extra_space,
    delete_space,
    insert_space,
)

try:
    import pynini
    from pynini.lib import pynutil

    graph_teen = pynini.invert(pynini.string_file(get_abs_path("data/numbers/teen.tsv"))).optimize()
    graph_digit = pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv"))).optimize()
    ties_graph = pynini.invert(pynini.string_file(get_abs_path("data/numbers/ties.tsv"))).optimize()

    PYNINI_AVAILABLE = True
except (ModuleNotFoundError, ImportError):
    # Add placeholders for global variables
    graph_teen = None
    graph_digit = None
    ties_graph = None

    PYNINI_AVAILABLE = True


def get_ties_graph(deterministic: bool = True):
    """
Beispiel #5
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="cardinal",
                         kind="classify",
                         deterministic=deterministic)

        graph = pynini.Far(
            get_abs_path("data/numbers/cardinal_number_name.far")).get_fst()
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            pynini.closure(NEMO_DIGIT, 2, 3)
            | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph
        self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure(
            pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT +
            NEMO_DIGIT + NEMO_DIGIT)) @ graph

        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))

        single_digits_graph = pynini.invert(graph_digit | graph_zero)
        self.single_digits_graph = single_digits_graph + pynini.closure(
            pynutil.insert(" ") + single_digits_graph)

        if not deterministic:
            single_digits_graph = (pynini.invert(graph_digit | graph_zero)
                                   | pynini.cross("0", "oh")
                                   | pynini.cross("0", "o"))
            self.single_digits_graph = single_digits_graph + pynini.closure(
                pynutil.insert(" ") + single_digits_graph)

            single_digits_graph_with_commas = pynini.closure(
                self.single_digits_graph + pynutil.insert(" "), 1,
                3) + pynini.closure(
                    pynutil.delete(",") + single_digits_graph +
                    pynutil.insert(" ") + single_digits_graph +
                    pynutil.insert(" ") + single_digits_graph,
                    1,
                )
            self.graph = (self.graph
                          | self.single_digits_graph
                          | get_hundreds_graph()
                          | pynutil.add_weight(single_digits_graph_with_commas,
                                               0.001))

            self.range_graph = (
                pynini.closure(pynutil.insert("from "), 0, 1) + self.graph +
                (pynini.cross("-", " to ") | pynini.cross("-", " ")) +
                self.graph)

            self.range_graph |= self.graph + (pynini.cross(
                "x", " by ") | pynini.cross(" x ", " by ")) + self.graph
            self.range_graph = self.range_graph.optimize()

        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)

        long_numbers = pynini.compose(NEMO_DIGIT**(5, ...),
                                      self.single_digits_graph).optimize()
        final_graph = self.graph | self.get_serial_graph(
        ) | pynutil.add_weight(long_numbers, -0.001)

        if not deterministic:
            final_graph |= self.range_graph
            remove_leading_zeros = pynini.closure(
                pynutil.delete("0"), 1) + pynini.compose(
                    pynini.closure(NEMO_DIGIT, 1), self.graph)
            final_graph |= remove_leading_zeros

        final_graph = optional_minus_graph + pynutil.insert(
            "integer: \"") + final_graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Beispiel #6
0
def _get_month_graph():
    month_graph = pynini.string_file(get_abs_path("data/months.tsv"))
    month_graph = pynini.invert(month_graph).optimize()
    return month_graph
Beispiel #7
0
    def __init__(self):
        super().__init__(name="time", kind="classify")
        # hours, minutes, seconds, suffix, zone, style, speak_period

        suffix_graph = pynini.string_file(get_abs_path("data/time/time_suffix.tsv"))
        time_zone_graph = pynini.invert(pynini.string_file(get_abs_path("data/time/time_zone.tsv")))
        time_to_graph = pynini.string_file(get_abs_path("data/time/time_to.tsv"))

        # only used for < 1000 thousand -> 0 weight
        cardinal = pynutil.add_weight(CardinalFst().graph_no_exception, weight=-0.7)

        labels_hour = [num_to_word(x) for x in range(0, 24)]
        labels_minute_single = [num_to_word(x) for x in range(1, 10)]
        labels_minute_double = [num_to_word(x) for x in range(10, 60)]

        graph_hour = pynini.union(*labels_hour) @ cardinal

        graph_minute_single = pynini.union(*labels_minute_single) @ cardinal
        graph_minute_double = pynini.union(*labels_minute_double) @ cardinal
        graph_minute_verbose = pynini.cross("half", "30") | pynini.cross("quarter", "15")
        oclock = pynini.cross(pynini.union("o' clock", "o clock", "o'clock", "oclock"), "")

        final_graph_hour = pynutil.insert("hours: \"") + graph_hour + pynutil.insert("\"")
        graph_minute = (
            oclock + pynutil.insert("00")
            | pynutil.delete("o") + delete_space + graph_minute_single
            | graph_minute_double
        )
        final_suffix = pynutil.insert("suffix: \"") + convert_space(suffix_graph) + pynutil.insert("\"")
        final_suffix_optional = pynini.closure(delete_space + insert_space + final_suffix, 0, 1)
        final_time_zone_optional = pynini.closure(
            delete_space
            + insert_space
            + pynutil.insert("zone: \"")
            + convert_space(time_zone_graph)
            + pynutil.insert("\""),
            0,
            1,
        )

        # five o' clock
        # two o eight, two thiry five (am/pm)
        # two pm/am
        graph_hm = (
            final_graph_hour + delete_extra_space + pynutil.insert("minutes: \"") + graph_minute + pynutil.insert("\"")
        )
        # 10 past four, quarter past four, half past four
        graph_mh = (
            pynutil.insert("minutes: \"")
            + pynini.union(graph_minute_single, graph_minute_double, graph_minute_verbose)
            + pynutil.insert("\"")
            + delete_space
            + pynutil.delete("past")
            + delete_extra_space
            + final_graph_hour
        )

        graph_quarter_time = (
            pynutil.insert("minutes: \"")
            + pynini.cross("quarter", "45")
            + pynutil.insert("\"")
            + delete_space
            + pynutil.delete(pynini.union("to", "till"))
            + delete_extra_space
            + pynutil.insert("hours: \"")
            + time_to_graph
            + pynutil.insert("\"")
        )

        graph_h = (
            final_graph_hour
            + delete_extra_space
            + pynutil.insert("minutes: \"")
            + (pynutil.insert("00") | graph_minute)
            + pynutil.insert("\"")
            + delete_space
            + insert_space
            + final_suffix
            + final_time_zone_optional
        )
        final_graph = (graph_hm | graph_mh | graph_quarter_time) + final_suffix_optional + final_time_zone_optional
        final_graph |= graph_h

        final_graph = self.add_tokens(final_graph)

        self.fst = final_graph.optimize()
Beispiel #8
0
##Vocabulary
lm_char = pynini.Fst.read("t9.char.lm")
lm_word = pynini.Fst.read("t9.word.lm")
t9 = pynini.transducer("0", "[32]")
t9_relations = [
    "0", "1", "2abc", "3def", "4ghi", "5jkl", "6mno", "7pqrs", "8tuv", "9wxyz"
]

##Reading vocabulary into alphabet.
for i in range(10):
    for k in t9_relations[i]:
        t9 = pynini.union(pynini.transducer(str(i), str(k)), t9)
##Adding punctuation to vocabulary
for i in string.punctuation:
    t9 = t9 | pynini.transducer("1", "[" + str(ord(i)) + "]")
##Closure and optimization
t9.closure().optimize()
##Inverstion for decoding
encoder = pynini.invert(t9).optimize()


def encode(message):
    return (message.lower() * encoder).stringify()


def decode(message):
    ###performs encoding on message, projects pathways to intersect with character ngram
    ###Then returns most likely path
    lattice = (message * t9).project(True) * lm_char
    return pynini.shortestpath(lattice).stringify()
Beispiel #9
0
    # _v = pynini.union("a", "e", "i", "o", "u")
    _c = pynini.union("b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n",
                      "p", "q", "r", "s", "t", "v", "w", "x", "y", "z")
    _ies = NEMO_SIGMA + _c + pynini.cross("y", "ies")
    _es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x",
                                    "z") + pynutil.insert("es")
    _s = NEMO_SIGMA + pynutil.insert("s")

    graph_plural = plurals._priority_union(
        suppletive,
        plurals._priority_union(_ies,
                                plurals._priority_union(_es, _s, NEMO_SIGMA),
                                NEMO_SIGMA), NEMO_SIGMA).optimize()

    SINGULAR_TO_PLURAL = graph_plural
    PLURAL_TO_SINGULAR = pynini.invert(graph_plural)
    TO_LOWER = pynini.union(*[
        pynini.cross(x, y)
        for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)
    ])
    TO_UPPER = pynini.invert(TO_LOWER)

    PYNINI_AVAILABLE = True
except (ModuleNotFoundError, ImportError):
    # Create placeholders
    NEMO_CHAR = None

    NEMO_DIGIT = None
    NEMO_LOWER = None
    NEMO_UPPER = None
    NEMO_ALPHA = None
Beispiel #10
0
    def __init__(self, deterministic: bool = True, lm: bool = False):
        super().__init__(name="cardinal",
                         kind="classify",
                         deterministic=deterministic)

        self.lm = lm
        self.deterministic = deterministic
        # TODO replace to have "oh" as a default for "0"
        graph = pynini.Far(
            get_abs_path("data/number/cardinal_number_name.far")).get_fst()
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            pynini.closure(NEMO_DIGIT, 2, 3)
            | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph

        graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv"))
        graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv"))

        single_digits_graph = pynini.invert(graph_digit | graph_zero)
        self.single_digits_graph = single_digits_graph + pynini.closure(
            insert_space + single_digits_graph)

        if not deterministic:
            # for a single token allow only the same normalization
            # "007" -> {"oh oh seven", "zero zero seven"} not {"oh zero seven"}
            single_digits_graph_zero = pynini.invert(graph_digit | graph_zero)
            single_digits_graph_oh = pynini.invert(graph_digit) | pynini.cross(
                "0", "oh")

            self.single_digits_graph = single_digits_graph_zero + pynini.closure(
                insert_space + single_digits_graph_zero)
            self.single_digits_graph |= single_digits_graph_oh + pynini.closure(
                insert_space + single_digits_graph_oh)

            single_digits_graph_with_commas = pynini.closure(
                self.single_digits_graph + insert_space, 1,
                3) + pynini.closure(
                    pynutil.delete(",") + single_digits_graph + insert_space +
                    single_digits_graph + insert_space + single_digits_graph,
                    1,
                )

        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)

        graph = (pynini.closure(NEMO_DIGIT, 1, 3) +
                 (pynini.closure(pynutil.delete(",") + NEMO_DIGIT**3)
                  | pynini.closure(NEMO_DIGIT**3))) @ graph

        self.graph = graph
        self.graph_with_and = self.add_optional_and(graph)

        if deterministic:
            long_numbers = pynini.compose(NEMO_DIGIT**(5, ...),
                                          self.single_digits_graph).optimize()
            final_graph = plurals._priority_union(long_numbers,
                                                  self.graph_with_and,
                                                  NEMO_SIGMA).optimize()
            cardinal_with_leading_zeros = pynini.compose(
                pynini.accep("0") + pynini.closure(NEMO_DIGIT),
                self.single_digits_graph)
            final_graph |= cardinal_with_leading_zeros
        else:
            leading_zeros = pynini.compose(
                pynini.closure(pynini.accep("0"), 1), self.single_digits_graph)
            cardinal_with_leading_zeros = (
                leading_zeros + pynutil.insert(" ") + pynini.compose(
                    pynini.closure(NEMO_DIGIT), self.graph_with_and))

            # add small weight to non-default graphs to make sure the deterministic option is listed first
            final_graph = (
                self.graph_with_and
                | pynutil.add_weight(self.single_digits_graph, 0.0001)
                | get_four_digit_year_graph(
                )  # allows e.g. 4567 be pronouced as forty five sixty seven
                | pynutil.add_weight(single_digits_graph_with_commas, 0.0001)
                | cardinal_with_leading_zeros)

        final_graph = optional_minus_graph + pynutil.insert(
            "integer: \"") + final_graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
    def __init__(self, deterministic: bool = True):
        super().__init__(name="cardinal",
                         kind="classify",
                         deterministic=deterministic)
        # TODO repalce to have "oh" as a default for "0"
        graph = pynini.Far(
            get_abs_path("data/numbers/cardinal_number_name.far")).get_fst()
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            pynini.closure(NEMO_DIGIT, 2, 3)
            | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph
        self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure(
            pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT +
            NEMO_DIGIT + NEMO_DIGIT)) @ graph

        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))

        single_digits_graph = pynini.invert(graph_digit | graph_zero)
        self.single_digits_graph = single_digits_graph + pynini.closure(
            insert_space + single_digits_graph)

        if not deterministic:
            # for a single token allow only the same normalization
            # "007" -> {"oh oh seven", "zero zero seven"} not {"oh zero seven"}
            single_digits_graph_zero = pynini.invert(graph_digit | graph_zero)
            single_digits_graph_oh = pynini.invert(graph_digit) | pynini.cross(
                "0", "oh")

            self.single_digits_graph = single_digits_graph_zero + pynini.closure(
                insert_space + single_digits_graph_zero)
            self.single_digits_graph |= single_digits_graph_oh + pynini.closure(
                insert_space + single_digits_graph_oh)

            single_digits_graph_with_commas = pynini.closure(
                self.single_digits_graph + insert_space, 1,
                3) + pynini.closure(
                    pynutil.delete(",") + single_digits_graph + insert_space +
                    single_digits_graph + insert_space + single_digits_graph,
                    1,
                )

            self.range_graph = pynutil.insert(
                "from ") + self.graph + pynini.cross("-", " to ") + self.graph
            self.range_graph |= self.graph + (pynini.cross(
                "x", " by ") | pynini.cross(" x ", " by ")) + self.graph
            self.range_graph |= (pynutil.insert("from ") +
                                 get_hundreds_graph() +
                                 pynini.cross("-", " to ") +
                                 get_hundreds_graph())
            self.range_graph = self.range_graph.optimize()

        serial_graph = self.get_serial_graph()
        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)

        if deterministic:
            long_numbers = pynini.compose(NEMO_DIGIT**(5, ...),
                                          self.single_digits_graph).optimize()
            final_graph = self.graph | serial_graph | pynutil.add_weight(
                long_numbers, -0.001)
            cardinal_with_leading_zeros = pynini.compose(
                pynini.accep("0") + pynini.closure(NEMO_DIGIT),
                self.single_digits_graph)
            final_graph |= cardinal_with_leading_zeros
        else:

            leading_zeros = pynini.compose(
                pynini.closure(pynini.accep("0"), 1), self.single_digits_graph)
            cardinal_with_leading_zeros = (
                leading_zeros + pynutil.insert(" ") +
                pynini.compose(pynini.closure(NEMO_DIGIT), self.graph))

            final_graph = (self.graph
                           | serial_graph
                           | self.range_graph
                           | self.single_digits_graph
                           | get_hundreds_graph()
                           | pynutil.add_weight(
                               single_digits_graph_with_commas, 0.001)
                           | cardinal_with_leading_zeros)

        final_graph = optional_minus_graph + pynutil.insert(
            "integer: \"") + final_graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)

        self.fst = final_graph.optimize()
Beispiel #12
0
    def __init__(self, cardinal: GraphFst, deterministic: bool = True):
        super().__init__(name="time",
                         kind="classify",
                         deterministic=deterministic)

        delete_time_delimiter = pynutil.delete(pynini.union(".", ":"))

        one = pynini.string_map([("un", "una"), ("ún", "una")])
        change_one = pynini.cdrewrite(one, "", "", NEMO_SIGMA)
        cardinal_graph = cardinal.graph @ change_one

        day_suffix = pynutil.insert("suffix: \"") + suffix + pynutil.insert(
            "\"")
        day_suffix = delete_space + insert_space + day_suffix

        delete_hora_suffix = delete_space + insert_space + pynutil.delete("h")
        delete_minute_suffix = delete_space + insert_space + pynutil.delete(
            "min")
        delete_second_suffix = delete_space + insert_space + pynutil.delete(
            "s")

        labels_hour_24 = [
            str(x) for x in range(0, 25)
        ]  # Can see both systems. Twelve hour requires am/pm for ambiguity resolution
        labels_hour_12 = [str(x) for x in range(1, 13)]
        labels_minute_single = [str(x) for x in range(1, 10)]
        labels_minute_double = [str(x) for x in range(10, 60)]

        delete_leading_zero_to_double_digit = (
            pynini.closure(pynutil.delete("0") |
                           (NEMO_DIGIT - "0"), 0, 1) + NEMO_DIGIT)

        graph_24 = (pynini.closure(NEMO_DIGIT, 1,
                                   2) @ delete_leading_zero_to_double_digit
                    @ pynini.union(*labels_hour_24))
        graph_12 = (pynini.closure(NEMO_DIGIT, 1,
                                   2) @ delete_leading_zero_to_double_digit
                    @ pynini.union(*labels_hour_12))

        graph_hour_24 = graph_24 @ cardinal_graph
        graph_hour_12 = graph_12 @ cardinal_graph

        graph_minute_single = delete_leading_zero_to_double_digit @ pynini.union(
            *labels_minute_single)
        graph_minute_double = pynini.union(*labels_minute_double)

        graph_minute = pynini.union(graph_minute_single,
                                    graph_minute_double) @ cardinal_graph

        final_graph_hour_only_24 = (pynutil.insert("hours: \"") +
                                    graph_hour_24 + pynutil.insert("\"") +
                                    delete_hora_suffix)
        final_graph_hour_only_12 = pynutil.insert(
            "hours: \"") + graph_hour_12 + pynutil.insert("\"") + day_suffix

        final_graph_hour_24 = pynutil.insert(
            "hours: \"") + graph_hour_24 + pynutil.insert("\"")
        final_graph_hour_12 = pynutil.insert(
            "hours: \"") + graph_hour_12 + pynutil.insert("\"")

        final_graph_minute = pynutil.insert(
            "minutes: \"") + graph_minute + pynutil.insert("\"")
        final_graph_second = pynutil.insert(
            "seconds: \"") + graph_minute + pynutil.insert("\"")
        final_time_zone_optional = pynini.closure(
            delete_space + insert_space + pynutil.insert("zone: \"") +
            time_zone_graph + pynutil.insert("\""),
            0,
            1,
        )

        # 02.30 h
        graph_hm = (
            final_graph_hour_24 + delete_time_delimiter +
            (pynutil.delete("00") |
             (insert_space + final_graph_minute)) + pynini.closure(
                 delete_time_delimiter +
                 (pynini.cross("00", " seconds: \"0\"") |
                  (insert_space + final_graph_second)),
                 0,
                 1,
             )  # For seconds 2.30.35 h
            + pynini.closure(delete_hora_suffix, 0,
                             1)  # 2.30 is valid if unambiguous
            + final_time_zone_optional)

        # 2 h 30 min
        graph_hm |= (
            final_graph_hour_24 + delete_hora_suffix + delete_space +
            (pynutil.delete("00") | (insert_space + final_graph_minute)) +
            delete_minute_suffix + pynini.closure(
                delete_space +
                (pynini.cross("00", " seconds: \"0\"") |
                 (insert_space + final_graph_second)) + delete_second_suffix,
                0,
                1,
            )  # For seconds
            + final_time_zone_optional)

        # 2.30 a. m. (Only for 12 hour clock)
        graph_hm |= (
            final_graph_hour_12 + delete_time_delimiter +
            (pynutil.delete("00") |
             (insert_space + final_graph_minute)) + pynini.closure(
                 delete_time_delimiter +
                 (pynini.cross("00", " seconds: \"0\"") |
                  (insert_space + final_graph_second)),
                 0,
                 1,
             )  # For seconds 2.30.35 a. m.
            + day_suffix + final_time_zone_optional)

        graph_h = (
            pynini.union(final_graph_hour_only_24, final_graph_hour_only_12) +
            final_time_zone_optional
        )  # Should always have a time indicator, else we'll pass to cardinals

        if not deterministic:
            # This includes alternate vocalization (hour menos min, min para hour), here we shift the times and indicate a `style` tag
            hour_shift_24 = pynini.invert(
                pynini.string_file(get_abs_path("data/time/hour_to_24.tsv")))
            hour_shift_12 = pynini.invert(
                pynini.string_file(get_abs_path("data/time/hour_to_12.tsv")))
            minute_shift = pynini.string_file(
                get_abs_path("data/time/minute_to.tsv"))

            graph_hour_to_24 = graph_24 @ hour_shift_24 @ cardinal_graph
            graph_hour_to_12 = graph_12 @ hour_shift_12 @ cardinal_graph

            graph_minute_to = pynini.union(
                graph_minute_single,
                graph_minute_double) @ minute_shift @ cardinal_graph

            final_graph_hour_to_24 = pynutil.insert(
                "hours: \"") + graph_hour_to_24 + pynutil.insert("\"")
            final_graph_hour_to_12 = pynutil.insert(
                "hours: \"") + graph_hour_to_12 + pynutil.insert("\"")

            final_graph_minute_to = pynutil.insert(
                "minutes: \"") + graph_minute_to + pynutil.insert("\"")

            graph_menos = pynutil.insert(" style: \"1\"")
            graph_para = pynutil.insert(" style: \"2\"")

            final_graph_style = graph_menos | graph_para

            # 02.30 h (omitting seconds since a bit awkward)
            graph_hm |= (
                final_graph_hour_to_24 + delete_time_delimiter +
                insert_space + final_graph_minute_to + pynini.closure(
                    delete_hora_suffix, 0, 1)  # 2.30 is valid if unambiguous
                + final_time_zone_optional + final_graph_style)

            # 2 h 30 min
            graph_hm |= (final_graph_hour_to_24 + delete_hora_suffix +
                         delete_space + insert_space + final_graph_minute_to +
                         delete_minute_suffix + final_time_zone_optional +
                         final_graph_style)

            # 2.30 a. m. (Only for 12 hour clock)
            graph_hm |= (final_graph_hour_to_12 + delete_time_delimiter +
                         insert_space + final_graph_minute_to + day_suffix +
                         final_time_zone_optional + final_graph_style)

        final_graph = graph_hm | graph_h
        if deterministic:
            final_graph = final_graph + pynutil.insert(" preserve_order: true")
        final_graph = final_graph.optimize()
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Beispiel #13
0
    NEMO_DIGIT,
    NEMO_SIGMA,
    NEMO_SPACE,
    NEMO_WHITE_SPACE,
    GraphFst,
    delete_space,
    insert_space,
)
from nemo_text_processing.text_normalization.es.graph_utils import cardinal_separator
from nemo_text_processing.text_normalization.es.utils import get_abs_path

try:
    import pynini
    from pynini.lib import pynutil

    zero = pynini.invert(
        pynini.string_file(get_abs_path("data/numbers/zero.tsv")))
    digit = pynini.invert(
        pynini.string_file(get_abs_path("data/numbers/digit.tsv")))
    teen = pynini.invert(
        pynini.string_file(get_abs_path("data/numbers/teen.tsv")))
    ties = pynini.invert(
        pynini.string_file(get_abs_path("data/numbers/ties.tsv")))
    twenties = pynini.invert(
        pynini.string_file(get_abs_path("data/numbers/twenties.tsv")))
    hundreds = pynini.invert(
        pynini.string_file(get_abs_path("data/numbers/hundreds.tsv")))

    PYNINI_AVAILABLE = True

except (ModuleNotFoundError, ImportError):
    zero = None
Beispiel #14
0
    def __init__(self, cardinal_tagger: GraphFst, deterministic: bool = True):
        super().__init__(name="time",
                         kind="verbalize",
                         deterministic=deterministic)

        # add weight so when using inverse text normalization this conversion is depriotized
        night_to_early = pynutil.add_weight(pynini.invert(
            pynini.string_file(
                get_abs_path("data/time/hour_to_night.tsv"))).optimize(),
                                            weight=0.0001)
        hour_to = pynini.invert(
            pynini.string_file(
                get_abs_path("data/time/hour_to.tsv"))).optimize()
        minute_to = pynini.invert(
            pynini.string_file(
                get_abs_path("data/time/minute_to.tsv"))).optimize()
        time_zone_graph = pynini.invert(
            convert_space(
                pynini.union(*[
                    x[1] for x in load_labels(
                        get_abs_path("data/time/time_zone.tsv"))
                ])))

        graph_zero = pynini.invert(
            pynini.string_file(
                get_abs_path("data/numbers/zero.tsv"))).optimize()
        number_verbalization = graph_zero | cardinal_tagger.two_digit_non_zero
        hour = pynutil.delete("hours: \"") + pynini.closure(
            NEMO_DIGIT, 1) + pynutil.delete("\"")
        hour_verbalized = hour @ number_verbalization @ pynini.cdrewrite(
            pynini.cross("eins", "ein"), "[BOS]", "[EOS]",
            NEMO_SIGMA) + pynutil.insert(" uhr")
        minute = pynutil.delete("minutes: \"") + pynini.closure(
            NEMO_DIGIT, 1) + pynutil.delete("\"")
        zone = pynutil.delete("zone: \"") + time_zone_graph + pynutil.delete(
            "\"")
        optional_zone = pynini.closure(pynini.accep(" ") + zone, 0, 1)
        second = pynutil.delete("seconds: \"") + pynini.closure(
            NEMO_DIGIT, 1) + pynutil.delete("\"")
        graph_hms = (hour_verbalized + pynini.accep(" ") +
                     minute @ number_verbalization +
                     pynutil.insert(" minuten") + pynini.accep(" ") +
                     second @ number_verbalization +
                     pynutil.insert(" sekunden") + optional_zone)
        graph_hms @= pynini.cdrewrite(
            pynini.cross("eins minuten", "eine minute")
            | pynini.cross("eins sekunden", "eine sekunde"),
            pynini.union(" ", "[BOS]"),
            "",
            NEMO_SIGMA,
        )

        min_30 = [str(x) for x in range(1, 31)]
        min_30 = pynini.union(*min_30)
        min_29 = [str(x) for x in range(1, 30)]
        min_29 = pynini.union(*min_29)

        graph_h = hour_verbalized
        graph_hm = hour_verbalized + pynini.accep(
            " ") + minute @ number_verbalization

        graph_m_past_h = (
            minute @ min_30
            @ (number_verbalization | pynini.cross("15", "viertel")) +
            pynini.accep(" ") + pynutil.insert("nach ")
            # + hour @ number_verbalization
            + hour @ pynini.cdrewrite(night_to_early, "[BOS]", "[EOS]",
                                      NEMO_SIGMA) @ number_verbalization)
        graph_m30_h = (minute @ pynini.cross("30", "halb") +
                       pynini.accep(" ") + hour @ pynini.cdrewrite(
                           night_to_early, "[BOS]", "[EOS]",
                           NEMO_SIGMA) @ hour_to @ number_verbalization)
        graph_m_to_h = (
            minute @ minute_to @ min_29
            @ (number_verbalization | pynini.cross("15", "viertel")) +
            pynini.accep(" ") + pynutil.insert("vor ") + hour
            @ pynini.cdrewrite(night_to_early, "[BOS]", "[EOS]",
                               NEMO_SIGMA) @ hour_to @ number_verbalization)

        self.graph = (graph_hms
                      | graph_h
                      | graph_hm
                      | pynutil.add_weight(graph_m_past_h, weight=0.0001)
                      | pynutil.add_weight(graph_m30_h, weight=0.0001)
                      | pynutil.add_weight(graph_m_to_h,
                                           weight=0.0001)) + optional_zone
        delete_tokens = self.delete_tokens(self.graph + delete_preserve_order)
        self.fst = delete_tokens.optimize()
Beispiel #15
0
# compose - *
# concat  - +
# union   - |

fst = (pn.a("a") | pn.a("e")) + pn.t("a",
                                     pn.a("0").closure(0, 5)) | pn.t(
                                         pn.a("a").star, "0") + pn.a("xxx")
fst = fst.optimize()

output_strings = set()

for i in range(10000):
    s = pn.randgen(fst, 1, random.randint(0, 100000)).stringify()
    output_strings.add(s)

print(len(output_strings))

for output_string in output_strings:
    print(output_string)


def top_paths(fst, count=100):
    return sorted(
        set(p[1] for p in pn.shortestpath(fst, nshortest=count).paths()))


print("INPUTS")
print("\t")
print(*top_paths(pn.invert(fst), 20), sep="\n\t")
Beispiel #16
0
    def __init__(self, cardinal: GraphFst, decimal: GraphFst):
        super().__init__(name="money", kind="classify")
        # quantity, integer_part, fractional_part, currency, style(depr)

        cardinal_graph = cardinal.graph_no_exception
        graph_decimal_final = decimal.final_graph_wo_negative

        unit = pynini.string_file(get_abs_path("data/currency.tsv"))
        unit_singular = pynini.invert(unit)
        unit_plural = get_singulars(unit_singular)

        graph_unit_singular = pynutil.insert("currency: \"") + convert_space(unit_singular) + pynutil.insert("\"")
        graph_unit_plural = pynutil.insert("currency: \"") + convert_space(unit_plural) + pynutil.insert("\"")

        add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert("0") + NEMO_DIGIT)
        # twelve dollars (and) fifty cents, zero cents
        cents_standalone = (
            pynutil.insert("fractional_part: \"")
            + pynini.union(
                pynutil.add_weight(((NEMO_SIGMA - "one") @ cardinal_graph), -0.7) @ add_leading_zero_to_double_digit
                + delete_space
                + pynutil.delete("cents"),
                pynini.cross("one", "01") + delete_space + pynutil.delete("cent"),
            )
            + pynutil.insert("\"")
        )

        optional_cents_standalone = pynini.closure(
            delete_space
            + pynini.closure(pynutil.delete("and") + delete_space, 0, 1)
            + insert_space
            + cents_standalone,
            0,
            1,
        )
        # twelve dollars fifty, only after integer
        optional_cents_suffix = pynini.closure(
            delete_extra_space
            + pynutil.insert("fractional_part: \"")
            + pynutil.add_weight(cardinal_graph @ add_leading_zero_to_double_digit, -0.7)
            + pynutil.insert("\""),
            0,
            1,
        )

        graph_integer = (
            pynutil.insert("integer_part: \"")
            + ((NEMO_SIGMA - "one") @ cardinal_graph)
            + pynutil.insert("\"")
            + delete_extra_space
            + graph_unit_plural
            + (optional_cents_standalone | optional_cents_suffix)
        )
        graph_integer |= (
            pynutil.insert("integer_part: \"")
            + pynini.cross("one", "1")
            + pynutil.insert("\"")
            + delete_extra_space
            + graph_unit_singular
            + (optional_cents_standalone | optional_cents_suffix)
        )
        graph_decimal = graph_decimal_final + delete_extra_space + graph_unit_plural
        graph_decimal |= pynutil.insert("currency: \"$\" integer_part: \"0\" ") + cents_standalone
        final_graph = graph_integer | graph_decimal
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()