def get_names(): """ Returns the graph that matched common male and female names. """ male_labels = load_labels(get_abs_path("data/roman/male.tsv")) female_labels = load_labels(get_abs_path("data/roman/female.tsv")) male_labels.extend([[x[0].upper()] for x in male_labels]) female_labels.extend([[x[0].upper()] for x in female_labels]) names = pynini.string_map(male_labels).optimize() names |= pynini.string_map(female_labels).optimize() return names
def _get_whitelist_non_deterministic_graph( file="data/whitelist_alternatives.tsv"): whitelist = load_labels(get_abs_path(file)) whitelist_lower = [(x.lower(), y.lower()) for x, y in whitelist] whitelist_cased = [(x, y) for x, y in whitelist] graph = pynini.string_map(whitelist_lower + whitelist_cased) return graph
def __init__(self, deterministic: bool = True): super().__init__(name="punctuation", kind="classify", deterministic=deterministic) s = "!#%&\'()*+,-./:;<=>?@^_`{|}~\"" punct_symbols_to_exclude = ["[", "]"] punct_unicode = [ chr(i) for i in range(sys.maxunicode) if category(chr(i)).startswith("P") and chr(i) not in punct_symbols_to_exclude ] whitelist_symbols = load_labels( get_abs_path("data/whitelist/symbol.tsv")) whitelist_symbols = [x[0] for x in whitelist_symbols] self.punct_marks = [ p for p in punct_unicode + list(s) if p not in whitelist_symbols ] punct = pynini.union(*self.punct_marks) punct = pynini.closure(punct, 1) emphasis = (pynini.accep("<") + ( (pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1) + pynini.closure(pynini.accep("/"), 0, 1)) | (pynini.accep("/") + pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1))) + pynini.accep(">")) punct = plurals._priority_union(emphasis, punct, NEMO_SIGMA) self.graph = punct self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
def get_serial_graph(self): """ Finite state transducer for classifying serial. The serial is a combination of digits, letters and dashes, e.g.: c325-b -> tokens { cardinal { integer: "c three two five b" } } """ alpha = NEMO_ALPHA if self.deterministic: num_graph = self.single_digits_graph else: num_graph = self.graph letter_pronunciation = pynini.string_map( load_labels(get_abs_path("data/letter_pronunciation.tsv"))) alpha |= letter_pronunciation delimiter = insert_space | pynini.cross("-", " ") | pynini.cross( "/", " ") letter_num = pynini.closure(alpha + delimiter, 1) + num_graph num_letter = pynini.closure(num_graph + delimiter, 1) + alpha num_delimiter_num = pynini.closure(num_graph + delimiter, 1) + num_graph next_alpha_or_num = pynini.closure(delimiter + (alpha | num_graph)) serial_graph = (letter_num | num_letter | num_delimiter_num) + next_alpha_or_num if not self.deterministic: serial_graph += pynini.closure( pynini.accep("s") | pynini.cross("s", "es"), 0, 1) serial_graph.optimize() return pynutil.add_weight(serial_graph, 10)
def _get_whitelist_graph(input_case, file): whitelist = load_labels(file) if input_case == "lower_cased": whitelist = [(x.lower(), y) for x, y in whitelist] else: whitelist = [(x, y) for x, y in whitelist] graph = pynini.string_map(whitelist) return graph
def _get_whitelist_graph(input_case, file="data/whitelist.tsv"): whitelist = load_labels(get_abs_path(file)) if input_case == "lower_cased": whitelist = [(x.lower(), y) for x, y in whitelist] else: whitelist = [(x, y) for x, y in whitelist] graph = pynini.string_map(whitelist) return graph
def _get_whitelist_graph(input_case, file, keep_punct_add_end: bool = False): whitelist = load_labels(file) if input_case == "lower_cased": whitelist = [[x.lower(), y] for x, y in whitelist] else: whitelist = [[x, y] for x, y in whitelist] if keep_punct_add_end: whitelist.extend(augment_labels_with_punct_at_end(whitelist)) graph = pynini.string_map(whitelist) return graph
def prepare_labels_for_insertion(file_path: str): """ Read the file and creates a union insertion graph Args: file_path: path to a file (single column) Returns fst that inserts labels from the file """ labels = load_labels(file_path) mapping = defaultdict(list) for k, v in labels: mapping[k].append(v) for k in mapping: mapping[k] = insert_space + pynini.union( *[pynutil.insert(end) for end in mapping[k]]) return mapping
def get_formats(input_f, input_case="cased", is_default=True): """ Adds various abbreviation format options to the list of acceptable input forms """ multiple_formats = load_labels(input_f) additional_options = [] for x, y in multiple_formats: if input_case == "lower_cased": x = x.lower() additional_options.append((f"{x}.", y)) # default "dr" -> doctor, this includes period "dr." -> doctor additional_options.append((f"{x[0].upper() + x[1:]}", f"{y[0].upper() + y[1:]}")) # "Dr" -> Doctor additional_options.append((f"{x[0].upper() + x[1:]}.", f"{y[0].upper() + y[1:]}")) # "Dr." -> Doctor multiple_formats.extend(additional_options) if not is_default: multiple_formats = [(x, f"|raw_start|{x}|raw_end||norm_start|{y}|norm_end|") for (x, y) in multiple_formats] multiple_formats = pynini.string_map(multiple_formats) return multiple_formats
def prepare_labels_for_insertion(file_path: str): """ Read the file and creates a union insertion graph Args: file_path: path to a file (3 columns: a label type e.g. "@@decimal_delimiter@@", a label e.g. "целого", and a weight e.g. "0.1"). Returns dictionary mapping from label type to an fst that inserts the labels with the specified weights. """ labels = load_labels(file_path) mapping = defaultdict(list) for k, v, w in labels: mapping[k].append((v, w)) for k in mapping: mapping[k] = (insert_space + pynini.union(*[ pynutil.add_weight(pynutil.insert(end), weight) for end, weight in mapping[k] ])).optimize() return mapping
def __init__(self, deterministic: bool = True, lm: bool = False): super().__init__(name="roman", kind="classify", deterministic=deterministic) roman_dict = load_labels( get_abs_path("data/roman/roman_to_spoken.tsv")) default_graph = pynini.string_map(roman_dict).optimize() default_graph = pynutil.insert( "integer: \"") + default_graph + pynutil.insert("\"") ordinal_limit = 19 graph_teens = pynini.string_map( [x[0] for x in roman_dict[:ordinal_limit]]).optimize() # roman numerals up to ordinal_limit with a preceding name are converted to ordinal form names = get_names() graph = (pynutil.insert("key_the_ordinal: \"") + names + pynutil.insert("\"") + pynini.accep(" ") + graph_teens @ default_graph).optimize() # single symbol roman numerals with preceding key words (multiple formats) are converted to cardinal form key_words = [] for k_word in load_labels(get_abs_path("data/roman/key_word.tsv")): key_words.append(k_word) key_words.append([k_word[0][0].upper() + k_word[0][1:]]) key_words.append([k_word[0].upper()]) key_words = pynini.string_map(key_words).optimize() graph |= (pynutil.insert("key_cardinal: \"") + key_words + pynutil.insert("\"") + pynini.accep(" ") + default_graph).optimize() if deterministic: # two digit roman numerals up to 49 roman_to_cardinal = pynini.compose( pynini.closure(NEMO_ALPHA, 2), (pynutil.insert("default_cardinal: \"default\" ") + (pynini.string_map([x[0] for x in roman_dict[:50] ]).optimize()) @ default_graph), ) elif not lm: # two or more digit roman numerals roman_to_cardinal = pynini.compose( pynini.difference(NEMO_SIGMA, "I"), (pynutil.insert("default_cardinal: \"default\" integer: \"") + pynini.string_map(roman_dict).optimize() + pynutil.insert("\"")), ).optimize() # convert three digit roman or up with suffix to ordinal roman_to_ordinal = pynini.compose( pynini.closure(NEMO_ALPHA, 3), (pynutil.insert("default_ordinal: \"default\" ") + graph_teens @ default_graph + pynutil.delete("th")), ) graph |= roman_to_cardinal | roman_to_ordinal graph = self.add_tokens(graph) self.fst = graph.optimize()
def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None): super().__init__(name="whitelist", kind="classify", deterministic=deterministic) def _get_whitelist_graph(input_case, file, keep_punct_add_end: bool = False): whitelist = load_labels(file) if input_case == "lower_cased": whitelist = [[x.lower(), y] for x, y in whitelist] else: whitelist = [[x, y] for x, y in whitelist] if keep_punct_add_end: whitelist.extend(augment_labels_with_punct_at_end(whitelist)) graph = pynini.string_map(whitelist) return graph graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist/tts.tsv")) graph |= _get_whitelist_graph(input_case, get_abs_path("data/whitelist/symbol.tsv")) if deterministic: names = get_names() graph |= ( pynini.cross(pynini.union("st", "St", "ST"), "Saint") + pynini.closure(pynutil.delete(".")) + pynini.accep(" ") + names ) else: graph |= _get_whitelist_graph( input_case, get_abs_path("data/whitelist/alternatives.tsv"), keep_punct_add_end=True ) for x in [".", ". "]: graph |= ( NEMO_UPPER + pynini.closure(pynutil.delete(x) + NEMO_UPPER, 2) + pynini.closure(pynutil.delete("."), 0, 1) ) if not deterministic: multiple_forms_whitelist_graph = get_formats(get_abs_path("data/whitelist/alternatives_all_format.tsv")) graph |= multiple_forms_whitelist_graph graph_unit = pynini.string_file(get_abs_path("data/measure/unit.tsv")) | pynini.string_file( get_abs_path("data/measure/unit_alternatives.tsv") ) graph_unit_plural = graph_unit @ SINGULAR_TO_PLURAL units_graph = pynini.compose(NEMO_CHAR ** (3, ...), convert_space(graph_unit | graph_unit_plural)) graph |= units_graph # convert to states only if comma is present before the abbreviation to avoid converting all caps words, # e.g. "IN", "OH", "OK" # TODO or only exclude above? states = load_labels(get_abs_path("data/address/state.tsv")) additional_options = [] for x, y in states: if input_case == "lower_cased": x = x.lower() additional_options.append((x, f"{y[0]}.{y[1:]}")) if not deterministic: additional_options.append((x, f"{y[0]}.{y[1:]}.")) states.extend(additional_options) state_graph = pynini.string_map(states) graph |= pynini.closure(NEMO_NOT_SPACE, 1) + pynini.union(", ", ",") + pynini.invert(state_graph).optimize() if input_file: whitelist_provided = _get_whitelist_graph(input_case, input_file) if not deterministic: graph |= whitelist_provided else: graph = whitelist_provided self.graph = (convert_space(graph)).optimize() self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = True): super().__init__(name="money", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph graph_decimal_final = decimal.final_graph_wo_negative unit_singular = pynini.string_file( get_abs_path("data/currency/currency.tsv")) unit_plural = convert_space(unit_singular @ SINGULAR_TO_PLURAL) unit_singular = convert_space(unit_singular) graph_unit_singular = pynutil.insert( "currency: \"") + unit_singular + pynutil.insert("\"") graph_unit_plural = pynutil.insert( "currency: \"") + unit_plural + pynutil.insert("\"") singular_graph = (graph_unit_singular + pynutil.insert(" integer_part: \"") + pynini.cross("1", "one") + pynutil.insert("\"")) graph_decimal = graph_unit_plural + insert_space + graph_decimal_final if deterministic: graph_integer = (graph_unit_plural + pynutil.insert(" integer_part: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + pynutil.insert("\"")) else: graph_integer = ( graph_unit_plural + pynutil.insert(" integer_part: \"") + ((NEMO_SIGMA - "1") @ (get_hundreds_graph(deterministic) | cardinal_graph)) + pynutil.insert("\"")) graph_decimal |= singular_graph + insert_space + graph_decimal_final graph_integer |= singular_graph final_graph = graph_integer | graph_decimal if not deterministic: currencies = load_labels( get_abs_path("data/currency/currency.tsv")) zero_graph = pynini.cross("0", "") | pynini.accep("0") # add minor currency part only when there are two digits after the point # .01 -> {zero one cent, one cent}, .05 -> {oh five, five cents} two_digits_fractional_part = ( NEMO_SIGMA + pynini.closure(NEMO_DIGIT) + ((pynini.accep(".") + (NEMO_DIGIT**(2) | zero_graph + (NEMO_DIGIT - "0"))) | pynutil.delete(".") + pynini.cross(pynini.closure("0", 1), ""))) integer_graph = None decimal_graph_with_minor = None decimal_graph_default = None for curr_symbol, curr_name in currencies: curr_symbol_graph = pynutil.delete(curr_symbol) graph_end = pynutil.insert(" currency: \"" + curr_symbol + "\"") preserve_order = pynutil.insert(" preserve_order: True") integer_part = decimal.graph_integer + graph_end + preserve_order # "$4" -> 'integer_part: "four" currency: "$" preserve_order: True' -> four dollars integer_graph_curr = curr_symbol_graph + integer_part # remove fractional part if it contains only zeros # "$4.00" -> 'integer_part: "four" currency: "$" preserve_order: True' -> four dollars integer_graph_curr |= pynini.compose( two_digits_fractional_part, integer_graph_curr) decimal_graph_with_minor_curr = ( curr_symbol_graph + pynini.closure(integer_part, 0, 1) + pynini.cross(".", " ") + decimal.graph_fractional + graph_end) # "$.5" -> 'fractional_part: "five" currency: "dollars"' -> point five dollars decimal_graph_default_curr = ( pynutil.delete("currency: \"" + pynini.compose(curr_symbol, unit_plural) + "\"") + delete_space + pynini.accep("fractional_part") + NEMO_SIGMA + pynutil.insert(" currency: \"" + pynini.compose(curr_symbol, unit_plural) + "\"")) # "$4.5" -> 'integer_part: "four" fractional_part: "five" currency: "dollars"' -> "four point five dollars" decimal_graph_default_curr |= ( pynutil.delete("currency: \"" + curr_name + pynini.closure(NEMO_NOT_QUOTE) + "\"") + delete_space + pynini.accep("integer_part") + NEMO_SIGMA + pynini.accep("fractional_part") + NEMO_SIGMA + pynutil.insert(" currency: \"" + pynini.compose(curr_symbol, unit_plural) + "\"")) # "£4 billion" -> 'integer_part: "four" quantity: "billion" currency: "pounds"' -> "four billion dollars" decimal_graph_default_curr |= ( pynutil.delete("currency: \"") + pynutil.delete( rewrite.rewrite_lattice( curr_symbol, pynini.compose(curr_symbol, unit_plural)) + "\" ") + pynini.difference(NEMO_SIGMA, "fractional_part") + pynutil.insert(" currency: \"" + pynini.compose(curr_symbol, unit_plural) + "\"")) decimal_graph_with_minor_curr = pynini.compose( two_digits_fractional_part, decimal_graph_with_minor_curr) decimal_graph_default_curr = pynini.compose( graph_decimal, decimal_graph_default_curr) integer_graph = (integer_graph_curr if integer_graph is None else pynini.union( integer_graph, integer_graph_curr)) decimal_graph_with_minor = (decimal_graph_with_minor_curr if decimal_graph_with_minor is None else pynini.union( decimal_graph_with_minor, decimal_graph_with_minor_curr)) decimal_graph_default = ( decimal_graph_default_curr if decimal_graph_default is None else pynini.union( decimal_graph_default, decimal_graph_default_curr)) final_graph = decimal_graph_with_minor | decimal_graph_default | integer_graph final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def _load_roman(file: str): roman = load_labels(get_abs_path(file)) roman_numerals = [(x, y) for x, y in roman] + [(x.upper(), y) for x, y in roman] return pynini.string_map(roman_numerals)
) from nemo_text_processing.text_normalization.en.utils import ( augment_labels_with_punct_at_end, get_abs_path, load_labels, ) try: import pynini from pynini.lib import pynutil from pynini.examples import plurals graph_teen = pynini.invert(pynini.string_file(get_abs_path("data/number/teen.tsv"))).optimize() graph_digit = pynini.invert(pynini.string_file(get_abs_path("data/number/digit.tsv"))).optimize() ties_graph = pynini.invert(pynini.string_file(get_abs_path("data/number/ty.tsv"))).optimize() year_suffix = load_labels(get_abs_path("data/date/year_suffix.tsv")) year_suffix.extend(augment_labels_with_punct_at_end(year_suffix)) year_suffix = pynini.string_map(year_suffix).optimize() PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): # Add placeholders for global variables graph_teen = None graph_digit = None ties_graph = None PYNINI_AVAILABLE = True def get_ties_graph(deterministic: bool = True): """
def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = True): super().__init__(name="money", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph_with_and graph_decimal_final = decimal.final_graph_wo_negative_w_abbr maj_singular_labels = load_labels( get_abs_path("data/money/currency_major.tsv")) maj_unit_plural = convert_space(maj_singular @ SINGULAR_TO_PLURAL) maj_unit_singular = convert_space(maj_singular) graph_maj_singular = pynutil.insert( "currency_maj: \"") + maj_unit_singular + pynutil.insert("\"") graph_maj_plural = pynutil.insert( "currency_maj: \"") + maj_unit_plural + pynutil.insert("\"") optional_delete_fractional_zeros = pynini.closure( pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), 0, 1) graph_integer_one = pynutil.insert("integer_part: \"") + pynini.cross( "1", "one") + pynutil.insert("\"") # only for decimals where third decimal after comma is non-zero or with quantity decimal_delete_last_zeros = ( pynini.closure(NEMO_DIGIT | pynutil.delete(",")) + pynini.accep(".") + pynini.closure(NEMO_DIGIT, 2) + (NEMO_DIGIT - "0") + pynini.closure(pynutil.delete("0"))) decimal_with_quantity = NEMO_SIGMA + NEMO_ALPHA graph_decimal = (graph_maj_plural + insert_space + (decimal_delete_last_zeros | decimal_with_quantity) @ graph_decimal_final) graph_integer = (pynutil.insert("integer_part: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + pynutil.insert("\"")) graph_integer_only = graph_maj_singular + insert_space + graph_integer_one graph_integer_only |= graph_maj_plural + insert_space + graph_integer final_graph = (graph_integer_only + optional_delete_fractional_zeros) | graph_decimal # remove trailing zeros of non zero number in the first 2 digits and fill up to 2 digits # e.g. 2000 -> 20, 0200->02, 01 -> 01, 10 -> 10 # not accepted: 002, 00, 0, two_digits_fractional_part = ( pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(pynutil.delete("0"))) @ ( (pynutil.delete("0") + (NEMO_DIGIT - "0")) | ((NEMO_DIGIT - "0") + pynutil.insert("0")) | ((NEMO_DIGIT - "0") + NEMO_DIGIT)) graph_min_singular = pynutil.insert( " currency_min: \"") + min_singular + pynutil.insert("\"") graph_min_plural = pynutil.insert( " currency_min: \"") + min_plural + pynutil.insert("\"") # format ** dollars ** cent decimal_graph_with_minor = None integer_graph_reordered = None decimal_default_reordered = None for curr_symbol, _ in maj_singular_labels: preserve_order = pynutil.insert(" preserve_order: true") integer_plus_maj = graph_integer + insert_space + pynutil.insert( curr_symbol) @ graph_maj_plural integer_plus_maj |= graph_integer_one + insert_space + pynutil.insert( curr_symbol) @ graph_maj_singular integer_plus_maj_with_comma = pynini.compose( NEMO_DIGIT - "0" + pynini.closure(NEMO_DIGIT | pynutil.delete(",")), integer_plus_maj) integer_plus_maj = pynini.compose( pynini.closure(NEMO_DIGIT) - "0", integer_plus_maj) integer_plus_maj |= integer_plus_maj_with_comma graph_fractional_one = two_digits_fractional_part @ pynini.cross( "1", "one") graph_fractional_one = pynutil.insert( "fractional_part: \"") + graph_fractional_one + pynutil.insert( "\"") graph_fractional = (two_digits_fractional_part @ ( pynini.closure(NEMO_DIGIT, 1, 2) - "1" ) @ cardinal.graph_hundred_component_at_least_one_none_zero_digit) graph_fractional = pynutil.insert( "fractional_part: \"") + graph_fractional + pynutil.insert( "\"") fractional_plus_min = graph_fractional + insert_space + pynutil.insert( curr_symbol) @ graph_min_plural fractional_plus_min |= ( graph_fractional_one + insert_space + pynutil.insert(curr_symbol) @ graph_min_singular) decimal_graph_with_minor_curr = integer_plus_maj + pynini.cross( ".", " ") + fractional_plus_min if not deterministic: decimal_graph_with_minor_curr |= pynutil.add_weight( integer_plus_maj + pynini.cross(".", " ") + pynutil.insert("fractional_part: \"") + two_digits_fractional_part @ cardinal. graph_hundred_component_at_least_one_none_zero_digit + pynutil.insert("\""), weight=0.0001, ) default_fraction_graph = ( decimal_delete_last_zeros | decimal_with_quantity) @ graph_decimal_final decimal_graph_with_minor_curr |= ( pynini.closure(pynutil.delete("0"), 0, 1) + pynutil.delete(".") + fractional_plus_min) decimal_graph_with_minor_curr = (pynutil.delete(curr_symbol) + decimal_graph_with_minor_curr + preserve_order) decimal_graph_with_minor = ( decimal_graph_with_minor_curr if decimal_graph_with_minor is None else pynini.union( decimal_graph_with_minor, decimal_graph_with_minor_curr).optimize()) if not deterministic: integer_graph_reordered_curr = (pynutil.delete(curr_symbol) + integer_plus_maj + preserve_order).optimize() integer_graph_reordered = ( integer_graph_reordered_curr if integer_graph_reordered is None else pynini.union( integer_graph_reordered, integer_graph_reordered_curr).optimize()) decimal_default_reordered_curr = ( pynutil.delete(curr_symbol) + default_fraction_graph + insert_space + pynutil.insert(curr_symbol) @ graph_maj_plural) decimal_default_reordered = ( decimal_default_reordered_curr if decimal_default_reordered is None else pynini.union( decimal_default_reordered, decimal_default_reordered_curr)).optimize() # weight for SH final_graph |= pynutil.add_weight(decimal_graph_with_minor, -0.0001) if not deterministic: final_graph |= integer_graph_reordered | decimal_default_reordered # to handle "$2.00" cases final_graph |= pynini.compose( NEMO_SIGMA + pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), integer_graph_reordered) final_graph = self.add_tokens(final_graph.optimize()) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, deterministic: bool): super().__init__(name="date", kind="classify", deterministic=deterministic) month_graph = pynini.string_file( get_abs_path("data/months/names.tsv")).optimize() month_graph |= (TO_LOWER + pynini.closure(NEMO_CHAR)) @ month_graph month_abbr_graph = pynini.string_file( get_abs_path("data/months/abbr.tsv")).optimize() month_abbr_graph = (month_abbr_graph | (TO_LOWER + pynini.closure(NEMO_CHAR)) @ month_abbr_graph) + pynini.closure( pynutil.delete("."), 0, 1) month_graph |= month_abbr_graph # to support all caps names names_all_caps = [[ x[0].upper() ] for x in load_labels(get_abs_path("data/months/names.tsv"))] abbr_all_caps = [ (x.upper(), y) for x, y in load_labels(get_abs_path("data/months/abbr.tsv")) ] month_graph |= pynini.string_map(names_all_caps) | ( pynini.string_map(abbr_all_caps) + pynini.closure(pynutil.delete("."), 0, 1)) month_numbers_graph = pynini.string_file( get_abs_path("data/months/numbers.tsv")).optimize() cardinal_graph = cardinal.graph_hundred_component_at_least_one_none_zero_digit year_graph = _get_year_graph(deterministic) YEAR_WEIGHT = 0.001 year_graph_standalone = (pynutil.insert("year: \"") + pynutil.add_weight(year_graph, YEAR_WEIGHT) + pynutil.insert("\"")) month_graph = pynutil.insert( "month: \"") + month_graph + pynutil.insert("\"") month_numbers_graph = pynutil.insert( "month: \"") + month_numbers_graph + pynutil.insert("\"") day_graph = (pynutil.insert("day: \"") + ((pynini.union("1", "2", "3") + NEMO_DIGIT) | NEMO_DIGIT) @ cardinal_graph + pynutil.insert("\"")) optional_day_graph = pynini.closure(delete_extra_space + day_graph, 0, 1) two_digit_year = NEMO_DIGIT**(2) @ (cardinal.single_digits_graph | cardinal_graph) two_digit_year = pynutil.insert( "year: \"") + two_digit_year + pynutil.insert("\"") year_graph = pynutil.insert("year: \"") + year_graph + pynutil.insert( "\"") optional_graph_year = pynini.closure( delete_extra_space + year_graph, 0, 1, ) graph_mdy = (month_graph + optional_day_graph + delete_space + pynini.closure(pynutil.delete(","), 0, 1) + optional_graph_year) delete_sep = pynutil.delete(pynini.union("-", "/", ".")) graph_mdy |= (month_numbers_graph + delete_sep + insert_space + pynini.closure(pynutil.delete("0"), 0, 1) + day_graph + delete_sep + insert_space + (year_graph | two_digit_year)) graph_dmy = (day_graph + delete_extra_space + month_graph + pynini.closure(pynutil.delete(","), 0, 1) + optional_graph_year) graph_ymd = ((year_graph | two_digit_year) + delete_sep + insert_space + month_numbers_graph + delete_sep + insert_space + pynini.closure(pynutil.delete("0"), 0, 1) + day_graph) final_graph = graph_mdy | graph_dmy if deterministic: final_graph += pynutil.insert(" preserve_order: true") else: final_graph += pynini.closure( pynutil.insert(" preserve_order: true"), 0, 1) final_graph |= graph_ymd | year_graph_standalone if not deterministic: ymd_to_mdy_graph = None mdy_to_dmy_graph = None for month in [ x[0] for x in load_labels(get_abs_path("data/months/names.tsv")) ]: for day in [ x[0] for x in load_labels( get_abs_path("data/months/days.tsv")) ]: ymd_to_mdy_curr = ( pynutil.insert("month: \"" + month + "\" day: \"" + day + "\" ") + pynini.accep('year:') + NEMO_SIGMA + pynutil.delete(" month: \"" + month + "\" day: \"" + day + "\"")) # YY-MM-DD -> MM-DD-YY ymd_to_mdy_curr = pynini.compose(final_graph, ymd_to_mdy_curr) ymd_to_mdy_graph = ( ymd_to_mdy_curr if ymd_to_mdy_graph is None else pynini.union(ymd_to_mdy_curr, ymd_to_mdy_graph)) mdy_to_dmy_curr = ( pynutil.insert("day: \"" + day + "\" month: \"" + month + "\" ") + pynutil.delete("month: \"" + month + "\" day: \"" + day + "\" ") + pynini.accep('year:') + NEMO_SIGMA) # pynini.compose(ymd_to_mdy_curr, mdy_to_dmy_curr) to handle: # YY-MM-DD (input format) -> MM-DD-YY (intermediate ymd_to_mdy_curr representation) -> DD-MM-YY # '2000-01-05' -> 'day: "five" month: "january" year: "two thousand"' # pynini.compose(final_graph, mdy_to_dmy_curr) to handle: # MM-DD-YY (input format) -> DD-MM-YY mdy_to_dmy_curr = pynini.compose( ymd_to_mdy_curr, mdy_to_dmy_curr) | pynini.compose( final_graph, mdy_to_dmy_curr) mdy_to_dmy_graph = ( mdy_to_dmy_curr if mdy_to_dmy_graph is None else pynini.union(mdy_to_dmy_curr, mdy_to_dmy_graph)) final_graph |= ymd_to_mdy_graph | mdy_to_dmy_graph final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def get_address_graph(self, cardinal): """ Finite state transducer for classifying serial. The serial is a combination of digits, letters and dashes, e.g.: 2788 San Tomas Expy, Santa Clara, CA 95051 -> units: "address" cardinal { integer: "two seven eight eight San Tomas Expressway Santa Clara California nine five zero five one" } preserve_order: true """ ordinal_verbalizer = OrdinalVerbalizer().graph ordinal_tagger = OrdinalTagger(cardinal=cardinal).graph ordinal_num = pynini.compose( pynutil.insert("integer: \"") + ordinal_tagger + pynutil.insert("\""), ordinal_verbalizer) address_num = NEMO_DIGIT**( 1, 2) @ cardinal.graph_hundred_component_at_least_one_none_zero_digit address_num += insert_space + NEMO_DIGIT**2 @ ( pynini.closure(pynini.cross("0", "zero "), 0, 1) + cardinal.graph_hundred_component_at_least_one_none_zero_digit) # to handle the rest of the numbers address_num = pynini.compose(NEMO_DIGIT**(3, 4), address_num) address_num = plurals._priority_union(address_num, cardinal.graph, NEMO_SIGMA) direction = (pynini.cross("E", "East") | pynini.cross("S", "South") | pynini.cross("W", "West") | pynini.cross("N", "North")) + pynini.closure( pynutil.delete("."), 0, 1) direction = pynini.closure(pynini.accep(NEMO_SPACE) + direction, 0, 1) address_words = get_formats( get_abs_path("data/address/address_word.tsv")) address_words = ( pynini.accep(NEMO_SPACE) + (pynini.closure(ordinal_num, 0, 1) | NEMO_UPPER + pynini.closure(NEMO_ALPHA, 1)) + NEMO_SPACE + pynini.closure(NEMO_UPPER + pynini.closure(NEMO_ALPHA) + NEMO_SPACE) + address_words) city = pynini.closure(NEMO_ALPHA | pynini.accep(NEMO_SPACE), 1) city = pynini.closure( pynini.accep(",") + pynini.accep(NEMO_SPACE) + city, 0, 1) states = load_labels(get_abs_path("data/address/state.tsv")) additional_options = [] for x, y in states: additional_options.append((x, f"{y[0]}.{y[1:]}")) states.extend(additional_options) state_graph = pynini.string_map(states) state = pynini.invert(state_graph) state = pynini.closure( pynini.accep(",") + pynini.accep(NEMO_SPACE) + state, 0, 1) zip_code = pynini.compose(NEMO_DIGIT**5, cardinal.single_digits_graph) zip_code = pynini.closure( pynini.closure(pynini.accep(","), 0, 1) + pynini.accep(NEMO_SPACE) + zip_code, 0, 1, ) address = address_num + direction + address_words + pynini.closure( city + state + zip_code, 0, 1) address |= address_num + direction + address_words + pynini.closure( pynini.cross(".", ""), 0, 1) return address
def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="time", kind="classify", deterministic=deterministic) suffix_labels = load_labels(get_abs_path("data/time/suffix.tsv")) suffix_labels.extend(augment_labels_with_punct_at_end(suffix_labels)) suffix_graph = pynini.string_map(suffix_labels) time_zone_graph = pynini.string_file( get_abs_path("data/time/zone.tsv")) # only used for < 1000 thousand -> 0 weight cardinal = cardinal.graph labels_hour = [str(x) for x in range(0, 24)] labels_minute_single = [str(x) for x in range(1, 10)] labels_minute_double = [str(x) for x in range(10, 60)] delete_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | ( pynini.closure(pynutil.delete("0"), 0, 1) + NEMO_DIGIT) graph_hour = delete_leading_zero_to_double_digit @ pynini.union( *labels_hour) @ cardinal graph_minute_single = pynini.union(*labels_minute_single) @ cardinal graph_minute_double = pynini.union(*labels_minute_double) @ cardinal final_graph_hour = pynutil.insert( "hours: \"") + graph_hour + pynutil.insert("\"") final_graph_minute = ( pynutil.insert("minutes: \"") + (pynini.cross("0", "o") + insert_space + graph_minute_single | graph_minute_double) + pynutil.insert("\"")) final_graph_second = ( pynutil.insert("seconds: \"") + (pynini.cross("0", "o") + insert_space + graph_minute_single | graph_minute_double) + pynutil.insert("\"")) final_suffix = pynutil.insert("suffix: \"") + convert_space( suffix_graph) + pynutil.insert("\"") final_suffix_optional = pynini.closure( delete_space + insert_space + final_suffix, 0, 1) final_time_zone_optional = pynini.closure( delete_space + insert_space + pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\""), 0, 1, ) # 2:30 pm, 02:30, 2:00 graph_hm = ( final_graph_hour + pynutil.delete(":") + (pynutil.delete("00") | insert_space + final_graph_minute) + final_suffix_optional + final_time_zone_optional) # 10:30:05 pm, graph_hms = (final_graph_hour + pynutil.delete(":") + (pynini.cross("00", " minutes: \"zero\"") | insert_space + final_graph_minute) + pynutil.delete(":") + (pynini.cross("00", " seconds: \"zero\"") | insert_space + final_graph_second) + final_suffix_optional + final_time_zone_optional) # 2.xx pm/am graph_hm2 = ( final_graph_hour + pynutil.delete(".") + (pynutil.delete("00") | insert_space + final_graph_minute) + delete_space + insert_space + final_suffix + final_time_zone_optional) # 2 pm est graph_h = final_graph_hour + delete_space + insert_space + final_suffix + final_time_zone_optional final_graph = (graph_hm | graph_h | graph_hm2 | graph_hms).optimize() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True, lm: bool = False): super().__init__(name="roman", kind="classify", deterministic=deterministic) roman_dict = load_labels( get_abs_path("data/roman/roman_to_spoken.tsv")) default_graph = pynini.string_map(roman_dict).optimize() default_graph = pynutil.insert( "integer: \"") + default_graph + pynutil.insert("\"") graph_teens = pynini.string_map([x[0] for x in roman_dict[:19]]).optimize() # up to five digit roman numerals with a preceding name are converted to ordinal form names = get_names() graph = (pynutil.insert("key_the_ordinal: \"") + names + pynutil.insert("\"") + pynini.accep(" ") + graph_teens @ default_graph).optimize() # single symbol roman numerals with preceding key words are converted to cardinal form key_words = pynini.string_map( load_labels(get_abs_path("data/roman/key_word.tsv"))).optimize() graph |= (pynutil.insert("key_cardinal: \"") + key_words + pynutil.insert("\"") + pynini.accep(" ") + default_graph).optimize() if deterministic: # two digit roman numerals up to 49 roman_to_cardinal = pynini.compose( pynini.closure(NEMO_ALPHA, 2), (pynutil.insert("default_cardinal: \"default\" ") + (pynini.string_map([x[0] for x in roman_dict[:50] ]).optimize()) @ default_graph), ) elif not lm: # two or more digit roman numerals roman_to_cardinal = pynini.compose( pynini.closure(NEMO_ALPHA, 2), (pynutil.insert("default_cardinal: \"default\" ") + (pynini.string_map([x[0] for x in roman_dict[:50] ]).optimize()) @ default_graph), ) # convert three digit roman or up with suffix to ordinal roman_to_ordinal = pynini.compose( pynini.closure(NEMO_ALPHA, 3), (pynutil.insert("default_ordinal: \"default\" ") + graph_teens @ default_graph + pynutil.delete("th")), ) graph |= roman_to_cardinal | roman_to_ordinal # # add a higher weight when roman number consists of a single symbol # graph = pynini.compose(pynini.closure(NEMO_CHAR, 2), graph) | pynutil.add_weight( # pynini.compose(NEMO_CHAR, graph), 101 # ) # graph = graph.optimize() + pynini.closure(pynutil.delete("."), 0, 1) # graph = pynutil.insert("integer: \"") + graph + pynutil.insert("\"") graph = self.add_tokens(graph) self.fst = graph.optimize()
def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = True, lm: bool = False): super().__init__(name="integer", kind="classify", deterministic=deterministic) """ Finite state transducer for classifying serial (handles only cases without delimiters, values with delimiters are handled by default). The serial is a combination of digits, letters and dashes, e.g.: c325b -> tokens { cardinal { integer: "c three two five b" } } """ num_graph = pynini.compose(NEMO_DIGIT**(6, ...), cardinal.single_digits_graph).optimize() num_graph |= pynini.compose(NEMO_DIGIT**(1, 5), cardinal.graph).optimize() # to handle numbers starting with zero num_graph |= pynini.compose( pynini.accep("0") + pynini.closure(NEMO_DIGIT), cardinal.single_digits_graph).optimize() # TODO: "#" doesn't work from the file symbols_graph = pynini.string_file( get_abs_path("data/whitelist/symbol.tsv")).optimize( ) | pynini.cross("#", "hash") num_graph |= symbols_graph if not self.deterministic and not lm: num_graph |= cardinal.single_digits_graph # also allow double digits to be pronounced as integer in serial number num_graph |= pynutil.add_weight( NEMO_DIGIT**2 @ cardinal. graph_hundred_component_at_least_one_none_zero_digit, weight=0.0001) # add space between letter and digit/symbol symbols = [ x[0] for x in load_labels(get_abs_path("data/whitelist/symbol.tsv")) ] symbols = pynini.union(*symbols) digit_symbol = NEMO_DIGIT | symbols graph_with_space = pynini.compose( pynini.cdrewrite(pynutil.insert(" "), NEMO_ALPHA | symbols, digit_symbol, NEMO_SIGMA), pynini.cdrewrite(pynutil.insert(" "), digit_symbol, NEMO_ALPHA | symbols, NEMO_SIGMA), ) # serial graph with delimiter delimiter = pynini.accep("-") | pynini.accep("/") | pynini.accep(" ") alphas = pynini.closure(NEMO_ALPHA, 1) letter_num = alphas + delimiter + num_graph num_letter = pynini.closure(num_graph + delimiter, 1) + alphas next_alpha_or_num = pynini.closure(delimiter + (alphas | num_graph)) next_alpha_or_num |= pynini.closure( delimiter + num_graph + plurals._priority_union(pynini.accep(" "), pynutil.insert(" "), NEMO_SIGMA).optimize() + alphas) serial_graph = letter_num + next_alpha_or_num serial_graph |= num_letter + next_alpha_or_num # numbers only with 2+ delimiters serial_graph |= (num_graph + delimiter + num_graph + delimiter + num_graph + pynini.closure(delimiter + num_graph)) # 2+ symbols serial_graph |= pynini.compose(NEMO_SIGMA + symbols + NEMO_SIGMA, num_graph + delimiter + num_graph) # exclude ordinal numbers from serial options serial_graph = pynini.compose( pynini.difference(NEMO_SIGMA, pynini.project(ordinal.graph, "input")), serial_graph).optimize() serial_graph = pynutil.add_weight(serial_graph, 0.0001) serial_graph |= (pynini.closure(NEMO_NOT_SPACE, 1) + (pynini.cross("^2", " squared") | pynini.cross("^3", " cubed")).optimize()) # at least one serial graph with alpha numeric value and optional additional serial/num/alpha values serial_graph = ( pynini.closure((serial_graph | num_graph | alphas) + delimiter) + serial_graph + pynini.closure(delimiter + (serial_graph | num_graph | alphas))) serial_graph |= pynini.compose(graph_with_space, serial_graph.optimize()).optimize() serial_graph = pynini.compose(pynini.closure(NEMO_NOT_SPACE, 2), serial_graph).optimize() self.graph = serial_graph.optimize() graph = pynutil.insert("name: \"") + convert_space( self.graph).optimize() + pynutil.insert("\"") self.fst = graph.optimize()
def __init__(self, cardinal: GraphFst, deterministic: bool): super().__init__(name="date", kind="classify", deterministic=deterministic) month_graph = pynini.string_file(get_abs_path("data/months/names.tsv")).optimize() month_graph |= (TO_LOWER + pynini.closure(NEMO_CHAR)) @ month_graph month_abbr_graph = pynini.string_file(get_abs_path("data/months/abbr.tsv")).optimize() month_abbr_graph = ( month_abbr_graph | (TO_LOWER + pynini.closure(NEMO_CHAR)) @ month_abbr_graph ) + pynini.closure(pynutil.delete("."), 0, 1) month_graph |= month_abbr_graph # to support all caps names names_all_caps = [[x[0].upper()] for x in load_labels(get_abs_path("data/months/names.tsv"))] abbr_all_caps = [(x.upper(), y) for x, y in load_labels(get_abs_path("data/months/abbr.tsv"))] month_graph |= pynini.string_map(names_all_caps) | ( pynini.string_map(abbr_all_caps) + pynini.closure(pynutil.delete("."), 0, 1) ) month_numbers_graph = pynini.string_file(get_abs_path("data/months/numbers.tsv")).optimize() cardinal_graph = cardinal.graph_hundred_component_at_least_one_none_zero_digit year_graph = _get_year_graph(deterministic) YEAR_WEIGHT = 0.001 year_graph_standalone = ( pynutil.insert("year: \"") + pynutil.add_weight(year_graph, YEAR_WEIGHT) + pynutil.insert("\"") ) month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"") month_numbers_graph = pynutil.insert("month: \"") + month_numbers_graph + pynutil.insert("\"") day_graph = ( pynutil.insert("day: \"") + ((pynini.union("1", "2", "3") + NEMO_DIGIT) | NEMO_DIGIT) @ cardinal_graph + pynutil.insert("\"") ) optional_day_graph = pynini.closure(delete_extra_space + day_graph, 0, 1) year_graph = pynutil.insert("year: \"") + year_graph + pynutil.insert("\"") optional_graph_year = pynini.closure(delete_extra_space + year_graph, 0, 1,) graph_mdy = ( month_graph + optional_day_graph + delete_space + pynini.closure(pynutil.delete(","), 0, 1) + optional_graph_year ) delete_sep = pynutil.delete(pynini.union("-", "/", ".")) graph_mdy |= ( month_numbers_graph + delete_sep + insert_space + pynini.closure(pynutil.delete("0"), 0, 1) + day_graph + delete_sep + insert_space + year_graph ) graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year graph_ymd = ( year_graph + delete_sep + insert_space + month_numbers_graph + delete_sep + insert_space + pynini.closure(pynutil.delete("0"), 0, 1) + day_graph ) final_graph = (graph_mdy | graph_dmy) + pynutil.insert(" preserve_order: true") final_graph |= graph_ymd | year_graph_standalone final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, deterministic: bool, lm: bool = False): super().__init__(name="date", kind="classify", deterministic=deterministic) # january month_graph = pynini.string_file(get_abs_path("data/date/month_name.tsv")).optimize() # January, JANUARY month_graph |= pynini.compose(TO_LOWER + pynini.closure(NEMO_CHAR), month_graph) | pynini.compose( TO_LOWER ** (2, ...), month_graph ) # jan month_abbr_graph = pynini.string_file(get_abs_path("data/date/month_abbr.tsv")).optimize() # jan, Jan, JAN month_abbr_graph = ( month_abbr_graph | pynini.compose(TO_LOWER + pynini.closure(NEMO_LOWER, 1), month_abbr_graph).optimize() | pynini.compose(TO_LOWER ** (2, ...), month_abbr_graph).optimize() ) + pynini.closure(pynutil.delete("."), 0, 1) month_graph |= month_abbr_graph.optimize() month_numbers_labels = pynini.string_file(get_abs_path("data/date/month_number.tsv")).optimize() cardinal_graph = cardinal.graph_hundred_component_at_least_one_none_zero_digit year_graph = _get_year_graph(cardinal_graph=cardinal_graph, deterministic=deterministic) # three_digit_year = (NEMO_DIGIT @ cardinal_graph) + insert_space + (NEMO_DIGIT ** 2) @ cardinal_graph # year_graph |= three_digit_year month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"") month_numbers_graph = pynutil.insert("month: \"") + month_numbers_labels + pynutil.insert("\"") endings = ["rd", "th", "st", "nd"] endings += [x.upper() for x in endings] endings = pynini.union(*endings) day_graph = ( pynutil.insert("day: \"") + pynini.closure(pynutil.delete("the "), 0, 1) + ( ((pynini.union("1", "2") + NEMO_DIGIT) | NEMO_DIGIT | (pynini.accep("3") + pynini.union("0", "1"))) + pynini.closure(pynutil.delete(endings), 0, 1) ) @ cardinal_graph + pynutil.insert("\"") ) two_digit_year = _get_two_digit_year( cardinal_graph=cardinal_graph, single_digits_graph=cardinal.single_digits_graph ) two_digit_year = pynutil.insert("year: \"") + two_digit_year + pynutil.insert("\"") # if lm: # two_digit_year = pynini.compose(pynini.difference(NEMO_DIGIT, "0") + NEMO_DIGIT ** (3), two_digit_year) # year_graph = pynini.compose(pynini.difference(NEMO_DIGIT, "0") + NEMO_DIGIT ** (2), year_graph) # year_graph |= pynini.compose(pynini.difference(NEMO_DIGIT, "0") + NEMO_DIGIT ** (4, ...), year_graph) graph_year = pynutil.insert(" year: \"") + pynutil.delete(" ") + year_graph + pynutil.insert("\"") graph_year |= ( pynutil.insert(" year: \"") + pynini.accep(",") + pynini.closure(pynini.accep(" "), 0, 1) + year_graph + pynutil.insert("\"") ) optional_graph_year = pynini.closure(graph_year, 0, 1) year_graph = pynutil.insert("year: \"") + year_graph + pynutil.insert("\"") graph_mdy = month_graph + ( (delete_extra_space + day_graph) | (pynini.accep(" ") + day_graph) | graph_year | (delete_extra_space + day_graph + graph_year) ) graph_mdy |= ( month_graph + pynini.cross("-", " ") + day_graph + pynini.closure(((pynini.cross("-", " ") + NEMO_SIGMA) @ graph_year), 0, 1) ) for x in ["-", "/", "."]: delete_sep = pynutil.delete(x) graph_mdy |= ( month_numbers_graph + delete_sep + insert_space + pynini.closure(pynutil.delete("0"), 0, 1) + day_graph + delete_sep + insert_space + (year_graph | two_digit_year) ) graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year day_ex_month = (NEMO_DIGIT ** 2 - pynini.project(month_numbers_graph, "input")) @ day_graph for x in ["-", "/", "."]: delete_sep = pynutil.delete(x) graph_dmy |= ( day_ex_month + delete_sep + insert_space + month_numbers_graph + delete_sep + insert_space + (year_graph | two_digit_year) ) graph_ymd = pynini.accep("") for x in ["-", "/", "."]: delete_sep = pynutil.delete(x) graph_ymd |= ( (year_graph | two_digit_year) + delete_sep + insert_space + month_numbers_graph + delete_sep + insert_space + pynini.closure(pynutil.delete("0"), 0, 1) + day_graph ) final_graph = graph_mdy | graph_dmy if not deterministic or lm: final_graph += pynini.closure(pynutil.insert(" preserve_order: true"), 0, 1) m_sep_d = ( month_numbers_graph + pynutil.delete(pynini.union("-", "/")) + insert_space + pynini.closure(pynutil.delete("0"), 0, 1) + day_graph ) final_graph |= m_sep_d else: final_graph += pynutil.insert(" preserve_order: true") final_graph |= graph_ymd | year_graph if not deterministic or lm: ymd_to_mdy_graph = None ymd_to_dmy_graph = None mdy_to_dmy_graph = None md_to_dm_graph = None for month in [x[0] for x in load_labels(get_abs_path("data/date/month_name.tsv"))]: for day in [x[0] for x in load_labels(get_abs_path("data/date/day.tsv"))]: ymd_to_mdy_curr = ( pynutil.insert("month: \"" + month + "\" day: \"" + day + "\" ") + pynini.accep('year:') + NEMO_SIGMA + pynutil.delete(" month: \"" + month + "\" day: \"" + day + "\"") ) # YY-MM-DD -> MM-DD-YY ymd_to_mdy_curr = pynini.compose(graph_ymd, ymd_to_mdy_curr) ymd_to_mdy_graph = ( ymd_to_mdy_curr if ymd_to_mdy_graph is None else pynini.union(ymd_to_mdy_curr, ymd_to_mdy_graph) ) ymd_to_dmy_curr = ( pynutil.insert("day: \"" + day + "\" month: \"" + month + "\" ") + pynini.accep('year:') + NEMO_SIGMA + pynutil.delete(" month: \"" + month + "\" day: \"" + day + "\"") ) # YY-MM-DD -> MM-DD-YY ymd_to_dmy_curr = pynini.compose(graph_ymd, ymd_to_dmy_curr).optimize() ymd_to_dmy_graph = ( ymd_to_dmy_curr if ymd_to_dmy_graph is None else pynini.union(ymd_to_dmy_curr, ymd_to_dmy_graph) ) mdy_to_dmy_curr = ( pynutil.insert("day: \"" + day + "\" month: \"" + month + "\" ") + pynutil.delete("month: \"" + month + "\" day: \"" + day + "\" ") + pynini.accep('year:') + NEMO_SIGMA ).optimize() # MM-DD-YY -> verbalize as MM-DD-YY (February fourth 1991) or DD-MM-YY (the fourth of February 1991) mdy_to_dmy_curr = pynini.compose(graph_mdy, mdy_to_dmy_curr).optimize() mdy_to_dmy_graph = ( mdy_to_dmy_curr if mdy_to_dmy_graph is None else pynini.union(mdy_to_dmy_curr, mdy_to_dmy_graph).optimize() ).optimize() md_to_dm_curr = pynutil.insert("day: \"" + day + "\" month: \"" + month + "\"") + pynutil.delete( "month: \"" + month + "\" day: \"" + day + "\"" ) md_to_dm_curr = pynini.compose(m_sep_d, md_to_dm_curr).optimize() md_to_dm_graph = ( md_to_dm_curr if md_to_dm_graph is None else pynini.union(md_to_dm_curr, md_to_dm_graph).optimize() ).optimize() final_graph |= mdy_to_dmy_graph | md_to_dm_graph | ymd_to_mdy_graph | ymd_to_dmy_graph final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, deterministic: bool): super().__init__(name="date", kind="classify", deterministic=deterministic) month_graph = pynini.string_file(get_abs_path("data/months/names.tsv")).optimize() month_graph |= pynini.compose(TO_LOWER + pynini.closure(NEMO_CHAR), month_graph) | pynini.compose( TO_LOWER ** (2, ...), month_graph ) month_abbr_graph = pynini.string_file(get_abs_path("data/months/abbr.tsv")).optimize() month_abbr_graph = ( month_abbr_graph | pynini.compose(TO_LOWER + pynini.closure(NEMO_LOWER, 1), month_abbr_graph).optimize() | pynini.compose(TO_LOWER ** (2, ...), month_abbr_graph).optimize() ) + pynini.closure(pynutil.delete("."), 0, 1) month_graph |= month_abbr_graph.optimize() month_numbers_graph = pynini.string_file(get_abs_path("data/months/numbers.tsv")).optimize() cardinal_graph = cardinal.graph_hundred_component_at_least_one_none_zero_digit year_graph = _get_year_graph(deterministic) YEAR_WEIGHT = 0.001 year_graph_standalone = ( pynutil.insert("year: \"") + pynutil.add_weight(year_graph, YEAR_WEIGHT) + pynutil.insert("\"") ) month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"") month_numbers_graph = pynutil.insert("month: \"") + month_numbers_graph + pynutil.insert("\"") endings = ["rd", "th", "st", "nd"] endings += [x.upper() for x in endings] endings = pynini.union(*endings) day_graph = ( pynutil.insert("day: \"") + pynini.closure(pynutil.delete("the "), 0, 1) + ( ((pynini.union("1", "2") + NEMO_DIGIT) | NEMO_DIGIT | (pynini.accep("3") + pynini.union("0", "1"))) + pynini.closure(pynutil.delete(endings), 0, 1) ) @ cardinal_graph + pynutil.insert("\"") ) two_digit_year = NEMO_DIGIT ** (2) @ (cardinal.single_digits_graph | cardinal_graph) two_digit_year = pynutil.insert("year: \"") + two_digit_year + pynutil.insert("\"") graph_year = pynutil.insert(" year: \"") + pynutil.delete(" ") + year_graph + pynutil.insert("\"") optional_graph_year = pynini.closure(graph_year, 0, 1) year_graph = pynutil.insert("year: \"") + year_graph + pynutil.insert("\"") graph_mdy = month_graph + ( (delete_extra_space + day_graph) | (pynini.accep(" ") + day_graph) | graph_year | (delete_extra_space + day_graph + graph_year) ) delete_sep = pynutil.delete(pynini.union("-", "/", ".")) graph_mdy |= ( month_numbers_graph + delete_sep + insert_space + pynini.closure(pynutil.delete("0"), 0, 1) + day_graph + delete_sep + insert_space + (year_graph | two_digit_year) ) graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year graph_ymd = ( (year_graph | two_digit_year) + delete_sep + insert_space + month_numbers_graph + delete_sep + insert_space + pynini.closure(pynutil.delete("0"), 0, 1) + day_graph ) final_graph = graph_mdy | graph_dmy if deterministic: final_graph += pynutil.insert(" preserve_order: true") else: final_graph += pynini.closure(pynutil.insert(" preserve_order: true"), 0, 1) m_sep_d = ( month_numbers_graph + delete_sep + insert_space + pynini.closure(pynutil.delete("0"), 0, 1) + day_graph ) final_graph |= m_sep_d final_graph |= graph_ymd | year_graph_standalone if not deterministic: ymd_to_mdy_graph = None ymd_to_dmy_graph = None mdy_to_dmy_graph = None md_to_dm_graph = None for month in [x[0] for x in load_labels(get_abs_path("data/months/names.tsv"))]: for day in [x[0] for x in load_labels(get_abs_path("data/months/days.tsv"))]: ymd_to_mdy_curr = ( pynutil.insert("month: \"" + month + "\" day: \"" + day + "\" ") + pynini.accep('year:') + NEMO_SIGMA + pynutil.delete(" month: \"" + month + "\" day: \"" + day + "\"") ) # YY-MM-DD -> MM-DD-YY ymd_to_mdy_curr = pynini.compose(graph_ymd, ymd_to_mdy_curr) ymd_to_mdy_graph = ( ymd_to_mdy_curr if ymd_to_mdy_graph is None else pynini.union(ymd_to_mdy_curr, ymd_to_mdy_graph) ) ymd_to_dmy_curr = ( pynutil.insert("day: \"" + day + "\" month: \"" + month + "\" ") + pynini.accep('year:') + NEMO_SIGMA + pynutil.delete(" month: \"" + month + "\" day: \"" + day + "\"") ) # YY-MM-DD -> MM-DD-YY ymd_to_dmy_curr = pynini.compose(graph_ymd, ymd_to_dmy_curr).optimize() ymd_to_dmy_graph = ( ymd_to_dmy_curr if ymd_to_dmy_graph is None else pynini.union(ymd_to_dmy_curr, ymd_to_dmy_graph) ) mdy_to_dmy_curr = ( pynutil.insert("day: \"" + day + "\" month: \"" + month + "\" ") + pynutil.delete("month: \"" + month + "\" day: \"" + day + "\" ") + pynini.accep('year:') + NEMO_SIGMA ).optimize() # MM-DD-YY -> verbalize as MM-DD-YY (February fourth 1991) or DD-MM-YY (the fourth of February 1991) mdy_to_dmy_curr = pynini.compose(graph_mdy, mdy_to_dmy_curr).optimize() mdy_to_dmy_graph = ( mdy_to_dmy_curr if mdy_to_dmy_graph is None else pynini.union(mdy_to_dmy_curr, mdy_to_dmy_graph).optimize() ).optimize() md_to_dm_curr = pynutil.insert("day: \"" + day + "\" month: \"" + month + "\"") + pynutil.delete( "month: \"" + month + "\" day: \"" + day + "\"" ) md_to_dm_curr = pynini.compose(m_sep_d, md_to_dm_curr).optimize() md_to_dm_graph = ( md_to_dm_curr if md_to_dm_graph is None else pynini.union(md_to_dm_curr, md_to_dm_graph).optimize() ).optimize() final_graph |= mdy_to_dmy_graph | md_to_dm_graph | ymd_to_mdy_graph | ymd_to_dmy_graph final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()