def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="measure", kind="classify") cardinal_graph = cardinal.graph_no_exception graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv")) graph_unit_singular = pynini.invert(graph_unit) # singular -> abbr graph_unit_plural = get_singulars( graph_unit_singular) # plural -> abbr optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("minus", "\"true\"") + delete_extra_space, 0, 1) unit_singular = convert_space(graph_unit_singular) unit_plural = convert_space(graph_unit_plural) unit_misc = pynutil.insert("/") + pynutil.delete( "per") + delete_space + convert_space(graph_unit_singular) unit_singular = (pynutil.insert("units: \"") + (unit_singular | unit_misc | pynutil.add_weight( unit_singular + delete_space + unit_misc, 0.01)) + pynutil.insert("\"")) unit_plural = (pynutil.insert("units: \"") + (unit_plural | unit_misc | pynutil.add_weight( unit_plural + delete_space + unit_misc, 0.01)) + pynutil.insert("\"")) subgraph_decimal = (pynutil.insert("decimal { ") + optional_graph_negative + decimal.final_graph_wo_negative + pynutil.insert(" }") + delete_extra_space + unit_plural) subgraph_cardinal = (pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + ((NEMO_SIGMA - "one") @ cardinal_graph) + pynutil.insert("\"") + pynutil.insert(" }") + delete_extra_space + unit_plural) subgraph_cardinal |= (pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + pynini.cross("one", "1") + pynutil.insert("\"") + pynutil.insert(" }") + delete_extra_space + unit_singular) final_graph = subgraph_decimal | subgraph_cardinal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="whitelist", kind="classify") whitelist = pynini.string_file( get_abs_path("data/whitelist.tsv")).invert() graph = pynutil.insert("name: \"") + convert_space( whitelist) + pynutil.insert("\"") self.fst = graph.optimize()
def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None): super().__init__(name="whitelist", kind="classify", deterministic=deterministic) def _get_whitelist_graph(input_case, file): whitelist = load_labels(file) if input_case == "lower_cased": whitelist = [(x.lower(), y) for x, y in whitelist] else: whitelist = [(x, y) for x, y in whitelist] graph = pynini.string_map(whitelist) return graph def _get_whitelist_non_deterministic_graph( file="data/whitelist_alternatives.tsv"): whitelist = load_labels(get_abs_path(file)) whitelist_lower = [(x.lower(), y.lower()) for x, y in whitelist] whitelist_cased = [(x, y) for x, y in whitelist] graph = pynini.string_map(whitelist_lower + whitelist_cased) return graph graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist.tsv")) if not deterministic: graph |= _get_whitelist_non_deterministic_graph() if input_file: whitelist_provided = _get_whitelist_graph(input_case, input_file) if not deterministic: graph |= whitelist_provided else: graph = whitelist_provided self.graph = (convert_space(graph)).optimize() self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="money", kind="classify") # quantity, integer_part, fractional_part, currency cardinal_graph = cardinal.graph_no_exception graph_decimal_final = decimal.final_graph_wo_negative unit = pynini.string_file(get_abs_path("data/currency.tsv")) unit_singular = pynini.invert(unit) unit_plural = get_singulars(unit_singular) graph_unit_singular = pynutil.insert("currency: \"") + convert_space( unit_singular) + pynutil.insert("\"") graph_unit_plural = pynutil.insert("currency: \"") + convert_space( unit_plural) + pynutil.insert("\"") add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | ( pynutil.insert("0") + NEMO_DIGIT) # twelve dollars (and) fifty cents, zero cents cents_standalone = (pynutil.insert("fractional_part: \"") + pynini.union( pynutil.add_weight( ((NEMO_SIGMA - "one") @ cardinal_graph), -0.7) @ add_leading_zero_to_double_digit + delete_space + pynutil.delete("cents"), pynini.cross("one", "01") + delete_space + pynutil.delete("cent"), ) + pynutil.insert("\"")) optional_cents_standalone = pynini.closure( delete_space + pynini.closure(pynutil.delete("and") + delete_space, 0, 1) + insert_space + cents_standalone, 0, 1, ) # twelve dollars fifty, only after integer optional_cents_suffix = pynini.closure( delete_extra_space + pynutil.insert("fractional_part: \"") + pynutil.add_weight( cardinal_graph @ add_leading_zero_to_double_digit, -0.7) + pynutil.insert("\""), 0, 1, ) graph_integer = (pynutil.insert("integer_part: \"") + ((NEMO_SIGMA - "one") @ cardinal_graph) + pynutil.insert("\"") + delete_extra_space + graph_unit_plural + (optional_cents_standalone | optional_cents_suffix)) graph_integer |= (pynutil.insert("integer_part: \"") + pynini.cross("one", "1") + pynutil.insert("\"") + delete_extra_space + graph_unit_singular + (optional_cents_standalone | optional_cents_suffix)) graph_decimal = graph_decimal_final + delete_extra_space + graph_unit_plural graph_decimal |= pynutil.insert( "currency: \"$\" integer_part: \"0\" ") + cents_standalone final_graph = graph_integer | graph_decimal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="time", kind="classify", deterministic=deterministic) suffix_graph = pynini.string_file(get_abs_path("data/time_suffix.tsv")) time_zone_graph = pynini.string_file( get_abs_path("data/time_zone.tsv")) # only used for < 1000 thousand -> 0 weight cardinal = cardinal.graph labels_hour = [str(x) for x in range(0, 24)] labels_minute_single = [str(x) for x in range(1, 10)] labels_minute_double = [str(x) for x in range(10, 60)] delete_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | ( pynini.closure(pynutil.delete("0"), 0, 1) + NEMO_DIGIT) graph_hour = delete_leading_zero_to_double_digit @ pynini.union( *labels_hour) @ cardinal graph_minute_single = pynini.union(*labels_minute_single) @ cardinal graph_minute_double = pynini.union(*labels_minute_double) @ cardinal final_graph_hour = pynutil.insert( "hours: \"") + graph_hour + pynutil.insert("\"") final_graph_minute = ( pynutil.insert("minutes: \"") + (pynini.cross("0", "o") + insert_space + graph_minute_single | graph_minute_double) + pynutil.insert("\"")) final_graph_second = ( pynutil.insert("seconds: \"") + (pynini.cross("0", "o") + insert_space + graph_minute_single | graph_minute_double) + pynutil.insert("\"")) final_suffix = pynutil.insert("suffix: \"") + convert_space( suffix_graph) + pynutil.insert("\"") final_suffix_optional = pynini.closure( delete_space + insert_space + final_suffix, 0, 1) final_time_zone_optional = pynini.closure( delete_space + insert_space + pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\""), 0, 1, ) # 2:30 pm, 02:30, 2:00 graph_hm = ( final_graph_hour + pynutil.delete(":") + (pynutil.delete("00") | insert_space + final_graph_minute) + final_suffix_optional + final_time_zone_optional) # 10:30:05 pm, graph_hms = (final_graph_hour + pynutil.delete(":") + (pynini.cross("00", " minutes: \"zero\"") | insert_space + final_graph_minute) + pynutil.delete(":") + (pynini.cross("00", " seconds: \"zero\"") | insert_space + final_graph_second) + final_suffix_optional + final_time_zone_optional) # 2.xx pm/am graph_hm2 = ( final_graph_hour + pynutil.delete(".") + (pynutil.delete("00") | insert_space + final_graph_minute) + delete_space + insert_space + final_suffix + final_time_zone_optional) # 2 pm est graph_h = final_graph_hour + delete_space + insert_space + final_suffix + final_time_zone_optional final_graph = (graph_hm | graph_h | graph_hm2 | graph_hms).optimize() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, deterministic: bool = True): super().__init__(name="measure", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph if not deterministic: cardinal_graph |= cardinal.range_graph graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv")) graph_unit |= pynini.compose( pynini.closure(TO_LOWER, 1) + pynini.closure(NEMO_ALPHA), graph_unit) graph_unit_plural = convert_space(graph_unit @ SINGULAR_TO_PLURAL) graph_unit = convert_space(graph_unit) optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) graph_unit2 = pynini.cross("/", "per") + delete_space + pynutil.insert( NEMO_NON_BREAKING_SPACE) + graph_unit optional_graph_unit2 = pynini.closure( delete_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit2, 0, 1, ) unit_plural = ( pynutil.insert("units: \"") + (graph_unit_plural + optional_graph_unit2 | graph_unit2) + pynutil.insert("\"")) unit_singular = (pynutil.insert("units: \"") + (graph_unit + optional_graph_unit2 | graph_unit2) + pynutil.insert("\"")) subgraph_decimal = (pynutil.insert("decimal { ") + optional_graph_negative + decimal.final_graph_wo_negative + delete_space + pynutil.insert(" } ") + unit_plural) subgraph_cardinal = (pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + delete_space + pynutil.insert("\"") + pynutil.insert(" } ") + unit_plural) subgraph_cardinal |= (pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + pynini.cross("1", "one") + delete_space + pynutil.insert("\"") + pynutil.insert(" } ") + unit_singular) cardinal_dash_alpha = (pynutil.insert("cardinal { integer: \"") + cardinal_graph + pynini.accep('-') + pynutil.insert("\" } units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynutil.insert("\"")) alpha_dash_cardinal = (pynutil.insert("units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynini.accep('-') + pynutil.insert("\"") + pynutil.insert(" cardinal { integer: \"") + cardinal_graph + pynutil.insert("\" } preserve_order: true")) decimal_dash_alpha = (pynutil.insert("decimal { ") + decimal.final_graph_wo_negative + pynini.cross('-', '') + pynutil.insert(" } units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynutil.insert("\"")) decimal_times = (pynutil.insert("decimal { ") + decimal.final_graph_wo_negative + pynutil.insert(" } units: \"") + pynini.cross(pynini.union('x', "X"), 'x') + pynutil.insert("\"")) alpha_dash_decimal = (pynutil.insert("units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynini.accep('-') + pynutil.insert("\"") + pynutil.insert(" decimal { ") + decimal.final_graph_wo_negative + pynutil.insert(" } preserve_order: true")) subgraph_fraction = (pynutil.insert("fraction { ") + fraction.graph + delete_space + pynutil.insert(" } ") + unit_plural) address = self.get_address_graph(cardinal) address = ( pynutil.insert("units: \"address\" cardinal { integer: \"") + address + pynutil.insert("\" } preserve_order: true")) math_operations = pynini.string_file( get_abs_path("data/math_operations.tsv")) delimiter = pynini.accep(" ") | pynutil.insert(" ") math = (cardinal_graph + delimiter + math_operations + delimiter + cardinal_graph + delimiter + pynini.cross("=", "equals") + delimiter + cardinal_graph) math = (pynutil.insert("units: \"math\" cardinal { integer: \"") + math + pynutil.insert("\" } preserve_order: true")) final_graph = (subgraph_decimal | subgraph_cardinal | cardinal_dash_alpha | alpha_dash_cardinal | decimal_dash_alpha | decimal_times | alpha_dash_decimal | subgraph_fraction | address | math) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="time", kind="classify") # hours, minutes, seconds, suffix, zone, style, speak_period suffix_graph = pynini.string_file(get_abs_path("data/time/time_suffix.tsv")) time_zone_graph = pynini.invert(pynini.string_file(get_abs_path("data/time/time_zone.tsv"))) time_to_graph = pynini.string_file(get_abs_path("data/time/time_to.tsv")) # only used for < 1000 thousand -> 0 weight cardinal = pynutil.add_weight(CardinalFst().graph_no_exception, weight=-0.7) labels_hour = [num_to_word(x) for x in range(0, 24)] labels_minute_single = [num_to_word(x) for x in range(1, 10)] labels_minute_double = [num_to_word(x) for x in range(10, 60)] graph_hour = pynini.union(*labels_hour) @ cardinal graph_minute_single = pynini.union(*labels_minute_single) @ cardinal graph_minute_double = pynini.union(*labels_minute_double) @ cardinal graph_minute_verbose = pynini.cross("half", "30") | pynini.cross("quarter", "15") oclock = pynini.cross(pynini.union("o' clock", "o clock", "o'clock", "oclock"), "") final_graph_hour = pynutil.insert("hours: \"") + graph_hour + pynutil.insert("\"") graph_minute = ( oclock + pynutil.insert("00") | pynutil.delete("o") + delete_space + graph_minute_single | graph_minute_double ) final_suffix = pynutil.insert("suffix: \"") + convert_space(suffix_graph) + pynutil.insert("\"") final_suffix_optional = pynini.closure(delete_space + insert_space + final_suffix, 0, 1) final_time_zone_optional = pynini.closure( delete_space + insert_space + pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\""), 0, 1, ) # five o' clock # two o eight, two thiry five (am/pm) # two pm/am graph_hm = ( final_graph_hour + delete_extra_space + pynutil.insert("minutes: \"") + graph_minute + pynutil.insert("\"") ) # 10 past four, quarter past four, half past four graph_mh = ( pynutil.insert("minutes: \"") + pynini.union(graph_minute_single, graph_minute_double, graph_minute_verbose) + pynutil.insert("\"") + delete_space + pynutil.delete("past") + delete_extra_space + final_graph_hour ) graph_quarter_time = ( pynutil.insert("minutes: \"") + pynini.cross("quarter", "45") + pynutil.insert("\"") + delete_space + pynutil.delete(pynini.union("to", "till")) + delete_extra_space + pynutil.insert("hours: \"") + time_to_graph + pynutil.insert("\"") ) graph_h = ( final_graph_hour + delete_extra_space + pynutil.insert("minutes: \"") + (pynutil.insert("00") | graph_minute) + pynutil.insert("\"") + delete_space + insert_space + final_suffix + final_time_zone_optional ) final_graph = (graph_hm | graph_mh | graph_quarter_time) + final_suffix_optional + final_time_zone_optional final_graph |= graph_h final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = True): super().__init__(name="money", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph graph_decimal_final = decimal.final_graph_wo_negative maj_singular_labels = load_labels(get_abs_path("data/currency/currency.tsv")) maj_unit_plural = convert_space(maj_singular @ SINGULAR_TO_PLURAL) maj_unit_singular = convert_space(maj_singular) graph_maj_singular = pynutil.insert("currency_maj: \"") + maj_unit_singular + pynutil.insert("\"") graph_maj_plural = pynutil.insert("currency_maj: \"") + maj_unit_plural + pynutil.insert("\"") optional_delete_fractional_zeros = pynini.closure( pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), 0, 1 ) graph_integer_one = pynutil.insert("integer_part: \"") + pynini.cross("1", "one") + pynutil.insert("\"") # only for decimals where third decimal after comma is non-zero or with quantity decimal_delete_last_zeros = ( pynini.closure(NEMO_DIGIT | pynutil.delete(",")) + pynini.accep(".") + pynini.closure(NEMO_DIGIT, 2) + (NEMO_DIGIT - "0") + pynini.closure(pynutil.delete("0")) ) decimal_with_quantity = NEMO_SIGMA + NEMO_ALPHA graph_decimal = ( graph_maj_plural + insert_space + (decimal_delete_last_zeros | decimal_with_quantity) @ graph_decimal_final ) graph_integer = ( pynutil.insert("integer_part: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + pynutil.insert("\"") ) graph_integer_only = graph_maj_singular + insert_space + graph_integer_one graph_integer_only |= graph_maj_plural + insert_space + graph_integer final_graph = (graph_integer_only + optional_delete_fractional_zeros) | graph_decimal # remove trailing zeros of non zero number in the first 2 digits and fill up to 2 digits # e.g. 2000 -> 20, 0200->02, 01 -> 01, 10 -> 10 # not accepted: 002, 00, 0, two_digits_fractional_part = ( pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(pynutil.delete("0")) ) @ ( (pynutil.delete("0") + (NEMO_DIGIT - "0")) | ((NEMO_DIGIT - "0") + pynutil.insert("0")) | ((NEMO_DIGIT - "0") + NEMO_DIGIT) ) graph_min_singular = pynutil.insert(" currency_min: \"") + min_singular + pynutil.insert("\"") graph_min_plural = pynutil.insert(" currency_min: \"") + min_plural + pynutil.insert("\"") # format ** dollars ** cent decimal_graph_with_minor = None integer_graph_reordered = None decimal_default_reordered = None for curr_symbol, _ in maj_singular_labels: preserve_order = pynutil.insert(" preserve_order: true") integer_plus_maj = graph_integer + insert_space + pynutil.insert(curr_symbol) @ graph_maj_plural integer_plus_maj |= graph_integer_one + insert_space + pynutil.insert(curr_symbol) @ graph_maj_singular integer_plus_maj_with_comma = pynini.compose( NEMO_DIGIT - "0" + pynini.closure(NEMO_DIGIT | pynutil.delete(",")), integer_plus_maj ) integer_plus_maj = pynini.compose(pynini.closure(NEMO_DIGIT) - "0", integer_plus_maj) integer_plus_maj |= integer_plus_maj_with_comma graph_fractional_one = two_digits_fractional_part @ pynini.cross("1", "one") graph_fractional_one = pynutil.insert("fractional_part: \"") + graph_fractional_one + pynutil.insert("\"") graph_fractional = ( two_digits_fractional_part @ (pynini.closure(NEMO_DIGIT, 1, 2) - "1") @ cardinal.graph_hundred_component_at_least_one_none_zero_digit ) graph_fractional = pynutil.insert("fractional_part: \"") + graph_fractional + pynutil.insert("\"") fractional_plus_min = graph_fractional + insert_space + pynutil.insert(curr_symbol) @ graph_min_plural fractional_plus_min |= ( graph_fractional_one + insert_space + pynutil.insert(curr_symbol) @ graph_min_singular ) decimal_graph_with_minor_curr = integer_plus_maj + pynini.cross(".", " ") + fractional_plus_min if not deterministic: decimal_graph_with_minor_curr |= pynutil.add_weight( integer_plus_maj + pynini.cross(".", " ") + pynutil.insert("fractional_part: \"") + two_digits_fractional_part @ cardinal.graph_hundred_component_at_least_one_none_zero_digit + pynutil.insert("\""), weight=0.0001, ) default_fraction_graph = (decimal_delete_last_zeros | decimal_with_quantity) @ graph_decimal_final decimal_graph_with_minor_curr |= ( pynini.closure(pynutil.delete("0"), 0, 1) + pynutil.delete(".") + fractional_plus_min ) decimal_graph_with_minor_curr = ( pynutil.delete(curr_symbol) + decimal_graph_with_minor_curr + preserve_order ) decimal_graph_with_minor = ( decimal_graph_with_minor_curr if decimal_graph_with_minor is None else pynini.union(decimal_graph_with_minor, decimal_graph_with_minor_curr).optimize() ) if not deterministic: integer_graph_reordered_curr = ( pynutil.delete(curr_symbol) + integer_plus_maj + preserve_order ).optimize() integer_graph_reordered = ( integer_graph_reordered_curr if integer_graph_reordered is None else pynini.union(integer_graph_reordered, integer_graph_reordered_curr).optimize() ) decimal_default_reordered_curr = ( pynutil.delete(curr_symbol) + default_fraction_graph + insert_space + pynutil.insert(curr_symbol) @ graph_maj_plural ) decimal_default_reordered = ( decimal_default_reordered_curr if decimal_default_reordered is None else pynini.union(decimal_default_reordered, decimal_default_reordered_curr) ).optimize() # weight for SH final_graph |= pynutil.add_weight(decimal_graph_with_minor, -0.001) if not deterministic: final_graph |= integer_graph_reordered | decimal_default_reordered # to handle "$2.00" cases final_graph |= pynini.compose( NEMO_SIGMA + pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), integer_graph_reordered ) final_graph = self.add_tokens(final_graph.optimize()) self.fst = final_graph.optimize()