def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="decimal", kind="classify", deterministic=deterministic) graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")).invert() graph_digit |= pynini.string_file( get_abs_path("data/numbers/zero.tsv")).invert() graph_digit |= pynini.cross("1", "eins") self.graph = graph_digit + pynini.closure(insert_space + graph_digit).optimize() point = pynutil.delete(",") optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) self.graph_fractional = pynutil.insert( "fractional_part: \"") + self.graph + pynutil.insert("\"") self.graph_integer = pynutil.insert( "integer_part: \"") + cardinal.graph + pynutil.insert("\"") final_graph_wo_sign = self.graph_integer + point + insert_space + self.graph_fractional self.final_graph_wo_negative = final_graph_wo_sign | get_quantity( final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit) final_graph = optional_graph_negative + self.final_graph_wo_negative final_graph += pynutil.insert(" preserve_order: true") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None): super().__init__(name="whitelist", kind="classify", deterministic=deterministic) def _get_whitelist_graph(input_case, file): whitelist = load_labels(file) if input_case == "lower_cased": whitelist = [[x[0].lower()] + x[1:] for x in whitelist] graph = pynini.string_map(whitelist) return graph graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist.tsv")) if not deterministic and input_case != "lower_cased": graph |= pynutil.add_weight( _get_whitelist_graph("lower_cased", get_abs_path("data/whitelist.tsv")), weight=0.0001 ) if input_file: whitelist_provided = _get_whitelist_graph(input_case, input_file) if not deterministic: graph |= whitelist_provided else: graph = whitelist_provided if not deterministic: units_graph = _get_whitelist_graph(input_case, file=get_abs_path("data/measure/measurements.tsv")) graph |= units_graph self.graph = graph self.final_graph = convert_space(self.graph).optimize() self.fst = (pynutil.insert("name: \"") + self.final_graph + pynutil.insert("\"")).optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic) graph_digit = pynini.string_file( get_abs_path("data/ordinals/digit.tsv")).invert() graph_ties = pynini.string_file( get_abs_path("data/ordinals/ties.tsv")).invert() graph_thousands = pynini.string_file( get_abs_path("data/ordinals/thousands.tsv")).invert() graph = pynutil.delete("integer: \"") + pynini.closure( NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") suffixes = pynini.union("ten", "tem", "ter", "tes", "te") convert_rest = pynutil.insert(suffixes, weight=0.01) self.ordinal_stem = graph_digit | graph_ties | graph_thousands suffix = pynini.cdrewrite( pynini.closure(self.ordinal_stem, 0, 1) + convert_rest, "", "[EOS]", NEMO_SIGMA, ).optimize() self.graph = pynini.compose(graph, suffix) self.suffix = suffix delete_tokens = self.delete_tokens(self.graph) self.fst = delete_tokens.optimize()
def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="telephone", kind="classify", deterministic=deterministic) graph_zero = pynini.invert( pynini.string_file( get_abs_path("data/numbers/zero.tsv"))).optimize() graph_digit_no_zero = pynini.invert( pynini.string_file(get_abs_path("data/numbers/digit.tsv")) ).optimize() | pynini.cross("1", "eins") graph_digit = graph_digit_no_zero | graph_zero numbers_with_single_digits = pynini.closure(graph_digit + insert_space) + graph_digit two_digit_and_zero = ( NEMO_DIGIT**2 @ cardinal.two_digit_non_zero) | graph_zero # def add_space_after_two_digit(): # return pynini.closure(two_digit_and_zero + insert_space) + ( # two_digit_and_zero # ) country_code = pynini.closure(pynini.cross("+", "plus "), 0, 1) + two_digit_and_zero country_code |= (pynutil.delete("(") + graph_zero + insert_space + numbers_with_single_digits + pynutil.delete(")")) country_code |= graph_zero + insert_space + numbers_with_single_digits country_code = pynutil.insert( "country_code: \"") + country_code + pynutil.insert("\"") del_separator = pynini.cross(pynini.union("-", " "), " ") # numbers_with_two_digits = pynini.closure(graph_digit + insert_space) + add_space_after_two_digit() + pynini.closure(insert_space + graph_digit) # numbers = numbers_with_two_digits + pynini.closure(del_separator + numbers_with_two_digits, 0, 1) numbers = numbers_with_single_digits + pynini.closure( del_separator + numbers_with_single_digits, 0, 1) number_length = pynini.closure( (NEMO_DIGIT | pynini.union("-", " ", ")", "(")), 7) number_part = pynini.compose(number_length, numbers) number = pynutil.insert( "number_part: \"") + number_part + pynutil.insert("\"") graph = country_code + pynini.accep(" ") + number self.graph = graph final_graph = self.add_tokens(self.graph + pynutil.insert(" preserve_order: true")) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="classify", deterministic=deterministic) dot = pynini.accep(".") accepted_common_domains = [ x[0] for x in load_labels(get_abs_path("data/electronic/domain.tsv")) ] accepted_common_domains = pynini.union(*accepted_common_domains) accepted_symbols = [ x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv")) ] accepted_symbols = pynini.union(*accepted_symbols) - dot accepted_characters = pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols) # email username = pynutil.insert( "username: \"") + accepted_characters + pynutil.insert( "\"") + pynini.cross('@', ' ') domain_graph = accepted_characters + dot + accepted_characters domain_graph = pynutil.insert( "domain: \"") + domain_graph + pynutil.insert("\"") domain_common_graph = ( pynutil.insert("domain: \"") + accepted_characters + accepted_common_domains + pynini.closure( (accepted_symbols | dot) + pynini.closure(accepted_characters, 1), 0, 1) + pynutil.insert("\"")) graph = (username + domain_graph) | domain_common_graph # url protocol_start = pynini.accep("https://") | pynini.accep("http://") protocol_end = pynini.accep("www.") protocol = protocol_start | protocol_end | (protocol_start + protocol_end) protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert( "\"") graph |= protocol + insert_space + (domain_graph | domain_common_graph) self.graph = graph final_graph = self.add_tokens(self.graph + pynutil.insert(" preserve_order: true")) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="verbalize", deterministic=deterministic) graph_digit_no_zero = pynini.invert( pynini.string_file(get_abs_path("data/numbers/digit.tsv")) ).optimize() | pynini.cross("1", "eins") graph_zero = pynini.invert( pynini.string_file( get_abs_path("data/numbers/zero.tsv"))).optimize() graph_digit = graph_digit_no_zero | graph_zero graph_symbols = pynini.string_file( get_abs_path("data/electronic/symbols.tsv")).optimize() server_common = pynini.string_file( get_abs_path("data/electronic/server_name.tsv")) domain_common = pynini.string_file( get_abs_path("data/electronic/domain.tsv")) def add_space_after_char(): return pynini.closure(NEMO_NOT_QUOTE - pynini.accep(" ") + insert_space) + (NEMO_NOT_QUOTE - pynini.accep(" ")) verbalize_characters = pynini.cdrewrite(graph_symbols | graph_digit, "", "", NEMO_SIGMA) user_name = pynutil.delete( "username: \"") + add_space_after_char() + pynutil.delete("\"") user_name @= verbalize_characters convert_defaults = pynutil.add_weight( NEMO_NOT_QUOTE, weight=0.0001) | domain_common | server_common domain = convert_defaults + pynini.closure(insert_space + convert_defaults) domain @= verbalize_characters domain = pynutil.delete("domain: \"") + domain + pynutil.delete("\"") protocol = (pynutil.delete("protocol: \"") + add_space_after_char() @ pynini.cdrewrite(graph_symbols, "", "", NEMO_SIGMA) + pynutil.delete("\"")) self.graph = (pynini.closure(protocol + pynini.accep(" "), 0, 1) + domain) | (user_name + pynini.accep(" ") + pynutil.insert("at ") + domain) delete_tokens = self.delete_tokens(self.graph + delete_preserve_order) self.fst = delete_tokens.optimize()
def __init__(self, ordinal: GraphFst, deterministic: bool = True): super().__init__(name="date", kind="verbalize", deterministic=deterministic) day_cardinal = pynutil.delete("day: \"") + pynini.closure( NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") day = day_cardinal @ pynini.cdrewrite( ordinal.ordinal_stem, "", "[EOS]", NEMO_SIGMA) + pynutil.insert("ter") months_names = pynini.union(*[ x[1] for x in load_labels(get_abs_path("data/months/abbr_to_name.tsv")) ]) month = pynutil.delete("month: \"") + pynini.closure( NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") final_month = month @ months_names final_month |= month @ pynini.difference( NEMO_SIGMA, months_names) @ pynini.cdrewrite( ordinal.ordinal_stem, "", "[EOS]", NEMO_SIGMA) + pynutil.insert("ter") year = pynutil.delete("year: \"") + pynini.closure( NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") # day month year graph_dmy = day + pynini.accep(" ") + final_month + pynini.closure( pynini.accep(" ") + year, 0, 1) graph_dmy |= final_month + pynini.accep(" ") + year self.graph = graph_dmy | year final_graph = self.graph + delete_preserve_order delete_tokens = self.delete_tokens(final_graph) self.fst = delete_tokens.optimize()
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from nemo_text_processing.text_normalization.de.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, insert_space try: import pynini from pynini.lib import pynutil quantities = pynini.string_file( get_abs_path("data/numbers/quantities.tsv")) PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): PYNINI_AVAILABLE = False quantities = None def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_hundred: 'pynini.FstLike') -> 'pynini.FstLike': """ Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral, e.g. 1 million -> integer_part: "eine" quantity: "million" e.g. 1.4 million -> integer_part: "eins" fractional_part: "vier" quantity: "million" Args:
from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_ALPHA, NEMO_DIGIT, NEMO_NON_BREAKING_SPACE, NEMO_SIGMA, GraphFst, convert_space, insert_space, ) try: import pynini from pynini.lib import pynutil from pynini.examples import plurals unit_singular = pynini.string_file(get_abs_path("data/measure/measurements.tsv")) suppletive = pynini.string_file(get_abs_path("data/measure/suppletive.tsv")) PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): PYNINI_AVAILABLE = False unit_singular = None suppletive = None def singular_to_plural(): # plural endung n/en maskuline Nomen mit den Endungen e, ent, and, ant, ist, or _n = NEMO_SIGMA + pynini.union("e") + pynutil.insert("n") _en = ( NEMO_SIGMA + pynini.union("ent", "and", "ant", "ist", "or", "ion", "ik", "heit", "keit", "schaft", "tät", "ung")
def __init__(self, cardinal: GraphFst, deterministic: bool): super().__init__(name="date", kind="classify", deterministic=deterministic) month_abbr_graph = load_labels( get_abs_path("data/months/abbr_to_name.tsv")) number_to_month = pynini.string_file( get_abs_path("data/months/numbers.tsv")).optimize() month_graph = pynini.union(*[x[1] for x in month_abbr_graph]).optimize() month_abbr_graph = pynini.string_map(month_abbr_graph) month_abbr_graph = (pynutil.add_weight(month_abbr_graph, weight=0.0001) | ((TO_LOWER + pynini.closure(NEMO_CHAR)) @ month_abbr_graph)) + pynini.closure( pynutil.delete(".", weight=-0.0001), 0, 1) self.month_abbr = month_abbr_graph month_graph |= (TO_LOWER + pynini.closure(NEMO_CHAR)) @ month_graph # jan.-> januar, Jan-> januar, januar-> januar month_graph |= month_abbr_graph numbers = cardinal.graph_hundred_component_at_least_one_none_zero_digit optional_leading_zero = delete_leading_zero | NEMO_DIGIT # 01, 31, 1 digit_day = optional_leading_zero @ pynini.union( *[str(x) for x in range(1, 32)]) @ numbers day = (pynutil.insert("day: \"") + digit_day + pynutil.insert("\"")).optimize() digit_month = optional_leading_zero @ pynini.union( *[str(x) for x in range(1, 13)]) number_to_month = digit_month @ number_to_month digit_month @= numbers month_name = (pynutil.insert("month: \"") + month_graph + pynutil.insert("\"")).optimize() month_number = (pynutil.insert("month: \"") + (pynutil.add_weight(digit_month, weight=0.0001) | number_to_month) + pynutil.insert("\"")).optimize() # prefer cardinal over year year = pynutil.add_weight(get_year_graph(cardinal=cardinal), weight=0.001) self.year = year year_only = pynutil.insert("year: \"") + year + pynutil.insert("\"") graph_dmy = (day + pynutil.delete(".") + pynini.closure(pynutil.delete(" "), 0, 1) + insert_space + month_name + pynini.closure(pynini.accep(" ") + year_only, 0, 1)) separators = ["."] for sep in separators: year_optional = pynini.closure( pynini.cross(sep, " ") + year_only, 0, 1) new_graph = day + pynini.cross(sep, " ") + month_number + year_optional graph_dmy |= new_graph dash = "-" day_optional = pynini.closure(pynini.cross(dash, " ") + day, 0, 1) graph_ymd = year_only + pynini.cross(dash, " ") + month_number + day_optional final_graph = graph_dmy + pynutil.insert(" preserve_order: true") final_graph |= year_only final_graph |= graph_ymd self.final_graph = final_graph.optimize() self.fst = self.add_tokens(self.final_graph).optimize()
from nemo_text_processing.text_normalization.de.utils import get_abs_path, load_labels from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_CHAR, NEMO_DIGIT, TO_LOWER, GraphFst, insert_space, ) try: import pynini from pynini.lib import pynutil graph_teen = pynini.invert( pynini.string_file(get_abs_path("data/numbers/teen.tsv"))).optimize() graph_digit = pynini.invert( pynini.string_file(get_abs_path("data/numbers/digit.tsv"))).optimize() ties_graph = pynini.invert( pynini.string_file(get_abs_path("data/numbers/ties.tsv"))).optimize() delete_leading_zero = (pynutil.delete("0") | (NEMO_DIGIT - "0")) + NEMO_DIGIT PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): graph_teen = None graph_digit = None ties_graph = None delete_leading_zero = None PYNINI_AVAILABLE = True
def __init__(self, deterministic: bool = True): super().__init__(name="time", kind="classify", deterministic=deterministic) final_suffix = pynutil.delete(" ") + pynutil.delete( "Uhr") | pynutil.delete("uhr") time_zone_graph = pynini.string_file( get_abs_path("data/time/time_zone.tsv")) labels_hour = [str(x) for x in range(0, 25)] labels_minute_single = [str(x) for x in range(1, 10)] labels_minute_double = [str(x) for x in range(10, 60)] delete_leading_zero_to_double_digit = (pynutil.delete("0") | (NEMO_DIGIT - "0")) + NEMO_DIGIT graph_hour = pynini.union(*labels_hour) graph_minute_single = pynini.union(*labels_minute_single) graph_minute_double = pynini.union(*labels_minute_double) final_graph_hour_only = pynutil.insert( "hours: \"") + graph_hour + pynutil.insert("\"") final_graph_hour = (pynutil.insert("hours: \"") + delete_leading_zero_to_double_digit @ graph_hour + pynutil.insert("\"")) final_graph_minute = ( pynutil.insert("minutes: \"") + (pynutil.delete("0") + graph_minute_single | graph_minute_double) + pynutil.insert("\"")) final_graph_second = ( pynutil.insert("seconds: \"") + (pynutil.delete("0") + graph_minute_single | graph_minute_double) + pynutil.insert("\"")) final_time_zone_optional = pynini.closure( pynini.accep(" ") + pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\""), 0, 1, ) # 02:30 Uhr graph_hm = (final_graph_hour + pynutil.delete(":") + (pynutil.delete("00") | (insert_space + final_graph_minute)) + final_suffix + final_time_zone_optional) # 10:30:05 Uhr, graph_hms = (final_graph_hour + pynutil.delete(":") + (pynini.cross("00", " minutes: \"0\"") | (insert_space + final_graph_minute)) + pynutil.delete(":") + (pynini.cross("00", " seconds: \"0\"") | (insert_space + final_graph_second)) + final_suffix + final_time_zone_optional + pynutil.insert(" preserve_order: true")) # 2 Uhr est graph_h = final_graph_hour_only + final_suffix + final_time_zone_optional final_graph = (graph_hm | graph_h | graph_hms).optimize() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal_tagger: GraphFst, deterministic: bool = True): super().__init__(name="time", kind="verbalize", deterministic=deterministic) # add weight so when using inverse text normalization this conversion is depriotized night_to_early = pynutil.add_weight(pynini.invert( pynini.string_file( get_abs_path("data/time/hour_to_night.tsv"))).optimize(), weight=0.0001) hour_to = pynini.invert( pynini.string_file( get_abs_path("data/time/hour_to.tsv"))).optimize() minute_to = pynini.invert( pynini.string_file( get_abs_path("data/time/minute_to.tsv"))).optimize() time_zone_graph = pynini.invert( convert_space( pynini.union(*[ x[1] for x in load_labels( get_abs_path("data/time/time_zone.tsv")) ]))) graph_zero = pynini.invert( pynini.string_file( get_abs_path("data/numbers/zero.tsv"))).optimize() number_verbalization = graph_zero | cardinal_tagger.two_digit_non_zero hour = pynutil.delete("hours: \"") + pynini.closure( NEMO_DIGIT, 1) + pynutil.delete("\"") hour_verbalized = hour @ number_verbalization @ pynini.cdrewrite( pynini.cross("eins", "ein"), "[BOS]", "[EOS]", NEMO_SIGMA) + pynutil.insert(" uhr") minute = pynutil.delete("minutes: \"") + pynini.closure( NEMO_DIGIT, 1) + pynutil.delete("\"") zone = pynutil.delete("zone: \"") + time_zone_graph + pynutil.delete( "\"") optional_zone = pynini.closure(pynini.accep(" ") + zone, 0, 1) second = pynutil.delete("seconds: \"") + pynini.closure( NEMO_DIGIT, 1) + pynutil.delete("\"") graph_hms = (hour_verbalized + pynini.accep(" ") + minute @ number_verbalization + pynutil.insert(" minuten") + pynini.accep(" ") + second @ number_verbalization + pynutil.insert(" sekunden") + optional_zone) graph_hms @= pynini.cdrewrite( pynini.cross("eins minuten", "eine minute") | pynini.cross("eins sekunden", "eine sekunde"), pynini.union(" ", "[BOS]"), "", NEMO_SIGMA, ) min_30 = [str(x) for x in range(1, 31)] min_30 = pynini.union(*min_30) min_29 = [str(x) for x in range(1, 30)] min_29 = pynini.union(*min_29) graph_h = hour_verbalized graph_hm = hour_verbalized + pynini.accep( " ") + minute @ number_verbalization graph_m_past_h = ( minute @ min_30 @ (number_verbalization | pynini.cross("15", "viertel")) + pynini.accep(" ") + pynutil.insert("nach ") # + hour @ number_verbalization + hour @ pynini.cdrewrite(night_to_early, "[BOS]", "[EOS]", NEMO_SIGMA) @ number_verbalization) graph_m30_h = (minute @ pynini.cross("30", "halb") + pynini.accep(" ") + hour @ pynini.cdrewrite( night_to_early, "[BOS]", "[EOS]", NEMO_SIGMA) @ hour_to @ number_verbalization) graph_m_to_h = ( minute @ minute_to @ min_29 @ (number_verbalization | pynini.cross("15", "viertel")) + pynini.accep(" ") + pynutil.insert("vor ") + hour @ pynini.cdrewrite(night_to_early, "[BOS]", "[EOS]", NEMO_SIGMA) @ hour_to @ number_verbalization) self.graph = (graph_hms | graph_h | graph_hm | pynutil.add_weight(graph_m_past_h, weight=0.0001) | pynutil.add_weight(graph_m30_h, weight=0.0001) | pynutil.add_weight(graph_m_to_h, weight=0.0001)) + optional_zone delete_tokens = self.delete_tokens(self.graph + delete_preserve_order) self.fst = delete_tokens.optimize()