def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None): super().__init__(name="whitelist", kind="classify", deterministic=deterministic) def _get_whitelist_graph(input_case, file): whitelist = load_labels(file) if input_case == "lower_cased": whitelist = [[x[0].lower()] + x[1:] for x in whitelist] graph = pynini.string_map(whitelist) return graph graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist.tsv")) if not deterministic and input_case != "lower_cased": graph |= pynutil.add_weight(_get_whitelist_graph( "lower_cased", get_abs_path("data/whitelist.tsv")), weight=0.0001) if input_file: whitelist_provided = _get_whitelist_graph(input_case, input_file) if not deterministic: graph |= whitelist_provided else: graph = whitelist_provided if not deterministic: units_graph = _get_whitelist_graph( input_case, file=get_abs_path("data/measures/measurements.tsv")) graph |= units_graph self.graph = graph self.final_graph = convert_space(self.graph).optimize() self.fst = (pynutil.insert("name: \"") + self.final_graph + pynutil.insert("\"")).optimize()
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, GraphFst, insert_space from nemo_text_processing.text_normalization.es.graph_utils import ones from nemo_text_processing.text_normalization.es.utils import get_abs_path try: import pynini from pynini.lib import pynutil graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) graph_twenties = pynini.string_file( get_abs_path("data/numbers/twenties.tsv")) PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): graph_digit = None graph_ties = None graph_teen = None graph_twenties = None PYNINI_AVAILABLE = False
# limitations under the License. from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, NEMO_SIGMA, GraphFst, delete_preserve_order, delete_space, insert_space, ) from nemo_text_processing.text_normalization.es.utils import get_abs_path try: import pynini from pynini.lib import pynutil alt_minutes = pynini.string_file(get_abs_path("data/time/alt_minutes.tsv")) morning_times = pynini.string_file(get_abs_path("data/time/morning_times.tsv")) afternoon_times = pynini.string_file(get_abs_path("data/time/afternoon_times.tsv")) evening_times = pynini.string_file(get_abs_path("data/time/evening_times.tsv")) PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): alt_minutes = None morning_times = None afternoon_times = None evening_times = None PYNINI_AVAILABLE = False
# limitations under the License. from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, NEMO_SIGMA, GraphFst, delete_preserve_order, insert_space, ) from nemo_text_processing.text_normalization.es.utils import get_abs_path try: import pynini from pynini.lib import pynutil digit_no_zero = pynini.invert( pynini.string_file(get_abs_path("data/numbers/digit.tsv"))) zero = pynini.invert( pynini.string_file(get_abs_path("data/numbers/zero.tsv"))) graph_symbols = pynini.string_file( get_abs_path("data/electronic/symbols.tsv")) server_common = pynini.string_file( get_abs_path("data/electronic/server_name.tsv")) domain_common = pynini.string_file( get_abs_path("data/electronic/domain.tsv")) PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): digit_no_zero = None zero = None
delete_space, insert_space, ) from nemo_text_processing.text_normalization.es.graph_utils import ( cardinal_separator, decimal_separator, strip_cardinal_apocope, ) from nemo_text_processing.text_normalization.es.utils import get_abs_path try: import pynini from pynini.lib import pynutil quantities = pynini.string_file( get_abs_path("data/numbers/quantities.tsv")) digit = pynini.invert( pynini.string_file(get_abs_path("data/numbers/digit.tsv"))) zero = pynini.invert( pynini.string_file(get_abs_path("data/numbers/zero.tsv"))) PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): quantities = None digit = None zero = None PYNINI_AVAILABLE = False
NEMO_SIGMA, NEMO_SPACE, NEMO_WHITE_SPACE, GraphFst, delete_extra_space, delete_preserve_order, ) from nemo_text_processing.text_normalization.es.graph_utils import ones from nemo_text_processing.text_normalization.es.utils import get_abs_path try: import pynini from pynini.lib import pynutil unit_plural_fem = pynini.string_file( get_abs_path("data/measures/measurements_plural_fem.tsv")) unit_plural_masc = pynini.string_file( get_abs_path("data/measures/measurements_plural_masc.tsv")) unit_singular_fem = pynini.project(unit_plural_fem, "input") unit_singular_masc = pynini.project(unit_plural_masc, "input") unit_plural_fem = pynini.project(unit_plural_fem, "output") unit_plural_masc = pynini.project(unit_plural_masc, "output") PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): unit_plural_fem = None unit_plural_masc = None
from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_CHAR, NEMO_SIGMA, NEMO_SPACE, GraphFst, delete_space, ) from nemo_text_processing.text_normalization.es.graph_utils import roman_to_int, strip_accent from nemo_text_processing.text_normalization.es.utils import get_abs_path try: import pynini from pynini.lib import pynutil digit = pynini.invert( pynini.string_file(get_abs_path("data/ordinals/digit.tsv"))) teens = pynini.invert( pynini.string_file(get_abs_path("data/ordinals/teen.tsv"))) twenties = pynini.invert( pynini.string_file(get_abs_path("data/ordinals/twenties.tsv"))) ties = pynini.invert( pynini.string_file(get_abs_path("data/ordinals/ties.tsv"))) hundreds = pynini.invert( pynini.string_file(get_abs_path("data/ordinals/hundreds.tsv"))) PYNINI_AVAILABLE = True except (ImportError, ModuleNotFoundError): digit = None teens = None twenties = None
NEMO_NON_BREAKING_SPACE, NEMO_SIGMA, NEMO_SPACE, GraphFst, convert_space, delete_space, insert_space, ) from nemo_text_processing.text_normalization.es.graph_utils import strip_cardinal_apocope from nemo_text_processing.text_normalization.es.utils import get_abs_path try: import pynini from pynini.lib import pynutil unit = pynini.string_file(get_abs_path("data/measures/measurements.tsv")) unit_plural_fem = pynini.string_file(get_abs_path("data/measures/measurements_plural_fem.tsv")) unit_plural_masc = pynini.string_file(get_abs_path("data/measures/measurements_plural_masc.tsv")) PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): unit = None unit_plural_fem = None unit_plural_masc = None PYNINI_AVAILABLE = False class MeasureFst(GraphFst): """
# limitations under the License. from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_CHAR, NEMO_DIGIT, NEMO_SIGMA, NEMO_SPACE, GraphFst, ) from nemo_text_processing.text_normalization.es.utils import get_abs_path try: import pynini from pynini.lib import pynutil ordinal_exceptions = pynini.string_file( get_abs_path("data/fractions/ordinal_exceptions.tsv")) higher_powers_of_ten = pynini.string_file( get_abs_path("data/fractions/powers_of_ten.tsv")) PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): ordinal_exceptions = None higher_powers_of_ten = None PYNINI_AVAILABLE = False class FractionFst(GraphFst): """ Finite state transducer for classifying fraction
from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, NEMO_SIGMA, NEMO_SPACE, GraphFst, delete_preserve_order, ) from nemo_text_processing.text_normalization.es.graph_utils import shift_cardinal_gender, strip_cardinal_apocope from nemo_text_processing.text_normalization.es.utils import get_abs_path try: import pynini from pynini.lib import pynutil fem = pynini.string_file( (get_abs_path("data/money/currency_plural_fem.tsv"))) masc = pynini.string_file( (get_abs_path("data/money/currency_plural_masc.tsv"))) fem_singular = pynini.project(fem, "input") masc_singular = pynini.project(masc, "input") fem_plural = pynini.project(fem, "output") masc_plural = pynini.project(masc, "output") PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): fem_plural = None masc_plural = None
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from nemo_text_processing.text_normalization.en.graph_utils import NEMO_ALPHA, NEMO_DIGIT, GraphFst, insert_space from nemo_text_processing.text_normalization.es.utils import get_abs_path, load_labels try: import pynini from pynini.lib import pynutil common_domains = [ x[0] for x in load_labels(get_abs_path("data/electronic/domain.tsv")) ] symbols = [ x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv")) ] PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): common_domains = None symbols = None PYNINI_AVAILABLE = False class ElectronicFst(GraphFst):
def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="time", kind="classify", deterministic=deterministic) delete_time_delimiter = pynutil.delete(pynini.union(".", ":")) one = pynini.string_map([("un", "una"), ("ún", "una")]) change_one = pynini.cdrewrite(one, "", "", NEMO_SIGMA) cardinal_graph = cardinal.graph @ change_one day_suffix = pynutil.insert("suffix: \"") + suffix + pynutil.insert( "\"") day_suffix = delete_space + insert_space + day_suffix delete_hora_suffix = delete_space + insert_space + pynutil.delete("h") delete_minute_suffix = delete_space + insert_space + pynutil.delete( "min") delete_second_suffix = delete_space + insert_space + pynutil.delete( "s") labels_hour_24 = [ str(x) for x in range(0, 25) ] # Can see both systems. Twelve hour requires am/pm for ambiguity resolution labels_hour_12 = [str(x) for x in range(1, 13)] labels_minute_single = [str(x) for x in range(1, 10)] labels_minute_double = [str(x) for x in range(10, 60)] delete_leading_zero_to_double_digit = ( pynini.closure(pynutil.delete("0") | (NEMO_DIGIT - "0"), 0, 1) + NEMO_DIGIT) graph_24 = (pynini.closure(NEMO_DIGIT, 1, 2) @ delete_leading_zero_to_double_digit @ pynini.union(*labels_hour_24)) graph_12 = (pynini.closure(NEMO_DIGIT, 1, 2) @ delete_leading_zero_to_double_digit @ pynini.union(*labels_hour_12)) graph_hour_24 = graph_24 @ cardinal_graph graph_hour_12 = graph_12 @ cardinal_graph graph_minute_single = delete_leading_zero_to_double_digit @ pynini.union( *labels_minute_single) graph_minute_double = pynini.union(*labels_minute_double) graph_minute = pynini.union(graph_minute_single, graph_minute_double) @ cardinal_graph final_graph_hour_only_24 = (pynutil.insert("hours: \"") + graph_hour_24 + pynutil.insert("\"") + delete_hora_suffix) final_graph_hour_only_12 = pynutil.insert( "hours: \"") + graph_hour_12 + pynutil.insert("\"") + day_suffix final_graph_hour_24 = pynutil.insert( "hours: \"") + graph_hour_24 + pynutil.insert("\"") final_graph_hour_12 = pynutil.insert( "hours: \"") + graph_hour_12 + pynutil.insert("\"") final_graph_minute = pynutil.insert( "minutes: \"") + graph_minute + pynutil.insert("\"") final_graph_second = pynutil.insert( "seconds: \"") + graph_minute + pynutil.insert("\"") final_time_zone_optional = pynini.closure( delete_space + insert_space + pynutil.insert("zone: \"") + time_zone_graph + pynutil.insert("\""), 0, 1, ) # 02.30 h graph_hm = ( final_graph_hour_24 + delete_time_delimiter + (pynutil.delete("00") | (insert_space + final_graph_minute)) + pynini.closure( delete_time_delimiter + (pynini.cross("00", " seconds: \"0\"") | (insert_space + final_graph_second)), 0, 1, ) # For seconds 2.30.35 h + pynini.closure(delete_hora_suffix, 0, 1) # 2.30 is valid if unambiguous + final_time_zone_optional) # 2 h 30 min graph_hm |= ( final_graph_hour_24 + delete_hora_suffix + delete_space + (pynutil.delete("00") | (insert_space + final_graph_minute)) + delete_minute_suffix + pynini.closure( delete_space + (pynini.cross("00", " seconds: \"0\"") | (insert_space + final_graph_second)) + delete_second_suffix, 0, 1, ) # For seconds + final_time_zone_optional) # 2.30 a. m. (Only for 12 hour clock) graph_hm |= ( final_graph_hour_12 + delete_time_delimiter + (pynutil.delete("00") | (insert_space + final_graph_minute)) + pynini.closure( delete_time_delimiter + (pynini.cross("00", " seconds: \"0\"") | (insert_space + final_graph_second)), 0, 1, ) # For seconds 2.30.35 a. m. + day_suffix + final_time_zone_optional) graph_h = ( pynini.union(final_graph_hour_only_24, final_graph_hour_only_12) + final_time_zone_optional ) # Should always have a time indicator, else we'll pass to cardinals if not deterministic: # This includes alternate vocalization (hour menos min, min para hour), here we shift the times and indicate a `style` tag hour_shift_24 = pynini.invert( pynini.string_file(get_abs_path("data/time/hour_to_24.tsv"))) hour_shift_12 = pynini.invert( pynini.string_file(get_abs_path("data/time/hour_to_12.tsv"))) minute_shift = pynini.string_file( get_abs_path("data/time/minute_to.tsv")) graph_hour_to_24 = graph_24 @ hour_shift_24 @ cardinal_graph graph_hour_to_12 = graph_12 @ hour_shift_12 @ cardinal_graph graph_minute_to = pynini.union( graph_minute_single, graph_minute_double) @ minute_shift @ cardinal_graph final_graph_hour_to_24 = pynutil.insert( "hours: \"") + graph_hour_to_24 + pynutil.insert("\"") final_graph_hour_to_12 = pynutil.insert( "hours: \"") + graph_hour_to_12 + pynutil.insert("\"") final_graph_minute_to = pynutil.insert( "minutes: \"") + graph_minute_to + pynutil.insert("\"") graph_menos = pynutil.insert(" style: \"1\"") graph_para = pynutil.insert(" style: \"2\"") final_graph_style = graph_menos | graph_para # 02.30 h (omitting seconds since a bit awkward) graph_hm |= ( final_graph_hour_to_24 + delete_time_delimiter + insert_space + final_graph_minute_to + pynini.closure( delete_hora_suffix, 0, 1) # 2.30 is valid if unambiguous + final_time_zone_optional + final_graph_style) # 2 h 30 min graph_hm |= (final_graph_hour_to_24 + delete_hora_suffix + delete_space + insert_space + final_graph_minute_to + delete_minute_suffix + final_time_zone_optional + final_graph_style) # 2.30 a. m. (Only for 12 hour clock) graph_hm |= (final_graph_hour_to_12 + delete_time_delimiter + insert_space + final_graph_minute_to + day_suffix + final_time_zone_optional + final_graph_style) final_graph = graph_hm | graph_h if deterministic: final_graph = final_graph + pynutil.insert(" preserve_order: true") final_graph = final_graph.optimize() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
# limitations under the License. from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, NEMO_SIGMA, GraphFst, delete_space, insert_space, ) from nemo_text_processing.text_normalization.es.utils import get_abs_path try: import pynini from pynini.lib import pynutil time_zone_graph = pynini.string_file( get_abs_path("data/time/time_zone.tsv")) suffix = pynini.string_file(get_abs_path("data/time/time_suffix.tsv")) PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): time_zone_graph = None suffix = None PYNINI_AVAILABLE = False class TimeFst(GraphFst): """ Finite state transducer for classifying time, e.g. "02:15 est" -> time { hours: "dos" minutes: "quince" zone: "e s t"}
def _load_roman(file: str): roman = load_labels(get_abs_path(file)) roman_numerals = [(x, y) for x, y in roman] + [(x.upper(), y) for x, y in roman] return pynini.string_map(roman_numerals)
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, NEMO_SPACE from nemo_text_processing.text_normalization.es import LOCALIZATION from nemo_text_processing.text_normalization.es.utils import get_abs_path, load_labels try: import pynini from pynini.lib import pynutil digits = pynini.project( pynini.string_file(get_abs_path("data/numbers/digit.tsv")), "input") tens = pynini.project( pynini.string_file(get_abs_path("data/numbers/ties.tsv")), "input") teens = pynini.project( pynini.string_file(get_abs_path("data/numbers/teen.tsv")), "input") twenties = pynini.project( pynini.string_file(get_abs_path("data/numbers/twenties.tsv")), "input") hundreds = pynini.project( pynini.string_file(get_abs_path("data/numbers/hundreds.tsv")), "input") accents = pynini.string_map([("á", "a"), ("é", "e"), ("í", "i"), ("ó", "o"), ("ú", "u")]) if LOCALIZATION == "am": # Setting localization for central and northern america formatting cardinal_separator = pynini.string_map([",", NEMO_SPACE]) decimal_separator = pynini.accep(".")