def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="measure", kind="classify") cardinal_graph = cardinal.graph_no_exception graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_four = pynini.cross("tư", "4") graph_one = pynini.cross("mốt", "1") graph_half = pynini.cross("rưỡi", "5") graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv")) graph_unit_singular = pynini.invert(graph_unit) # singular -> abbr optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross(pynini.union("âm", "trừ"), '"true"') + delete_extra_space, 0, 1, ) unit_singular = convert_space(graph_unit_singular) unit_misc = pynutil.insert("/") + pynutil.delete( "trên") + delete_space + convert_space(graph_unit_singular) unit_singular = (pynutil.insert('units: "') + (unit_singular | unit_misc | pynutil.add_weight( unit_singular + delete_space + unit_misc, 0.01)) + pynutil.insert('"')) subgraph_decimal = (pynutil.insert("decimal { ") + optional_graph_negative + decimal.final_graph_wo_negative + pynutil.insert(" }") + delete_extra_space + unit_singular) subgraph_cardinal = (pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert('integer: "') + cardinal_graph + pynutil.insert('"') + pynutil.insert(" }") + delete_extra_space + unit_singular) fraction_graph = (delete_extra_space + pynutil.insert('fractional_part: "') + (graph_digit | graph_half | graph_one | graph_four) + pynutil.insert('"')) subgraph_cardinal |= (pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert('integer: "') + cardinal_graph + pynutil.insert('" }') + delete_extra_space + unit_singular + fraction_graph) final_graph = subgraph_decimal | subgraph_cardinal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="electronic", kind="classify") delete_extra_space = pynutil.delete(" ") alpha_num = ( NEMO_ALPHA | pynini.string_file(get_abs_path("data/numbers/digit.tsv")) | pynini.string_file(get_abs_path("data/numbers/zero.tsv"))) symbols = pynini.string_file( get_abs_path("data/electronic/symbols.tsv")).invert() accepted_username = alpha_num | symbols process_dot = pynini.cross("chấm", ".") username = (pynutil.insert('username: "******"')) single_alphanum = pynini.closure(alpha_num + delete_extra_space) + alpha_num server = single_alphanum | pynini.string_file( get_abs_path("data/electronic/server_name.tsv")) domain = single_alphanum | pynini.string_file( get_abs_path("data/electronic/domain.tsv")) multi_domain = (pynini.closure(process_dot + delete_extra_space + domain + delete_extra_space) + process_dot + delete_extra_space + domain) domain_graph = pynutil.insert( 'domain: "' ) + server + delete_extra_space + multi_domain + pynutil.insert('"') graph = (username + delete_extra_space + pynutil.delete(pynini.union("a còng", "a móc", "a vòng")) + insert_space + delete_extra_space + domain_graph) ############# url ### protocol_end = pynini.cross(pynini.union("w w w", "www"), "www") protocol_start = (pynini.cross("h t t p", "http") | pynini.cross( "h t t p s", "https")) + pynini.cross(" hai chấm sẹc sẹc ", "://") # .com, ending = ( delete_extra_space + symbols + delete_extra_space + (domain | pynini.closure(accepted_username + delete_extra_space) + accepted_username)) protocol = (pynini.closure(protocol_start, 0, 1) + protocol_end + delete_extra_space + process_dot + pynini.closure(delete_extra_space + accepted_username, 1) + pynini.closure(ending, 1, 2)) protocol = pynutil.insert('protocol: "') + protocol + pynutil.insert( '"') graph |= protocol ######## final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def _get_month_graph(): """ Transducer for month, e.g. march -> march """ month_graph = pynini.string_file( get_abs_path("data/months.tsv")).optimize() return month_graph
def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="money", kind="classify") # quantity, integer_part, fractional_part, currency cardinal_graph = cardinal.graph_no_exception graph_decimal_final = decimal.final_graph_wo_negative graph_half = pynini.cross("rưỡi", "5") unit = pynini.string_file(get_abs_path("data/currency.tsv")) unit_singular = pynini.invert(unit) graph_unit_singular = pynutil.insert("currency: \"") + convert_space( unit_singular) + pynutil.insert("\"") add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | ( pynutil.insert("0") + NEMO_DIGIT) # twelve dollars fifty, only after integer optional_cents_suffix = pynini.closure( delete_extra_space + pynutil.insert("fractional_part: \"") + (pynutil.add_weight( cardinal_graph @ add_leading_zero_to_double_digit, -0.7) | graph_half) + pynutil.insert("\""), 0, 1, ) graph_integer = (pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") + delete_extra_space + graph_unit_singular + optional_cents_suffix) graph_decimal = graph_decimal_final + delete_extra_space + graph_unit_singular + optional_cents_suffix final_graph = graph_integer | graph_decimal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="telephone", kind="classify") graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) digit = graph_digit | graph_zero last_digit = digit | pynini.cross("mốt", "1") | pynini.cross( "tư", "4") | pynini.cross("lăm", "5") graph_number_part = pynini.closure(digit + delete_space, 2) + last_digit number_part = pynutil.insert( 'number_part: "') + graph_number_part + pynutil.insert('"') graph = number_part final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="whitelist", kind="classify") whitelist = pynini.string_file( get_abs_path("data/whitelist.tsv")).invert() graph = pynutil.insert('name: "') + convert_space( whitelist) + pynutil.insert('"') self.fst = graph.optimize()
def __init__(self): super().__init__(name="ordinal", kind="classify") graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv")) graph_ordinal = pynini.cross("thứ", "") graph = graph_digit self.graph = graph final_graph = pynutil.insert("integer: \"") + graph_ordinal + delete_space + self.graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst): super().__init__(name="decimal", kind="classify") cardinal_graph = cardinal.graph_no_exception graph_decimal = graph_digit | pynini.string_file( get_abs_path("data/numbers/zero.tsv")) graph_one = pynini.cross("mốt", "1") graph_four = pynini.cross("tư", "4") graph_five = pynini.cross("lăm", "5") graph_decimal = pynini.union( graph_decimal, graph_four, pynini.closure(graph_decimal + delete_space, 1) + (graph_decimal | graph_four | graph_five | graph_one), ) self.graph = graph_decimal point = pynutil.delete("chấm") | pynutil.delete("phẩy") optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross(pynini.union("âm", "trừ"), '"true"') + delete_extra_space, 0, 1, ) graph_fractional = pynutil.insert( 'fractional_part: "') + graph_decimal + pynutil.insert('"') graph_integer = pynutil.insert( 'integer_part: "') + cardinal_graph + pynutil.insert('"') final_graph_wo_sign = ( pynini.closure(graph_integer + delete_extra_space, 0, 1) + point + delete_extra_space + graph_fractional) final_graph = optional_graph_negative + final_graph_wo_sign self.final_graph_wo_negative = final_graph_wo_sign | get_quantity( final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit, ) final_graph |= optional_graph_negative + get_quantity( final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit, ) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="cardinal", kind="classify") graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) graph_one = pynini.cross("mốt", "1") graph_four = pynini.cross("tư", "4") graph_five = pynini.cross("lăm", "5") graph_half = pynini.cross("rưỡi", "5") graph_hundred = pynini.cross("trăm", "") graph_ten = pynini.cross("mươi", "") zero = pynini.cross(pynini.union("linh", "lẻ"), "0") optional_ten = pynini.closure(delete_space + graph_ten, 0, 1) last_digit = graph_digit | graph_one | graph_four | graph_five graph_hundred_component = (graph_digit | graph_zero) + delete_space + graph_hundred graph_hundred_component += delete_space graph_hundred_component += pynini.union( graph_teen, graph_ties + optional_ten + ((delete_space + last_digit) | pynutil.insert("0")), (graph_half | graph_four | graph_one) + pynutil.insert("0"), zero + delete_space + (graph_digit | graph_four), graph_digit, pynutil.insert("00"), ) graph_hundred_component |= ( pynutil.insert("0") + delete_space + pynini.union( graph_teen, graph_ties + optional_ten + ((delete_space + last_digit) | pynutil.insert("0")), zero + delete_space + (graph_digit | graph_four), graph_digit, )) graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ ( pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)) self.graph_hundred_component_at_least_one_none_zero_digit = ( graph_hundred_component_at_least_one_none_zero_digit) graph_thousands = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete(pynini.union("nghìn", "ngàn")), pynutil.insert("000", weight=0.1), ) graph_ten_thousand = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("vạn"), pynutil.insert("0000", weight=0.1), ) graph_ten_thousand_suffix = pynini.union( graph_digit + delete_space + pynutil.delete(pynini.union("nghìn", "ngàn")), pynutil.insert("0", weight=0.1), ) graph_million = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("triệu"), pynutil.insert("000", weight=0.1), ) graph_billion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete(pynini.union("tỉ", "tỷ")), pynutil.insert("000", weight=0.1), ) graph = pynini.union( graph_billion + delete_space + graph_million + delete_space + graph_thousands + delete_space + graph_hundred_component, graph_ten_thousand + delete_space + graph_ten_thousand_suffix + delete_space + graph_hundred_component, graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete(pynini.union("nghìn", "ngàn")) + delete_space + ((last_digit + pynutil.insert("00")) | graph_hundred_component), graph_zero, ) graph = graph @ pynini.union( pynutil.delete(pynini.closure("0")) + pynini.difference( NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0") # don't convert cardinals from zero to nine inclusive graph_exception = pynini.project(pynini.union(graph_digit, graph_zero), 'input') self.graph_no_exception = graph self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross(pynini.union("âm", "trừ"), "\"-\"") + NEMO_SPACE, 0, 1) final_graph = optional_minus_graph + pynutil.insert( "integer: \"") + self.graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="time", kind="classify") # hours, minutes, seconds, suffix, zone, style, speak_period graph_hours_to = pynini.string_file( get_abs_path("data/time/hours_to.tsv")) graph_minutes_to = pynini.string_file( get_abs_path("data/time/minutes_to.tsv")) graph_hours = pynini.string_file(get_abs_path("data/time/hours.tsv")) graph_minutes = pynini.string_file( get_abs_path("data/time/minutes.tsv")) time_zone_graph = pynini.invert( pynini.string_file(get_abs_path("data/time/time_zone.tsv"))) graph_half = pynini.cross("rưỡi", "30") oclock = pynini.cross("giờ", "") minute = pynini.cross("phút", "") optional_minute = pynini.closure(delete_space + minute, 0, 1) second = pynini.cross("giây", "") final_graph_hour = pynutil.insert( "hours: \"") + graph_hours + pynutil.insert( "\"") + delete_space + oclock graph_minute = graph_minutes + optional_minute graph_second = graph_minute + delete_space + second final_time_zone_optional = pynini.closure( delete_space + insert_space + pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\""), 0, 1, ) graph_hm = (final_graph_hour + delete_extra_space + pynutil.insert("minutes: \"") + (graph_minute | graph_half) + pynutil.insert("\"")) graph_hms = graph_hm + delete_extra_space + pynutil.insert( "seconds: \"") + graph_second + pynutil.insert("\"") graph_ms = (pynutil.insert("minutes: \"") + graph_minute + pynutil.insert("\"") + delete_extra_space + pynutil.insert("seconds: \"") + (graph_second | graph_half) + pynutil.insert("\"")) graph_hours_to_component = graph_hours @ graph_hours_to graph_minutes_to_component = graph_minutes @ graph_minutes_to graph_time_to = (pynutil.insert("hours: \"") + graph_hours_to_component + pynutil.insert("\"") + delete_space + oclock + delete_space + pynutil.delete("kém") + delete_extra_space + pynutil.insert("minutes: \"") + graph_minutes_to_component + optional_minute + pynutil.insert("\"")) final_graph = (final_graph_hour | graph_hm | graph_hms) + final_time_zone_optional final_graph |= graph_ms final_graph |= graph_time_to final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
NEMO_DIGIT, GraphFst, delete_extra_space, delete_space, ) from nemo_text_processing.inverse_text_normalization.vi.utils import get_abs_path try: import pynini from pynini.lib import pynutil PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): PYNINI_AVAILABLE = False graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_hundred: 'pynini.FstLike') -> 'pynini.FstLike': """ Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral, e.g. một triệu -> integer_part: "1" quantity: "triệu" e.g. một tỷ rưỡi -> integer_part: "1" fractional_part: "5" quantity: "tỷ" Args: decimal: decimal FST cardinal_up_to_hundred: cardinal FST """ numbers = cardinal_up_to_hundred @ (pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") +
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from nemo_text_processing.inverse_text_normalization.vi.graph_utils import GraphFst, delete_extra_space, delete_space from nemo_text_processing.inverse_text_normalization.vi.utils import get_abs_path try: import pynini from pynini.lib import pynutil graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")).optimize() graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).optimize() graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")).optimize() ties_graph = pynini.string_file(get_abs_path("data/numbers/ties.tsv")).optimize() PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): graph_teen = None graph_digit = None graph_zero = None ties_graph = None PYNINI_AVAILABLE = True def _get_month_graph():
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from nemo_text_processing.inverse_text_normalization.vi.graph_utils import GraphFst, delete_extra_space, delete_space from nemo_text_processing.inverse_text_normalization.vi.utils import get_abs_path try: import pynini from pynini.lib import pynutil graph_teen = pynini.string_file( get_abs_path("data/numbers/teen.tsv")).optimize() graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")).optimize() graph_zero = pynini.string_file( get_abs_path("data/numbers/zero.tsv")).optimize() ties_graph = pynini.string_file( get_abs_path("data/numbers/ties.tsv")).optimize() PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): graph_teen = None graph_digit = None graph_zero = None ties_graph = None PYNINI_AVAILABLE = True