def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="classify", deterministic=deterministic) def get_input_symbols(f): accepted_symbols = [] with open(f, 'r', encoding='utf-8') as f: for line in f: symbol, _ = line.split('\t') accepted_symbols.append(pynini.accep(symbol)) return accepted_symbols accepted_symbols = get_input_symbols( get_abs_path("data/electronic/symbols.tsv")) accepted_common_domains = get_input_symbols( get_abs_path("data/electronic/domain.tsv")) accepted_symbols = NEMO_ALPHA + pynini.closure( NEMO_ALPHA | NEMO_DIGIT | pynini.union(*accepted_symbols)) graph_symbols = pynini.string_file( get_abs_path("data/electronic/symbols.tsv")).optimize() username = pynutil.insert( "username: \"") + accepted_symbols + pynutil.insert( "\"") + pynini.cross('@', ' ') domain_graph = accepted_symbols + pynini.accep('.') + accepted_symbols domain_graph = pynutil.insert( "domain: \"") + domain_graph + pynutil.insert("\"") domain_common_graph = (pynutil.insert("domain: \"") + accepted_symbols + pynini.union(*accepted_common_domains) + pynutil.insert("\"")) protocol_start = pynini.accep("https://") | pynini.accep("http://") protocol_symbols = pynini.closure((NEMO_ALPHA | pynutil.add_weight( graph_symbols | pynini.cross(":", "colon"), -0.1)) + pynutil.insert(" ")) protocol_end = pynini.accep("www.") protocol = protocol_start | protocol_end | (protocol_start + protocol_end) protocol = pynini.compose(protocol, protocol_symbols) protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert( "\"") graph = username + domain_graph graph |= domain_common_graph graph |= protocol + pynutil.insert(" ") + domain_graph final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def __init__(self, cardinal, deterministic: bool = True): super().__init__(name="fraction", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph integer = pynutil.insert( "integer_part: \"") + cardinal_graph + pynutil.insert("\"") numerator = (pynutil.insert("numerator: \"") + cardinal_graph + (pynini.cross("/", "\" ") | pynini.cross(" / ", "\" "))) endings = ["rd", "th", "st", "nd"] endings += [x.upper() for x in endings] optional_end = pynini.closure(pynini.cross(pynini.union(*endings), ""), 0, 1) denominator = pynutil.insert( "denominator: \"" ) + cardinal_graph + optional_end + pynutil.insert("\"") graph = pynini.closure(integer + pynini.accep(" "), 0, 1) + (numerator + denominator) graph |= pynini.closure( integer + (pynini.accep(" ") | pynutil.insert(" ")), 0, 1) + pynini.compose( pynini.string_file(get_abs_path("data/number/fraction.tsv")), (numerator + denominator)) self.graph = graph final_graph = self.add_tokens(self.graph) self.fst = final_graph.optimize()
def _get_minor_currencies(file): minor_currencies = [] with open(get_abs_path(file), 'r') as f: for line in f: min_cur = line.strip() minor_currencies.append(pynutil.insert(min_cur)) return minor_currencies
def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="classify", deterministic=deterministic) accepted_symbols = [] with open(get_abs_path("data/electronic/symbols.tsv"), 'r') as f: for line in f: symbol, _ = line.split('\t') accepted_symbols.append(pynini.accep(symbol)) username = ( pynutil.insert("username: \"") + NEMO_ALPHA + pynini.closure(NEMO_ALPHA | NEMO_DIGIT | pynini.union(*accepted_symbols)) + pynutil.insert("\"") + pynini.cross('@', ' ') ) domain_graph = ( NEMO_ALPHA + pynini.closure(NEMO_ALPHA | NEMO_DIGIT | pynini.accep('-')) + pynini.accep('.') + pynini.closure(NEMO_ALPHA, 1) ) domain_graph = pynutil.insert("domain: \"") + domain_graph + pynutil.insert("\"") graph = pynini.closure(username, 0, 1) + domain_graph final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="classify", deterministic=deterministic) accepted_symbols = pynini.project( pynini.string_file(get_abs_path("data/electronic/symbol.tsv")), "input") accepted_common_domains = pynini.project( pynini.string_file(get_abs_path("data/electronic/domain.tsv")), "input") all_accepted_symbols = NEMO_ALPHA + pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols) graph_symbols = pynini.string_file( get_abs_path("data/electronic/symbol.tsv")).optimize() username = (pynutil.insert("username: \"") + all_accepted_symbols + pynutil.insert("\"") + pynini.cross('@', ' ')) domain_graph = all_accepted_symbols + pynini.accep( '.') + all_accepted_symbols protocol_symbols = pynini.closure((graph_symbols | pynini.cross(":", "semicolon")) + pynutil.insert(" ")) protocol_start = (pynini.cross("https", "HTTPS ") | pynini.cross( "http", "HTTP ")) + (pynini.accep("://") @ protocol_symbols) protocol_file_start = pynini.accep("file") + insert_space + ( pynini.accep(":///") @ protocol_symbols) protocol_end = pynini.cross( "www", "WWW ") + pynini.accep(".") @ protocol_symbols protocol = protocol_file_start | protocol_start | protocol_end | ( protocol_start + protocol_end) domain_graph = ( pynutil.insert("domain: \"") + pynini.difference(domain_graph, pynini.project(protocol, "input") + NEMO_SIGMA) + pynutil.insert("\"")) domain_common_graph = ( pynutil.insert("domain: \"") + pynini.difference( all_accepted_symbols + accepted_common_domains + pynini.closure( accepted_symbols + pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols), 0, 1), pynini.project(protocol, "input") + NEMO_SIGMA, ) + pynutil.insert("\"")) protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert( "\"") # email graph = username + domain_graph # abc.com, abc.com/123-sm graph |= domain_common_graph # www.abc.com/sdafsdf, or https://www.abc.com/asdfad or www.abc.abc/asdfad graph |= protocol + pynutil.insert(" ") + domain_graph final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, TO_UPPER, GraphFst, get_abs_path try: import pynini from pynini.lib import pynutil delete_space = pynutil.delete(" ") quantities = pynini.string_file(get_abs_path("data/number/thousand.tsv")) quantities_abbr = pynini.string_file( get_abs_path("data/number/quantity_abbr.tsv")) quantities_abbr |= TO_UPPER @ quantities_abbr PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): PYNINI_AVAILABLE = False def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_hundred: 'pynini.FstLike', include_abbr: bool) -> 'pynini.FstLike': """ Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral, e.g. 1 million -> integer_part: "one" quantity: "million"
def __init__(self, decimal: GraphFst, deterministic: bool = True): super().__init__(name="money", kind="verbalize", deterministic=deterministic) unit = (pynutil.delete("currency:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) graph = decimal.numbers + delete_space + pynutil.insert(" ") + unit if not deterministic: # For non-deterministic case, the currency symbol was not changed in the tagger, so here we need to # create a transducer to replace the currency symbol with the correct spoken equivalent # the graph finds instances where the fractional part is '.01' - this is need to add singular case for # the minor currency fractional_non_one = ( pynutil.delete("fractional_part: \"") + pynini.difference( pynini.closure(NEMO_NOT_QUOTE), pynini.union("oh one", "o one", "zero one", "one")) + pynutil.delete("\"")) preserve_order = pynutil.delete("preserve_order: True") # Create units graph for major and minor currencies in both singular and plural forms unit_major_sing = pynini.string_file( get_abs_path("data/currency/currency.tsv")) unit_major_plural = ( pynutil.delete("currency: \"") + pynini.compose(unit_major_sing, SINGULAR_TO_PLURAL) + pynutil.delete("\"")) unit_major_sing = pynutil.delete( "currency: \"") + unit_major_sing + pynutil.delete("\"") unit_minor_sing = pynini.string_file( get_abs_path("data/currency/currency_minor_singular.tsv")) unit_minor_sing = pynutil.delete( "currency: \"") + unit_minor_sing + pynutil.delete("\"") unit_minor_plural = pynini.string_file( get_abs_path("data/currency/currency_minor_plural.tsv")) unit_minor_plural = pynutil.delete( "currency: \"") + unit_minor_plural + pynutil.delete("\"") # for the integer part of the money graph find cases, when the integer part is one # this is need to add a singular currency value, e.g. `$1` -> `one dollar` not `one dollars` integer_one = pynini.compose(decimal.integer, pynini.accep("one")) # graph for integer values that are not `1`, we need to use plural currency form for such cases integer_not_one = pynini.compose( decimal.integer, pynini.difference(NEMO_SIGMA, pynini.accep("one"))) graph_integer = integer_one + delete_space + insert_space + unit_major_sing + delete_space + preserve_order graph_integer |= (integer_not_one + delete_space + insert_space + unit_major_plural + delete_space + preserve_order) # find when the fractional part is equal to `.01` -> to use singular form of the minor currency fractional_part_sing = ( delete_space + pynutil.delete("fractional_part: \"" + pynini.union("o ", "oh ", "zero ")) + pynini.accep("one") + pynutil.delete("\"") + delete_space + insert_space + unit_minor_sing) # verbalize money values with .01 in the fractional part and use singular form of the minor currency # e.g. '$12.01' -> 'twelve dollars (and) one cent' graph_decimal_with_minor = ( graph_integer + delete_space + insert_space + pynini.closure(pynutil.insert("and "), 0, 1) + fractional_part_sing) fractional_part_plural = (delete_space + fractional_non_one + delete_space + insert_space + unit_minor_plural) # verbalize money values with the fractional part not equal to '.01' and # use plural form of the minor currency # e.g. '$12.56' -> 'twelve dollars (and) fifty six cents' graph_decimal_with_minor |= ( graph_integer + delete_space + insert_space + pynini.closure(pynutil.insert("and "), 0, 1) + fractional_part_plural) # handle cases when there is no integer part graph_decimal_with_minor |= fractional_part_sing | fractional_part_plural # to make sure no texts with remaining currency symbol bypass the verbalizer graph = pynini.compose( pynini.closure(NEMO_ALPHA | ":" | "\"" | "{" | "}" | "_" | NEMO_WHITE_SPACE), graph) graph |= graph_integer | graph_decimal_with_minor delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()