Exemple #1
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="electronic",
                         kind="classify",
                         deterministic=deterministic)

        def get_input_symbols(f):
            accepted_symbols = []
            with open(f, 'r', encoding='utf-8') as f:
                for line in f:
                    symbol, _ = line.split('\t')
                    accepted_symbols.append(pynini.accep(symbol))
            return accepted_symbols

        accepted_symbols = get_input_symbols(
            get_abs_path("data/electronic/symbols.tsv"))
        accepted_common_domains = get_input_symbols(
            get_abs_path("data/electronic/domain.tsv"))
        accepted_symbols = NEMO_ALPHA + pynini.closure(
            NEMO_ALPHA | NEMO_DIGIT | pynini.union(*accepted_symbols))
        graph_symbols = pynini.string_file(
            get_abs_path("data/electronic/symbols.tsv")).optimize()

        username = pynutil.insert(
            "username: \"") + accepted_symbols + pynutil.insert(
                "\"") + pynini.cross('@', ' ')
        domain_graph = accepted_symbols + pynini.accep('.') + accepted_symbols
        domain_graph = pynutil.insert(
            "domain: \"") + domain_graph + pynutil.insert("\"")
        domain_common_graph = (pynutil.insert("domain: \"") +
                               accepted_symbols +
                               pynini.union(*accepted_common_domains) +
                               pynutil.insert("\""))

        protocol_start = pynini.accep("https://") | pynini.accep("http://")
        protocol_symbols = pynini.closure((NEMO_ALPHA | pynutil.add_weight(
            graph_symbols | pynini.cross(":", "colon"), -0.1)) +
                                          pynutil.insert(" "))
        protocol_end = pynini.accep("www.")
        protocol = protocol_start | protocol_end | (protocol_start +
                                                    protocol_end)
        protocol = pynini.compose(protocol, protocol_symbols)
        protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert(
            "\"")
        graph = username + domain_graph
        graph |= domain_common_graph
        graph |= protocol + pynutil.insert(" ") + domain_graph

        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Exemple #2
0
    def __init__(self, cardinal, deterministic: bool = True):
        super().__init__(name="fraction",
                         kind="classify",
                         deterministic=deterministic)
        cardinal_graph = cardinal.graph

        integer = pynutil.insert(
            "integer_part: \"") + cardinal_graph + pynutil.insert("\"")
        numerator = (pynutil.insert("numerator: \"") + cardinal_graph +
                     (pynini.cross("/", "\" ") | pynini.cross(" / ", "\" ")))

        endings = ["rd", "th", "st", "nd"]
        endings += [x.upper() for x in endings]
        optional_end = pynini.closure(pynini.cross(pynini.union(*endings), ""),
                                      0, 1)

        denominator = pynutil.insert(
            "denominator: \""
        ) + cardinal_graph + optional_end + pynutil.insert("\"")

        graph = pynini.closure(integer + pynini.accep(" "), 0,
                               1) + (numerator + denominator)
        graph |= pynini.closure(
            integer +
            (pynini.accep(" ") | pynutil.insert(" ")), 0, 1) + pynini.compose(
                pynini.string_file(get_abs_path("data/number/fraction.tsv")),
                (numerator + denominator))

        self.graph = graph
        final_graph = self.add_tokens(self.graph)
        self.fst = final_graph.optimize()
Exemple #3
0
 def _get_minor_currencies(file):
     minor_currencies = []
     with open(get_abs_path(file), 'r') as f:
         for line in f:
             min_cur = line.strip()
             minor_currencies.append(pynutil.insert(min_cur))
     return minor_currencies
Exemple #4
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="electronic", kind="classify", deterministic=deterministic)

        accepted_symbols = []
        with open(get_abs_path("data/electronic/symbols.tsv"), 'r') as f:
            for line in f:
                symbol, _ = line.split('\t')
                accepted_symbols.append(pynini.accep(symbol))

        username = (
            pynutil.insert("username: \"")
            + NEMO_ALPHA
            + pynini.closure(NEMO_ALPHA | NEMO_DIGIT | pynini.union(*accepted_symbols))
            + pynutil.insert("\"")
            + pynini.cross('@', ' ')
        )
        domain_graph = (
            NEMO_ALPHA
            + pynini.closure(NEMO_ALPHA | NEMO_DIGIT | pynini.accep('-'))
            + pynini.accep('.')
            + pynini.closure(NEMO_ALPHA, 1)
        )
        domain_graph = pynutil.insert("domain: \"") + domain_graph + pynutil.insert("\"")
        graph = pynini.closure(username, 0, 1) + domain_graph

        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Exemple #5
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="electronic",
                         kind="classify",
                         deterministic=deterministic)

        accepted_symbols = pynini.project(
            pynini.string_file(get_abs_path("data/electronic/symbol.tsv")),
            "input")
        accepted_common_domains = pynini.project(
            pynini.string_file(get_abs_path("data/electronic/domain.tsv")),
            "input")
        all_accepted_symbols = NEMO_ALPHA + pynini.closure(NEMO_ALPHA
                                                           | NEMO_DIGIT
                                                           | accepted_symbols)
        graph_symbols = pynini.string_file(
            get_abs_path("data/electronic/symbol.tsv")).optimize()

        username = (pynutil.insert("username: \"") + all_accepted_symbols +
                    pynutil.insert("\"") + pynini.cross('@', ' '))
        domain_graph = all_accepted_symbols + pynini.accep(
            '.') + all_accepted_symbols
        protocol_symbols = pynini.closure((graph_symbols
                                           | pynini.cross(":", "semicolon")) +
                                          pynutil.insert(" "))
        protocol_start = (pynini.cross("https", "HTTPS ") | pynini.cross(
            "http", "HTTP ")) + (pynini.accep("://") @ protocol_symbols)
        protocol_file_start = pynini.accep("file") + insert_space + (
            pynini.accep(":///") @ protocol_symbols)

        protocol_end = pynini.cross(
            "www", "WWW ") + pynini.accep(".") @ protocol_symbols
        protocol = protocol_file_start | protocol_start | protocol_end | (
            protocol_start + protocol_end)

        domain_graph = (
            pynutil.insert("domain: \"") +
            pynini.difference(domain_graph,
                              pynini.project(protocol, "input") + NEMO_SIGMA) +
            pynutil.insert("\""))
        domain_common_graph = (
            pynutil.insert("domain: \"") + pynini.difference(
                all_accepted_symbols + accepted_common_domains +
                pynini.closure(
                    accepted_symbols +
                    pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols),
                    0, 1),
                pynini.project(protocol, "input") + NEMO_SIGMA,
            ) + pynutil.insert("\""))

        protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert(
            "\"")
        # email
        graph = username + domain_graph
        # abc.com, abc.com/123-sm
        graph |= domain_common_graph
        # www.abc.com/sdafsdf, or https://www.abc.com/asdfad or www.abc.abc/asdfad
        graph |= protocol + pynutil.insert(" ") + domain_graph

        final_graph = self.add_tokens(graph)

        self.fst = final_graph.optimize()
Exemple #6
0
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, TO_UPPER, GraphFst, get_abs_path

try:
    import pynini
    from pynini.lib import pynutil

    delete_space = pynutil.delete(" ")
    quantities = pynini.string_file(get_abs_path("data/number/thousand.tsv"))
    quantities_abbr = pynini.string_file(
        get_abs_path("data/number/quantity_abbr.tsv"))
    quantities_abbr |= TO_UPPER @ quantities_abbr

    PYNINI_AVAILABLE = True
except (ModuleNotFoundError, ImportError):
    PYNINI_AVAILABLE = False


def get_quantity(decimal: 'pynini.FstLike',
                 cardinal_up_to_hundred: 'pynini.FstLike',
                 include_abbr: bool) -> 'pynini.FstLike':
    """
    Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral,
    e.g. 1 million -> integer_part: "one" quantity: "million"
Exemple #7
0
    def __init__(self, decimal: GraphFst, deterministic: bool = True):
        super().__init__(name="money",
                         kind="verbalize",
                         deterministic=deterministic)

        unit = (pynutil.delete("currency:") + delete_space +
                pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) +
                pynutil.delete("\""))
        graph = decimal.numbers + delete_space + pynutil.insert(" ") + unit

        if not deterministic:
            # For non-deterministic case, the currency symbol was not changed in the tagger, so here we need to
            # create a transducer to replace the currency symbol with the correct spoken equivalent

            # the graph finds instances where the fractional part is '.01' - this is need to add singular case for
            # the minor currency
            fractional_non_one = (
                pynutil.delete("fractional_part: \"") + pynini.difference(
                    pynini.closure(NEMO_NOT_QUOTE),
                    pynini.union("oh one", "o one", "zero one", "one")) +
                pynutil.delete("\""))
            preserve_order = pynutil.delete("preserve_order: True")

            # Create units graph for major and minor currencies in both singular and plural forms
            unit_major_sing = pynini.string_file(
                get_abs_path("data/currency/currency.tsv"))
            unit_major_plural = (
                pynutil.delete("currency: \"") +
                pynini.compose(unit_major_sing, SINGULAR_TO_PLURAL) +
                pynutil.delete("\""))
            unit_major_sing = pynutil.delete(
                "currency: \"") + unit_major_sing + pynutil.delete("\"")
            unit_minor_sing = pynini.string_file(
                get_abs_path("data/currency/currency_minor_singular.tsv"))
            unit_minor_sing = pynutil.delete(
                "currency: \"") + unit_minor_sing + pynutil.delete("\"")
            unit_minor_plural = pynini.string_file(
                get_abs_path("data/currency/currency_minor_plural.tsv"))
            unit_minor_plural = pynutil.delete(
                "currency: \"") + unit_minor_plural + pynutil.delete("\"")

            # for the integer part of the money graph find cases, when the integer part is one
            # this is need to add a singular currency value, e.g. `$1` -> `one dollar` not `one dollars`
            integer_one = pynini.compose(decimal.integer, pynini.accep("one"))

            # graph for integer values that are not `1`, we need to use plural currency form for such cases
            integer_not_one = pynini.compose(
                decimal.integer,
                pynini.difference(NEMO_SIGMA, pynini.accep("one")))
            graph_integer = integer_one + delete_space + insert_space + unit_major_sing + delete_space + preserve_order
            graph_integer |= (integer_not_one + delete_space + insert_space +
                              unit_major_plural + delete_space +
                              preserve_order)

            # find when the fractional part is equal to `.01` -> to use singular form of the minor currency
            fractional_part_sing = (
                delete_space +
                pynutil.delete("fractional_part: \"" +
                               pynini.union("o ", "oh ", "zero ")) +
                pynini.accep("one") + pynutil.delete("\"") + delete_space +
                insert_space + unit_minor_sing)

            # verbalize money values with .01 in the fractional part and use singular form of the minor currency
            # e.g. '$12.01' -> 'twelve dollars (and) one cent'
            graph_decimal_with_minor = (
                graph_integer + delete_space + insert_space +
                pynini.closure(pynutil.insert("and "), 0, 1) +
                fractional_part_sing)

            fractional_part_plural = (delete_space + fractional_non_one +
                                      delete_space + insert_space +
                                      unit_minor_plural)

            # verbalize money values with the fractional part not equal to '.01' and
            # use plural form of the minor currency
            # e.g. '$12.56' -> 'twelve dollars (and) fifty six cents'
            graph_decimal_with_minor |= (
                graph_integer + delete_space + insert_space +
                pynini.closure(pynutil.insert("and "), 0, 1) +
                fractional_part_plural)

            # handle cases when there is no integer part
            graph_decimal_with_minor |= fractional_part_sing | fractional_part_plural

            # to make sure no texts with remaining currency symbol bypass the verbalizer
            graph = pynini.compose(
                pynini.closure(NEMO_ALPHA | ":" | "\"" | "{" | "}" | "_"
                               | NEMO_WHITE_SPACE), graph)
            graph |= graph_integer | graph_decimal_with_minor

        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()