Ejemplo n.º 1
0
    def __init__(self,
                 deterministic: bool = True,
                 cache_dir: str = None,
                 overwrite_cache: bool = False):
        super().__init__(name="verbalize_final",
                         kind="verbalize",
                         deterministic=deterministic)

        far_file = None
        if cache_dir is not None and cache_dir != "None":
            os.makedirs(cache_dir, exist_ok=True)
            far_file = os.path.join(
                cache_dir,
                f"de_tn_{deterministic}_deterministic_verbalizer.far")
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode="r")["verbalize"]
            logging.info(
                f'VerbalizeFinalFst graph was restored from {far_file}.')
        else:
            verbalize = VerbalizeFst(deterministic=deterministic).fst
            word = WordFst(deterministic=deterministic).fst

            types = verbalize | word
            graph = (pynutil.delete("tokens") + delete_space +
                     pynutil.delete("{") + delete_space + types +
                     delete_space + pynutil.delete("}"))
            graph = delete_space + pynini.closure(
                graph + delete_extra_space) + graph + delete_space

            self.fst = graph.optimize()
            if far_file:
                generator_main(far_file, {"verbalize": self.fst})
                logging.info(
                    f"VerbalizeFinalFst grammars are saved to {far_file}.")
Ejemplo n.º 2
0
    def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
        super().__init__(name="tokenize_and_classify", kind="classify")

        far_file = None
        if cache_dir is not None and cache_dir != "None":
            os.makedirs(cache_dir, exist_ok=True)
            far_file = os.path.join(cache_dir, "_en_itn.far")
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
            logging.info(f"ClassifyFst.fst was restored from {far_file}.")
        else:
            logging.info(f"Creating ClassifyFst grammars.")
            cardinal = CardinalFst()
            cardinal_graph = cardinal.fst

            ordinal = OrdinalFst(cardinal)
            ordinal_graph = ordinal.fst

            decimal = DecimalFst(cardinal)
            decimal_graph = decimal.fst

            measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal).fst
            date_graph = DateFst(ordinal=ordinal).fst
            word_graph = WordFst().fst
            time_graph = TimeFst().fst
            money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst
            whitelist_graph = WhiteListFst().fst
            punct_graph = PunctuationFst().fst
            electronic_graph = ElectronicFst().fst
            telephone_graph = TelephoneFst(cardinal).fst

            classify = (pynutil.add_weight(whitelist_graph, 1.01)
                        | pynutil.add_weight(time_graph, 1.1)
                        | pynutil.add_weight(date_graph, 1.09)
                        | pynutil.add_weight(decimal_graph, 1.1)
                        | pynutil.add_weight(measure_graph, 1.1)
                        | pynutil.add_weight(cardinal_graph, 1.1)
                        | pynutil.add_weight(ordinal_graph, 1.1)
                        | pynutil.add_weight(money_graph, 1.1)
                        | pynutil.add_weight(telephone_graph, 1.1)
                        | pynutil.add_weight(electronic_graph, 1.1)
                        | pynutil.add_weight(word_graph, 100))

            punct = pynutil.insert("tokens { ") + pynutil.add_weight(
                punct_graph, weight=1.1) + pynutil.insert(" }")
            token = pynutil.insert("tokens { ") + classify + pynutil.insert(
                " }")
            token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) +
                                token +
                                pynini.closure(pynutil.insert(" ") + punct))

            graph = token_plus_punct + pynini.closure(delete_extra_space +
                                                      token_plus_punct)
            graph = delete_space + graph + delete_space

            self.fst = graph.optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f"ClassifyFst grammars are saved to {far_file}.")
Ejemplo n.º 3
0
    def __init__(
        self,
        input_case: str,
        cache_dir: str = None,
        overwrite_cache: bool = False,
        deterministic: bool = True,
        whitelist: str = None,
    ):
        super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic)

        far_file = None
        if cache_dir is not None and cache_dir != "None":
            os.makedirs(cache_dir, exist_ok=True)
            whitelist_file = os.path.basename(whitelist) if whitelist else ""
            far_file = os.path.join(
                cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far"
            )
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
            logging.info(f'ClassifyFst.fst was restored from {far_file}.')
        else:
            logging.info(f"Creating ClassifyFst grammars.")

            word_graph = WordFst(deterministic=deterministic).fst
            whitelist_graph = WhiteListFst(input_case=input_case, deterministic=deterministic).fst
            punct_graph = PunctuationFst(deterministic=deterministic).fst

            classify = pynutil.add_weight(whitelist_graph, 1) | pynutil.add_weight(word_graph, 100)

            punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }")
            punct = pynini.closure(
                pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
                | (pynutil.insert(" ") + punct),
                1,
            )
            token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
            token_plus_punct = (
                pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)
            )

            graph = (
                token_plus_punct
                + pynini.closure(
                    (
                        pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
                        | (pynutil.insert(" ") + punct + pynutil.insert(" "))
                    )
                    + token_plus_punct
                ).optimize()
            )

            graph = delete_space + graph + delete_space
            graph |= punct

            self.fst = graph.optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f"ClassifyFst grammars are saved to {far_file}.")
Ejemplo n.º 4
0
    def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):

        far_file = None
        if cache_dir is not None and cache_dir != "None":
            os.makedirs(cache_dir, exist_ok=True)
            far_file = os.path.join(cache_dir, "en_tn_post_processing.far")
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode="r")["post_process_graph"]
            logging.info(
                f'Post processing graph was restored from {far_file}.')
        else:
            self.set_punct_dict()
            self.fst = self.get_punct_postprocess_graph()

            if far_file:
                generator_main(far_file, {"post_process_graph": self.fst})
Ejemplo n.º 5
0
def export_grammars(output_dir, grammars):
    """
    Exports tokenizer_and_classify and verbalize Fsts as OpenFst finite state archive (FAR) files. 

    Args:
        output_dir: directory to export FAR files to. Subdirectories will be created for tagger and verbalizer respectively.
        grammars: grammars to be exported
    """

    for category, graphs in grammars.items():
        out_dir = os.path.join(output_dir, category)
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
            time.sleep(1)
        if category == "classify":
            category = "tokenize_and_classify"
        generator_main(f"{out_dir}/{category}.far", graphs)
Ejemplo n.º 6
0
    def __init__(
        self,
        input_case: str,
        deterministic: bool = False,
        cache_dir: str = None,
        overwrite_cache: bool = False,
        whitelist: str = None,
    ):
        super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic)
        far_file = None
        if cache_dir is not None and cache_dir != "None":
            os.makedirs(cache_dir, exist_ok=True)
            whitelist_file = os.path.basename(whitelist) if whitelist else ""
            far_file = os.path.join(
                cache_dir, f"_{input_case}_de_tn_{deterministic}_deterministic{whitelist_file}.far"
            )
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
            logging.info(f"ClassifyFst.fst was restored from {far_file}.")
        else:
            logging.info(f"Creating ClassifyFst grammars. This might take some time...")

            self.cardinal = CardinalFst(deterministic=deterministic)
            cardinal_graph = self.cardinal.fst

            self.ordinal = OrdinalFst(cardinal=self.cardinal, deterministic=deterministic)
            ordinal_graph = self.ordinal.fst

            self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic)
            decimal_graph = self.decimal.fst

            self.fraction = FractionFst(cardinal=self.cardinal, deterministic=deterministic)
            fraction_graph = self.fraction.fst
            self.measure = MeasureFst(
                cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic
            )
            measure_graph = self.measure.fst
            self.date = DateFst(cardinal=self.cardinal, deterministic=deterministic)
            date_graph = self.date.fst
            word_graph = WordFst(deterministic=deterministic).fst
            self.time = TimeFst(deterministic=deterministic)
            time_graph = self.time.fst
            self.telephone = TelephoneFst(cardinal=self.cardinal, deterministic=deterministic)
            telephone_graph = self.telephone.fst
            self.electronic = ElectronicFst(deterministic=deterministic)
            electronic_graph = self.electronic.fst
            self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic)
            money_graph = self.money.fst
            self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist)
            whitelist_graph = self.whitelist.fst
            punct_graph = PunctuationFst(deterministic=deterministic).fst

            classify = (
                pynutil.add_weight(whitelist_graph, 1.01)
                | pynutil.add_weight(time_graph, 1.1)
                | pynutil.add_weight(measure_graph, 1.1)
                | pynutil.add_weight(cardinal_graph, 1.1)
                | pynutil.add_weight(fraction_graph, 1.1)
                | pynutil.add_weight(date_graph, 1.1)
                | pynutil.add_weight(ordinal_graph, 1.1)
                | pynutil.add_weight(decimal_graph, 1.1)
                | pynutil.add_weight(money_graph, 1.1)
                | pynutil.add_weight(telephone_graph, 1.1)
                | pynutil.add_weight(electronic_graph, 1.1)
                | pynutil.add_weight(word_graph, 100)
            )

            punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }")
            token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
            token_plus_punct = (
                pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)
            )

            graph = token_plus_punct + pynini.closure(pynutil.add_weight(delete_extra_space, 1.1) + token_plus_punct)
            graph = delete_space + graph + delete_space

            self.fst = graph.optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f"ClassifyFst grammars are saved to {far_file}.")
    def __init__(
        self,
        input_case: str,
        deterministic: bool = True,
        cache_dir: str = None,
        overwrite_cache: bool = True,
        whitelist: str = None,
    ):
        super().__init__(name="tokenize_and_classify",
                         kind="classify",
                         deterministic=deterministic)

        far_file = None
        if cache_dir is not None and cache_dir != 'None':
            os.makedirs(cache_dir, exist_ok=True)
            whitelist_file = os.path.basename(whitelist) if whitelist else ""
            far_file = os.path.join(
                cache_dir,
                f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far"
            )
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode='r')['tokenize_and_classify']
            no_digits = pynini.closure(pynini.difference(
                NEMO_CHAR, NEMO_DIGIT))
            self.fst_no_digits = pynini.compose(self.fst, no_digits).optimize()
            logging.info(f'ClassifyFst.fst was restored from {far_file}.')
        else:
            logging.info(
                f'Creating ClassifyFst grammars. This might take some time...')
            # TAGGERS
            cardinal = CardinalFst(deterministic=deterministic)
            cardinal_graph = cardinal.fst

            ordinal = OrdinalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            deterministic_ordinal = OrdinalFst(cardinal=cardinal,
                                               deterministic=True)
            ordinal_graph = ordinal.fst

            decimal = DecimalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            decimal_graph = decimal.fst
            fraction = FractionFst(deterministic=deterministic,
                                   cardinal=cardinal)
            fraction_graph = fraction.fst

            measure = MeasureFst(cardinal=cardinal,
                                 decimal=decimal,
                                 fraction=fraction,
                                 deterministic=deterministic)
            measure_graph = measure.fst
            date_graph = DateFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            word_graph = WordFst(deterministic=deterministic).graph
            time_graph = TimeFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            telephone_graph = TelephoneFst(deterministic=deterministic).fst
            electronic_graph = ElectronicFst(deterministic=deterministic).fst
            money_graph = MoneyFst(cardinal=cardinal,
                                   decimal=decimal,
                                   deterministic=deterministic).fst
            whitelist = WhiteListFst(input_case=input_case,
                                     deterministic=deterministic,
                                     input_file=whitelist)
            whitelist_graph = whitelist.graph
            punct_graph = PunctuationFst(deterministic=deterministic).graph
            serial_graph = SerialFst(cardinal=cardinal,
                                     ordinal=deterministic_ordinal,
                                     deterministic=deterministic).fst

            # VERBALIZERS
            cardinal = vCardinal(deterministic=deterministic)
            v_cardinal_graph = cardinal.fst
            decimal = vDecimal(cardinal=cardinal, deterministic=deterministic)
            v_decimal_graph = decimal.fst
            ordinal = vOrdinal(deterministic=deterministic)
            v_ordinal_graph = ordinal.fst
            fraction = vFraction(deterministic=deterministic)
            v_fraction_graph = fraction.fst
            v_telephone_graph = vTelephone(deterministic=deterministic).fst
            v_electronic_graph = vElectronic(deterministic=deterministic).fst
            measure = vMeasure(decimal=decimal,
                               cardinal=cardinal,
                               fraction=fraction,
                               deterministic=deterministic)
            v_measure_graph = measure.fst
            v_time_graph = vTime(deterministic=deterministic).fst
            v_date_graph = vDate(ordinal=ordinal,
                                 deterministic=deterministic).fst
            v_money_graph = vMoney(decimal=decimal,
                                   deterministic=deterministic).fst
            v_roman_graph = vRoman(deterministic=deterministic).fst
            v_abbreviation = vAbbreviation(deterministic=deterministic).fst

            det_v_time_graph = vTime(deterministic=True).fst
            det_v_date_graph = vDate(ordinal=vOrdinal(deterministic=True),
                                     deterministic=True).fst
            time_final = pynini.compose(time_graph, det_v_time_graph)
            date_final = pynini.compose(date_graph, det_v_date_graph)
            range_graph = RangeFst(time=time_final,
                                   date=date_final,
                                   cardinal=CardinalFst(deterministic=True),
                                   deterministic=deterministic).fst
            v_word_graph = vWord(deterministic=deterministic).fst

            sem_w = 1
            word_w = 100
            punct_w = 2
            classify_and_verbalize = (
                pynutil.add_weight(whitelist_graph, sem_w)
                | pynutil.add_weight(pynini.compose(time_graph, v_time_graph),
                                     sem_w)
                | pynutil.add_weight(
                    pynini.compose(decimal_graph, v_decimal_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(measure_graph, v_measure_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(cardinal_graph, v_cardinal_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(ordinal_graph, v_ordinal_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(telephone_graph, v_telephone_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(electronic_graph, v_electronic_graph),
                    sem_w)
                | pynutil.add_weight(
                    pynini.compose(fraction_graph, v_fraction_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(money_graph, v_money_graph), sem_w)
                | pynutil.add_weight(word_graph, word_w)
                | pynutil.add_weight(pynini.compose(date_graph, v_date_graph),
                                     sem_w - 0.01)
                | pynutil.add_weight(pynini.compose(range_graph, v_word_graph),
                                     sem_w)
                | pynutil.add_weight(
                    pynini.compose(serial_graph, v_word_graph),
                    1.1001)  # should be higher than the rest of the classes
            ).optimize()

            if not deterministic:
                roman_graph = RomanFst(deterministic=deterministic).fst
                # the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens
                classify_and_verbalize |= pynutil.add_weight(
                    pynini.compose(roman_graph, v_roman_graph), word_w)

                abbreviation_graph = AbbreviationFst(
                    whitelist=whitelist, deterministic=deterministic).fst
                classify_and_verbalize |= pynutil.add_weight(
                    pynini.compose(abbreviation_graph, v_abbreviation), word_w)

            punct_only = pynutil.add_weight(punct_graph, weight=punct_w)
            punct = pynini.closure(
                pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1),
                               delete_extra_space)
                | (pynutil.insert(" ") + punct_only),
                1,
            )

            token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) +
                                classify_and_verbalize +
                                pynini.closure(pynutil.insert(" ") + punct))

            graph = token_plus_punct + pynini.closure(
                (pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1),
                                delete_extra_space)
                 | (pynutil.insert(" ") + punct + pynutil.insert(" "))) +
                token_plus_punct)

            graph |= punct_only + pynini.closure(punct)
            graph = delete_space + graph + delete_space

            remove_extra_spaces = pynini.closure(
                NEMO_NOT_SPACE,
                1) + pynini.closure(delete_extra_space +
                                    pynini.closure(NEMO_NOT_SPACE, 1))
            remove_extra_spaces |= (
                pynini.closure(pynutil.delete(" "), 1) +
                pynini.closure(NEMO_NOT_SPACE, 1) +
                pynini.closure(delete_extra_space +
                               pynini.closure(NEMO_NOT_SPACE, 1)))

            graph = pynini.compose(graph.optimize(),
                                   remove_extra_spaces).optimize()
            self.fst = graph
            no_digits = pynini.closure(pynini.difference(
                NEMO_CHAR, NEMO_DIGIT))
            self.fst_no_digits = pynini.compose(graph, no_digits).optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f'ClassifyFst grammars are saved to {far_file}.')
Ejemplo n.º 8
0
    """
    Utils to get alternative formats for numbers.
    """
    one_alternatives = load_labels(
        get_abs_path('data/numbers/cardinals_alternatives.tsv'))
    one_thousand_map = []
    for k in one_alternatives:
        default, alternative = k
        one_thousand_map.append((alternative.split()[1], alternative))
    one_thousand_map = pynini.string_map(one_thousand_map)

    one_thousand_alternative = pynini.cdrewrite(one_thousand_map, "[BOS]", "",
                                                NEMO_SIGMA)

    t = pynini.Far(get_abs_path('data/utils/universal_thousands_punct.far'))
    separators = (pynutil.add_weight(t['dot_thousands'], 0.1)
                  | pynutil.add_weight(t['no_delimiter'], -0.1)
                  | pynutil.add_weight(t['space_thousands'], 0.1))
    alternative_formats = {}
    alternative_formats['one_thousand_alternative'] = one_thousand_alternative
    alternative_formats['separators'] = separators
    return alternative_formats


if __name__ == '__main__':
    from nemo_text_processing.text_normalization.en.graph_utils import generator_main

    numbers = get_number_names()
    for k, v in numbers.items():
        generator_main(f'{k}.far', {k: v})
Ejemplo n.º 9
0
    def __init__(
        self,
        input_case: str,
        deterministic: bool = True,
        cache_dir: str = None,
        overwrite_cache: bool = False,
        whitelist: str = None,
    ):
        super().__init__(name="tokenize_and_classify",
                         kind="classify",
                         deterministic=deterministic)

        far_file = None
        if cache_dir is not None and cache_dir != "None":
            os.makedirs(cache_dir, exist_ok=True)
            whitelist_file = os.path.basename(whitelist) if whitelist else ""
            far_file = os.path.join(
                cache_dir,
                f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far"
            )
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
            logging.info(f'ClassifyFst.fst was restored from {far_file}.')
        else:
            logging.info(f"Creating ClassifyFst grammars.")
            cardinal = CardinalFst(deterministic=deterministic)
            cardinal_graph = cardinal.fst

            ordinal = OrdinalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            ordinal_graph = ordinal.fst

            decimal = DecimalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            decimal_graph = decimal.fst
            fraction = FractionFst(deterministic=deterministic,
                                   cardinal=cardinal)
            fraction_graph = fraction.fst

            measure = MeasureFst(cardinal=cardinal,
                                 decimal=decimal,
                                 fraction=fraction,
                                 deterministic=deterministic)
            measure_graph = measure.fst
            date_graph = DateFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            word_graph = WordFst(deterministic=deterministic).fst
            time_graph = TimeFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            telephone_graph = TelephoneFst(deterministic=deterministic).fst
            electonic_graph = ElectronicFst(deterministic=deterministic).fst
            money_graph = MoneyFst(cardinal=cardinal,
                                   decimal=decimal,
                                   deterministic=deterministic).fst
            whitelist_graph = WhiteListFst(input_case=input_case,
                                           deterministic=deterministic,
                                           input_file=whitelist).fst
            punct_graph = PunctuationFst(deterministic=deterministic).fst
            serial_graph = SerialFst(cardinal=cardinal,
                                     ordinal=ordinal,
                                     deterministic=deterministic).fst

            v_time_graph = vTimeFst(deterministic=deterministic).fst
            v_ordinal_graph = vOrdinalFst(deterministic=deterministic)
            v_date_graph = vDateFst(ordinal=v_ordinal_graph,
                                    deterministic=deterministic).fst
            time_final = pynini.compose(time_graph, v_time_graph)
            date_final = pynini.compose(date_graph, v_date_graph)
            range_graph = RangeFst(time=time_final,
                                   date=date_final,
                                   cardinal=cardinal,
                                   deterministic=deterministic).fst

            classify = (
                pynutil.add_weight(whitelist_graph, 1.01)
                | pynutil.add_weight(time_graph, 1.1)
                | pynutil.add_weight(date_graph, 1.09)
                | pynutil.add_weight(decimal_graph, 1.1)
                | pynutil.add_weight(measure_graph, 1.1)
                | pynutil.add_weight(cardinal_graph, 1.1)
                | pynutil.add_weight(ordinal_graph, 1.1)
                | pynutil.add_weight(money_graph, 1.1)
                | pynutil.add_weight(telephone_graph, 1.1)
                | pynutil.add_weight(electonic_graph, 1.1)
                | pynutil.add_weight(fraction_graph, 1.1)
                | pynutil.add_weight(range_graph, 1.1)
                | pynutil.add_weight(
                    serial_graph,
                    1.1001)  # should be higher than the rest of the classes
            )

            # roman_graph = RomanFst(deterministic=deterministic).fst
            # classify |= pynutil.add_weight(roman_graph, 1.1)

            if not deterministic:
                abbreviation_graph = AbbreviationFst(
                    deterministic=deterministic).fst
                classify |= pynutil.add_weight(abbreviation_graph, 100)

            punct = pynutil.insert("tokens { ") + pynutil.add_weight(
                punct_graph, weight=2.1) + pynutil.insert(" }")
            punct = pynini.closure(
                pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1),
                               delete_extra_space)
                | (pynutil.insert(" ") + punct),
                1,
            )

            classify |= pynutil.add_weight(word_graph, 100)
            token = pynutil.insert("tokens { ") + classify + pynutil.insert(
                " }")
            token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) +
                                token +
                                pynini.closure(pynutil.insert(" ") + punct))

            graph = token_plus_punct + pynini.closure(
                (pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1),
                                delete_extra_space)
                 | (pynutil.insert(" ") + punct + pynutil.insert(" "))) +
                token_plus_punct)

            graph = delete_space + graph + delete_space
            graph |= punct

            self.fst = graph.optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f"ClassifyFst grammars are saved to {far_file}.")
    def __init__(
        self,
        input_case: str,
        deterministic: bool = True,
        cache_dir: str = None,
        overwrite_cache: bool = True,
        whitelist: str = None,
    ):
        super().__init__(name="tokenize_and_classify",
                         kind="classify",
                         deterministic=deterministic)

        far_file = None
        if cache_dir is not None and cache_dir != 'None':
            os.makedirs(cache_dir, exist_ok=True)
            whitelist_file = os.path.basename(whitelist) if whitelist else ""
            far_file = os.path.join(
                cache_dir,
                f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far"
            )
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode='r')['tokenize_and_classify']
            logging.info(f'ClassifyFst.fst was restored from {far_file}.')
        else:
            logging.info(
                f'Creating ClassifyFst grammars. This might take some time...')
            # TAGGERS
            cardinal = CardinalFst(deterministic=deterministic)
            cardinal_graph = cardinal.fst

            ordinal = OrdinalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            ordinal_graph = ordinal.fst

            decimal = DecimalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            decimal_graph = decimal.fst
            fraction = FractionFst(deterministic=deterministic,
                                   cardinal=cardinal)
            fraction_graph = fraction.fst

            measure = MeasureFst(cardinal=cardinal,
                                 decimal=decimal,
                                 fraction=fraction,
                                 deterministic=deterministic)
            measure_graph = measure.fst
            date_graph = DateFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            word_graph = WordFst(deterministic=deterministic).graph
            time_graph = TimeFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            telephone_graph = TelephoneFst(deterministic=deterministic).fst
            electronic_graph = ElectronicFst(deterministic=deterministic).fst
            money_graph = MoneyFst(cardinal=cardinal,
                                   decimal=decimal,
                                   deterministic=deterministic).fst
            whitelist = WhiteListFst(input_case=input_case,
                                     deterministic=deterministic,
                                     input_file=whitelist)
            whitelist_graph = whitelist.graph
            punct_graph = PunctuationFst(deterministic=deterministic).graph

            # VERBALIZERS
            cardinal = vCardinal(deterministic=deterministic)
            v_cardinal_graph = cardinal.fst
            decimal = vDecimal(cardinal=cardinal, deterministic=deterministic)
            v_decimal_graph = decimal.fst
            ordinal = vOrdinal(deterministic=deterministic)
            v_ordinal_graph = ordinal.fst
            fraction = vFraction(deterministic=deterministic)
            v_fraction_graph = fraction.fst
            v_telephone_graph = vTelephone(deterministic=deterministic).fst
            v_electronic_graph = vElectronic(deterministic=deterministic).fst
            measure = vMeasure(decimal=decimal,
                               cardinal=cardinal,
                               fraction=fraction,
                               deterministic=deterministic)
            v_measure_graph = measure.fst
            v_time_graph = vTime(deterministic=deterministic).fst
            v_date_graph = vDate(ordinal=ordinal,
                                 deterministic=deterministic).fst
            v_money_graph = vMoney(decimal=decimal,
                                   deterministic=deterministic).fst
            v_roman_graph = vRoman(deterministic=deterministic).fst
            v_abbreviation = vAbbreviation(deterministic=deterministic).fst

            classify_and_verbalize = (
                pynutil.add_weight(whitelist_graph, 1.01)
                | pynutil.add_weight(pynini.compose(time_graph, v_time_graph),
                                     1.1)
                | pynutil.add_weight(
                    pynini.compose(decimal_graph, v_decimal_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(measure_graph, v_measure_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(cardinal_graph, v_cardinal_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(ordinal_graph, v_ordinal_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(telephone_graph, v_telephone_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(electronic_graph, v_electronic_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(fraction_graph, v_fraction_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(money_graph, v_money_graph), 1.1)
                | pynutil.add_weight(word_graph, 100)
                | pynutil.add_weight(pynini.compose(date_graph, v_date_graph),
                                     1.09)).optimize()

            if not deterministic:
                roman_graph = RomanFst(deterministic=deterministic).fst
                # the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens
                classify_and_verbalize |= pynutil.add_weight(
                    pynini.compose(roman_graph, v_roman_graph), 100)

                abbreviation_graph = AbbreviationFst(
                    whitelist=whitelist, deterministic=deterministic).fst
                classify_and_verbalize |= pynutil.add_weight(
                    pynini.compose(abbreviation_graph, v_abbreviation), 100)

            punct = pynutil.add_weight(punct_graph, weight=1.1)
            token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) +
                                classify_and_verbalize +
                                pynini.closure(pynutil.insert(" ") + punct))

            graph = token_plus_punct + pynini.closure(delete_extra_space +
                                                      token_plus_punct)
            graph = delete_space + graph + delete_space

            self.fst = graph.optimize()
            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f'ClassifyFst grammars are saved to {far_file}.')
Ejemplo n.º 11
0
    def __init__(self,
                 input_case: str,
                 deterministic: bool = True,
                 cache_dir: str = None,
                 overwrite_cache: bool = False):
        super().__init__(name="tokenize_and_classify",
                         kind="classify",
                         deterministic=deterministic)

        far_file = None
        if cache_dir is not None and cache_dir != "None":
            os.makedirs(cache_dir, exist_ok=True)
            far_file = os.path.join(
                cache_dir,
                f"_{input_case}_en_tn_{deterministic}_deterministic.far")
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
            logging.info(f'ClassifyFst.fst was restored from {far_file}.')
        else:
            logging.info(f"Creating ClassifyFst grammars.")
            cardinal = CardinalFst(deterministic=deterministic)
            cardinal_graph = cardinal.fst

            ordinal = OrdinalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            ordinal_graph = ordinal.fst

            decimal = DecimalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            decimal_graph = decimal.fst
            fraction = FractionFst(deterministic=deterministic,
                                   cardinal=cardinal)
            fraction_graph = fraction.fst

            measure = MeasureFst(cardinal=cardinal,
                                 decimal=decimal,
                                 fraction=fraction,
                                 deterministic=deterministic)
            measure_graph = measure.fst
            date_graph = DateFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            word_graph = WordFst(deterministic=deterministic).fst
            time_graph = TimeFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            telephone_graph = TelephoneFst(deterministic=deterministic).fst
            electonic_graph = ElectronicFst(deterministic=deterministic).fst
            money_graph = MoneyFst(cardinal=cardinal,
                                   decimal=decimal,
                                   deterministic=deterministic).fst
            whitelist_graph = WhiteListFst(input_case=input_case,
                                           deterministic=deterministic).fst
            punct_graph = PunctuationFst(deterministic=deterministic).fst

            classify = (pynutil.add_weight(whitelist_graph, 1.01)
                        | pynutil.add_weight(time_graph, 1.1)
                        | pynutil.add_weight(date_graph, 1.09)
                        | pynutil.add_weight(decimal_graph, 1.1)
                        | pynutil.add_weight(measure_graph, 1.1)
                        | pynutil.add_weight(cardinal_graph, 1.1)
                        | pynutil.add_weight(ordinal_graph, 1.1)
                        | pynutil.add_weight(money_graph, 1.1)
                        | pynutil.add_weight(telephone_graph, 1.1)
                        | pynutil.add_weight(electonic_graph, 1.1)
                        | pynutil.add_weight(fraction_graph, 1.1)
                        | pynutil.add_weight(word_graph, 100))

            if not deterministic:
                roman_graph = RomanFst(deterministic=deterministic).fst
                # the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens
                classify |= pynutil.add_weight(roman_graph, 100)

                abbreviation_graph = AbbreviationFst(
                    deterministic=deterministic).fst
                classify |= pynutil.add_weight(abbreviation_graph, 100)

            punct = pynutil.insert("tokens { ") + pynutil.add_weight(
                punct_graph, weight=1.1) + pynutil.insert(" }")
            token = pynutil.insert("tokens { ") + classify + pynutil.insert(
                " }")
            token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) +
                                token +
                                pynini.closure(pynutil.insert(" ") + punct))

            graph = token_plus_punct + pynini.closure(delete_extra_space +
                                                      token_plus_punct)
            graph = delete_space + graph + delete_space

            self.fst = graph.optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f"ClassifyFst grammars are saved to {far_file}.")
Ejemplo n.º 12
0
    def __init__(self,
                 input_case: str,
                 deterministic: bool = False,
                 cache_dir: str = None,
                 overwrite_cache: bool = False):
        super().__init__(name="tokenize_and_classify",
                         kind="classify",
                         deterministic=deterministic)
        if deterministic:
            raise ValueError(
                'Ru TN only supports non-deterministic cases and produces multiple normalization options.'
            )
        far_file = None
        if cache_dir is not None and cache_dir != "None":
            os.makedirs(cache_dir, exist_ok=True)
            far_file = os.path.join(
                cache_dir,
                f"_{input_case}_ru_tn_{deterministic}_deterministic.far")
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
            logging.info(f"ClassifyFst.fst was restored from {far_file}.")
        else:
            logging.info(
                f"Creating ClassifyFst grammars. This might take some time...")
            number_names = get_number_names()
            alternative_formats = get_alternative_formats()

            self.cardinal = CardinalFst(
                number_names=number_names,
                alternative_formats=alternative_formats,
                deterministic=deterministic)
            cardinal_graph = self.cardinal.fst

            self.ordinal = OrdinalFst(number_names=number_names,
                                      alternative_formats=alternative_formats,
                                      deterministic=deterministic)
            ordinal_graph = self.ordinal.fst

            self.decimal = DecimalFst(cardinal=self.cardinal,
                                      deterministic=deterministic)
            decimal_graph = self.decimal.fst

            self.measure = MeasureFst(cardinal=self.cardinal,
                                      decimal=self.decimal,
                                      deterministic=deterministic)
            measure_graph = self.measure.fst
            self.date = DateFst(number_names=number_names,
                                deterministic=deterministic)
            date_graph = self.date.fst
            word_graph = WordFst(deterministic=deterministic).fst
            self.time = TimeFst(number_names=number_names,
                                deterministic=deterministic)
            time_graph = self.time.fst
            self.telephone = TelephoneFst(number_names=number_names,
                                          deterministic=deterministic)
            telephone_graph = self.telephone.fst
            self.electronic = ElectronicFst(deterministic=deterministic)
            electronic_graph = self.electronic.fst
            self.money = MoneyFst(cardinal=self.cardinal,
                                  decimal=self.decimal,
                                  deterministic=deterministic)
            money_graph = self.money.fst
            self.whitelist = WhiteListFst(input_case=input_case,
                                          deterministic=deterministic)
            whitelist_graph = self.whitelist.fst
            punct_graph = PunctuationFst(deterministic=deterministic).fst

            classify = (pynutil.add_weight(whitelist_graph, 1.01)
                        | pynutil.add_weight(time_graph, 1.1)
                        | pynutil.add_weight(date_graph, 1.09)
                        | pynutil.add_weight(decimal_graph, 1.1)
                        | pynutil.add_weight(measure_graph, 0.9)
                        | pynutil.add_weight(cardinal_graph, 1.1)
                        | pynutil.add_weight(ordinal_graph, 1.1)
                        | pynutil.add_weight(money_graph, 1.1)
                        | pynutil.add_weight(telephone_graph, 1.1)
                        | pynutil.add_weight(electronic_graph, 1.1)
                        | pynutil.add_weight(word_graph, 100))

            punct = pynutil.insert("tokens { ") + pynutil.add_weight(
                punct_graph, weight=1.1) + pynutil.insert(" }")
            token = pynutil.insert("tokens { ") + classify + pynutil.insert(
                " }")
            token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) +
                                token +
                                pynini.closure(pynutil.insert(" ") + punct))

            graph = token_plus_punct + pynini.closure(
                pynutil.add_weight(delete_extra_space, 1.1) + token_plus_punct)
            graph = delete_space + graph + delete_space

            self.fst = graph.optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f"ClassifyFst grammars are saved to {far_file}.")
Ejemplo n.º 13
0
    def __init__(self,
                 cache_dir: str = None,
                 overwrite_cache: bool = False,
                 deterministic: bool = True):
        super().__init__(name="tokenize_and_classify",
                         kind="classify",
                         deterministic=deterministic)

        far_file = None
        if cache_dir is not None and cache_dir != 'None':
            os.makedirs(cache_dir, exist_ok=True)
            far_file = os.path.join(cache_dir, "_de_itn.far")
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
            logging.info(f"ClassifyFst.fst was restored from {far_file}.")
        else:
            logging.info(f"Creating ClassifyFst grammars.")
            tn_cardinal_tagger = TNCardinalTagger(deterministic=False)
            tn_date_tagger = TNDateTagger(cardinal=tn_cardinal_tagger,
                                          deterministic=False)
            tn_decimal_tagger = TNDecimalTagger(cardinal=tn_cardinal_tagger,
                                                deterministic=False)
            tn_ordinal_verbalizer = TNOrdinalVerbalizer(deterministic=False)
            tn_fraction_verbalizer = TNFractionVerbalizer(
                ordinal=tn_ordinal_verbalizer, deterministic=False)
            tn_time_verbalizer = TNTimeVerbalizer(
                cardinal_tagger=tn_cardinal_tagger, deterministic=False)
            tn_date_verbalizer = TNDateVerbalizer(
                ordinal=tn_ordinal_verbalizer, deterministic=False)
            tn_electronic_tagger = TNElectronicTagger(deterministic=False)
            tn_electronic_verbalizer = TNElectronicVerbalizer(
                deterministic=False)
            tn_whitelist_tagger = TNWhitelistTagger(input_case="cased",
                                                    deterministic=False)

            cardinal = CardinalFst(tn_cardinal_tagger=tn_cardinal_tagger)
            cardinal_graph = cardinal.fst

            ordinal = OrdinalFst(itn_cardinal_tagger=cardinal,
                                 tn_ordinal_verbalizer=tn_ordinal_verbalizer)
            ordinal_graph = ordinal.fst
            decimal = DecimalFst(itn_cardinal_tagger=cardinal,
                                 tn_decimal_tagger=tn_decimal_tagger)
            decimal_graph = decimal.fst

            fraction = FractionFst(
                itn_cardinal_tagger=cardinal,
                tn_fraction_verbalizer=tn_fraction_verbalizer)
            fraction_graph = fraction.fst

            measure_graph = MeasureFst(itn_cardinal_tagger=cardinal,
                                       itn_decimal_tagger=decimal,
                                       itn_fraction_tagger=fraction).fst
            date_graph = DateFst(itn_cardinal_tagger=cardinal,
                                 tn_date_verbalizer=tn_date_verbalizer,
                                 tn_date_tagger=tn_date_tagger).fst
            word_graph = WordFst().fst
            time_graph = TimeFst(tn_time_verbalizer=tn_time_verbalizer).fst
            money_graph = MoneyFst(itn_cardinal_tagger=cardinal,
                                   itn_decimal_tagger=decimal).fst
            whitelist_graph = WhiteListFst(
                tn_whitelist_tagger=tn_whitelist_tagger).fst
            punct_graph = PunctuationFst().fst
            electronic_graph = ElectronicFst(
                tn_electronic_tagger=tn_electronic_tagger,
                tn_electronic_verbalizer=tn_electronic_verbalizer).fst
            telephone_graph = TelephoneFst(
                tn_cardinal_tagger=tn_cardinal_tagger).fst

            classify = (pynutil.add_weight(cardinal_graph, 1.1)
                        | pynutil.add_weight(whitelist_graph, 1.0)
                        | pynutil.add_weight(time_graph, 1.1)
                        | pynutil.add_weight(date_graph, 1.1)
                        | pynutil.add_weight(decimal_graph, 1.1)
                        | pynutil.add_weight(measure_graph, 1.1)
                        | pynutil.add_weight(ordinal_graph, 1.1)
                        | pynutil.add_weight(fraction_graph, 1.1)
                        | pynutil.add_weight(money_graph, 1.1)
                        | pynutil.add_weight(telephone_graph, 1.1)
                        | pynutil.add_weight(electronic_graph, 1.1)
                        | pynutil.add_weight(word_graph, 100))

            punct = pynutil.insert("tokens { ") + pynutil.add_weight(
                punct_graph, weight=1.1) + pynutil.insert(" }")
            token = pynutil.insert("tokens { ") + classify + pynutil.insert(
                " }")
            token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) +
                                token +
                                pynini.closure(pynutil.insert(" ") + punct))

            graph = token_plus_punct + pynini.closure(delete_extra_space +
                                                      token_plus_punct)
            graph = delete_space + graph + delete_space

            self.fst = graph.optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f"ClassifyFst grammars are saved to {far_file}.")