Beispiel #1
0
    def __init__(
        self,
        input_case: str,
        cache_dir: str = None,
        overwrite_cache: bool = False,
        deterministic: bool = True,
        whitelist: str = None,
    ):
        super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic)

        far_file = None
        if cache_dir is not None and cache_dir != "None":
            os.makedirs(cache_dir, exist_ok=True)
            whitelist_file = os.path.basename(whitelist) if whitelist else ""
            far_file = os.path.join(
                cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far"
            )
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
            logging.info(f'ClassifyFst.fst was restored from {far_file}.')
        else:
            logging.info(f"Creating ClassifyFst grammars.")

            word_graph = WordFst(deterministic=deterministic).fst
            whitelist_graph = WhiteListFst(input_case=input_case, deterministic=deterministic).fst
            punct_graph = PunctuationFst(deterministic=deterministic).fst

            classify = pynutil.add_weight(whitelist_graph, 1) | pynutil.add_weight(word_graph, 100)

            punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }")
            punct = pynini.closure(
                pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
                | (pynutil.insert(" ") + punct),
                1,
            )
            token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
            token_plus_punct = (
                pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)
            )

            graph = (
                token_plus_punct
                + pynini.closure(
                    (
                        pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
                        | (pynutil.insert(" ") + punct + pynutil.insert(" "))
                    )
                    + token_plus_punct
                ).optimize()
            )

            graph = delete_space + graph + delete_space
            graph |= punct

            self.fst = graph.optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f"ClassifyFst grammars are saved to {far_file}.")
    def __init__(
        self,
        input_case: str,
        deterministic: bool = True,
        cache_dir: str = None,
        overwrite_cache: bool = True,
        whitelist: str = None,
    ):
        super().__init__(name="tokenize_and_classify",
                         kind="classify",
                         deterministic=deterministic)

        far_file = None
        if cache_dir is not None and cache_dir != 'None':
            os.makedirs(cache_dir, exist_ok=True)
            whitelist_file = os.path.basename(whitelist) if whitelist else ""
            far_file = os.path.join(
                cache_dir,
                f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far"
            )
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode='r')['tokenize_and_classify']
            no_digits = pynini.closure(pynini.difference(
                NEMO_CHAR, NEMO_DIGIT))
            self.fst_no_digits = pynini.compose(self.fst, no_digits).optimize()
            logging.info(f'ClassifyFst.fst was restored from {far_file}.')
        else:
            logging.info(
                f'Creating ClassifyFst grammars. This might take some time...')
            # TAGGERS
            cardinal = CardinalFst(deterministic=deterministic)
            cardinal_graph = cardinal.fst

            ordinal = OrdinalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            deterministic_ordinal = OrdinalFst(cardinal=cardinal,
                                               deterministic=True)
            ordinal_graph = ordinal.fst

            decimal = DecimalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            decimal_graph = decimal.fst
            fraction = FractionFst(deterministic=deterministic,
                                   cardinal=cardinal)
            fraction_graph = fraction.fst

            measure = MeasureFst(cardinal=cardinal,
                                 decimal=decimal,
                                 fraction=fraction,
                                 deterministic=deterministic)
            measure_graph = measure.fst
            date_graph = DateFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            word_graph = WordFst(deterministic=deterministic).graph
            time_graph = TimeFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            telephone_graph = TelephoneFst(deterministic=deterministic).fst
            electronic_graph = ElectronicFst(deterministic=deterministic).fst
            money_graph = MoneyFst(cardinal=cardinal,
                                   decimal=decimal,
                                   deterministic=deterministic).fst
            whitelist = WhiteListFst(input_case=input_case,
                                     deterministic=deterministic,
                                     input_file=whitelist)
            whitelist_graph = whitelist.graph
            punct_graph = PunctuationFst(deterministic=deterministic).graph
            serial_graph = SerialFst(cardinal=cardinal,
                                     ordinal=deterministic_ordinal,
                                     deterministic=deterministic).fst

            # VERBALIZERS
            cardinal = vCardinal(deterministic=deterministic)
            v_cardinal_graph = cardinal.fst
            decimal = vDecimal(cardinal=cardinal, deterministic=deterministic)
            v_decimal_graph = decimal.fst
            ordinal = vOrdinal(deterministic=deterministic)
            v_ordinal_graph = ordinal.fst
            fraction = vFraction(deterministic=deterministic)
            v_fraction_graph = fraction.fst
            v_telephone_graph = vTelephone(deterministic=deterministic).fst
            v_electronic_graph = vElectronic(deterministic=deterministic).fst
            measure = vMeasure(decimal=decimal,
                               cardinal=cardinal,
                               fraction=fraction,
                               deterministic=deterministic)
            v_measure_graph = measure.fst
            v_time_graph = vTime(deterministic=deterministic).fst
            v_date_graph = vDate(ordinal=ordinal,
                                 deterministic=deterministic).fst
            v_money_graph = vMoney(decimal=decimal,
                                   deterministic=deterministic).fst
            v_roman_graph = vRoman(deterministic=deterministic).fst
            v_abbreviation = vAbbreviation(deterministic=deterministic).fst

            det_v_time_graph = vTime(deterministic=True).fst
            det_v_date_graph = vDate(ordinal=vOrdinal(deterministic=True),
                                     deterministic=True).fst
            time_final = pynini.compose(time_graph, det_v_time_graph)
            date_final = pynini.compose(date_graph, det_v_date_graph)
            range_graph = RangeFst(time=time_final,
                                   date=date_final,
                                   cardinal=CardinalFst(deterministic=True),
                                   deterministic=deterministic).fst
            v_word_graph = vWord(deterministic=deterministic).fst

            sem_w = 1
            word_w = 100
            punct_w = 2
            classify_and_verbalize = (
                pynutil.add_weight(whitelist_graph, sem_w)
                | pynutil.add_weight(pynini.compose(time_graph, v_time_graph),
                                     sem_w)
                | pynutil.add_weight(
                    pynini.compose(decimal_graph, v_decimal_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(measure_graph, v_measure_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(cardinal_graph, v_cardinal_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(ordinal_graph, v_ordinal_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(telephone_graph, v_telephone_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(electronic_graph, v_electronic_graph),
                    sem_w)
                | pynutil.add_weight(
                    pynini.compose(fraction_graph, v_fraction_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(money_graph, v_money_graph), sem_w)
                | pynutil.add_weight(word_graph, word_w)
                | pynutil.add_weight(pynini.compose(date_graph, v_date_graph),
                                     sem_w - 0.01)
                | pynutil.add_weight(pynini.compose(range_graph, v_word_graph),
                                     sem_w)
                | pynutil.add_weight(
                    pynini.compose(serial_graph, v_word_graph),
                    1.1001)  # should be higher than the rest of the classes
            ).optimize()

            if not deterministic:
                roman_graph = RomanFst(deterministic=deterministic).fst
                # the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens
                classify_and_verbalize |= pynutil.add_weight(
                    pynini.compose(roman_graph, v_roman_graph), word_w)

                abbreviation_graph = AbbreviationFst(
                    whitelist=whitelist, deterministic=deterministic).fst
                classify_and_verbalize |= pynutil.add_weight(
                    pynini.compose(abbreviation_graph, v_abbreviation), word_w)

            punct_only = pynutil.add_weight(punct_graph, weight=punct_w)
            punct = pynini.closure(
                pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1),
                               delete_extra_space)
                | (pynutil.insert(" ") + punct_only),
                1,
            )

            token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) +
                                classify_and_verbalize +
                                pynini.closure(pynutil.insert(" ") + punct))

            graph = token_plus_punct + pynini.closure(
                (pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1),
                                delete_extra_space)
                 | (pynutil.insert(" ") + punct + pynutil.insert(" "))) +
                token_plus_punct)

            graph |= punct_only + pynini.closure(punct)
            graph = delete_space + graph + delete_space

            remove_extra_spaces = pynini.closure(
                NEMO_NOT_SPACE,
                1) + pynini.closure(delete_extra_space +
                                    pynini.closure(NEMO_NOT_SPACE, 1))
            remove_extra_spaces |= (
                pynini.closure(pynutil.delete(" "), 1) +
                pynini.closure(NEMO_NOT_SPACE, 1) +
                pynini.closure(delete_extra_space +
                               pynini.closure(NEMO_NOT_SPACE, 1)))

            graph = pynini.compose(graph.optimize(),
                                   remove_extra_spaces).optimize()
            self.fst = graph
            no_digits = pynini.closure(pynini.difference(
                NEMO_CHAR, NEMO_DIGIT))
            self.fst_no_digits = pynini.compose(graph, no_digits).optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f'ClassifyFst grammars are saved to {far_file}.')
Beispiel #3
0
    def __init__(
        self,
        input_case: str,
        deterministic: bool = True,
        cache_dir: str = None,
        overwrite_cache: bool = False,
        whitelist: str = None,
    ):
        super().__init__(name="tokenize_and_classify",
                         kind="classify",
                         deterministic=deterministic)

        far_file = None
        if cache_dir is not None and cache_dir != "None":
            os.makedirs(cache_dir, exist_ok=True)
            whitelist_file = os.path.basename(whitelist) if whitelist else ""
            far_file = os.path.join(
                cache_dir,
                f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far"
            )
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
            logging.info(f'ClassifyFst.fst was restored from {far_file}.')
        else:
            logging.info(f"Creating ClassifyFst grammars.")
            cardinal = CardinalFst(deterministic=deterministic)
            cardinal_graph = cardinal.fst

            ordinal = OrdinalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            ordinal_graph = ordinal.fst

            decimal = DecimalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            decimal_graph = decimal.fst
            fraction = FractionFst(deterministic=deterministic,
                                   cardinal=cardinal)
            fraction_graph = fraction.fst

            measure = MeasureFst(cardinal=cardinal,
                                 decimal=decimal,
                                 fraction=fraction,
                                 deterministic=deterministic)
            measure_graph = measure.fst
            date_graph = DateFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            word_graph = WordFst(deterministic=deterministic).fst
            time_graph = TimeFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            telephone_graph = TelephoneFst(deterministic=deterministic).fst
            electonic_graph = ElectronicFst(deterministic=deterministic).fst
            money_graph = MoneyFst(cardinal=cardinal,
                                   decimal=decimal,
                                   deterministic=deterministic).fst
            whitelist_graph = WhiteListFst(input_case=input_case,
                                           deterministic=deterministic,
                                           input_file=whitelist).fst
            punct_graph = PunctuationFst(deterministic=deterministic).fst
            serial_graph = SerialFst(cardinal=cardinal,
                                     ordinal=ordinal,
                                     deterministic=deterministic).fst

            v_time_graph = vTimeFst(deterministic=deterministic).fst
            v_ordinal_graph = vOrdinalFst(deterministic=deterministic)
            v_date_graph = vDateFst(ordinal=v_ordinal_graph,
                                    deterministic=deterministic).fst
            time_final = pynini.compose(time_graph, v_time_graph)
            date_final = pynini.compose(date_graph, v_date_graph)
            range_graph = RangeFst(time=time_final,
                                   date=date_final,
                                   cardinal=cardinal,
                                   deterministic=deterministic).fst

            classify = (
                pynutil.add_weight(whitelist_graph, 1.01)
                | pynutil.add_weight(time_graph, 1.1)
                | pynutil.add_weight(date_graph, 1.09)
                | pynutil.add_weight(decimal_graph, 1.1)
                | pynutil.add_weight(measure_graph, 1.1)
                | pynutil.add_weight(cardinal_graph, 1.1)
                | pynutil.add_weight(ordinal_graph, 1.1)
                | pynutil.add_weight(money_graph, 1.1)
                | pynutil.add_weight(telephone_graph, 1.1)
                | pynutil.add_weight(electonic_graph, 1.1)
                | pynutil.add_weight(fraction_graph, 1.1)
                | pynutil.add_weight(range_graph, 1.1)
                | pynutil.add_weight(
                    serial_graph,
                    1.1001)  # should be higher than the rest of the classes
            )

            # roman_graph = RomanFst(deterministic=deterministic).fst
            # classify |= pynutil.add_weight(roman_graph, 1.1)

            if not deterministic:
                abbreviation_graph = AbbreviationFst(
                    deterministic=deterministic).fst
                classify |= pynutil.add_weight(abbreviation_graph, 100)

            punct = pynutil.insert("tokens { ") + pynutil.add_weight(
                punct_graph, weight=2.1) + pynutil.insert(" }")
            punct = pynini.closure(
                pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1),
                               delete_extra_space)
                | (pynutil.insert(" ") + punct),
                1,
            )

            classify |= pynutil.add_weight(word_graph, 100)
            token = pynutil.insert("tokens { ") + classify + pynutil.insert(
                " }")
            token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) +
                                token +
                                pynini.closure(pynutil.insert(" ") + punct))

            graph = token_plus_punct + pynini.closure(
                (pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1),
                                delete_extra_space)
                 | (pynutil.insert(" ") + punct + pynutil.insert(" "))) +
                token_plus_punct)

            graph = delete_space + graph + delete_space
            graph |= punct

            self.fst = graph.optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f"ClassifyFst grammars are saved to {far_file}.")
    def __init__(
        self,
        input_case: str,
        deterministic: bool = True,
        cache_dir: str = None,
        overwrite_cache: bool = True,
        whitelist: str = None,
    ):
        super().__init__(name="tokenize_and_classify",
                         kind="classify",
                         deterministic=deterministic)

        far_file = None
        if cache_dir is not None and cache_dir != 'None':
            os.makedirs(cache_dir, exist_ok=True)
            whitelist_file = os.path.basename(whitelist) if whitelist else ""
            far_file = os.path.join(
                cache_dir,
                f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far"
            )
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode='r')['tokenize_and_classify']
            logging.info(f'ClassifyFst.fst was restored from {far_file}.')
        else:
            logging.info(
                f'Creating ClassifyFst grammars. This might take some time...')
            # TAGGERS
            cardinal = CardinalFst(deterministic=deterministic)
            cardinal_graph = cardinal.fst

            ordinal = OrdinalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            ordinal_graph = ordinal.fst

            decimal = DecimalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            decimal_graph = decimal.fst
            fraction = FractionFst(deterministic=deterministic,
                                   cardinal=cardinal)
            fraction_graph = fraction.fst

            measure = MeasureFst(cardinal=cardinal,
                                 decimal=decimal,
                                 fraction=fraction,
                                 deterministic=deterministic)
            measure_graph = measure.fst
            date_graph = DateFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            word_graph = WordFst(deterministic=deterministic).graph
            time_graph = TimeFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            telephone_graph = TelephoneFst(deterministic=deterministic).fst
            electronic_graph = ElectronicFst(deterministic=deterministic).fst
            money_graph = MoneyFst(cardinal=cardinal,
                                   decimal=decimal,
                                   deterministic=deterministic).fst
            whitelist = WhiteListFst(input_case=input_case,
                                     deterministic=deterministic,
                                     input_file=whitelist)
            whitelist_graph = whitelist.graph
            punct_graph = PunctuationFst(deterministic=deterministic).graph

            # VERBALIZERS
            cardinal = vCardinal(deterministic=deterministic)
            v_cardinal_graph = cardinal.fst
            decimal = vDecimal(cardinal=cardinal, deterministic=deterministic)
            v_decimal_graph = decimal.fst
            ordinal = vOrdinal(deterministic=deterministic)
            v_ordinal_graph = ordinal.fst
            fraction = vFraction(deterministic=deterministic)
            v_fraction_graph = fraction.fst
            v_telephone_graph = vTelephone(deterministic=deterministic).fst
            v_electronic_graph = vElectronic(deterministic=deterministic).fst
            measure = vMeasure(decimal=decimal,
                               cardinal=cardinal,
                               fraction=fraction,
                               deterministic=deterministic)
            v_measure_graph = measure.fst
            v_time_graph = vTime(deterministic=deterministic).fst
            v_date_graph = vDate(ordinal=ordinal,
                                 deterministic=deterministic).fst
            v_money_graph = vMoney(decimal=decimal,
                                   deterministic=deterministic).fst
            v_roman_graph = vRoman(deterministic=deterministic).fst
            v_abbreviation = vAbbreviation(deterministic=deterministic).fst

            classify_and_verbalize = (
                pynutil.add_weight(whitelist_graph, 1.01)
                | pynutil.add_weight(pynini.compose(time_graph, v_time_graph),
                                     1.1)
                | pynutil.add_weight(
                    pynini.compose(decimal_graph, v_decimal_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(measure_graph, v_measure_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(cardinal_graph, v_cardinal_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(ordinal_graph, v_ordinal_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(telephone_graph, v_telephone_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(electronic_graph, v_electronic_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(fraction_graph, v_fraction_graph), 1.1)
                | pynutil.add_weight(
                    pynini.compose(money_graph, v_money_graph), 1.1)
                | pynutil.add_weight(word_graph, 100)
                | pynutil.add_weight(pynini.compose(date_graph, v_date_graph),
                                     1.09)).optimize()

            if not deterministic:
                roman_graph = RomanFst(deterministic=deterministic).fst
                # the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens
                classify_and_verbalize |= pynutil.add_weight(
                    pynini.compose(roman_graph, v_roman_graph), 100)

                abbreviation_graph = AbbreviationFst(
                    whitelist=whitelist, deterministic=deterministic).fst
                classify_and_verbalize |= pynutil.add_weight(
                    pynini.compose(abbreviation_graph, v_abbreviation), 100)

            punct = pynutil.add_weight(punct_graph, weight=1.1)
            token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) +
                                classify_and_verbalize +
                                pynini.closure(pynutil.insert(" ") + punct))

            graph = token_plus_punct + pynini.closure(delete_extra_space +
                                                      token_plus_punct)
            graph = delete_space + graph + delete_space

            self.fst = graph.optimize()
            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f'ClassifyFst grammars are saved to {far_file}.')
Beispiel #5
0
    def __init__(
        self,
        input_case: str,
        deterministic: bool = False,
        cache_dir: str = None,
        overwrite_cache: bool = False,
        whitelist: str = None,
    ):
        super().__init__(name="tokenize_and_classify",
                         kind="classify",
                         deterministic=deterministic)
        if deterministic:
            raise ValueError(
                'Ru TN only supports non-deterministic cases and produces multiple normalization options.'
            )
        far_file = None
        if cache_dir is not None and cache_dir != "None":
            os.makedirs(cache_dir, exist_ok=True)
            whitelist_file = os.path.basename(whitelist) if whitelist else ""
            far_file = os.path.join(
                cache_dir,
                f"_{input_case}_ru_tn_{deterministic}_deterministic{whitelist_file}.far"
            )
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
            logging.info(f"ClassifyFst.fst was restored from {far_file}.")
        else:
            logging.info(
                f"Creating ClassifyFst grammars. This might take some time...")
            number_names = get_number_names()
            alternative_formats = get_alternative_formats()

            self.cardinal = CardinalFst(
                number_names=number_names,
                alternative_formats=alternative_formats,
                deterministic=deterministic)
            cardinal_graph = self.cardinal.fst

            self.ordinal = OrdinalFst(number_names=number_names,
                                      alternative_formats=alternative_formats,
                                      deterministic=deterministic)
            ordinal_graph = self.ordinal.fst

            self.decimal = DecimalFst(cardinal=self.cardinal,
                                      deterministic=deterministic)
            decimal_graph = self.decimal.fst

            self.measure = MeasureFst(cardinal=self.cardinal,
                                      decimal=self.decimal,
                                      deterministic=deterministic)
            measure_graph = self.measure.fst
            self.date = DateFst(number_names=number_names,
                                deterministic=deterministic)
            date_graph = self.date.fst
            word_graph = WordFst(deterministic=deterministic).fst
            self.time = TimeFst(number_names=number_names,
                                deterministic=deterministic)
            time_graph = self.time.fst
            self.telephone = TelephoneFst(number_names=number_names,
                                          deterministic=deterministic)
            telephone_graph = self.telephone.fst
            self.electronic = ElectronicFst(deterministic=deterministic)
            electronic_graph = self.electronic.fst
            self.money = MoneyFst(cardinal=self.cardinal,
                                  decimal=self.decimal,
                                  deterministic=deterministic)
            money_graph = self.money.fst
            self.whitelist = WhiteListFst(input_case=input_case,
                                          deterministic=deterministic,
                                          input_file=whitelist)
            whitelist_graph = self.whitelist.fst
            punct_graph = PunctuationFst(deterministic=deterministic).fst

            classify = (pynutil.add_weight(whitelist_graph, 1.01)
                        | pynutil.add_weight(time_graph, 1.1)
                        | pynutil.add_weight(date_graph, 1.09)
                        | pynutil.add_weight(decimal_graph, 1.1)
                        | pynutil.add_weight(measure_graph, 0.9)
                        | pynutil.add_weight(cardinal_graph, 1.1)
                        | pynutil.add_weight(ordinal_graph, 1.1)
                        | pynutil.add_weight(money_graph, 1.1)
                        | pynutil.add_weight(telephone_graph, 1.1)
                        | pynutil.add_weight(electronic_graph, 1.1)
                        | pynutil.add_weight(word_graph, 100))

            punct = pynutil.insert("tokens { ") + pynutil.add_weight(
                punct_graph, weight=1.1) + pynutil.insert(" }")
            token = pynutil.insert("tokens { ") + classify + pynutil.insert(
                " }")
            token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) +
                                token +
                                pynini.closure(pynutil.insert(" ") + punct))

            graph = token_plus_punct + pynini.closure(
                pynutil.add_weight(delete_extra_space, 1.1) + token_plus_punct)
            graph = delete_space + graph + delete_space

            self.fst = graph.optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f"ClassifyFst grammars are saved to {far_file}.")
Beispiel #6
0
    def __init__(self,
                 input_case: str,
                 deterministic: bool = True,
                 cache_dir: str = None,
                 overwrite_cache: bool = False):
        super().__init__(name="tokenize_and_classify",
                         kind="classify",
                         deterministic=deterministic)

        far_file = None
        if cache_dir is not None and cache_dir != "None":
            os.makedirs(cache_dir, exist_ok=True)
            far_file = os.path.join(
                cache_dir,
                f"_{input_case}_en_tn_{deterministic}_deterministic.far")
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
            logging.info(f'ClassifyFst.fst was restored from {far_file}.')
        else:
            logging.info(f"Creating ClassifyFst grammars.")
            cardinal = CardinalFst(deterministic=deterministic)
            cardinal_graph = cardinal.fst

            ordinal = OrdinalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            ordinal_graph = ordinal.fst

            decimal = DecimalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            decimal_graph = decimal.fst
            fraction = FractionFst(deterministic=deterministic,
                                   cardinal=cardinal)
            fraction_graph = fraction.fst

            measure = MeasureFst(cardinal=cardinal,
                                 decimal=decimal,
                                 fraction=fraction,
                                 deterministic=deterministic)
            measure_graph = measure.fst
            date_graph = DateFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            word_graph = WordFst(deterministic=deterministic).fst
            time_graph = TimeFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            telephone_graph = TelephoneFst(deterministic=deterministic).fst
            electonic_graph = ElectronicFst(deterministic=deterministic).fst
            money_graph = MoneyFst(cardinal=cardinal,
                                   decimal=decimal,
                                   deterministic=deterministic).fst
            whitelist_graph = WhiteListFst(input_case=input_case,
                                           deterministic=deterministic).fst
            punct_graph = PunctuationFst(deterministic=deterministic).fst

            classify = (pynutil.add_weight(whitelist_graph, 1.01)
                        | pynutil.add_weight(time_graph, 1.1)
                        | pynutil.add_weight(date_graph, 1.09)
                        | pynutil.add_weight(decimal_graph, 1.1)
                        | pynutil.add_weight(measure_graph, 1.1)
                        | pynutil.add_weight(cardinal_graph, 1.1)
                        | pynutil.add_weight(ordinal_graph, 1.1)
                        | pynutil.add_weight(money_graph, 1.1)
                        | pynutil.add_weight(telephone_graph, 1.1)
                        | pynutil.add_weight(electonic_graph, 1.1)
                        | pynutil.add_weight(fraction_graph, 1.1)
                        | pynutil.add_weight(word_graph, 100))

            if not deterministic:
                roman_graph = RomanFst(deterministic=deterministic).fst
                # the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens
                classify |= pynutil.add_weight(roman_graph, 100)

                abbreviation_graph = AbbreviationFst(
                    deterministic=deterministic).fst
                classify |= pynutil.add_weight(abbreviation_graph, 100)

            punct = pynutil.insert("tokens { ") + pynutil.add_weight(
                punct_graph, weight=1.1) + pynutil.insert(" }")
            token = pynutil.insert("tokens { ") + classify + pynutil.insert(
                " }")
            token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) +
                                token +
                                pynini.closure(pynutil.insert(" ") + punct))

            graph = token_plus_punct + pynini.closure(delete_extra_space +
                                                      token_plus_punct)
            graph = delete_space + graph + delete_space

            self.fst = graph.optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f"ClassifyFst grammars are saved to {far_file}.")
Beispiel #7
0
    def __init__(self, input_case: str, deterministic: bool = True):
        super().__init__(name="tokenize_and_classify",
                         kind="classify",
                         deterministic=deterministic)

        cardinal = CardinalFst(deterministic=deterministic)
        cardinal_graph = cardinal.fst

        ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic)
        ordinal_graph = ordinal.fst

        decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic)
        decimal_graph = decimal.fst
        fraction = FractionFst(deterministic=deterministic, cardinal=cardinal)
        fraction_graph = fraction.fst

        measure = MeasureFst(cardinal=cardinal,
                             decimal=decimal,
                             fraction=fraction,
                             deterministic=deterministic)
        measure_graph = measure.fst
        date_graph = DateFst(cardinal=cardinal,
                             deterministic=deterministic).fst
        word_graph = WordFst(deterministic=deterministic).fst
        time_graph = TimeFst(cardinal=cardinal,
                             deterministic=deterministic).fst
        telephone_graph = TelephoneFst(deterministic=deterministic).fst
        electonic_graph = ElectronicFst(deterministic=deterministic).fst
        money_graph = MoneyFst(cardinal=cardinal,
                               decimal=decimal,
                               deterministic=deterministic).fst
        whitelist_graph = WhiteListFst(input_case=input_case,
                                       deterministic=deterministic).fst
        punct_graph = PunctuationFst(deterministic=deterministic).fst

        classify = (pynutil.add_weight(whitelist_graph, 1.01)
                    | pynutil.add_weight(time_graph, 1.1)
                    | pynutil.add_weight(date_graph, 1.09)
                    | pynutil.add_weight(decimal_graph, 1.1)
                    | pynutil.add_weight(measure_graph, 1.1)
                    | pynutil.add_weight(cardinal_graph, 1.1)
                    | pynutil.add_weight(ordinal_graph, 1.1)
                    | pynutil.add_weight(money_graph, 1.1)
                    | pynutil.add_weight(telephone_graph, 1.1)
                    | pynutil.add_weight(electonic_graph, 1.1)
                    | pynutil.add_weight(fraction_graph, 1.1)
                    | pynutil.add_weight(word_graph, 100))

        if not deterministic:
            roman_graph = RomanFst(deterministic=deterministic).fst
            # the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens
            classify |= pynutil.add_weight(roman_graph, 100)

        punct = pynutil.insert("tokens { ") + pynutil.add_weight(
            punct_graph, weight=1.1) + pynutil.insert(" }")
        token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
        token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) +
                            token +
                            pynini.closure(pynutil.insert(" ") + punct))

        graph = token_plus_punct + pynini.closure(delete_extra_space +
                                                  token_plus_punct)
        graph = delete_space + graph + delete_space

        self.fst = graph.optimize()