Ejemplo n.º 1
0
 def items_for_normalized(self):
     references_path = self.app_config.REFERENCES_PATH
     items_for_normalized = []
     for r, d, f in os.walk(references_path):
         for file in f:
             if self._ext in file:
                 filename = os.path.join(r, file)
                 rep = FileRepository(filename=filename, loader=json.loads)
                 rep.load()
                 data = rep.data
                 item = self._split_items(
                     self._collect_items_for_normalized(data))
                 items_for_normalized.extend(item)
     return items_for_normalized
    def __load_everything(self):
        nltk.download('punkt')

        from smart_kit.configs import get_app_config
        app_config = get_app_config()
        text_normalizer_params = FileRepository(
            f"{app_config.STATIC_PATH}/.text_normalizer_resources/static_workdata.json",
            loader=ordered_json,
        )
        text_normalizer_params.load()

        synonyms = FileRepository(
            f"{app_config.STATIC_PATH}/.text_normalizer_resources/dict_synonyms.json",
            loader=reverse_json_dict,
        )
        synonyms.load()
        text_normalizer_params.data["synonyms"] = synonyms.data

        text2num_dict = FileRepository(
            f"{app_config.STATIC_PATH}/.text_normalizer_resources/text2num_dict.json",
            loader=ordered_json,
        )
        text2num_dict.load()
        text_normalizer_params.data["text2num_dict"] = text2num_dict.data

        text_normalizer_params = text_normalizer_params.data or {}
        self.convert_plan = text_normalizer_params["convert_plan"]
        self.processor_pipeline = text_normalizer_params["processor_pipeline"]

        word_false_stoppings = text_normalizer_params.get("word_false_stoppings", [])
        words_without_splitting_point = text_normalizer_params.get("word_no_splitting_point", [])
        synonyms = text_normalizer_params.get("synonyms", {})
        text2num = text_normalizer_params.get("text2num_dict", {})
        sberbank_phones = text_normalizer_params.get("sberbank_phones", [])
        unicode_symbols = text_normalizer_params.get("unicode_symbols", {})

        self.sentence_tokenizer = ru_sent_tokenize
        self.word_tokenizer = NLTKWordTokenizer(word_false_stoppings, words_without_splitting_point)

        skip_func = lambda x: x
        self.converter_pipeline = {
            'Объединение цифр после stt': NumbersUnionAfterSTT(text2num),
            'Конверсия юникодовых символов': UnicodeSymbolsConverter(unicode_symbols),
            'Цифры и буквы отдельно': unmerge_numbers_and_letters,
            'Номера телефонов': NormalizePhoneNumbers(),
            'Номера телефонов из голоса': NormalizePhoneNumbersVoice(),
            'Номера карт': MergeCardNumbers(),
            'Номера карт из голоса': MergeCardNumbersVoice(),
            'Объединение сумм': merge_numbers,
            'Символы валют': replace_currencies_symbols,
            "Претокенизация математических операций": AdditionalMathSplitter()
        }
        self.tokens_processor = {
            "Synonyms": ReplaceSynonyms(synonyms) if synonyms else skip_func,
            "Text2Num": Text2Num(text2num, sberbank_phones) if text2num else skip_func,
            "Currency": CurrencyTokensOneIterationMerger(),
            "Grammemes": self.morph,
        }

        self.__ready_to_use = True