def __init__(self): self.tokenizer = Tokenizer() self.tokenizer.load() self.lemmatizer = Mystem() self.lexicon = Word2Lemmas() self.language_resources = LanguageResources() self.postagger = rupostagger.RuPosTagger() self.gg_dictionaries = GenerativeGrammarDictionaries() self.known_words = set()
def __init__(self): self.tokenizer = Tokenizer() self.tokenizer.load() self.lexicon = Word2Lemmas() self.language_resources = LanguageResources() self.postagger = rupostagger.RuPosTagger() self.chunker = ruchunker.Chunker() self.word2tags = ruword2tags.RuWord2Tags() self.flexer = ruword2tags.RuFlexer() self.syntan = None self.gg_dictionaries = GenerativeGrammarDictionaries() self.known_words = set() #self.lemmatizer = Mystem() self.lemmatizer = rulemma.Lemmatizer() self.word_embeddings = None
if __name__ == '__main__': model_folder = '../../../tmp' tmp_folder = '../../../tmp' data_folder = '../../../data' # Список слов, которые упоминаются в датасетах known_words = set() with io.open(os.path.join(tmp_folder, 'dataset_words.txt'), 'r', encoding='utf-8') as rdr: for line in rdr: known_words.add(line.strip()) # Словари общие для нескольких грамматик print('Build dictionaries...') dictionaries = GenerativeGrammarDictionaries() dictionaries.prepare(data_folder, max_ngram_gap=1, use_assocs=False, lexicon_words=known_words, use_verb_prep_case=True) dictionaries.save( os.path.join(model_folder, 'generative_grammar_dictionaries.bin')) # Теперь генератор ответов print('Build answer grammar...') answer_generator = AnswerGeneratorEngine() answer_generator.set_dictionaries(dictionaries) answer_generator.compile_grammar(data_folder, tmp_folder) del answer_generator gc.collect()