コード例 #1
0
 def __init__(self):
     self.tokenizer = Tokenizer()
     self.tokenizer.load()
     self.lemmatizer = Mystem()
     self.lexicon = Word2Lemmas()
     self.language_resources = LanguageResources()
     self.postagger = rupostagger.RuPosTagger()
     self.gg_dictionaries = GenerativeGrammarDictionaries()
     self.known_words = set()
コード例 #2
0
ファイル: text_utils.py プロジェクト: midvix/chatbot
 def __init__(self):
     self.tokenizer = Tokenizer()
     self.tokenizer.load()
     self.lexicon = Word2Lemmas()
     self.language_resources = LanguageResources()
     self.postagger = rupostagger.RuPosTagger()
     self.chunker = ruchunker.Chunker()
     self.word2tags = ruword2tags.RuWord2Tags()
     self.flexer = ruword2tags.RuFlexer()
     self.syntan = None
     self.gg_dictionaries = GenerativeGrammarDictionaries()
     self.known_words = set()
     #self.lemmatizer = Mystem()
     self.lemmatizer = rulemma.Lemmatizer()
     self.word_embeddings = None
コード例 #3
0
if __name__ == '__main__':
    model_folder = '../../../tmp'
    tmp_folder = '../../../tmp'
    data_folder = '../../../data'

    # Список слов, которые упоминаются в датасетах
    known_words = set()
    with io.open(os.path.join(tmp_folder, 'dataset_words.txt'),
                 'r',
                 encoding='utf-8') as rdr:
        for line in rdr:
            known_words.add(line.strip())

    # Словари общие для нескольких грамматик
    print('Build dictionaries...')
    dictionaries = GenerativeGrammarDictionaries()
    dictionaries.prepare(data_folder,
                         max_ngram_gap=1,
                         use_assocs=False,
                         lexicon_words=known_words,
                         use_verb_prep_case=True)
    dictionaries.save(
        os.path.join(model_folder, 'generative_grammar_dictionaries.bin'))

    # Теперь генератор ответов
    print('Build answer grammar...')
    answer_generator = AnswerGeneratorEngine()
    answer_generator.set_dictionaries(dictionaries)
    answer_generator.compile_grammar(data_folder, tmp_folder)
    del answer_generator
    gc.collect()