def load(self, word2tags_path=None): module_folder = str(pathlib.Path(__file__).resolve().parent) data_folder = os.path.join(module_folder, '../tmp') config_path = os.path.join(data_folder, 'rupostagger.config') if not os.path.exists(config_path): data_folder = module_folder config_path = os.path.join(data_folder, 'rupostagger.config') #print('DEBUG@47 module_folder={}'.format(module_folder)) #print('DEBUG@48 data_folder={}'.format(data_folder)) with open(config_path, 'r') as rdr: self.config = json.load(rdr) self.winspan = self.config['winspan'] self.use_gren = self.config['use_gren'] self.use_w2v = self.config['use_w2v'] self.use_syllabs = self.config['use_syllabs'] self.ending_len = self.config['ending_len'] self.word2tags = ruword2tags.RuWord2Tags() self.word2tags.load(word2tags_path) model_path = os.path.join(data_folder, 'rupostagger.model') self.tagger = pycrfsuite.Tagger() self.tagger.open(model_path)
def __init__(self): self.tokenizer = Tokenizer() self.tokenizer.load() self.lexicon = Word2Lemmas() self.language_resources = LanguageResources() self.postagger = rupostagger.RuPosTagger() self.chunker = ruchunker.Chunker() self.word2tags = ruword2tags.RuWord2Tags() self.flexer = ruword2tags.RuFlexer() self.syntan = None self.gg_dictionaries = GenerativeGrammarDictionaries() self.known_words = set() #self.lemmatizer = Mystem() self.lemmatizer = rulemma.Lemmatizer() self.word_embeddings = None
def load(self, model_dir=None): if model_dir is None: module_folder = str(pathlib.Path(__file__).resolve().parent) model_dir = os.path.join(module_folder, '../tmp') if not os.path.exists(model_dir): model_dir = module_folder config_path = os.path.join(model_dir, 'chunker_NP.config') self.chunker_params = ChunkerCrfParams.load(config_path) if self.chunker_params.use_gren: self.word2tags = ruword2tags.RuWord2Tags() self.word2tags.load() if self.chunker_params.use_postagger: self.postagger = rupostagger.RuPosTagger() self.postagger.load() self.crf_tagger = pycrfsuite.Tagger() self.crf_tagger.open( os.path.join(model_dir, self.chunker_params.model_filename))
logging.root.removeHandler(absl.logging._absl_handler) absl.logging._warn_preinit_stderr = False logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(message)s') logfile_path = os.path.join(tmp_dir, 'rupostagger2.trainer.log') lf = logging.FileHandler(logfile_path, mode='w') lf.setLevel(logging.INFO) lf.setFormatter(logging.Formatter('%(asctime)s %(message)s')) logging.getLogger('').addHandler(lf) logging.info('STARTED') trainer = Trainer() logging.info('Loading dictionary...') word2tags = ruword2tags.RuWord2Tags() word2tags.load() w2v = None if use_w2v: w2v_path = os.path.join(tmp_dir, 'w2v.kv') #w2v_path = os.path.join('/home/inkoziev/polygon/w2v/fasttext.CBOW=1_WIN=5_DIM=64') wordchar2vector_path = '~/polygon/chatbot/data/wordchar2vector.dat' if 'fasttext' in w2v_path: w2v = FastText.load_fasttext_format(w2v_path) else: if use_wc2v: logging.info(u'Loading the wordchar2vector model from "%s"', wordchar2vector_path) wc2v = gensim.models.KeyedVectors.load_word2vec_format(
if len(phrase2) == 0 or phrase1 == phrase2: no_expansion_phrases.append(u' '.join( tokenizer.tokenize(phrase1))) lines = [] print('{} samples, {} no-expansion phrases'.format( len(samples), len(no_expansion_phrases))) with io.open(no_expansion_path, 'w', encoding='utf=8') as wrt: for phrase in no_expansion_phrases: wrt.write(phrase + '\n') tagger = rupostagger.RuPosTagger() tagger.load() gren = ruword2tags.RuWord2Tags() gren.load() # НАЧАЛО ОТЛАДКИ #words = tokenizer.tokenize('Тебе нравится пить кофе') #tags = list(tagger.tag(words)) # КОНЕЦ ОТЛАДКИ #lemmatizer = rulemma.Lemmatizer() #lemmatizer.load() all_templates = set() template2freq = collections.Counter() template2sample = dict() all_terms = collections.Counter()
logging.info('Start "prepare_answer_relevancy_dataset.py"') tokenizer = Tokenizer() tokenizer.load() samples = load_samples(input_paths, tokenizer) logging.info('Loading dictionaries...') thesaurus = Thesaurus() thesaurus.load(os.path.join(data_folder, 'dict/links.csv')) # , corpus) lexicon = Word2Lemmas() lexicon.load(os.path.join(data_folder, 'dict/word2lemma.dat')) grdict = ruword2tags.RuWord2Tags() grdict.load() flexer = ruword2tags.RuFlexer() flexer.load() # Аугментация: генерируем негативных сэмплы через выбор вариантов словоформ, отличающихся # от использованных в валидном ответе. logging.info('Generating negative samples...') all_keys = set(sample.get_key() for sample in samples) neg_samples = [] for sample in samples: if sample.label == 1: answer_words = tokenizer.tokenize(sample.answer) answer_len = len(answer_words) if answer_len == 1: