def load(self, rdr): for line in rdr: if len(line) == 0: eof_reached = True break line = line.strip() if len(line) == 0: if len(self.premises) > 0: break else: continue if line.startswith(u'T:'): if len(self.questions) > 0: self.invalid_format() premise = line.replace(u'T:', u'').replace(u'ё', u'е').lower().strip() premise_words = lpad_wordseq(self.tokenizer.tokenize(premise), self.max_wordseq_len) self.premises_str.append(u' '.join(premise_words)) self.premises.append(premise_words) elif line.startswith(u'Q:'): question = line.replace(u'Q:', u'').replace(u'ё', u'е').strip() question = lpad_wordseq(self.tokenizer.tokenize(question), self.max_wordseq_len) self.questions.append(question) else: self.invalid_format() eof_reached = len(self.premises) == 0 return eof_reached
def load(self, data_folder): eval_path = os.path.join(data_folder, 'evaluate_relevancy.txt') self.eval_data = [] # список из EvaluationGroup with codecs.open(eval_path, 'r', 'utf-8') as rdr: while True: group = EvaluationGroup(self.max_wordseq_len, self.tokenizer) eof_reached = group.load(rdr) if eof_reached: break if not group.is_empty(): self.eval_data.append(group) # Нам нужен набор нерелевантных предпосылок. self.all_premises = [] if False: # Возьмем их из тренировочного набора. # Вообще говоря, могут быть коллизии, когда для одного вопроса есть несколько # релевантных предпосылок, и мы можем случайно выбрать релевантных вариант как недопустимый. df = pd.read_csv(os.path.join(data_folder, 'premise_question_answer.csv'), encoding='utf-8', delimiter='\t', quoting=3) for premise in df['premise'].unique(): premise_words = self.tokenizer.tokenize(premise) if u'кого' not in premise_words: premise = lpad_wordseq(premise_words, self.max_wordseq_len) self.all_premises.append(premise) else: # Берем нерелевантные предпосылки из базы фактов чат-бота with codecs.open(os.path.join(data_folder, 'premises.txt'), 'r', 'utf-8') as rdr: for line in rdr: line = line.strip() if len(line) > 0: premise = lpad_wordseq(self.tokenizer.tokenize(line), self.max_wordseq_len) self.all_premises.append(premise)
def load_dataset(params): tokenizer = Tokenizer() tokenizer.load() # Датасет должен быть заранее сформирован скриптом ./preparation/prepare_req_interpretation_classif.py df = pd.read_csv(os.path.join(data_folder, 'req_interpretation_dataset.csv'), sep='\t', encoding='utf-8') samples = [ Sample(row['text'], int(row['label'])) for i, row in df.iterrows() ] # Токенизация сэмплов for sample in samples: sample.words = tokenizer.tokenize(sample.phrase) nb_0 = sum(sample.y == 0 for sample in samples) nb_1 = sum(sample.y == 1 for sample in samples) logging.info('nb_0={} nb_1={}'.format(nb_0, nb_1)) max_wordseq_len = max(len(sample.words) for sample in samples) logging.info('max_wordseq_len={}'.format(max_wordseq_len)) if params['padding'] == 'left': for sample in samples: sample.words = lpad_wordseq(sample.words, max_wordseq_len) else: for sample in samples: sample.words = rpad_wordseq(sample.words, max_wordseq_len) computed_params = { 'max_wordseq_len': max_wordseq_len, 'nb_0': nb_0, 'nb_1': nb_1 } return samples, computed_params
def extract_entity(self, entity_name, phrase, text_utils, embeddings): # TODO: брать модель для указанного entity_name, когда будет множество разных entity self.X_probe.fill(0) words = text_utils.tokenize(phrase) if self.padding == 'right': words = rpad_wordseq(words, self.max_inputseq_len) else: words = lpad_wordseq(words, self.max_inputseq_len) embeddings.vectorize_words(self.w2v_filename, words, self.X_probe, 0) inputs = dict() inputs['input'] = self.X_probe y = self.model.predict(x=inputs)[0] predicted_labels = np.argmax(y, axis=-1) selected_words = [ word for word, label in zip(words, predicted_labels) if label == 1 ] entity_text = u' '.join(selected_words).strip() return entity_text
def pad_wordseq(self, words, n): if self.padding == 'left': return lpad_wordseq(words, n) else: return rpad_wordseq(words, n)
computed_params['embeddings'] = embeddings computed_params['word_dims'] = embeddings.vector_size print('Restoring model architecture from {}'.format(arch_filepath)) with open(arch_filepath, 'r') as f: model = model_from_json(f.read()) print('Loading model weights from {}'.format(weights_path)) model.load_weights(weights_path) tokenizer = Tokenizer() tokenizer.load() while True: phrase = utils.console_helpers.input_kbd(':> ').strip() sample1 = Sample(phrase, 0) sample1.words = tokenizer.tokenize(phrase) if padding == 'left': sample1.words = lpad_wordseq(sample1.words, max_wordseq_len) else: sample1.words = rpad_wordseq(sample1.words, max_wordseq_len) for istep, xy in enumerate( generate_rows([sample1], 2, computed_params, 1)): x = xy[0] y_pred = model.predict(x=x, verbose=0)[0] print('y==0 --> {}'.format(y_pred[0])) print('y==1 --> {}'.format(y_pred[1])) break
nb_0 = sum(sample.y == 0 for sample in samples) nb_1 = sum(sample.y == 1 for sample in samples) logging.info('nb_0={} nb_1={}'.format(nb_0, nb_1)) max_wordseq_len = 0 for sample in samples: for phrase in [sample.phrase1, sample.phrase2]: words = tokenizer.tokenize(phrase) max_wordseq_len = max(max_wordseq_len, len(words)) logging.info('max_wordseq_len={}'.format(max_wordseq_len)) if padding == 'left': for sample in samples: sample.words1 = lpad_wordseq(tokenizer.tokenize(sample.phrase1), max_wordseq_len) sample.words2 = lpad_wordseq(tokenizer.tokenize(sample.phrase2), max_wordseq_len) else: for sample in samples: sample.words1 = rpad_wordseq(tokenizer.tokenize(sample.phrase1), max_wordseq_len) sample.words2 = rpad_wordseq(tokenizer.tokenize(sample.phrase2), max_wordseq_len) # суммарное кол-во дополнительных фич, подаваемых на вход сетки # помимо двух отдельных предложений. nb_addfeatures = 0 if net_arch == 'cnn2': # попарные похожести слов в двух предложениях.