def on_epoch_end(self, epoch, logs={}): if not do_ema: results = model.predict([val_context_word, val_question_word, val_context_char, val_question_char], \ verbose=1, batch_size=64) _, _, y_start_pred, y_end_pred = results y_start_pred = np.reshape(y_start_pred, (-1)) y_end_pred = np.reshape(y_end_pred, (-1)) answer_dict, remapped_dict = util.convert_tokens(eval_file, val_qid.tolist(), \ y_start_pred.tolist(), y_end_pred.tolist()) metrics = util.evaluate(eval_file, answer_dict) print("Exact Match: {}, F1: {}".format(metrics['exact_match'], metrics['f1'])) ems.append(metrics['exact_match']) f1s.append(metrics['f1']) if metrics['f1'] > self.best_f1: self.best_f1 = metrics['f1'] model.save_weights('model/QANet_v99.h5') if epoch + 1 == 25: model.save_weights('model/QANet_v99_60k.h5') else: # validation with ema # save backup weights print('saving temp weights...') model.save_weights('temp_model2.h5') ExponentialMovingAverage_EpochEnd(model, self.ema_trainable_weights_vals) results = model.predict([val_context_word, val_question_word, val_context_char, val_question_char], \ verbose=1, batch_size=64) _, _, y_start_pred, y_end_pred = results y_start_pred = np.reshape(y_start_pred, (-1)) y_end_pred = np.reshape(y_end_pred, (-1)) answer_dict, remapped_dict = util.convert_tokens(eval_file, val_qid.tolist(), y_start_pred.tolist(), \ y_end_pred.tolist()) metrics_ema = util.evaluate(eval_file, answer_dict) print("After EMA, Exact Match: {}, F1: {}".format( metrics_ema['exact_match'], metrics_ema['f1'])) ems.append(metrics_ema['exact_match']) f1s.append(metrics_ema['f1']) if metrics_ema['f1'] > self.best_f1: self.best_f1 = metrics_ema['f1'] model.save_weights('model/QANet_ema_v99.h5') if epoch + 1 == 25: model.save_weights('model/QANet_ema_v99_60k.h5') # load backup print('loading temp weights...') model.load_weights('temp_model2.h5') result = pd.DataFrame([ems, f1s], index=['em', 'f1']).transpose() result.to_csv('log/result2.csv', index=None)
def _get_predictions(self, observations): datatype = 'public' word2idx_dict = self.model_dicts['word2idx_dict'] char2idx_dict = self.model_dicts['char2idx_dict'] bpe2idx_dict = self.model_dicts['bpe2idx_dict'] pos2idx_dict = self.model_dicts['pos2idx_dict'] data_examples, data_eval = process_file( self.model_config, observations, datatype, remove_unicode=self.model_config.remove_unicode, bpe_model=self.bpe_model, is_test=True) data_features, data_meta = build_features_notfdata(self.model_config, data_examples, datatype, word2idx_dict, char2idx_dict, bpe2idx_dict, pos2idx_dict, is_test=True) total = data_meta["total"] answer_dict = {} remapped_dict = {} print(len(data_features)) # hotfix добить длину data_examples до делителя config.batch_size while len(data_features) % self.model_config.batch_size != 0: data_features.append(data_features[-1]) print(len(data_features)) for step in tqdm(range(total // self.model_config.batch_size + 1)): def get_batch(): batch_items = data_features[step * self.model_config.batch_size:(step + 1) * self.model_config.batch_size] batch = dict() for key in batch_items[0].keys(): batch[key] = np.stack([el[key] for el in batch_items]) return batch batch = get_batch() qa_id, loss, yp1, yp2 = self.tf_session.run( [self.model.qa_id, self.model.loss, self.model.yp1, self.model.yp2], feed_dict={ self.model.c_ph: batch['context_idxs'], self.model.q_ph: batch['ques_idxs'], self.model.ch_ph: batch['context_char_idxs'], self.model.qh_ph: batch['ques_char_idxs'], self.model.cb_ph: batch['context_bpe_idxs'], self.model.qb_ph: batch['ques_bpe_idxs'], self.model.cp_ph: batch['context_pos_idxs'], self.model.qp_ph: batch['ques_pos_idxs'], self.model.y1_ph: batch['y1'], self.model.y2_ph: batch['y2'], self.model.qa_id: batch['id'], }) answer_dict_, remapped_dict_ = convert_tokens( data_eval, qa_id.tolist(), yp1.tolist(), yp2.tolist()) answer_dict.update(answer_dict_) remapped_dict.update(remapped_dict_) return remapped_dict
def evaluate_batch(model, num_batches, eval_file, sess, data_type, handle, str_handle): """ Evaluate a batch while training. :param model: The model object. :param num_batches: Number of batches to evaluate. :param eval_file: The file with the correct answers. :param sess: The session. :param data_type: The type of the data (train/dev/test) :param handle: :param str_handle: :return: metrics dictionary and list of tensorflow summaries. """ answer_dict = {} losses = [] for _ in tqdm(range(1, num_batches + 1)): qa_id, loss, yp1, yp2, = sess.run( [model.qa_id, model.loss, model.yp1, model.yp2], feed_dict={handle: str_handle}) answer_dict_, _ = convert_tokens( eval_file, qa_id.tolist(), yp1.tolist(), yp2.tolist()) answer_dict.update(answer_dict_) losses.append(loss) loss = np.mean(losses) metrics = evaluate(eval_file, answer_dict) metrics["loss"] = loss loss_sum = tf.Summary(value=[tf.Summary.Value( tag="{}/loss".format(data_type), simple_value=metrics["loss"]), ]) f1_sum = tf.Summary(value=[tf.Summary.Value( tag="{}/f1".format(data_type), simple_value=metrics["f1"]), ]) em_sum = tf.Summary(value=[tf.Summary.Value( tag="{}/em".format(data_type), simple_value=metrics["exact_match"]), ]) return metrics, [loss_sum, f1_sum, em_sum]
def evaluate_batch(model, num_batches, eval_file, sess, data_type, handle, str_handle): answer_dict = {} losses = [] for _ in tqdm(range(1, num_batches + 1)): qa_id, loss, yp1, yp2, = sess.run( [model.qa_id, model.loss, model.yp1, model.yp2], feed_dict={handle: str_handle}) answer_dict_, _ = convert_tokens(eval_file, qa_id.tolist(), yp1.tolist(), yp2.tolist()) answer_dict.update(answer_dict_) losses.append(loss) loss = np.mean(losses) metrics = evaluate(eval_file, answer_dict) metrics["loss"] = loss loss_sum = tf.Summary(value=[ tf.Summary.Value(tag="{}/loss".format(data_type), simple_value=metrics["loss"]), ]) f1_sum = tf.Summary(value=[ tf.Summary.Value(tag="{}/f1".format(data_type), simple_value=metrics["f1"]), ]) em_sum = tf.Summary(value=[ tf.Summary.Value(tag="{}/em".format(data_type), simple_value=metrics["exact_match"]), ]) return metrics, [loss_sum, f1_sum, em_sum]
def evaluate_batch(data_source, model, max_batches, eval_file): answer_dict = {} total_loss, step_cnt = 0, 0 for step, data in enumerate(data_source): if step >= max_batches and max_batches > 0: break context_idxs = Variable(data['context_idxs'], volatile=True) ques_idxs = Variable(data['ques_idxs'], volatile=True) context_char_idxs = Variable(data['context_char_idxs'], volatile=True) ques_char_idxs = Variable(data['ques_char_idxs'], volatile=True) context_lens = Variable(data['context_lens'], volatile=True) y1 = Variable(data['y1'], volatile=True) y2 = Variable(data['y2'], volatile=True) graph = data['graph'] graph_q = data['graph_q'] elmo = data['elmo'] elmo_q = data['elmo_q'] if elmo is not None: elmo.volatile = True elmo_q.volatile = True logit1, logit2, yp1, yp2 = model(context_idxs, ques_idxs, context_char_idxs, ques_char_idxs, context_lens, return_yp=True, pre_att=graph, pre_att_q=graph_q, elmo=elmo, elmo_q=elmo_q) loss = criterion(logit1, y1) + criterion(logit2, y2) answer_dict_, _ = convert_tokens(eval_file, data['ids'], yp1.data.cpu().numpy().tolist(), yp2.data.cpu().numpy().tolist()) answer_dict.update(answer_dict_) total_loss += loss.data[0] step_cnt += 1 loss = total_loss / step_cnt metrics = evaluate(eval_file, answer_dict) metrics['loss'] = loss return metrics
def evaluate_batch(data_source, model, max_batches, eval_file, config): answer_dict = {} sp_dict = {} total_loss, step_cnt = 0, 0 iter = data_source for step, data in enumerate(iter): if step >= max_batches and max_batches > 0: break context_idxs = Variable(data['context_idxs'], volatile=True) ques_idxs = Variable(data['ques_idxs'], volatile=True) context_char_idxs = Variable(data['context_char_idxs'], volatile=True) ques_char_idxs = Variable(data['ques_char_idxs'], volatile=True) context_lens = Variable(data['context_lens'], volatile=True) y1 = Variable(data['y1'], volatile=True) y2 = Variable(data['y2'], volatile=True) q_type = Variable(data['q_type'], volatile=True) is_support = Variable(data['is_support'], volatile=True) is_support_word= Variable(data['is_support_word'],volatile=True) start_mapping = Variable(data['start_mapping'], volatile=True) end_mapping = Variable(data['end_mapping'], volatile=True) all_mapping = Variable(data['all_mapping'], volatile=True) logit1, logit2, predict_type, predict_support, yp1, yp2 = model(context_idxs, ques_idxs, context_char_idxs, ques_char_idxs, context_lens, start_mapping, end_mapping, all_mapping, is_support_word,return_yp=True) loss = (nll_sum(predict_type, q_type) + nll_sum(logit1, y1) + nll_sum(logit2, y2)) / context_idxs.size(0) + config.sp_lambda * nll_average(predict_support.view(-1, 2), is_support.view(-1)) answer_dict_ = convert_tokens(eval_file, data['ids'], yp1.data.cpu().numpy().tolist(), yp2.data.cpu().numpy().tolist(), np.argmax(predict_type.data.cpu().numpy(), 1)) answer_dict.update(answer_dict_) # 相当于map[x]=y total_loss += loss.data.item() step_cnt += 1 loss = total_loss / step_cnt metrics = evaluate(eval_file, answer_dict) metrics['loss'] = loss return metrics
def test(config): with open(config.word_emb_file, "r") as fh: word_mat = np.array(json.load(fh), dtype=np.float32) with open(config.char_emb_file, "r") as fh: char_mat = np.array(json.load(fh), dtype=np.float32) with open(config.test_eval_file, "r") as fh: eval_file = json.load(fh) with open(config.test_meta, "r") as fh: meta = json.load(fh) total = meta["total"] print("Loading model...") test_batch = get_dataset(config.test_record_file, get_record_parser(config, is_test=True), config).make_one_shot_iterator() model = Model(config, test_batch, word_mat, char_mat, trainable=False) graph_handler = GraphHandler(config, model) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: sess.run(tf.global_variables_initializer()) #saver = tf.train.Saver() graph_handler.initialize(sess) #saver.restore(sess, tf.train.latest_checkpoint(config.save_dir)) sess.run(tf.assign(model.is_train, tf.constant(False, dtype=tf.bool))) losses = [] answer_dict = {} remapped_dict = {} ensember_dict = {} for step in tqdm(range(total // config.batch_size + 1)): start_logits, stop_logits, qa_id, loss, yp1, yp2 = sess.run([ model.start_logits, model.stop_logits, model.qa_id, model.loss, model.yp1, model.yp2 ]) answer_dict_, remapped_dict_ = convert_tokens( eval_file, qa_id.tolist(), yp1.tolist(), yp2.tolist()) answer_dict.update(answer_dict_) remapped_dict.update(remapped_dict_) losses.append(loss) start_logits.tolist() stop_logits.tolist() for id, start, stop in zip(qa_id, start_logits, stop_logits): ensember_dict[str(id)] = {'yp1': start, 'yp2': stop} loss = np.mean(losses) metrics = evaluate(eval_file, answer_dict) answer_path = config.answer_file + "_" + str(config.load_step) with open(answer_path, "w") as fh: json.dump(remapped_dict, fh) print("Exact Match: {}, F1: {}".format(metrics['exact_match'], metrics['f1'])) ensember_dict['loss'] = loss ensember_dict['exact_math'] = metrics['exact_match'] ensember_dict['f1'] = metrics['f1'] file_name = config.model_name + '_' + config.run_id + '.pklz' save_path = os.path.join(config.result_path, file_name) with gzip.open(save_path, 'wb', compresslevel=3) as fh: pickle.dump(ensember_dict, fh)
def predict(data_source, model, eval_file, config, prediction_file): answer_dict = {} sp_dict = {} sp_th = config.sp_threshold for step, data in enumerate(tqdm(data_source)): context_idxs = Variable(data['context_idxs'], volatile=True) ques_idxs = Variable(data['ques_idxs'], volatile=True) context_char_idxs = Variable(data['context_char_idxs'], volatile=True) ques_char_idxs = Variable(data['ques_char_idxs'], volatile=True) context_lens = Variable(data['context_lens'], volatile=True) start_mapping = Variable(data['start_mapping'], volatile=True) end_mapping = Variable(data['end_mapping'], volatile=True) all_mapping = Variable(data['all_mapping'], volatile=True) is_support = Variable(data['is_support'],volatile= True) is_support_word= Variable(data['is_support_word'],volatile=True) logit1, logit2, predict_type, predict_support, yp1, yp2 = model(context_idxs, ques_idxs, context_char_idxs, ques_char_idxs, context_lens, start_mapping, end_mapping, all_mapping,is_support_word,return_yp=True) answer_dict_ = convert_tokens(eval_file, data['ids'], yp1.data.cpu().numpy().tolist(), yp2.data.cpu().numpy().tolist(), np.argmax(predict_type.data.cpu().numpy(), 1)) answer_dict.update(answer_dict_) predict_support_np = torch.sigmoid(predict_support[:, :, 1]).data.cpu().numpy() for i in range(predict_support_np.shape[0]): cur_sp_pred = [] cur_id = data['ids'][i] for j in range(predict_support_np.shape[1]): if j >= len(eval_file[cur_id]['sent2title_ids']): break if predict_support_np[i, j] > sp_th: cur_sp_pred.append(eval_file[cur_id]['sent2title_ids'][j]) sp_dict.update({cur_id: cur_sp_pred}) prediction = {'answer': answer_dict, 'sp': sp_dict} with open(prediction_file, 'w') as f: json.dump(prediction, f)
def test(self): # 加载数据化数据 test_loader = DataLoader(dataset=MyDataset(self.config.test_data_file, self.digital_keys), batch_size=self.config.val_num_batches * self.device_count) # 加载原始数据 with open(self.config.test_eval_file, "r") as fh: test_eval_file = json.load(fh) answer_dict = {} self.logger.info('testing model...') answer_save_file = open(self.config.answer_file, 'w', encoding='utf-8') self.model.is_train = False self.model.eval() for batch in test_loader: logits1, logits2 = self.model(batch[0].to(self.device), batch[1].to(self.device), batch[2].to(self.device), batch[3].to(self.device)) loss = self.calc_loss(logits1, logits2, batch[4].to(self.device), batch[5].to(self.device)) # 开始位置 p1 = logits1.argmax(dim=1) # 结束位置 p2 = logits2.argmax(dim=1) answer_dict_, remapped_dict = convert_tokens(test_eval_file, batch[6].to(self.device).tolist(), p1.tolist(), p2.tolist()) answer_dict.update(answer_dict_) uuid = test_eval_file[str(batch[6].tolist()[0])]["uuid"] # save answer answer_save_file.write(str(uuid + ":" + remapped_dict[uuid] + "\n")) metrics = evaluate(test_eval_file, answer_dict) self.logger.info('test exact_match:{},f1:{},loss:{}'.format(metrics['exact_match'], metrics['f1'], loss))
def test(config): with open(config.word_emb_file, "r") as fh: word_mat = np.array(json.load(fh), dtype=np.float32) with open(config.char_emb_file, "r") as fh: char_mat = np.array(json.load(fh), dtype=np.float32) with open(config.test_eval_file, "r") as fh: eval_file = json.load(fh) with open(config.test_meta, "r") as fh: meta = json.load(fh) total = meta["total"] graph = tf.Graph() print("Loading model...") with graph.as_default() as g: test_batch = get_dataset(config.test_record_file, get_record_parser(config, is_test=True), config).make_one_shot_iterator() model = Model(config, test_batch, word_mat, char_mat, trainable=False, graph=g) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(config.save_dir)) if config.decay < 1.0: sess.run(model.assign_vars) losses = [] answer_dict = {} remapped_dict = {} for step in tqdm(range(total // config.batch_size + 1)): qa_id, loss, yp1, yp2 = sess.run( [model.qa_id, model.loss, model.yp1, model.yp2]) answer_dict_, remapped_dict_ = convert_tokens( eval_file, qa_id.tolist(), yp1.tolist(), yp2.tolist()) answer_dict.update(answer_dict_) remapped_dict.update(remapped_dict_) losses.append(loss) loss = np.mean(losses) #with open(config.answer_file, "w") as fh: # json.dump(remapped_dict, fh) ''' metrics = evaluate(eval_file, answer_dict) print("Exact Match: {}, F1: {}".format( metrics['exact_match'], metrics['f1'])) ''' with open(config.answer_csv, 'w') as f: print('dumping ans file to : %s' % str(config.answer_csv)) s = csv.writer(f, delimiter=',', lineterminator='\n') for i in sorted(remapped_dict): s.writerow([remapped_dict[i]])
def evaluate_batch(model, num_batches, eval_file, sess, data_type, handle, str_handle): answer_dict = {} losses = [] outlier_count = 0 for _ in tqdm(range(1, num_batches + 1)): qa_id, loss, yp1, yp2, = sess.run( [model.qa_id, model.loss, model.yp1, model.yp2], feed_dict={handle: str_handle}) answer_dict_, _, outlier = convert_tokens( eval_file, qa_id.tolist(), yp1.tolist(), yp2.tolist()) if outlier: outlier_count += 1 continue answer_dict.update(answer_dict_) losses.append(loss) #print("outlier_count:",outlier_count) loss = np.mean(losses) metrics = evaluate(eval_file, answer_dict) metrics["loss"] = loss loss_sum = tf.Summary(value=[tf.Summary.Value( tag="{}/loss".format(data_type), simple_value=metrics["loss"]), ]) f1_sum = tf.Summary(value=[tf.Summary.Value( tag="{}/f1".format(data_type), simple_value=metrics["f1"]), ]) em_sum = tf.Summary(value=[tf.Summary.Value( tag="{}/em".format(data_type), simple_value=metrics["exact_match"]), ]) rouge_l_f = tf.Summary(value=[tf.Summary.Value( tag="{}/rouge-l-f".format(data_type), simple_value=metrics["rouge-l-f"]), ]) rouge_l_p = tf.Summary(value=[tf.Summary.Value( tag="{}/rouge-l-p".format(data_type), simple_value=metrics["rouge-l-p"]), ]) rouge_l_r = tf.Summary(value=[tf.Summary.Value( tag="{}/rouge-l-r".format(data_type), simple_value=metrics["rouge-l-r"]), ]) outlier_c = tf.Summary(value=[tf.Summary.Value( tag="{}/outlier_count".format(data_type), simple_value=outlier_count), ]) return metrics, [loss_sum, f1_sum, em_sum, rouge_l_f, rouge_l_p, rouge_l_r, outlier_c]
def evaluate_batch(model, num_batches, eval_file, sess, data_type, handle, str_handle): answer_dict = {} losses_esp = [] losses_ee = [] outlier_count = 0 for _ in tqdm(range(1, num_batches + 1)): qa_id, loss, yp1, yp2, ee_loss = sess.run( [model.qa_id, model.loss, model.yp1, model.yp2, model.ee_loss], feed_dict={handle: str_handle}) answer_dict_, _, outlier = convert_tokens( eval_file, qa_id.tolist(), yp1.tolist(), yp2.tolist()) #if outlier: # outlier_count += 1 # continue answer_dict.update(answer_dict_) losses_esp.append(loss) losses_ee.append(ee_loss) #print("outlier_count:",outlier_count) loss_esp = np.mean(losses_esp) loss_ee = np.mean(losses_ee) metrics = evaluate(eval_file, answer_dict) metrics["ee_loss"] = loss_ee metrics["esp_loss"] = loss_esp loss_sum1 = tf.Summary(value=[tf.Summary.Value( tag="{}/loss_esp".format(data_type), simple_value=metrics["esp_loss"]), ]) loss_sum2 = tf.Summary(value=[tf.Summary.Value( tag="{}/loss_ee".format(data_type), simple_value=metrics["ee_loss"]), ]) f1_sum = tf.Summary(value=[tf.Summary.Value( tag="{}/f1".format(data_type), simple_value=metrics["f1"]), ]) em_sum = tf.Summary(value=[tf.Summary.Value( tag="{}/em".format(data_type), simple_value=metrics["exact_match"]), ]) rouge_l_f = tf.Summary(value=[tf.Summary.Value( tag="{}/ROUGE-L-F1".format(data_type), simple_value=metrics["rouge-l-f"]), ]) return metrics, [loss_sum1, loss_sum2, f1_sum, em_sum, rouge_l_f]
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2): nll_meter = util.AverageMeter() model.eval() pred_dict = {} with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(data_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) #print("ckpt 1") ans_lens = y2 - y1 loss = 0 for i in range(max_len): mask = ((torch.ones_like(y1) * i) == ans_lens).type( torch.cuda.LongTensor) y = y1 * mask loss += F.nll_loss(log_p[:, :, i], y) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores log_p, ans_len = torch.max(log_p, dim=-1) starts = torch.max(log_p, dim=-1)[1] ends = starts for i in range(starts.size(0)): ends[i] += ans_len.type(torch.cuda.LongTensor)[i, starts[i]] # print("starts and ends:", starts, ends, starts.size(), ends.size()) # starts, ends = util.discretize(p, p + ans_lens, max_len, use_squad_v2) # Log info progress_bar.update(batch_size) progress_bar.set_postfix(NLL=nll_meter.avg) preds, _ = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), use_squad_v2) pred_dict.update(preds) model.train() results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) return results, pred_dict
def evaluate_batch(model, num_batches, eval_file, sess, data_type, handle, str_handle, config): answer_dict = {} losses_esp = losses_pr = losses_ee = [] outlier_count = 0 for _ in tqdm(range(1, num_batches + 1)): if config.with_passage_ranking: qa_id, loss_esp, loss_pr, loss_ee, yp1, yp2, = sess.run( [model.qa_id, model.loss, model.pr_loss, model.e_loss, model.yp1, model.yp2], feed_dict={handle: str_handle}) else: qa_id, loss_esp, yp1, yp2, = sess.run( [model.qa_id, model.loss, model.yp1, model.yp2], feed_dict={handle: str_handle}) answer_dict_, _, outlier = convert_tokens( config, eval_file, qa_id.tolist(), yp1.tolist(), yp2.tolist()) if outlier: outlier_count += 1 continue answer_dict.update(answer_dict_) if loss_esp<100: losses_esp.append(loss_esp) print(loss_esp) if config.with_passage_ranking: losses_pr.append(loss_pr) losses_ee.append(loss_ee) #print("outlier_count:",outlier_count) loss_esp = np.mean(losses_esp) print("dev_loss:",loss_esp) if config.with_passage_ranking: loss_pr = np.mean(losses_pr) loss_ee = np.mean(losses_ee) metrics = evaluate(eval_file, answer_dict) metrics["loss_esp"] = loss_esp metrics["loss_ee"] = loss_esp if config.with_passage_ranking: metrics["loss_pr"] = loss_pr metrics["loss_ee"] = loss_ee loss_sum1 = tf.Summary(value=[tf.Summary.Value( tag="{}/loss_esp".format(data_type), simple_value=metrics["loss_esp"]), ]) if config.with_passage_ranking: loss_sum2 = tf.Summary(value=[tf.Summary.Value( tag="{}/loss_pr".format(data_type), simple_value=metrics["loss_pr"]), ]) loss_sum3 = tf.Summary(value=[tf.Summary.Value( tag="{}/loss_ee".format(data_type), simple_value=metrics["loss_ee"]), ]) f1_sum = tf.Summary(value=[tf.Summary.Value( tag="{}/f1".format(data_type), simple_value=metrics["f1"]), ]) em_sum = tf.Summary(value=[tf.Summary.Value( tag="{}/em".format(data_type), simple_value=metrics["exact_match"]), ]) rouge_l_f = tf.Summary(value=[tf.Summary.Value( tag="{}/ROUGE-L".format(data_type), simple_value=metrics["rouge-l-f"]), ]) rouge_l_p = tf.Summary(value=[tf.Summary.Value( tag="{}/rouge-l-p".format(data_type), simple_value=metrics["rouge-l-p"]), ]) rouge_l_r = tf.Summary(value=[tf.Summary.Value( tag="{}/rouge-l-r".format(data_type), simple_value=metrics["rouge-l-r"]), ]) outlier_c = tf.Summary(value=[tf.Summary.Value( tag="{}/outlier_count".format(data_type), simple_value=outlier_count), ]) if config.with_passage_ranking: return metrics, [loss_sum1, loss_sum2, loss_sum3, rouge_l_f] return metrics, [loss_sum1, rouge_l_f]
def test(config, dataset="test"): with open(config.word_emb_file, "r") as fh: word_mat = np.array(json.load(fh), dtype=np.float32) with open(config.char_emb_file, "r") as fh: char_mat = np.array(json.load(fh), dtype=np.float32) if dataset == "test": test_eval_file = config.test_eval_file test_meta = config.test_meta test_record_file = config.test_record_file elif dataset == "addsent": print('HELLO') test_eval_file = config.addsent_eval_file test_meta = config.addsent_meta test_record_file = config.addsent_record_file elif dataset == "addonesent": test_eval_file = config.addonesent_eval_file test_meta = config.addonesent_meta test_record_file = config.addonesent_record_file with open(test_eval_file, "r") as fh: eval_file = json.load(fh) with open(test_meta, "r") as fh: meta = json.load(fh) total = meta["total"] print("Loading model...") test_batch = get_dataset(test_record_file, get_record_parser(config, is_test=True), config).make_one_shot_iterator() model = Model(config, test_batch, word_mat, char_mat, trainable=False) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(config.save_dir)) sess.run(tf.assign(model.is_train, tf.constant(False, dtype=tf.bool))) losses = [] answer_dict = {} remapped_dict = {} for step in tqdm(range(total // config.batch_size + 1)): qa_id, loss, yp1, yp2 = sess.run( [model.qa_id, model.loss, model.yp1, model.yp2]) answer_dict_, remapped_dict_ = convert_tokens( eval_file, qa_id.tolist(), yp1.tolist(), yp2.tolist()) answer_dict.update(answer_dict_) remapped_dict.update(remapped_dict_) losses.append(loss) loss = np.mean(losses) metrics = evaluate(eval_file, answer_dict) with open(config.answer_file, "w") as fh: json.dump(remapped_dict, fh) print("Exact Match: {}, F1: {}".format(metrics['exact_match'], metrics['f1']))
def evaluate_batch(data_source, model, max_batches, eval_file, config): answer_dict = {} sp_dict = {} total_loss, step_cnt = 0, 0 iter = data_source for step, data in enumerate(iter): if step >= max_batches and max_batches > 0: break context_idxs = Variable(data['context_idxs'], volatile=True) ques_idxs = Variable(data['ques_idxs'], volatile=True) context_char_idxs = Variable(data['context_char_idxs'], volatile=True) ques_char_idxs = Variable(data['ques_char_idxs'], volatile=True) context_lens = Variable(data['context_lens'], volatile=True) y1 = Variable(data['y1'], volatile=True) y2 = Variable(data['y2'], volatile=True) q_type = Variable(data['q_type'], volatile=True) is_support = Variable(data['is_support'], volatile=True) start_mapping = Variable(data['start_mapping'], volatile=True) end_mapping = Variable(data['end_mapping'], volatile=True) all_mapping = Variable(data['all_mapping'], volatile=True) # subject_y1 = Variable(data['subject_y1']) subject_y2 = Variable(data['subject_y2']) object_y1 = Variable(data['object_y1']) object_y2 = Variable(data['object_y2']) relations = Variable(data['relations']) # # model_results = model(context_idxs, ques_idxs, context_char_idxs, ques_char_idxs, relations, \ context_lens, start_mapping, end_mapping, all_mapping, return_yp=True) (logit1, logit2, predict_type, predict_support, logit_subject_start, logit_subject_end, \ logit_object_start, logit_object_end, k_relations, loss_relation, yp1, yp2, sy1, sy2, oy1, oy2) = model_results loss_1 = (nll_sum(predict_type, q_type) + nll_sum(logit1, y1) + nll_sum(logit2, y2)) / context_idxs.size(0) loss_2 = nll_average(predict_support.view(-1, 2), is_support.view(-1)) loss_3_r = torch.sum(loss_relation) loss_3_s = (nll_sum(logit_subject_start, subject_y1) + nll_sum( logit_subject_end, subject_y2)) / context_idxs.size(0) loss_3_o = (nll_sum(logit_object_start, object_y1) + nll_sum( logit_object_end, object_y2)) / context_idxs.size(0) loss = loss_1 + config.sp_lambda * loss_2 + config.evi_lambda * ( loss_3_s + loss_3_r + loss_3_o) answer_dict_ = convert_tokens( eval_file, data['ids'], yp1.data.cpu().numpy().tolist(), yp2.data.cpu().numpy().tolist(), np.argmax(predict_type.data.cpu().numpy(), 1)) answer_dict.update(answer_dict_) total_loss += loss.item() # total_loss += loss.data[0] step_cnt += 1 loss = total_loss / step_cnt metrics = evaluate(eval_file, answer_dict) metrics['loss'] = loss return metrics
def test(config): gpu_options = tf.GPUOptions(visible_device_list="2") sess_config = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options) sess_config.gpu_options.allow_growth = True with open(config.word_emb_file, "r") as fh: word_mat = np.array(json.load(fh), dtype=np.float32) with open(config.char_emb_file, "r") as fh: char_mat = np.array(json.load(fh), dtype=np.float32) with open(config.test_eval_file, "r") as fh: eval_file = json.load(fh) with open(config.test_meta, "r") as fh: meta = json.load(fh) total = meta["total"] print("Loading model...") test_batch = get_dataset(config.test_record_file, get_record_parser(config, is_test=True), config).make_one_shot_iterator() model = Model(config, test_batch, word_mat, char_mat, trainable=False) with tf.Session(config=sess_config) as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(config.save_dir)) sess.run(tf.assign(model.is_train, tf.constant(False, dtype=tf.bool))) losses = [] answer_dict = {} remapped_dict = {} # tqdm for step in tqdm(range(total // config.batch_size + 1)): qa_id, loss, yp1, yp2 = sess.run( [model.qa_id, model.loss, model.yp1, model.yp2]) answer_dict_, remapped_dict_, outlier = convert_tokens( eval_file, qa_id.tolist(), yp1.tolist(), yp2.tolist()) answer_dict.update(answer_dict_) remapped_dict.update(remapped_dict_) losses.append(loss) print("\n", loss) if (loss > 50): for i, j, k in zip(qa_id.tolist(), yp1.tolist(), yp2.tolist()): print(answer_dict[str(i)], j, k) #print("IDs: {} Losses: {} Yp1: {} Yp2: {}".format(qa_id.tolist(),\ # loss.tolist(), yp1.tolist(), yp2.tolist())) loss = np.mean(losses) # evaluate with answer_dict, but in evaluate-v1.1.py, evaluate with remapped_dict # since only that is saved. Both dict are a little bit different, check evaluate-v1.1.py metrics = evaluate(eval_file, answer_dict) with open(config.answer_file, "w") as fh: json.dump(remapped_dict, fh) print("Exact Match: {}, F1: {} Rouge-l-f: {} Rouge-l-p: {} Rouge-l-r: {}".format(\ metrics['exact_match'], metrics['f1'], metrics['rouge-l-f'], metrics['rouge-l-p'],\ metrics['rouge-l-r']))
def predict(config): prepro_predict(config) with open(config.word_emb_file, "r") as fh: word_mat = np.array(json.load(fh), dtype=np.float32) with open(config.char_emb_file, "r") as fh: char_mat = np.array(json.load(fh), dtype=np.float32) with open(config.bpe_emb_file, "r") as fh: bpe_mat = np.array(json.load(fh), dtype=np.float32) with open(config.pos_emb_file, "r") as fh: pos_mat = np.array(json.load(fh), dtype=np.float32) with open(config.predict_eval_file, "r") as fh: predict_eval_file = json.load(fh) with open(config.predict_meta, "r") as fh: meta = json.load(fh) total = meta["total"] print("Loading model...") test_batch = get_dataset(config.predict_record_file, get_record_parser(config, is_test=True), config).make_one_shot_iterator() model = Model(config, test_batch, word_mat, char_mat, bpe_mat, pos_mat, trainable=False) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() # TODO: add restoring from best model or from model name saver.restore(sess, tf.train.latest_checkpoint(config.save_dir)) print('Restoring from: {}'.format( tf.train.latest_checkpoint(config.save_dir))) sess.run(tf.assign(model.is_train, tf.constant(False, dtype=tf.bool))) answer_dict = {} remapped_dict = {} for step in tqdm(range(total // config.batch_size + 1)): qa_id, loss, yp1, yp2 = sess.run( [model.qa_id, model.loss, model.yp1, model.yp2]) answer_dict_, remapped_dict_ = convert_tokens( predict_eval_file, qa_id.tolist(), yp1.tolist(), yp2.tolist()) answer_dict.update(answer_dict_) remapped_dict.update(remapped_dict_) path_to_save_answer = config.predict_file + '_ans' with open(path_to_save_answer, "w") as fh: json.dump(remapped_dict, fh) print("Answer dumped: {}".format(path_to_save_answer))
def evaluate(args, model, data_loader, device, eval_file, max_len, use_squad_v2): nll_meter = util.AverageMeter() model.eval() pred_dict = {} with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(data_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, cw_pos, cw_ner, cw_freq, cqw_extra, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cw_pos = cw_pos.to(device) cw_ner = cw_ner.to(device) cw_freq = cw_freq.to(device) cqw_extra = cqw_extra.to(device) batch_size = cw_idxs.size(0) # Forward if args.model == 'bidaf': log_p1, log_p2 = model(cw_idxs, qw_idxs) else: log_p1, log_p2 = model(cw_idxs, qw_idxs, cw_pos, cw_ner, cw_freq, cqw_extra) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, max_len, use_squad_v2) # Log info progress_bar.update(batch_size) progress_bar.set_postfix(NLL=nll_meter.avg) preds, _ = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), use_squad_v2) pred_dict.update(preds) model.train() results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) return results, pred_dict
def evaluate_batch(data_source, model, max_batches, eval_file, config): answer_dict = {} sp_dict = {} total_loss, step_cnt = 0, 0 iter = data_source for step, data in enumerate(iter): if step >= max_batches and max_batches > 0: break with torch.no_grad(): if config.cuda: data = { k: (data[k].cuda() if k != 'ids' else data[k]) for k in data } context_idxs = data['context_idxs'] ques_idxs = data['ques_idxs'] context_char_idxs = data['context_char_idxs'] ques_char_idxs = data['ques_char_idxs'] context_lens = data['context_lens'] y1 = data['y1'] y2 = data['y2'] q_type = data['q_type'] is_support = data['is_support'] start_mapping = data['start_mapping'] end_mapping = data['end_mapping'] all_mapping = data['all_mapping'] logit1, logit2, predict_type, predict_support, yp1, yp2 = model( context_idxs, ques_idxs, context_char_idxs, ques_char_idxs, context_lens, start_mapping, end_mapping, all_mapping, context_lens.sum(1).max().item(), return_yp=True) loss = (nll_sum(predict_type, q_type) + nll_sum(logit1, y1) + nll_sum(logit2, y2) ) / context_idxs.size(0) + config.sp_lambda * nll_average( predict_support.view(-1, 2), is_support.view(-1)) answer_dict_ = convert_tokens( eval_file, data['ids'], yp1.data.cpu().numpy().tolist(), yp2.data.cpu().numpy().tolist(), np.argmax(predict_type.data.cpu().numpy(), 1)) answer_dict.update(answer_dict_) total_loss += loss.item() step_cnt += 1 loss = total_loss / step_cnt metrics = evaluate(eval_file, answer_dict) metrics['loss'] = loss return metrics
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2, args): nll_meter = util.AverageMeter() model.eval() pred_dict = {} with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(data_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) ## Additions for BERT ## max_context_len, max_question_len = args.para_limit, args.ques_limit if "bert" in args.model_type: bert_dev_embeddings = get_embeddings("dev", ids, args.para_limit, args.ques_limit) else: bert_dev_embeddings = None # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs, bert_dev_embeddings, \ max_context_len, max_question_len, device) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, max_len, use_squad_v2) # Log info progress_bar.update(batch_size) progress_bar.set_postfix(NLL=nll_meter.avg) preds, _ = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), use_squad_v2) pred_dict.update(preds) model.train() results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) return results, pred_dict
def test(config): os.environ["CUDA_VISIBLE_DEVICES"] = config.choose_gpu with open(config.word_emb_file, "r") as fh: word_mat = np.array(json.load(fh), dtype=np.float32) with open(config.char_emb_file, "r") as fh: char_mat = np.array(json.load(fh), dtype=np.float32) with open(config.test_eval_file, "r") as fh: eval_file = json.load(fh) with open(config.test_meta, "r") as fh: meta = json.load(fh) total = meta["total"] graph = tf.Graph() print("Loading model...") with graph.as_default() as g: test_batch = get_dataset(config.test_record_file, get_record_parser(config, is_test=True), config).make_one_shot_iterator() model = QANet(config, test_batch, word_mat, char_mat, trainable=False, graph=g) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True sess_config.gpu_options.per_process_gpu_memory_fraction = config.gpu_memory_fraction with tf.Session(config=sess_config) as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(config.save_dir)) if config.decay < 1.0: sess.run(model.assign_vars) losses = [] answer_dict = {} remapped_dict = {} for step in tqdm(range(total // config.batch_size + 1)): qa_id, loss, yp1, yp2 = sess.run( [model.qa_id, model.loss, model.yp1, model.yp2]) answer_dict_, remapped_dict_ = convert_tokens( eval_file, qa_id.tolist(), yp1.tolist(), yp2.tolist()) answer_dict.update(answer_dict_) remapped_dict.update(remapped_dict_) losses.append(loss) loss = np.mean(losses) metrics = evaluate(eval_file, answer_dict) with open(config.answer_file, "w") as fh: json.dump(remapped_dict, fh) print("Exact Match: {}, F1: {}".format(metrics['exact_match'], metrics['f1']))
def SelfEvaluate(self, batches, eval_file=None, answer_file=None, drop_file=None, dev=None): print('Starting evaluation') with open(eval_file, 'r', encoding='utf-8') as f: eval_file = json.load(f) with open(dev, 'r', encoding='utf-8') as f: dev = json.load(f) answer_dict = {} mapped_dict = {} for batch in batches: data = prepare_data(batch) full_p_states, p_mask, full_q_states, q_mask = self.encode(data) logits1, logits2, ans_log = self.decode(full_p_states, p_mask, full_q_states, q_mask) y1, y2, has_ans = get_predictions(logits1, logits2, ans_log) qa_id = data['id'] answer_dict_, mapped_dict_ = convert_tokens( eval_file, qa_id, y1, y2, has_ans) answer_dict.update(answer_dict_) mapped_dict.update(mapped_dict_) del full_p_states, p_mask, full_q_states, q_mask, y1, y2, answer_dict_, mapped_dict_, has_ans, ans_log, logits1, logits2 with open(drop_file, 'r', encoding='utf-8') as f: drop = json.load(f) for i in drop['drop_ids']: uuid = eval_file[str(i)]["uuid"] answer_dict[str(i)] = '' mapped_dict[uuid] = '' with open(answer_file, 'w', encoding='utf-8') as f: json.dump(mapped_dict, f) metrics = evaluate(dev, mapped_dict) # sub_path = join('./result/', "submit.csv") # #log.info('Writing submission file to {}...'.format(sub_path)) # with open(sub_path, 'w') as csv_fh: # csv_writer = csv.writer(csv_fh, delimiter=',') # csv_writer.writerow(['Id', 'Predicted']) # for uuid in sorted(mapped_dict): # csv_writer.writerow([uuid, mapped_dict[uuid]]) print("EM: {}, F1: {}, Has answer: {}, No answer: {}".format( metrics['exact'], metrics['f1'], metrics['HasAns_f1'], metrics['NoAns_f1'])) return metrics['exact'], metrics['f1']
def predict(data_source, model, eval_file, config, prediction_file): answer_dict = {} sp_dict = {} sp_th = config.sp_threshold for step, data in enumerate(tqdm(data_source)): with torch.no_grad(): if config.cuda: data = { k: (data[k].cuda() if k != 'ids' else data[k]) for k in data } context_idxs = data['context_idxs'] ques_idxs = data['ques_idxs'] context_char_idxs = data['context_char_idxs'] ques_char_idxs = data['ques_char_idxs'] context_lens = data['context_lens'] start_mapping = data['start_mapping'] end_mapping = data['end_mapping'] all_mapping = data['all_mapping'] logit1, logit2, predict_type, predict_support, yp1, yp2 = model( context_idxs, ques_idxs, context_char_idxs, ques_char_idxs, context_lens, start_mapping, end_mapping, all_mapping, context_lens.sum(1).max().item(), return_yp=True) answer_dict_ = convert_tokens( eval_file, data['ids'], yp1.data.cpu().numpy().tolist(), yp2.data.cpu().numpy().tolist(), np.argmax(predict_type.data.cpu().numpy(), 1)) answer_dict.update(answer_dict_) predict_support_np = torch.sigmoid( predict_support[:, :, 1] - predict_support[:, :, 0]).data.cpu().numpy() for i in range(predict_support_np.shape[0]): cur_sp_pred = [] cur_id = data['ids'][i] for j in range(predict_support_np.shape[1]): if j >= len(eval_file[cur_id]['sent2title_ids']): break if predict_support_np[i, j] > sp_th: cur_sp_pred.append(eval_file[cur_id]['sent2title_ids'][j]) sp_dict.update({cur_id: cur_sp_pred}) prediction = {'answer': answer_dict, 'sp': sp_dict} with open(prediction_file, 'w') as f: json.dump(prediction, f)
def test(config): with open(config.word_emb_file, "r") as fh: word_mat = np.array(json.load(fh), dtype=np.float32) with open(config.char_emb_file, "r") as fh: char_mat = np.array(json.load(fh), dtype=np.float32) with open(config.test_eval_file, "r") as fh: eval_file = json.load(fh) with open(config.test_meta, "r") as fh: meta = json.load(fh) total = meta["num_batches"] print("Loading model...") test_batch = get_batch_dataset(config.test_record_file, get_record_parser( config, is_test=True), config, is_test=True).make_one_shot_iterator() model = Model(config, test_batch, word_mat, char_mat, trainable=False) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(config.save_dir)) sess.run(tf.assign(model.is_train, tf.constant(False, dtype=tf.bool))) losses = [] answer_dict = {} select_right = [] for step in tqdm(range(1, total + 1)): qa_id, loss, yp1, yp2 , y1, y2, is_select_p, is_select= sess.run( [model.qa_id, model.loss, model.yp1, model.yp2, model.y1, model.y2, model.is_select_p, model.is_select]) y1 = np.argmax(y1, axis=-1) y2 = np.argmax(y2, axis=-1) sp = np.argmax(is_select_p, axis=-1) s = np.argmax(is_select, axis=-1) sp = [ n+i*config.passage_num for i,n in enumerate(sp.tolist()) ] s = [ m+i*config.passage_num for i,m in enumerate(s.tolist()) ] select_right.append(len(set(s).intersection(set(sp)))) answer_dict_, _ = convert_tokens( eval_file, [qa_id[n] for n in sp], [yp1[n] for n in sp], [yp2[n] for n in sp], [y1[n] for n in sp], [y2[n] for n in sp], sp, s) answer_dict.update(answer_dict_) losses.append(loss) loss = np.mean(losses) select_accu = sum(select_right)/ (len(select_right)*(config.batch_size/config.passage_num)) write_prediction(eval_file, answer_dict, 'answer_for_evl.json', config) metrics = evaluate(eval_file, answer_dict, filter=False) metrics['Selection Accuracy'] = select_accu print("Exact Match: {}, F1: {}, selection accuracy: {}".format( metrics['exact_match'], metrics['f1'], metrics['Selection Accuracy']))
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2): nll_meter = util.AverageMeter() # print('Memory 3: ', torch.cuda.memory_allocated()) model.eval() pred_dict = {} with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(data_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # print('Memory at start of loop section: ', torch.cuda.memory_allocated()) # Setup for forward cw_idxs = cw_idxs.to(device) cc_idxs = cc_idxs.to(device) qw_idxs = qw_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) # print('Memory Before Forward Pass: '******'Memory After Forward Pass: '******'Memory After Loss Calc: ', torch.cuda.memory_allocated()) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, max_len, use_squad_v2) print('Max start idx score: ', torch.max(starts)) print('Max End idx score: ', torch.max(ends)) # Log info progress_bar.update(batch_size) progress_bar.set_postfix(NLL=nll_meter.avg) preds, _ = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), use_squad_v2) pred_dict.update(preds) model.train() results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) return results, pred_dict
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2): nll_meter = util.AverageMeter() loss_f = torch.nn.CrossEntropyLoss() model.eval() pred_dict = {} with open(eval_file, 'r') as fh: gold_dict = json_load(fh) mems = (tuple(), tuple(), tuple()) with torch.no_grad(), \ tqdm(total=len(data_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2, mems = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs, *mems) y1, y2 = y1.to(device), y2.to(device) loss = torch.mean(loss_f(log_p1, y1) + loss_f(log_p2, y2)) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, max_len, use_squad_v2) # Log info progress_bar.update(batch_size) progress_bar.set_postfix(NLL=nll_meter.avg) preds, _ = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), use_squad_v2) pred_dict.update(preds) model.train() results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) return results, pred_dict
def test(config): with open(config.word_emb_file, "r", encoding="utf-8") as fh: word_mat = np.array(json.load(fh), dtype=np.float32) with open(config.test_eval_file, "r", encoding="utf-8") as fh: eval_file = json.load(fh) with open(config.test_meta, "r", encoding="utf-8") as fh: meta = json.load(fh) total = meta["total"] print("Loading model...") test_batch = get_dataset(config.test_record_file, get_record_parser(config, is_test=True), config).make_one_shot_iterator() model = Model(config, test_batch, word_mat, trainable=False) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(config.save_dir)) sess.run(tf.assign(model.is_train, tf.constant(False, dtype=tf.bool))) answer_dict = {} remapped_dict = {} for _ in tqdm(range(total // config.batch_size + 1)): qa_id, loss, yp = sess.run([model.qa_id, model.loss, model.yp]) remapped_dict_, answer_dict_ = convert_tokens( eval_file, qa_id.tolist(), yp.tolist()) answer_dict.update(answer_dict_) remapped_dict.update(remapped_dict_) f = open(config.answer_file, "w", encoding="utf-8") for key in answer_dict: f.write(str(key) + "\t" + answer_dict[key] + "\n") # 处理不合法(被丢弃)的测试样本 # 直接选第一个答案 ans_list = list(answer_dict.keys()) with open(config.test_file, "r", encoding="utf-8") as fh: for line in fh: sample = json.loads(line) if sample["query_id"] not in ans_list: f.write( str(sample["query_id"]) + "\t" + sample['alternatives'].split("|")[0] + "\n") f.close()
def evaluate_batch(model, num_batches, eval_file, sess, data_type, handle, str_handle): """ Evaluate a """ all_yp3 = [] conter_high = 0 answer_dict = {} losses = [] for numb_b in (range(1, num_batches + 1)): qa_id, loss, yp1, yp2, yp3, y1, y2, y3, logging, logging2, q = sess.run( [ model.qa_id, model.loss, model.yp1, model.yp2, model.yp3, model.y1, model.y2, model.y3, model.logging, model.logging2, model.q ], feed_dict={handle: str_handle}) answer_dict_, _ = convert_tokens(eval_file, qa_id.tolist(), yp1.tolist(), yp2.tolist(), yp3.tolist()) answer_dict.update(answer_dict_) losses.append(loss) loss = np.mean(losses) metrics = evaluate(eval_file, answer_dict) print(metrics) metrics["loss"] = loss loss_sum = tf.Summary(value=[ tf.Summary.Value(tag="{}/loss".format(data_type), simple_value=metrics["loss"]), ]) f1_sum = tf.Summary(value=[ tf.Summary.Value(tag="{}/f1".format(data_type), simple_value=metrics["f1"]), ]) em_sum = tf.Summary(value=[ tf.Summary.Value(tag="{}/em".format(data_type), simple_value=metrics["exact_match"]), ]) return metrics, [loss_sum, f1_sum, em_sum]
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2): # use_squad_v2 = True nll_meter = util.AverageMeter() # Keep track of average values over time. model.eval() # Sets the module in evaluation mode. pred_dict = {} with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(data_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # ids # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) ## # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() # e^log_p1 e^log_p2 starts, ends = util.discretize(p1, p2, max_len, use_squad_v2) ## return : start_idxs & end_idxs # Log info progress_bar.update(batch_size) progress_bar.set_postfix(NLL=nll_meter.avg) preds, _ = util.convert_tokens(gold_dict, ## Convert predictions to tokens from the context. ids.tolist(), starts.tolist(), ends.tolist(), use_squad_v2) pred_dict.update(preds) model.train() results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2) ## results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) return results, pred_dict
def test(config): with open(config.word_emb_file, "r") as fh: word_mat = np.array(json.load(fh), dtype=np.float32) with open(config.char_emb_file, "r") as fh: char_mat = np.array(json.load(fh), dtype=np.float32) with open(config.test_eval_file, "r") as fh: eval_file = json.load(fh) with open(config.test_meta, "r") as fh: meta = json.load(fh) total = meta["total"] graph = tf.Graph() print("Loading model...") with graph.as_default() as g: test_batch = get_dataset(config.test_record_file, get_record_parser( config, is_test=True), config).make_one_shot_iterator() model = Model(config, test_batch, word_mat, char_mat, trainable=False, graph = g) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(config.save_dir)) if config.decay < 1.0: sess.run(model.assign_vars) losses = [] answer_dict = {} remapped_dict = {} for step in tqdm(range(total // config.batch_size + 1)): qa_id, loss, yp1, yp2 = sess.run( [model.qa_id, model.loss, model.yp1, model.yp2]) answer_dict_, remapped_dict_ = convert_tokens( eval_file, qa_id.tolist(), yp1.tolist(), yp2.tolist()) answer_dict.update(answer_dict_) remapped_dict.update(remapped_dict_) losses.append(loss) loss = np.mean(losses) metrics = evaluate(eval_file, answer_dict) with open(config.answer_file, "w") as fh: json.dump(remapped_dict, fh) print("Exact Match: {}, F1: {}".format( metrics['exact_match'], metrics['f1']))
def evaluate_batch(model, num_batches, eval_file, sess, data_type, handle, str_handle): answer_dict = {} losses = [] for _ in tqdm(range(1, num_batches + 1)): qa_id, loss, yp1, yp2, = sess.run( [model.qa_id, model.loss, model.yp1, model.yp2], feed_dict={handle: str_handle}) answer_dict_, _ = convert_tokens( eval_file, qa_id.tolist(), yp1.tolist(), yp2.tolist()) answer_dict.update(answer_dict_) losses.append(loss) loss = np.mean(losses) metrics = evaluate(eval_file, answer_dict) metrics["loss"] = loss loss_sum = tf.Summary(value=[tf.Summary.Value( tag="{}/loss".format(data_type), simple_value=metrics["loss"]), ]) f1_sum = tf.Summary(value=[tf.Summary.Value( tag="{}/f1".format(data_type), simple_value=metrics["f1"]), ]) em_sum = tf.Summary(value=[tf.Summary.Value( tag="{}/em".format(data_type), simple_value=metrics["exact_match"]), ]) return metrics, [loss_sum, f1_sum, em_sum]