def main(): log.info('[program starts.]') checkpoint = torch.load(args.model) opt = checkpoint['config'] opt['task_name'] = 'CoQA' opt['cuda'] = args.cuda opt['seed'] = args.seed opt['mask_prev_ans'] = args.mask_prev_ans opt['no_yes_no'] = args.no_yes_no opt['remove_indicator'] = args.remove_indicator if opt.get('do_hierarchical_query') is None: opt['do_hierarchical_query'] = False state_dict = checkpoint['state_dict'] log.info('[model loaded.]') test, test_embedding = load_dev_data(opt) model = QAModel(opt, state_dict=state_dict) CoQAEval = CoQAEvaluator(os.path.join(args.dev_dir, 'dev.json')) log.info('[Data loaded.]') model.setup_eval_embed(test_embedding) if args.cuda: model.cuda() batches = BatchGen_CoQA(test, batch_size=args.batch_size, evaluation=True, gpu=args.cuda, dialog_ctx=opt['explicit_dialog_ctx'], precompute_elmo=16 // args.batch_size) sample_idx = random.sample(range(len(batches)), args.show) with open(os.path.join(args.dev_dir, 'dev.json'), "r", encoding="utf8") as f: dev_data = json.load(f) list_of_ids = [] for article in dev_data['data']: id = article["id"] for Qs in article["questions"]: tid = Qs["turn_id"] list_of_ids.append((id, tid)) gradients = [] for i, batch in enumerate(batches): grad = model.calc_grad(batch) gradients.append(grad.cpu()) if i > 10: break output_path = os.path.join(args.output_dir, 'grad.pkl') print('Saving to {}...'.format(output_path)) with open(output_path, 'wb') as f: pickle.dump(gradients, f)
def judge_yesno(gold_list): yesno_dict = Counter() for gold in gold_list: norm_text = CoQAEvaluator.normalize_answer(gold) if norm_text == 'yes': yesno_dict['y'] += 1 elif norm_text == 'no': yesno_dict['n'] += 1 elif norm_text == 'unknown': yesno_dict['u'] += 1 else: yesno_dict['x'] += 1 return yesno_dict.most_common(1)[0][0]
import numpy as np parser = argparse.ArgumentParser() parser.add_argument('--predict_file') parser.add_argument('--dev_file') args = parser.parse_args() opt = vars(args) with open(opt['predict_file'], 'r') as f: predictions = json.load(f) with open(opt['dev_file'], 'r') as f: dev = json.load(f) evaluator = CoQAEvaluator(dev) span_start_dis_cnt = Counter() span_end_dis_cnt = Counter() wrong_type = 0 dis_diff_cnt = dict() f1_length = dict() matching_more_set = [] for data in predictions: spans = data['spans'] truth_s = spans[0] truth_e = spans[1] predict_s = spans[2]
def main(): log.info('[program starts.]') checkpoint = torch.load(args.model) opt = checkpoint['config'] opt['task_name'] = 'CoQA' opt['cuda'] = args.cuda opt['seed'] = args.seed if opt.get('do_hierarchical_query') is None: opt['do_hierarchical_query'] = False state_dict = checkpoint['state_dict'] log.info('[model loaded.]') test, test_embedding = load_dev_data(opt) model = QAModel(opt, state_dict=state_dict) CoQAEval = CoQAEvaluator("CoQA/coqa-dev-v1.0.json") log.info('[Data loaded.]') model.setup_eval_embed(test_embedding) if args.cuda: model.cuda() batches = BatchGen_CoQA(test, batch_size=args.batch_size, evaluation=True, gpu=args.cuda, dialog_ctx=opt['explicit_dialog_ctx'], precompute_elmo=16 // args.batch_size) sample_idx = random.sample(range(len(batches)), args.show) with open("CoQA/coqa-dev-v1.0.json", "r", encoding="utf8") as f: dev_data = json.load(f) list_of_ids = [] for article in dev_data['data']: id = article["id"] for Qs in article["questions"]: tid = Qs["turn_id"] list_of_ids.append((id, tid)) predictions = [] for i, batch in enumerate(batches): prediction = model.predict(batch) predictions.extend(prediction) if not (i in sample_idx): continue print("Story: ", batch[-4][0]) for j in range(len(batch[-2][0])): print("Q: ", batch[-2][0][j]) print("A: ", prediction[j]) print("Gold A: ", batch[-1][0][j]) print("---") print("") assert (len(list_of_ids) == len(predictions)) official_predictions = [] for ids, pred in zip(list_of_ids, predictions): official_predictions.append({ "id": ids[0], "turn_id": ids[1], "answer": pred }) with open("model_prediction.json", "w", encoding="utf8") as f: json.dump(official_predictions, f) f1 = CoQAEval.compute_turn_score_seq(predictions) log.warning("Test F1: {:.3f}".format(f1 * 100.0))
for article in data: if turn_id > len(article['questions']): continue gold = [article['answers'][turn_id - 1]['input_text']] gold += [ article['additional_answers'][key][turn_id - 1]['input_text'] for key in article['additional_answers'] ] golds.append(gold) pds.append(preds[article['id']][turn_id]) print('gold_answer: %s\nprediction: %s\n' % (gold, pds[-1])) evals[turn] = dict() evals[turn]['F1'] = CoQAEvaluator.compute_turn_score_seq(golds, pds) output = get_yesno_recall_precision(pds, golds) evals[turn].update(output) # print(json.dumps(evals[turn_id], indent=4)) predicted_answers.extend(pds) gold_answers.extend(golds) print( '=============================Single Turn Evaluation==========================================' ) print(json.dumps(evals, indent=4))
def main(): log.info('[program starts.]') opt = vars(args) # changing opt will change args train, train_embedding, opt = load_train_data(opt) dev, dev_embedding = load_dev_data(opt) opt['num_features'] += args.explicit_dialog_ctx * 3 # dialog_act + previous answer if opt['use_elmo'] == False: opt['elmo_batch_size'] = 0 CoQAEval = CoQAEvaluator("CoQA/coqa-dev-v1.0.json") log.info('[Data loaded.]') if args.resume: log.info('[loading previous model...]') checkpoint = torch.load(args.resume) if args.resume_options: opt = checkpoint['config'] state_dict = checkpoint['state_dict'] model = QAModel(opt, train_embedding, state_dict) epoch_0 = checkpoint['epoch'] + 1 for i in range(checkpoint['epoch']): random.shuffle(list(range(len(train)))) # synchronize random seed if args.reduce_lr: lr_decay(model.optimizer, lr_decay=args.reduce_lr) else: model = QAModel(opt, train_embedding) epoch_0 = 1 if args.pretrain: pretrain_model = torch.load(args.pretrain) state_dict = pretrain_model['state_dict']['network'] model.get_pretrain(state_dict) model.setup_eval_embed(dev_embedding) log.info("[dev] Total number of params: {}".format(model.total_param)) if args.cuda: model.cuda() if args.resume: batches = BatchGen_CoQA(dev, batch_size=args.batch_size, evaluation=True, gpu=args.cuda, dialog_ctx=args.explicit_dialog_ctx) predictions = [] for batch in batches: phrases, noans = model.predict(batch) predictions.extend(phrases) f1 = CoQAEval.compute_turn_score_seq(predictions) log.info("[dev F1: {:.3f}]".format(f1)) best_val_score = f1 else: best_val_score = 0.0 for epoch in range(epoch_0, epoch_0 + args.epoches): log.warning('Epoch {}'.format(epoch)) # train batches = BatchGen_CoQA(train, batch_size=args.batch_size, gpu=args.cuda, dialog_ctx=args.explicit_dialog_ctx, precompute_elmo=args.elmo_batch_size // args.batch_size) start = datetime.now() for i, batch in enumerate(batches): model.update(batch) if i % args.log_per_updates == 0: log.info( 'updates[{0:6}] train loss[{1:.5f}] remaining[{2}]'.format( model.updates, model.train_loss.avg, str((datetime.now() - start) / (i + 1) * (len(batches) - i - 1)).split('.')[0])) # eval if epoch % args.eval_per_epoch == 0: batches = BatchGen_CoQA(dev, batch_size=args.batch_size, evaluation=True, gpu=args.cuda, dialog_ctx=args.explicit_dialog_ctx, precompute_elmo=args.elmo_batch_size // args.batch_size) predictions = [] for batch in batches: phrases = model.predict(batch) predictions.extend(phrases) f1 = CoQAEval.compute_turn_score_seq(predictions) # save if args.save_best_only: if f1 > best_val_score: best_val_score = f1 model_file = os.path.join(model_dir, 'best_model.pt') model.save(model_file, epoch) log.info('[new best model saved.]') else: model_file = os.path.join(model_dir, 'checkpoint_epoch_{}.pt'.format(epoch)) model.save(model_file, epoch) if f1 > best_val_score: best_val_score = f1 copyfile(os.path.join(model_dir, model_file), os.path.join(model_dir, 'best_model.pt')) log.info('[new best model saved.]') log.warning("Epoch {} - dev F1: {:.3f} (Best F1: {:.3f})".format( epoch, f1 * 100.0, best_val_score * 100.0))
def main(): log.info( "\n\n\nScores on the dev set are meaningless.\nDev data has been added to train data for pretraining.\nDo not use this script to train CoQA.\n\n\n" ) log.info('[program starts.]') log.info('seed: {}'.format(args.seed)) log.info(str(vars(args))) opt = vars(args) # changing opt will change args train, train_embedding, opt = load_train_data(opt) dev, dev_embedding = load_dev_data(opt) opt['num_features'] += args.explicit_dialog_ctx * 3 # dialog_act + previous answer if opt['use_elmo'] == False: opt['elmo_batch_size'] = 0 CoQAEval = CoQAEvaluator("CoQA/dev.json") log.info('[Data loaded.]') if args.resume: log.info('[loading previous model...]') if args.cuda: checkpoint = torch.load(args.resume, map_location={'cpu': 'cuda:0'}) else: checkpoint = torch.load(args.resume, map_location={'cuda:0': 'cpu'}) if args.resume_options: opt = checkpoint['config'] state_dict = checkpoint['state_dict'] model = QAModel(opt, train_embedding, state_dict) epoch_0 = checkpoint['epoch'] + 1 for i in range(checkpoint['epoch']): random.shuffle(list(range(len(train)))) # synchronize random seed if args.reduce_lr: lr_decay(model.optimizer, lr_decay=args.reduce_lr) else: model = QAModel(opt, train_embedding) epoch_0 = 1 if args.pretrain: pretrain_model = torch.load(args.pretrain) state_dict = pretrain_model['state_dict']['network'] model.get_pretrain(state_dict) model.setup_eval_embed(dev_embedding) log.info("[dev] Total number of params: {}".format(model.total_param)) if args.cuda: model.cuda() if args.resume: batches = BatchGen_CoQA(dev, batch_size=args.batch_size, evaluation=True, gpu=args.cuda, dialog_ctx=args.explicit_dialog_ctx, use_bert=args.use_bert) predictions = [] for batch in batches: if batch is None: continue phrases, noans = model.predict(batch) predictions.extend(phrases) f1 = CoQAEval.compute_turn_score_seq(predictions) log.info("[dev F1: {:.3f}]".format(f1)) best_val_score = f1 else: best_val_score = 0.0 aggregate_grad_steps = 1 if opt['use_bert']: aggregate_grad_steps = opt['aggregate_grad_steps'] for epoch in range(epoch_0, epoch_0 + args.epoches): log.warning('Epoch {}'.format(epoch)) # train batches = BatchGen_CoQA(train, batch_size=args.batch_size, gpu=args.cuda, dialog_ctx=args.explicit_dialog_ctx, precompute_elmo=args.elmo_batch_size // args.batch_size, use_bert=args.use_bert) start = datetime.now() total_batches = len(batches) loss = 0 model.optimizer.zero_grad() if opt['finetune_bert']: model.bertadam.zero_grad() for i, batch in enumerate(batches): if batch is None: continue model.update(batch) if (i + 1) % aggregate_grad_steps == 0 or total_batches == (i + 1): # Update the gradients model.take_step() loss = 0 if i % args.log_per_updates == 0: log.info( 'updates[{0:6}] train loss[{1:.5f}] remaining[{2}]'.format( model.updates, model.train_loss.avg, str((datetime.now() - start) / (i + 1) * (len(batches) - i - 1)).split('.')[0])) # eval if epoch % args.eval_per_epoch == 0: batches = BatchGen_CoQA(dev, batch_size=args.batch_size, evaluation=True, gpu=args.cuda, dialog_ctx=args.explicit_dialog_ctx, precompute_elmo=args.elmo_batch_size // args.batch_size, use_bert=args.use_bert) predictions = [] for batch in batches: if batch is None: continue phrases = model.predict(batch) predictions.extend(phrases) f1 = CoQAEval.compute_turn_score_seq(predictions) # save if args.save_best_only: if f1 > best_val_score: best_val_score = f1 model_file = os.path.join(model_dir, 'best_model.pt') model.save(model_file, epoch) log.info('[new best model saved.]') else: model_file = os.path.join(model_dir, 'checkpoint_epoch_{}.pt'.format(epoch)) model.save(model_file, epoch) if f1 > best_val_score: best_val_score = f1 copyfile(os.path.join(model_dir, model_file), os.path.join(model_dir, 'best_model.pt')) log.info('[new best model saved.]') log.warning("Epoch {} - dev F1: {:.3f} (Best F1: {:.3f})".format( epoch, f1 * 100.0, best_val_score * 100.0))
e_tmp = list() for row in rows: if row[2] == 'Unknown': e_tmp.append('unknown') elif row[2] == 'Yes': e_tmp.append('yes') elif row[2] == 'No': e_tmp.append('no') else: e_tmp.append(row[2]) ex.extend(e_tmp) q_text = [row[1] for row in rows] answer = [row[2] for row in rows] answer_start = [row[3] for row in rows] answer_end = [row[4] for row in rows] rationale = [row[5] for row in rows] rationale_start = [row[6] for row in rows] rationale_end = [row[7] for row in rows] answer_choice = [row[8] for row in rows] out.append( {'context': context, 'story_id': article['id'], 'q_text': q_text, 'answer': answer, 'answer_start': answer_start, 'answer_end': answer_end, 'rationale': rationale, 'rationale_start': rationale_start, 'rationale_end': rationale_end, 'answer_choice': answer_choice}) F1 = CoQAEvaluator.compute_turn_score_seq(golds, ex) print('F1: %f' % F1) with open(args.output_file, 'w') as f: json.dump(out, f, indent=2)
import json import argparse from CoQA_eval import CoQAEvaluator from collections import Counter parser = argparse.ArgumentParser() parser.add_argument('-p', '--predict_file') parser.add_argument('-d', '--dev_file') args = parser.parse_args() opt = vars(args) with open(opt['dev_file'], 'r') as f: dev_file = json.load(f) evaluator = CoQAEvaluator(dev_file) def judge_yesno(gold_list): yesno_dict = Counter() for gold in gold_list: norm_text = CoQAEvaluator.normalize_answer(gold) if norm_text == 'yes': yesno_dict['y'] += 1 elif norm_text == 'no': yesno_dict['n'] += 1 elif norm_text == 'unknown': yesno_dict['u'] += 1 else: yesno_dict['x'] += 1 return yesno_dict.most_common(1)[0][0]
with open(args.data_file, 'r') as f: data_file = json.load(f)['data'] data = [] for article in data_file: story_id = article['id'] answers = article['answers'] additional_answers = article['additional_answers'] for i, answer in enumerate(answers): tmp = [answer['input_text']] tmp.extend([ additional_answers[key][i]['input_text'] for key in additional_answers ]) data.append(tmp) with open(args.pred_file, 'r') as f: pred_file = json.load(f) output = [] for i, (gold, pre) in enumerate(zip(data, pred_file)): f1 = CoQAEvaluator._compute_turn_score(gold, pre['answer'])['f1'] if f1 <= 0.7: output.append({ 'id': pre['id'], 'turn_id': pre['turn_id'], 'gold': gold, 'pred': pre['answer'] }) print(json.dumps(output, indent=2))
from CoQA_eval import CoQAEvaluator import logging logger = logging.getLogger() parser = argparse.ArgumentParser('description: experiments on datasets') parser.add_argument('--pred_file') parser.add_argument('--data_file') args = parser.parse_args() with open(args.pred_file, 'r') as f: data = json.load(f) predictions = [] for answer in data: predictions.append(data[answer]) with open(args.data_file, 'r') as f: data = json.load(f)['data'] ground_truth = [] for article in data: answers = [[answer['input_text']] for answer in article['answers']] add_answers = article['additional_answers'] for key in add_answers: for i, additional_answer in enumerate(add_answers[key]): answers[i].append(additional_answer['input_text']) ground_truth.extend(answers) F1 = CoQAEvaluator.compute_turn_score_seq(ground_truth, predictions) print("F1: %f" % F1)
parser.add_argument('--dev') parser.add_argument('--output_dir') args = parser.parse_args() opt = vars(args) with open(opt['with_flow'], 'r') as f: with_flow = json.load(f) with open(opt['no_flow'], 'r') as f: no_flow = json.load(f) with open(opt['dev'], 'r') as f: dev_set = json.load(f) evaluator = CoQAEvaluator(dev_set) dev_map = dict() for data in dev_set['data']: questions = data['questions'] answers = data['answers'] for question, answer in zip(questions, answers): dev_map[(data['id'], question['turn_id'])] = (question['input_text'], answer['input_text']) flow_higher = [] no_flow_higher = [] equality = [] max_f1 = AverageMeter()