def evaluate(model, args, dev_features, device, global_steps): # Eval! print("***** Running evaluation *****") # all_results = [] # for batch in tqdm(eval_dataloader, desc="Evaluating"): # model.eval() # batch = tuple(t.to(device) for t in batch) # with torch.no_grad(): # input_ids, input_mask, segment_ids, example_indices = batch # inputs = {'input_ids': input_ids, # 'attention_mask': input_mask, # 'token_type_ids': segment_ids} # outputs = model(**inputs) # # for i, example_index in enumerate(example_indices): # eval_feature = dev_features[example_index.item()] # unique_id = str(eval_feature.unique_id) # result = RawResult(unique_id=unique_id, # example_id=eval_feature.example_index, # start_logits=to_list(outputs[0][i]), # end_logits=to_list(outputs[1][i]), # answer_type_logits=to_list(outputs[2][i])) # all_results.append(result) # pickle.dump(all_results, open(os.path.join(args.output_dir, 'RawResults.pkl'), 'wb')) # all_results = pickle.load(open(os.path.join(args.output_dir, 'RawResults.pkl'), 'rb')) # # # print("Going to candidates file") # candidates_dict = read_candidates_from_one_split(args.predict_file) # # # print("Compute_pred_dict") # nq_pred_dict = compute_pred_dict(candidates_dict, dev_features, # [r._asdict() for r in all_results], # args.n_best_size, args.max_answer_length) # output_prediction_file = os.path.join( args.output_dir, 'predictions' + str(global_steps) + '.json') # # print("Saving predictions to", output_prediction_file) # with open(output_prediction_file, 'w') as f: # json.dump({'predictions': list(nq_pred_dict.values())}, f) # print("Computing f1 score") results = get_metrics_as_dict(args.predict_file, output_prediction_file) print('Steps:{}'.format(global_steps)) print(json.dumps(results, indent=2)) model.train() return results
def evaluate(model, args, dev_features, device, global_steps): # Eval! print("***** Running evaluation *****") all_results = [] for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): input_ids, input_mask, segment_ids, example_indices = batch inputs = { 'input_ids': input_ids, 'attention_mask': input_mask, 'token_type_ids': segment_ids } start_logits, end_logits, answer_type_logits = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = dev_features[example_index.item()] unique_id = str(eval_feature.unique_id) result = RawResult( unique_id=unique_id, short_start_logits=start_logits[i].cpu().numpy(), short_end_logits=end_logits[i].cpu().numpy(), answer_type_logits=answer_type_logits[i].cpu().numpy()) all_results.append(result) pickle.dump(all_results, open(os.path.join(args.output_dir, 'RawResults.pkl'), 'wb')) # all_results = pickle.load(open(os.path.join(args.output_dir, 'RawResults.pkl'), 'rb')) nq_pred_dict = compute_short_pred(dev_features, all_results, args.n_best_size, args.max_answer_length) nq_pred_dict = convert_short_pred(nq_pred_dict) output_prediction_file = os.path.join( args.output_dir, 'predictions' + str(global_steps) + '.json') with open(output_prediction_file, 'w') as f: json.dump({'predictions': list(nq_pred_dict.values())}, f) results = get_metrics_as_dict(args.predict_file, output_prediction_file) print('Steps:{}'.format(global_steps)) print(json.dumps(results, indent=2)) model.train() return results
def evaluate(model, args, dev_features, device): # Eval! print("***** Running evaluation *****") all_results = [] for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): input_ids, input_mask, segment_ids, example_indices = batch inputs = {'input_ids': input_ids, 'attention_mask': input_mask, 'token_type_ids': segment_ids} start_logits, end_logits = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = dev_features[example_index.item()] unique_id = str(eval_feature.unique_id) result = RawResult(unique_id=unique_id, long_start_logits=start_logits[i].cpu().numpy(), long_end_logits=end_logits[i].cpu().numpy()) all_results.append(result) if args.is_test: pickle.dump(all_results, open(os.path.join(args.output_dir, 'test_long_RawResults.pkl'), 'wb')) else: pickle.dump(all_results, open(os.path.join(args.output_dir, 'dev_long_RawResults.pkl'), 'wb')) # all_results = pickle.load(open(os.path.join(args.output_dir, 'dev_long_RawResults.pkl'), 'rb')) for th in args.thresholds: print('UNK type threshold:', th) ground_truth_dict = load_all_annotations_from_dev(args.predict_file, is_test=args.is_test) nq_pred_dict = compute_long_pred(ground_truth_dict, dev_features, all_results, args.n_best_size, th) if args.is_test: output_prediction_file = os.path.join(args.output_dir, 'test_long_predictions.json') else: output_prediction_file = os.path.join(args.output_dir, 'dev_long_predictions.json') with open(output_prediction_file, 'w') as f: json.dump({'predictions': list(nq_pred_dict.values())}, f) if not args.is_test: results = get_metrics_as_dict(args.predict_file, output_prediction_file) print(json.dumps(results, indent=2))
def get_ensemble_result(args): ensemble_names = [ init_restore_dir.split('/')[-2].split('-')[-1] for init_restore_dir in args.init_restore_dir ] if args.is_test: all_preds = [ os.path.join(args.output_dir, 'test_short_predictions_' + e + '.json') for e in ensemble_names ] else: all_preds = [ os.path.join(args.output_dir, 'dev_short_predictions_' + e + '.json') for e in ensemble_names ] output_prediction_file = None for th in args.thresholds: for yesno_th in args.yesno_thresholds: print('UNK type threshold:', th, 'YESNO threshold:', yesno_th) ensemble_pred_dict = short_ensemble_combine( all_preds, th, yesno_th, args.long_pred_file) long_short_combined_pred = combine_long_short( ensemble_pred_dict, args.long_pred_file) if args.is_test: output_prediction_file = os.path.join( args.output_dir, 'all_test_short_predictions.json') else: output_prediction_file = os.path.join( args.output_dir, 'all_dev_short_predictions.json') with open(output_prediction_file, 'w') as f: json.dump({'predictions': long_short_combined_pred}, f) if not args.is_test: results = get_metrics_as_dict(args.predict_file, output_prediction_file) print(json.dumps(results, indent=2)) if args.is_test: make_submission(output_prediction_file, args.output_dir)
def get_ensemble_result(args): ensemble_names = [ init_restore_dir.split('/')[-2].split('-')[-1] for init_restore_dir in args.init_restore_dir ] if args.is_test: all_preds = [ os.path.join(args.output_dir, 'test_long_predictions_' + e + '.json') for e in ensemble_names ] else: all_preds = [ os.path.join(args.output_dir, 'dev_long_predictions_' + e + '.json') for e in ensemble_names ] output_prediction_file = None for th1 in args.thresholds1: for th2 in args.thresholds2: print('UNK threshold:', th1, 'SHORT threshold', th2) ensemble_ls_pred_dict = ls_ensemble_combine(all_preds, th1, th2) if args.is_test: output_prediction_file = os.path.join( args.output_dir, 'all_test_ls_predictions.json') else: output_prediction_file = os.path.join( args.output_dir, 'all_dev_ls_predictions.json') with open(output_prediction_file, 'w') as f: json.dump( {'predictions': list(ensemble_ls_pred_dict.values())}, f) if not args.is_test: results = get_metrics_as_dict(args.predict_file, output_prediction_file) print(json.dumps(results, indent=2))
for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) result = RawResult(unique_id=unique_id, start_logits=to_list(outputs[0][i]), end_logits=to_list(outputs[1][i]), answer_type_logits=to_list(outputs[2][i])) all_results.append(result) pickle.dump(all_results, open(os.path.join(args.output_dir, 'RawResults.pkl'), 'wb')) # all_results = pickle.load(open(os.path.join(args.output_dir, 'RawResults.pkl'), 'rb')) print("Going to candidates file") candidates_dict = read_candidates_from_one_split(args.predict_file) print("Compute_pred_dict") nq_pred_dict = compute_pred_dict(candidates_dict, features, [r._asdict() for r in all_results], args.n_best_size, args.max_answer_length) output_prediction_file = os.path.join(args.output_dir, 'predictions.json') print("Saving predictions to", output_prediction_file) with open(output_prediction_file, 'w') as f: json.dump({'predictions': list(nq_pred_dict.values())}, f) print("Computing f1 score") results = get_metrics_as_dict(args.predict_file, output_prediction_file) print(json.dumps(results, indent=2))
def evaluate(model, args, dev_features, device, global_steps): # Eval! print("***** Running evaluation *****") all_results = [] for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): input_ids, input_mask, segment_ids, example_indices = batch inputs = { 'input_ids': input_ids, 'attention_mask': input_mask, 'token_type_ids': segment_ids } outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = dev_features[example_index.item()] unique_id = str(eval_feature.unique_id) result = RawResult( unique_id=unique_id, # [topk] long_start_topk_logits=outputs['long_start_topk_logits'] [i].cpu().numpy(), long_start_topk_index=outputs['long_start_topk_index'] [i].cpu().numpy(), long_end_topk_logits=outputs['long_end_topk_logits'] [i].cpu().numpy(), long_end_topk_index=outputs['long_end_topk_index'] [i].cpu().numpy(), # [topk, topk] short_start_topk_logits=outputs['short_start_topk_logits'] [i].cpu().numpy(), short_start_topk_index=outputs['short_start_topk_index'] [i].cpu().numpy(), short_end_topk_logits=outputs['short_end_topk_logits'] [i].cpu().numpy(), short_end_topk_index=outputs['short_end_topk_index'] [i].cpu().numpy(), answer_type_logits=to_list(outputs['answer_type_logits'][i]), long_cls_logits=outputs['long_cls_logits'][i].cpu().numpy(), short_cls_logits=outputs['short_cls_logits'][i].cpu().numpy()) all_results.append(result) pickle.dump( all_results, open(os.path.join(args.output_dir, 'RawResults_dev.pkl'), 'wb')) # all_results = pickle.load(open(os.path.join(args.output_dir, 'RawResults_dev.pkl'), 'rb')) candidates_dict = read_candidates_from_one_split(args.predict_file) nq_pred_dict = compute_pred_dict(candidates_dict, dev_features, [r._asdict() for r in all_results], args.n_best_size, args.max_answer_length, topk_pred=True, long_n_top=5, short_n_top=5) output_prediction_file = os.path.join( args.output_dir, 'predictions' + str(global_steps) + '.json') with open(output_prediction_file, 'w') as f: json.dump({'predictions': list(nq_pred_dict.values())}, f) results = get_metrics_as_dict(args.predict_file, output_prediction_file) print('Steps:{}'.format(global_steps)) print(json.dumps(results, indent=2)) model.train() return results