def evaluate(model, args, dev_features, device, global_steps): # Eval! print("***** Running evaluation *****") all_results = [] for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): input_ids, input_mask, segment_ids, example_indices = batch inputs = { 'input_ids': input_ids, 'attention_mask': input_mask, 'token_type_ids': segment_ids } outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = dev_features[example_index.item()] unique_id = str(eval_feature.unique_id) result = RawResult(unique_id=unique_id, start_logits=to_list(outputs[0][i]), end_logits=to_list(outputs[1][i]), answer_type_logits=to_list(outputs[2][i])) all_results.append(result) pickle.dump(all_results, open(os.path.join(args.output_dir, 'RawResults.pkl'), 'wb')) # all_results = pickle.load(open(os.path.join(args.output_dir, 'RawResults.pkl'), 'rb')) candidates_dict = read_candidates_from_one_split(args.predict_file) nq_pred_dict = compute_pred_dict(candidates_dict, dev_features, [r._asdict() for r in all_results], args.n_best_size, args.max_answer_length) output_prediction_file = os.path.join( args.output_dir, 'predictions' + str(global_steps) + '.json') with open(output_prediction_file, 'w') as f: json.dump({'predictions': list(nq_pred_dict.values())}, f) results = get_metrics_as_dict(args.predict_file, output_prediction_file) print('Steps:{}'.format(global_steps)) print(json.dumps(results, indent=2)) model.train() return results
answer_type_logits=to_list(outputs['answer_type_logits'][i]), long_cls_logits=outputs['long_cls_logits'][i].cpu().numpy(), short_cls_logits=outputs['short_cls_logits'][i].cpu().numpy()) all_results.append(result) # pickle.dump( all_results, open(os.path.join(args.output_dir, 'RawResults_test.pkl'), 'wb')) # all_results = pickle.load(open(os.path.join(args.output_dir, 'RawResults_test.pkl'), 'rb')) print("Going to candidates file") candidates_dict = read_candidates_from_one_split(args.predict_file) print("Compute_pred_dict") nq_pred_dict = compute_pred_dict(candidates_dict, features, [r._asdict() for r in all_results], args.n_best_size, args.max_answer_length, topk_pred=True, long_n_top=5, short_n_top=5) output_prediction_file = os.path.join(args.output_dir, 'test_predictions.json') print("Saving predictions to", output_prediction_file) with open(output_prediction_file, 'w') as f: json.dump({'predictions': list(nq_pred_dict.values())}, f) make_submission(output_prediction_file, args.output_dir)
for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) result = RawResult(unique_id=unique_id, start_logits=to_list(outputs[0][i]), end_logits=to_list(outputs[1][i]), answer_type_logits=to_list(outputs[2][i])) all_results.append(result) pickle.dump(all_results, open(os.path.join(args.output_dir, 'RawResults.pkl'), 'wb')) # all_results = pickle.load(open(os.path.join(args.output_dir, 'RawResults.pkl'), 'rb')) print("Going to candidates file") candidates_dict = read_candidates_from_one_split(args.predict_file) print("Compute_pred_dict") nq_pred_dict = compute_pred_dict(candidates_dict, features, [r._asdict() for r in all_results], args.n_best_size, args.max_answer_length) output_prediction_file = os.path.join(args.output_dir, 'predictions.json') print("Saving predictions to", output_prediction_file) with open(output_prediction_file, 'w') as f: json.dump({'predictions': list(nq_pred_dict.values())}, f) print("Computing f1 score") results = get_metrics_as_dict(args.predict_file, output_prediction_file) print(json.dumps(results, indent=2))
def evaluate(model, args, dev_features, device, ei): # Eval! if os.path.exists( os.path.join(args.output_dir, 'RawResults_ensemble{}.pkl'.format(ei))): all_results = pickle.load( open( os.path.join(args.output_dir, 'RawResults_ensemble{}.pkl'.format(ei)), 'rb')) else: all_results = [] for batch in tqdm(eval_dataloader, desc="Evaluating Ensemble-{}".format(ei)): model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): input_ids, input_mask, segment_ids, example_indices = batch inputs = { 'input_ids': input_ids, 'attention_mask': input_mask, 'token_type_ids': segment_ids } outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = dev_features[example_index.item()] unique_id = str(eval_feature.unique_id) result = RawResult( unique_id=unique_id, # [topk] long_start_topk_logits=outputs['long_start_topk_logits'] [i].cpu().numpy(), long_start_topk_index=outputs['long_start_topk_index'] [i].cpu().numpy(), long_end_topk_logits=outputs['long_end_topk_logits'] [i].cpu().numpy(), long_end_topk_index=outputs['long_end_topk_index'] [i].cpu().numpy(), # [topk, topk] short_start_topk_logits=outputs['short_start_topk_logits'] [i].cpu().numpy(), short_start_topk_index=outputs['short_start_topk_index'] [i].cpu().numpy(), short_end_topk_logits=outputs['short_end_topk_logits'] [i].cpu().numpy(), short_end_topk_index=outputs['short_end_topk_index'] [i].cpu().numpy(), answer_type_logits=to_list( outputs['answer_type_logits'][i]), long_cls_logits=outputs['long_cls_logits'] [i].cpu().numpy(), short_cls_logits=outputs['short_cls_logits'] [i].cpu().numpy()) all_results.append(result) pickle.dump( all_results, open( os.path.join(args.output_dir, 'RawResults_ensemble{}.pkl'.format(ei)), 'wb')) candidates_dict = read_candidates_from_one_split(args.predict_file) nq_pred_dict = compute_pred_dict(candidates_dict, dev_features, [r._asdict() for r in all_results], args.n_best_size, args.max_answer_length, topk_pred=True, long_n_top=5, short_n_top=5, ensemble=True) output_prediction_file = os.path.join(args.output_dir, 'predictions{}.json'.format(ei)) with open(output_prediction_file, 'w') as f: json.dump({'predictions': list(nq_pred_dict.values())}, f)