'attention_mask': batch[1], 'token_type_ids': batch[2] } example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) result = RawResult(unique_id=unique_id, start_logits=to_list(outputs[0][i]), end_logits=to_list(outputs[1][i])) all_results.append(result) # Compute predictions output_prediction_file = "predictions_.json" output_nbest_file = "nbest_predictions_.json" write_predictions(examples, features, all_results, 20, max_answer_length, do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, False, False, null_score_diff_threshold) # Evaluate with the official SQuAD script evaluate_options = EVAL_OPTS(data_file=dev_file, pred_file=output_prediction_file, na_prob_file=output_null_log_odds_file) results = evaluate_on_squad(evaluate_options) print(results)
def evaluate(args, model, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and (args.local_rank in [-1, 0] or args.no_distributed_training): os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) if ( args.local_rank == -1 or args.no_distributed_training) else DistributedSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': None if args.model_type == 'xlm' else batch[2] # XLM don't use segment_ids } example_indices = batch[3] if args.model_type in ['xlnet', 'xlm']: inputs.update({'cls_index': batch[4], 'p_mask': batch[5]}) outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) if args.model_type in ['xlnet', 'xlm']: # XLNet uses a more complex post-processing procedure result = RawResultExtended( unique_id=unique_id, start_top_log_probs=to_list(outputs[0][i]), start_top_index=to_list(outputs[1][i]), end_top_log_probs=to_list(outputs[2][i]), end_top_index=to_list(outputs[3][i]), cls_logits=to_list(outputs[4][i])) else: result = RawResult(unique_id=unique_id, start_logits=to_list(outputs[0][i]), end_logits=to_list(outputs[1][i])) all_results.append(result) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None if args.model_type in ['xlnet', 'xlm']: # XLNet uses a more complex post-processing procedure write_predictions_extended( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.predict_file, model.config.start_n_top, model.config.end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging) else: write_predictions(examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) # Evaluate with the official SQuAD script evaluate_options = EVAL_OPTS(data_file=args.predict_file, pred_file=output_prediction_file, na_prob_file=output_null_log_odds_file) results = evaluate_on_squad(evaluate_options) return results
def evaluate(args, model, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if args.model_type in [ "xlm", "roberta", "distilbert", "camembert" ]: del inputs["token_type_ids"] feature_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm models if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device) }) outputs = model(**inputs) for i, feature_index in enumerate(feature_indices): # TODO: i and feature_index are the same number! Simplify by removing enumerate? eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) if args.model_type in ["xlnet", "xlm"]: # XLNet uses a more complex post-processing procedure result = RawResultExtended( unique_id=unique_id, start_top_log_probs=to_list(outputs[0][i]), start_top_index=to_list(outputs[1][i]), end_top_log_probs=to_list(outputs[2][i]), end_top_index=to_list(outputs[3][i]), cls_logits=to_list(outputs[4][i]), ) else: result = RawResult( unique_id=unique_id, start_logits=to_list(outputs[0][i]), end_logits=to_list(outputs[1][i]), ) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None if args.model_type in ["xlnet", "xlm"]: # XLNet uses a more complex post-processing procedure write_predictions_extended( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.predict_file, model.config.start_n_top, model.config.end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging, ) else: write_predictions( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, ) # Evaluate with the official SQuAD script evaluate_options = EVAL_OPTS( data_file=args.predict_file, pred_file=output_prediction_file, na_prob_file=output_null_log_odds_file, ) results = evaluate_on_squad(evaluate_options) return results
def evaluate(args, model, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler sampls randomly eval_sampler = SequentialSampler(dataset) if args.local_rank in [ -1, 0 ] else DistributedSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) logger.info(" ***** Running evaluation {} ***** ".format(prefix)) logger.info(" Num examples = %d ", len(dataset)) logger.info(" Batch size = %d ", args.eval_batch_size) all_results = [] for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': None if args.model_type == 'xlm' else batch[2], } if args.model_type in ["distilbert"]: del inputs["token_type_ids"] example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) result = RawResult(unique_id=unique_id, start_logits=to_list(outputs[0][i]), end_logits=to_list(outputs[1][i])) all_results.append(result) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None write_predictions(examples, features, all_results, args.n_best_size, args.max_answer_length, True, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) # Evaluate with the official SQuAD script evaluate_options = EVAL_OPTS(data_file=args.predict_file, pred_file=output_prediction_file, na_prob_file=output_null_log_odds_file) results = evaluate_on_squad(evaluate_options) return results
def evaluate(args, model, tokenizer, prefix=""): global_rank = -1 if args.local_rank == -1 else torch.distributed.get_rank() dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if args.preprocess_only: return write_dir = args.output_dir if args.write_dir is None else args.write_dir if not os.path.exists(write_dir) and global_rank in [-1, 0]: os.makedirs(write_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) # if global_rank == -1 else DistributedSampler(dataset) # No distributed eval to eval on full dev set eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': None if args.model_type in ['xlm', 'roberta'] else batch[2] # XLM don't use segment_ids } example_indices = batch[3] if args.model_type in ['xlnet', 'xlm']: inputs.update({'cls_index': batch[4], 'p_mask': batch[5]}) outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) if args.model_type in ['xlnet', 'xlm']: # XLNet uses a more complex post-processing procedure result = RawResultExtended(unique_id = unique_id, start_top_log_probs = to_list(outputs[0][i]), start_top_index = to_list(outputs[1][i]), end_top_log_probs = to_list(outputs[2][i]), end_top_index = to_list(outputs[3][i]), cls_logits = to_list(outputs[4][i])) else: result = RawResult(unique_id = unique_id, start_logits = to_list(outputs[0][i]), end_logits = to_list(outputs[1][i])) all_results.append(result) # Compute predictions output_prediction_file = os.path.join(write_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join(write_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join(write_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None if args.model_type in ['xlnet', 'xlm']: # XLNet uses a more complex post-processing procedure all_predictions, all_nbest_predictions, all_null_odds = write_predictions_extended( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.predict_file, model.config.start_n_top, model.config.end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging) else: all_predictions, all_nbest_predictions, all_null_odds = write_predictions( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) # Evaluate with the official SQuAD script def filter_keys(qid_to_val, task_name): task_name = task_name.lower() assert task_name in {'hotpot', 'squad'}, 'task_name {} not implemented.'.format(task_name) return {qid: val for qid, val in qid_to_val.items() if len(qid.split('.')) == (2 if task_name == 'hotpot' else 1)} if len(filter_keys(all_predictions, 'squad')) == 0: results = {} # No SQuAD data in evaluation set else: squad_output_prediction_file = os.path.join(write_dir, "squad_predictions_{}.json".format(prefix)) with open(squad_output_prediction_file, 'w') as writer: writer.write(json.dumps(filter_keys(all_predictions, 'squad'), indent=2)) squad_output_nbest_file = os.path.join(write_dir, "squad_nbest_predictions_{}.json".format(prefix)) with open(squad_output_nbest_file, 'w') as writer: writer.write(json.dumps(filter_keys(all_nbest_predictions, 'squad'), indent=2)) if args.version_2_with_negative: squad_output_null_log_odds_file = os.path.join(write_dir, "squad_null_odds_{}.json".format(prefix)) with open(squad_output_null_log_odds_file, 'w') as writer: writer.write(json.dumps(filter_keys(all_null_odds, 'squad'), indent=2)) else: squad_output_null_log_odds_file = None predict_file_parts = args.predict_file.split('/') squad_predict_file = '/'.join(predict_file_parts[:-2] + ['squad', predict_file_parts[-1]]) evaluate_options = EVAL_OPTS(data_file=squad_predict_file, pred_file=squad_output_prediction_file, na_prob_file=squad_output_null_log_odds_file) results = evaluate_on_squad(evaluate_options) # Check if HotpotQA answer file exists to do HotpotQA evaluation hotpot_answer_file_parts = args.predict_file.split('/') hotpot_answer_file_parts[-2] = 'hotpot-orig' hotpot_answer_file = '/'.join(hotpot_answer_file_parts) if (not args.no_answer_file) and (not os.path.exists(hotpot_answer_file)): with open(os.path.join(write_dir, "squad_results_{}.json".format(prefix)), "w") as writer: writer.write(json.dumps(results, indent=2, sort_keys=True)) return results # Evaluate with official HotpotQA script nbest_predictions = filter_keys(all_nbest_predictions, 'hotpot') null_odds = filter_keys(all_null_odds, 'hotpot') qids = {single_hop_qid.split('.')[0] for single_hop_qid in nbest_predictions.keys()} pred_answers_and_sps = {'answer': {}, 'sp': {}} globally_normed_pred_answers_and_sps = {'answer': {}, 'sp': {}} pred_infos = {} globally_normed_pred_infos = {} max_num_paragraphs = 10 for qid in qids: # Find paragraph with answer prediction min_null_odds = float('inf') max_logit_sum = float('-inf') best_single_hop_qid = None for paragraph_no in range(max_num_paragraphs): single_hop_qid = qid + '.' + str(paragraph_no) if (single_hop_qid in null_odds) and (null_odds[single_hop_qid] < min_null_odds): best_single_hop_qid = single_hop_qid min_null_odds = null_odds[single_hop_qid] if single_hop_qid in nbest_predictions: for nbest_prediction in nbest_predictions[single_hop_qid]: if (len(nbest_prediction['text']) > 0) and (args.model_type not in ['xlnet', 'xlm']): logit_sum = nbest_prediction['start_logit'] + nbest_prediction['end_logit'] - null_odds[single_hop_qid] if logit_sum > max_logit_sum: globally_normed_pred_answers_and_sps['answer'][qid] = nbest_prediction['text'] globally_normed_pred_infos[qid] = nbest_prediction max_logit_sum = logit_sum # Find/store answer and supporting fact pred_answers_and_sps['sp'][qid] = [] # NB: Dummy supporting fact for now globally_normed_pred_answers_and_sps['sp'][qid] = [] # NB: Dummy supporting fact for now for nbest_prediction in nbest_predictions[best_single_hop_qid]: if len(nbest_prediction['text']) > 0: pred_answers_and_sps['answer'][qid] = nbest_prediction['text'] pred_infos[qid] = nbest_prediction break assert qid in pred_answers_and_sps['answer'], 'Error: No predicted answer found.' # assert qid in globally_normed_pred_answers_and_sps['answer'], 'Error: No globally normed predicted answer found.' hotpot_output_prediction_file = os.path.join(write_dir, "hotpot_predictions_{}.json".format(prefix)) with open(hotpot_output_prediction_file, "w") as writer: writer.write(json.dumps(pred_answers_and_sps, indent=2)) hotpot_results = evaluate_on_hotpot(hotpot_output_prediction_file, hotpot_answer_file) if not args.no_answer_file else {} with open(os.path.join(write_dir, "hotpot_predictions_info_{}.json".format(prefix)), "w") as writer: writer.write(json.dumps(pred_infos, indent=2)) hotpot_output_prediction_gn_file = os.path.join(write_dir, "hotpot_predictions_gn_{}.json".format(prefix)) with open(hotpot_output_prediction_gn_file, "w") as writer: writer.write(json.dumps(globally_normed_pred_answers_and_sps, indent=2)) hotpot_gn_results = evaluate_on_hotpot(hotpot_output_prediction_gn_file, hotpot_answer_file) \ if ((not args.no_answer_file) and (args.model_type not in ['xlnet', 'xlm'])) else {} with open(os.path.join(write_dir, "hotpot_predictions_gn_info_{}.json".format(prefix)), "w") as writer: writer.write(json.dumps(globally_normed_pred_infos, indent=2)) hotpot_results = {k: v * 100. for k, v in hotpot_results.items()} hotpot_gn_results = {'gn_' + k: v * 100. for k, v in hotpot_gn_results.items()} results = {'squad_' + k: v for k, v in results.items()} results.update(hotpot_results) results.update(hotpot_gn_results) with open(os.path.join(write_dir, "hotpot_results_{}.json".format(prefix)), "w") as writer: writer.write(json.dumps(results, indent=2, sort_keys=True)) return results
def evaluate(args, model, tokenizer, checkpoint_id=None, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if args.eval_data_subset > 0: dataset = Subset(dataset, list(range(min(args.eval_data_subset, len(dataset))))) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], } example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) result = RawResult(unique_id = unique_id, start_logits = to_list(outputs[0][i]), end_logits = to_list(outputs[1][i])) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(checkpoint_id if checkpoint_id is not None else prefix)) output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(checkpoint_id if checkpoint_id is not None else prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(checkpoint_id if checkpoint_id is not None else prefix)) else: output_null_log_odds_file = None write_predictions(examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) # Evaluate with the official SQuAD script evaluate_options = EVAL_OPTS(data_file=args.predict_file, pred_file=output_prediction_file, na_prob_file=output_null_log_odds_file) results = evaluate_on_squad(evaluate_options) output_eval_file = os.path.join(args.output_dir, "eval_results_{}.txt".format(checkpoint_id) if checkpoint_id is not None else "eval_results.txt" ) with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) writer.write("%s = %s\n" % (key, str(results[key]))) return results
def evaluate(self, dataset, examples, features, model, tokenizer, prefix=1): # args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) eval_batch_size = 8 # by default # args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) # if args.local_rank == -1 else DistributedSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=eval_batch_size) s_pred_file = os.path.join(os.getcwd(), "predict.json") all_results = [] for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(self._device) for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'cls_index': batch[4], 'p_mask': batch[5] } example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) # XLNet uses a more complex post-processing procedure result = RawResultExtended(unique_id = unique_id, start_top_log_probs = self._to_list(outputs[0][i]), start_top_index = self._to_list(outputs[1][i]), end_top_log_probs = self._to_list(outputs[2][i]), end_top_index = self._to_list(outputs[3][i]), cls_logits = self._to_list(outputs[4][i])) all_results.append(result) # Compute predictions output_dir = os.getcwd() output_prediction_file = os.path.join(output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join(output_dir, "nbest_predictions_{}.json".format(prefix)) version_2_with_negative = True output_null_log_odds_file = None if version_2_with_negative: output_null_log_odds_file = os.path.join(output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet uses a more complex post-processing procedure n_best_size = 20 max_answer_length = 30 verbose_logging = True predict_file = "predict.json" write_predictions_extended(examples, features, all_results, n_best_size, max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, predict_file, model.config.start_n_top, model.config.end_n_top, version_2_with_negative, tokenizer, verbose_logging) # Evaluate with the official SQuAD script evaluate_options = EVAL_OPTS(data_file=s_pred_file, pred_file=output_prediction_file, na_prob_file=output_null_log_odds_file) results = evaluate_on_squad(evaluate_options) # print("Coding: final result ", results) return results