def _evaluate(self, dataset: DataLoader, features: InputFeatures): logger.info("***** Running inference *****") # logger.info(" Batch size: {}".format(dataset.batch_size)) # logger.info(" Num examples = %d", len(dataset)) eval_results = [] # for batch in tqdm(dataset, desc="Evaluating"): for batch in dataset: self.model.eval() batch = tuple(t.to(self.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': None if self.model_type == 'xlm' else batch[2] } example_indices = batch[3] if self.model_type in ['xlnet', 'xlm']: inputs.update( {'cls_index': batch[4], 'p_mask': batch[5]} ) outputs = self.model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) if self.model_type in ['xlnet', 'xlm']: # XLNet uses a more complex post-processing procedure result = RawResultExtended( unique_id=unique_id, start_top_log_probs=(outputs[0][i]).detach().cpu().tolist(), start_top_index=(outputs[1][i]).detach().cpu().tolist(), end_top_log_probs=(outputs[2][i]).detach().cpu().tolist(), end_top_index=(outputs[3][i]).detach().cpu().tolist(), cls_logits=(outputs[4][i]).detach().cpu().tolist()) else: result = RawResult( unique_id=unique_id, start_logits=(outputs[0][i]).detach().cpu().tolist(), end_logits=(outputs[1][i]).detach().cpu().tolist()) eval_results.append(result) return eval_results
def evaluate(args, model, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and (args.local_rank in [-1, 0] or args.no_distributed_training): os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) if ( args.local_rank == -1 or args.no_distributed_training) else DistributedSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': None if args.model_type == 'xlm' else batch[2] # XLM don't use segment_ids } example_indices = batch[3] if args.model_type in ['xlnet', 'xlm']: inputs.update({'cls_index': batch[4], 'p_mask': batch[5]}) outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) if args.model_type in ['xlnet', 'xlm']: # XLNet uses a more complex post-processing procedure result = RawResultExtended( unique_id=unique_id, start_top_log_probs=to_list(outputs[0][i]), start_top_index=to_list(outputs[1][i]), end_top_log_probs=to_list(outputs[2][i]), end_top_index=to_list(outputs[3][i]), cls_logits=to_list(outputs[4][i])) else: result = RawResult(unique_id=unique_id, start_logits=to_list(outputs[0][i]), end_logits=to_list(outputs[1][i])) all_results.append(result) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None if args.model_type in ['xlnet', 'xlm']: # XLNet uses a more complex post-processing procedure write_predictions_extended( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.predict_file, model.config.start_n_top, model.config.end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging) else: write_predictions(examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) # Evaluate with the official SQuAD script evaluate_options = EVAL_OPTS(data_file=args.predict_file, pred_file=output_prediction_file, na_prob_file=output_null_log_odds_file) results = evaluate_on_squad(evaluate_options) return results
batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] } example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) result = RawResult(unique_id=unique_id, start_logits=to_list(outputs[0][i]), end_logits=to_list(outputs[1][i])) all_results.append(result) # Compute predictions output_prediction_file = "predictions_.json" output_nbest_file = "nbest_predictions_.json" write_predictions(examples, features, all_results, 20, max_answer_length, do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, False, False, null_score_diff_threshold) # Evaluate with the official SQuAD script evaluate_options = EVAL_OPTS(data_file=dev_file, pred_file=output_prediction_file,
def evaluate(args, model, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if args.model_type in [ "xlm", "roberta", "distilbert", "camembert" ]: del inputs["token_type_ids"] feature_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm models if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device) }) outputs = model(**inputs) for i, feature_index in enumerate(feature_indices): # TODO: i and feature_index are the same number! Simplify by removing enumerate? eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) if args.model_type in ["xlnet", "xlm"]: # XLNet uses a more complex post-processing procedure result = RawResultExtended( unique_id=unique_id, start_top_log_probs=to_list(outputs[0][i]), start_top_index=to_list(outputs[1][i]), end_top_log_probs=to_list(outputs[2][i]), end_top_index=to_list(outputs[3][i]), cls_logits=to_list(outputs[4][i]), ) else: result = RawResult( unique_id=unique_id, start_logits=to_list(outputs[0][i]), end_logits=to_list(outputs[1][i]), ) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None if args.model_type in ["xlnet", "xlm"]: # XLNet uses a more complex post-processing procedure write_predictions_extended( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.predict_file, model.config.start_n_top, model.config.end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging, ) else: write_predictions( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, ) # Evaluate with the official SQuAD script evaluate_options = EVAL_OPTS( data_file=args.predict_file, pred_file=output_prediction_file, na_prob_file=output_null_log_odds_file, ) results = evaluate_on_squad(evaluate_options) return results
def evaluate(args, model, tokenizer, prefix=""): global_rank = -1 if args.local_rank == -1 else torch.distributed.get_rank() dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if args.preprocess_only: return write_dir = args.output_dir if args.write_dir is None else args.write_dir if not os.path.exists(write_dir) and global_rank in [-1, 0]: os.makedirs(write_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) # if global_rank == -1 else DistributedSampler(dataset) # No distributed eval to eval on full dev set eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': None if args.model_type in ['xlm', 'roberta'] else batch[2] # XLM don't use segment_ids } example_indices = batch[3] if args.model_type in ['xlnet', 'xlm']: inputs.update({'cls_index': batch[4], 'p_mask': batch[5]}) outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) if args.model_type in ['xlnet', 'xlm']: # XLNet uses a more complex post-processing procedure result = RawResultExtended(unique_id = unique_id, start_top_log_probs = to_list(outputs[0][i]), start_top_index = to_list(outputs[1][i]), end_top_log_probs = to_list(outputs[2][i]), end_top_index = to_list(outputs[3][i]), cls_logits = to_list(outputs[4][i])) else: result = RawResult(unique_id = unique_id, start_logits = to_list(outputs[0][i]), end_logits = to_list(outputs[1][i])) all_results.append(result) # Compute predictions output_prediction_file = os.path.join(write_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join(write_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join(write_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None if args.model_type in ['xlnet', 'xlm']: # XLNet uses a more complex post-processing procedure all_predictions, all_nbest_predictions, all_null_odds = write_predictions_extended( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.predict_file, model.config.start_n_top, model.config.end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging) else: all_predictions, all_nbest_predictions, all_null_odds = write_predictions( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) # Evaluate with the official SQuAD script def filter_keys(qid_to_val, task_name): task_name = task_name.lower() assert task_name in {'hotpot', 'squad'}, 'task_name {} not implemented.'.format(task_name) return {qid: val for qid, val in qid_to_val.items() if len(qid.split('.')) == (2 if task_name == 'hotpot' else 1)} if len(filter_keys(all_predictions, 'squad')) == 0: results = {} # No SQuAD data in evaluation set else: squad_output_prediction_file = os.path.join(write_dir, "squad_predictions_{}.json".format(prefix)) with open(squad_output_prediction_file, 'w') as writer: writer.write(json.dumps(filter_keys(all_predictions, 'squad'), indent=2)) squad_output_nbest_file = os.path.join(write_dir, "squad_nbest_predictions_{}.json".format(prefix)) with open(squad_output_nbest_file, 'w') as writer: writer.write(json.dumps(filter_keys(all_nbest_predictions, 'squad'), indent=2)) if args.version_2_with_negative: squad_output_null_log_odds_file = os.path.join(write_dir, "squad_null_odds_{}.json".format(prefix)) with open(squad_output_null_log_odds_file, 'w') as writer: writer.write(json.dumps(filter_keys(all_null_odds, 'squad'), indent=2)) else: squad_output_null_log_odds_file = None predict_file_parts = args.predict_file.split('/') squad_predict_file = '/'.join(predict_file_parts[:-2] + ['squad', predict_file_parts[-1]]) evaluate_options = EVAL_OPTS(data_file=squad_predict_file, pred_file=squad_output_prediction_file, na_prob_file=squad_output_null_log_odds_file) results = evaluate_on_squad(evaluate_options) # Check if HotpotQA answer file exists to do HotpotQA evaluation hotpot_answer_file_parts = args.predict_file.split('/') hotpot_answer_file_parts[-2] = 'hotpot-orig' hotpot_answer_file = '/'.join(hotpot_answer_file_parts) if (not args.no_answer_file) and (not os.path.exists(hotpot_answer_file)): with open(os.path.join(write_dir, "squad_results_{}.json".format(prefix)), "w") as writer: writer.write(json.dumps(results, indent=2, sort_keys=True)) return results # Evaluate with official HotpotQA script nbest_predictions = filter_keys(all_nbest_predictions, 'hotpot') null_odds = filter_keys(all_null_odds, 'hotpot') qids = {single_hop_qid.split('.')[0] for single_hop_qid in nbest_predictions.keys()} pred_answers_and_sps = {'answer': {}, 'sp': {}} globally_normed_pred_answers_and_sps = {'answer': {}, 'sp': {}} pred_infos = {} globally_normed_pred_infos = {} max_num_paragraphs = 10 for qid in qids: # Find paragraph with answer prediction min_null_odds = float('inf') max_logit_sum = float('-inf') best_single_hop_qid = None for paragraph_no in range(max_num_paragraphs): single_hop_qid = qid + '.' + str(paragraph_no) if (single_hop_qid in null_odds) and (null_odds[single_hop_qid] < min_null_odds): best_single_hop_qid = single_hop_qid min_null_odds = null_odds[single_hop_qid] if single_hop_qid in nbest_predictions: for nbest_prediction in nbest_predictions[single_hop_qid]: if (len(nbest_prediction['text']) > 0) and (args.model_type not in ['xlnet', 'xlm']): logit_sum = nbest_prediction['start_logit'] + nbest_prediction['end_logit'] - null_odds[single_hop_qid] if logit_sum > max_logit_sum: globally_normed_pred_answers_and_sps['answer'][qid] = nbest_prediction['text'] globally_normed_pred_infos[qid] = nbest_prediction max_logit_sum = logit_sum # Find/store answer and supporting fact pred_answers_and_sps['sp'][qid] = [] # NB: Dummy supporting fact for now globally_normed_pred_answers_and_sps['sp'][qid] = [] # NB: Dummy supporting fact for now for nbest_prediction in nbest_predictions[best_single_hop_qid]: if len(nbest_prediction['text']) > 0: pred_answers_and_sps['answer'][qid] = nbest_prediction['text'] pred_infos[qid] = nbest_prediction break assert qid in pred_answers_and_sps['answer'], 'Error: No predicted answer found.' # assert qid in globally_normed_pred_answers_and_sps['answer'], 'Error: No globally normed predicted answer found.' hotpot_output_prediction_file = os.path.join(write_dir, "hotpot_predictions_{}.json".format(prefix)) with open(hotpot_output_prediction_file, "w") as writer: writer.write(json.dumps(pred_answers_and_sps, indent=2)) hotpot_results = evaluate_on_hotpot(hotpot_output_prediction_file, hotpot_answer_file) if not args.no_answer_file else {} with open(os.path.join(write_dir, "hotpot_predictions_info_{}.json".format(prefix)), "w") as writer: writer.write(json.dumps(pred_infos, indent=2)) hotpot_output_prediction_gn_file = os.path.join(write_dir, "hotpot_predictions_gn_{}.json".format(prefix)) with open(hotpot_output_prediction_gn_file, "w") as writer: writer.write(json.dumps(globally_normed_pred_answers_and_sps, indent=2)) hotpot_gn_results = evaluate_on_hotpot(hotpot_output_prediction_gn_file, hotpot_answer_file) \ if ((not args.no_answer_file) and (args.model_type not in ['xlnet', 'xlm'])) else {} with open(os.path.join(write_dir, "hotpot_predictions_gn_info_{}.json".format(prefix)), "w") as writer: writer.write(json.dumps(globally_normed_pred_infos, indent=2)) hotpot_results = {k: v * 100. for k, v in hotpot_results.items()} hotpot_gn_results = {'gn_' + k: v * 100. for k, v in hotpot_gn_results.items()} results = {'squad_' + k: v for k, v in results.items()} results.update(hotpot_results) results.update(hotpot_gn_results) with open(os.path.join(write_dir, "hotpot_results_{}.json".format(prefix)), "w") as writer: writer.write(json.dumps(results, indent=2, sort_keys=True)) return results
def do_prediction(model_dir): # 1. Load a trained model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = BertForQuestionAnswering.from_pretrained(model_dir) model.to(device) model.eval() # 2. Load and pre-process the test set dev_file = "data/sfu.json" predict_batch_size = 2 max_seq_length = 384 eval_examples = read_squad_examples(input_file=dev_file, is_training=False, version_2_with_negative=False) tokenizer = BertTokenizer.from_pretrained(model_dir) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=128, max_query_length=64, is_training=False) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=predict_batch_size) # 3. Run inference on the test set all_results = [] for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model(input_ids, input_mask, segment_ids) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append(RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(model_dir, "predictions_sfu.json") output_nbest_file = os.path.join(model_dir, "nbest_predictions_sfu.json") output_null_log_odds_file = os.path.join(model_dir, "null_odds_sfu.json") preds = write_predictions(eval_examples, eval_features, all_results, 20, 30, True, output_prediction_file, output_nbest_file, output_null_log_odds_file, True, False, 0.0)
def evaluate(args, model, tokenizer, checkpoint_id=None, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if args.eval_data_subset > 0: dataset = Subset(dataset, list(range(min(args.eval_data_subset, len(dataset))))) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], } example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) result = RawResult(unique_id = unique_id, start_logits = to_list(outputs[0][i]), end_logits = to_list(outputs[1][i])) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(checkpoint_id if checkpoint_id is not None else prefix)) output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(checkpoint_id if checkpoint_id is not None else prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(checkpoint_id if checkpoint_id is not None else prefix)) else: output_null_log_odds_file = None write_predictions(examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) # Evaluate with the official SQuAD script evaluate_options = EVAL_OPTS(data_file=args.predict_file, pred_file=output_prediction_file, na_prob_file=output_null_log_odds_file) results = evaluate_on_squad(evaluate_options) output_eval_file = os.path.join(args.output_dir, "eval_results_{}.txt".format(checkpoint_id) if checkpoint_id is not None else "eval_results.txt" ) with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) writer.write("%s = %s\n" % (key, str(results[key]))) return results
def predict(args, model, tokenizer, prefix="test"): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) pred_dataloader = DataLoader(dataset, batch_size=args.pred_batch_size, shuffle=False) logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.pred_batch_size) all_results = [] for batch in tqdm(pred_dataloader, desc="Predicting"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': None if args.model_type == 'xlm' else batch[2] # XLM don't use segment_ids } example_indices = batch[3] if args.model_type in ['xlnet', 'xlm']: inputs.update({'cls_index': batch[4], 'p_mask': batch[5]}) outputs = model(**inputs) for i, example_index in enumerate(example_indices): pred_feature = features[example_index.item()] unique_id = int(pred_feature.unique_id) if args.model_type in ['xlnet', 'xlm']: # XLNet uses a more complex post-processing procedure result = RawResultExtended( unique_id=unique_id, start_top_log_probs=to_list(outputs[0][i]), start_top_index=to_list(outputs[1][i]), end_top_log_probs=to_list(outputs[2][i]), end_top_index=to_list(outputs[3][i]), cls_logits=to_list(outputs[4][i])) else: result = RawResult(unique_id=unique_id, start_logits=to_list(outputs[0][i]), end_logits=to_list(outputs[1][i])) all_results.append(result) # Compute predictions output_prediction_file = os.path.join(args.save_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.save_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.save_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None if args.model_type in ['xlnet', 'xlm']: # XLNet uses a more complex post-processing procedure write_predictions_extended( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.predict_file, model.config.start_n_top, model.config.end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging) else: write_predictions(examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) output_pred_file = os.path.join(args.save_dir, 'submit.csv') convert_json_to_csv(output_nbest_file, output_pred_file, args.max_answer_length)
def evaluate(args, model, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler( dataset) if args.local_rank == -1 else DistributedSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {"input_ids": batch[0], "attention_mask": batch[1]} example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) result = RawResult(unique_id=unique_id, start_logits=to_list(outputs[0][i]), end_logits=to_list(outputs[1][i])) all_results.append(result) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None write_predictions( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, ) # Evaluate with the official SQuAD script evaluate_options = EVAL_OPTS(data_file=args.predict_file, pred_file=output_prediction_file, na_prob_file=output_null_log_odds_file) results = evaluate_on_squad(evaluate_options) return results
def decode_ouput(outputs, features, examples, example_indices): """ Helps in decoding the model ouput to nbest probabilites and start , end word indexes """ eval_feature = features[example_indices[0].item()] unique_id = int(eval_feature.unique_id) result = RawResult(unique_id=unique_id, start_logits=to_list(outputs[0][0]), end_logits=to_list(outputs[1][0])) _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name "PrelimPrediction", [ "feature_index", "start_index", "end_index", "start_logit", "end_logit" ]) start_indexes = _get_best_indexes(to_list(outputs[0][0]), 20) end_indexes = _get_best_indexes(to_list(outputs[1][0]), 20) feature = eval_feature prelim_predictions = [] for start_index in start_indexes: for end_index in end_indexes: # We could hypothetically create invalid predictions, e.g., predict # that the start of the span is in the question. We throw out all # invalid predictions. if start_index >= len(feature.tokens): continue if end_index >= len(feature.tokens): continue if start_index not in feature.token_to_orig_map: continue if end_index not in feature.token_to_orig_map: continue if not feature.token_is_max_context.get(start_index, False): continue if end_index < start_index: continue length = end_index - start_index + 1 if length > 30: continue prelim_predictions.append( _PrelimPrediction(feature_index=0, start_index=start_index, end_index=end_index, start_logit=result.start_logits[start_index], end_logit=result.end_logits[end_index])) _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name "NbestPrediction", ["text", "start_logit", "end_logit"]) seen_predictions = {} nbest = [] for pred in prelim_predictions: if len(nbest) >= 30: break feature = features[pred.feature_index] if pred.start_index > 0: # this is a non-null prediction tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] orig_doc_start = feature.token_to_orig_map[pred.start_index] orig_doc_end = feature.token_to_orig_map[pred.end_index] orig_tokens = examples[0].doc_tokens[orig_doc_start:(orig_doc_end + 1)] tok_text = " ".join(tok_tokens) # De-tokenize WordPieces that have been split off. tok_text = tok_text.replace(" ##", "") tok_text = tok_text.replace("##", "") # Clean whitespace tok_text = tok_text.strip() tok_text = " ".join(tok_text.split()) orig_text = " ".join(orig_tokens) final_text = get_final_text(tok_text, orig_text, True, True) if final_text in seen_predictions: continue seen_predictions[final_text] = True else: final_text = "" seen_predictions[final_text] = True nbest.append( _NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit)) total_scores = [] best_non_null_entry = None for entry in nbest: total_scores.append(entry.start_logit + entry.end_logit) if not best_non_null_entry: if entry.text: best_non_null_entry = entry probs = _compute_softmax(total_scores) nbest_json = [] for (i, entry) in enumerate(nbest): output = collections.OrderedDict() output["text"] = entry.text output["probability"] = probs[i] output["start_logit"] = entry.start_logit output["end_logit"] = entry.end_logit nbest_json.append(output) return nbest_json, nbest_json[0]["text"] # context = "Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24\u201310 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the \"golden anniversary\" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as \"Super Bowl L\"), so that the logo could prominently feature the Arabic numerals 50." # question = "Which NFL team represented the AFC at Super Bowl 50?" # model_path = "pretrained/" # model = load_model(model_path) # inputs,features,examples,example_indices = feature_extract(context,question) # ouputs = model(**inputs) # nbest,best = decode_ouput(ouputs,features,examples,example_indices) # print best