async def _custom_accuracy(self, examples, features, dataset, prefix=""): if not os.path.exists(self.parent.config.output_dir ) and self.parent.config.local_rank in [-1, 0]: os.makedirs(self.parent.config.output_dir) self.parent.config.eval_batch_size = ( self.parent.config.per_gpu_eval_batch_size * max(1, self.parent.config.n_gpu)) eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader( dataset, sampler=eval_sampler, batch_size=self.parent.config.eval_batch_size, ) # multi-gpu evaluate if self.parent.config.n_gpu > 1 and not isinstance( self.model, torch.nn.DataParallel): self.model = torch.nn.DataParallel(self.model) # Eval logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", self.parent.config.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): self.model.eval() batch = tuple(t.to(self.parent.config.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if self.parent.config.model_type in [ "xlm", "roberta", "distilbert", "camembert", ]: del inputs["token_type_ids"] feature_indices = batch[3] # XLNet and XLM use more arguments for their predictions if self.parent.config.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm models if hasattr(self.model, "config") and hasattr( self.model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * self.parent.config.lang_id).to( self.parent.config.device) }) outputs = self.model(**inputs) for i, feature_index in enumerate(feature_indices): eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) output = [self.to_list(output[i]) for output in outputs] if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info( " Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset), ) # Compute predictions output_prediction_file = os.path.join( self.parent.config.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( self.parent.config.output_dir, "nbest_predictions_{}.json".format(prefix), ) # XLNet and XLM use a more complex post-processing procedure if self.parent.config.model_type in ["xlnet", "xlm"]: start_n_top = (self.model.config.start_n_top if hasattr( self.model, "config") else self.model.module.config.start_n_top) end_n_top = (self.model.config.end_n_top if hasattr( self.model, "config") else self.model.module.config.end_n_top) predictions = compute_predictions_log_probs( examples, features, all_results, self.parent.config.n_best_size, self.parent.config.max_answer_length, output_prediction_file, output_nbest_file, None, start_n_top, end_n_top, False, self.tokenizer, True, ) else: predictions = compute_predictions_logits( examples, features, all_results, self.parent.config.n_best_size, self.parent.config.max_answer_length, self.parent.config.do_lower_case, output_prediction_file, output_nbest_file, None, True, False, self.parent.config.null_score_diff_threshold, self.tokenizer, ) return predictions
def evaluate(args, model, tokenizer, prefix="", calibration=False): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) dataset_cached = "./dataset_cached" if not os.path.exists(dataset_cached): os.makedirs(dataset_cached) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) calibation_iteration = int( (len(dataset) * 0.05 + args.eval_batch_size - 1) / args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) print(" Batch size = %d" % args.eval_batch_size) if args.mkldnn_eval: from torch.utils import mkldnn as mkldnn_utils model = mkldnn_utils.to_mkldnn(model) print(model) all_results = [] evalTime = 0 nb_eval_steps = 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) if calibration and nb_eval_steps >= calibation_iteration: break with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1]} if args.model_type != 'distilbert': inputs[ 'token_type_ids'] = None if args.model_type == 'xlm' else batch[ 2] # XLM don't use segment_ids example_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ['xlnet', 'xlm']: inputs.update({'cls_index': batch[4], 'p_mask': batch[5]}) if nb_eval_steps >= args.warmup: start_time = timeit.default_timer() outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult(unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) if nb_eval_steps >= args.warmup: evalTime += (timeit.default_timer() - start_time) nb_eval_steps += 1 if args.iter > 0 and nb_eval_steps >= (args.warmup + args.iter): break if nb_eval_steps >= args.warmup: perf = (nb_eval_steps - args.warmup) * args.eval_batch_size / evalTime if args.eval_batch_size == 1: print('Latency: %.3f ms' % (evalTime / (nb_eval_steps - args.warmup) * 1000)) print("Evaluation done in total %f secs (Throughput: %f samples/sec)" % (evalTime, perf)) else: logger.info( "*****no performance, please check dataset length and warmup number *****" ) # Compute predictions output_prediction_file = os.path.join(dataset_cached, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( dataset_cached, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( dataset_cached, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ['xlnet', 'xlm']: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging) elif not calibration and args.iter == 0: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) # Compute the F1 and exact scores. if not calibration and args.iter == 0: results = squad_evaluate(examples, predictions) bert_task_acc_keys = ['best_f1', 'f1', 'mcc', 'spearmanr', 'acc'] for key in bert_task_acc_keys: if key in results.keys(): acc = results[key] break print("Accuracy: %.5f" % acc) else: results = None return results, perf
def evaluate(args, model, tokenizer, device, prefix=""): eval_dataset, examples, features = data.load_and_cache_examples( args.validation, tokenizer, args, evaluate=True, output_examples=True, ) eval_dataloader = data.get_dataloader(eval_dataset, args.per_gpu_eval_batch_size, evaluate=True) all_results = [] start_time = timeit.default_timer() eval_batches = 0 for batch in eval_dataloader: model.eval() batch = tuple(t.to(device) for t in batch) eval_batches += 1 with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if args.model_type in [ "xlm", "roberta", "distilbert", "camembert" ]: del inputs["token_type_ids"] feature_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm models if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(device) }) outputs = model(**inputs) for i, feature_index in enumerate(feature_indices): # TODO: i and feature_index are the same number! Simplify by removing enumerate? eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / (eval_batches * args.per_gpu_eval_batch_size)) # Compute predictions output_prediction_file = os.path.join(args.output_data_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_data_dir, "nbest_predictions_{}.json".format(prefix)) if args.has_unanswerable: output_null_log_odds_file = os.path.join( args.output_data_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ["xlnet", "xlm"]: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = squad_metrics.compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_len, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.has_unanswerable, tokenizer, logger.level < logging.INFO, ) else: predictions = squad_metrics.compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_len, args.uncased_model, output_prediction_file, output_nbest_file, output_null_log_odds_file, logger.level < logging.INFO, args.has_unanswerable, args.null_score_diff_thresh, tokenizer, ) # Compute the F1 and exact scores. results = squad_metrics.squad_evaluate(examples, predictions) return results
def find_answer(self, question, context, n_best_size=20, max_answer_length=30, full_sentence=False): # heavily inspired by "https://github.com/huggingface/transformers/blob/v2.3.0/examples/run_squad.py#L212-L317" example_id = '55555' example = SquadExample(example_id, question, context, None, None, None) features, dataset = squad_convert_examples_to_features( [example], self.tokenizer, self.max_seq_length, self.doc_stride, self.max_query_length, False, return_dataset='pt') sampler = SequentialSampler(dataset) dataloader = DataLoader(dataset, sampler=sampler, batch_size=1) all_results = [] for batch in dataloader: self.model.eval() batch = tuple(t.to(self.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if self.model_type in {"xlm", "roberta", "distilbert"}: del inputs["token_type_ids"] example_index = batch[3] # XLNet and XLM use more arguments for their predictions if self.model_type in {"xlnet", "xlm"}: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) outputs = self.model(**inputs) output = [o.detach().cpu().tolist() for o in outputs] unique_id = int(features[example_index].unique_id) # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] squad_result = SquadResult( unique_id, start_logits[0], end_logits[0], start_top_index=start_top_index[0], end_top_index=end_top_index[0], cls_logits=cls_logits[0], ) else: start_logits, end_logits = output squad_result = SquadResult(unique_id, start_logits[0], end_logits[0]) all_results.append(squad_result) # XLNet and XLM use a more complex post-processing procedure if self.model_type in {"xlnet", "xlm"}: if hasattr(model, "config"): start_n_top = self.model.config.start_n_top end_n_top = self.model.config.end_n_top else: start_n_top = self.model.module.config.start_n_top end_n_top = self.model.module.config.end_n_top predictions = compute_predictions_log_probs( [example], features, all_results, n_best_size, max_answer_length, '/tmp/pred.out', '/tmp/nbest.out', '/tmp/null.out', start_n_top, end_n_top, self.version_2_with_negative, tokenizer, self.verbose, ) else: predictions = compute_predictions_logits( [example], features, all_results, n_best_size, max_answer_length, self.do_lower_case, '/tmp/pred.out', '/tmp/nbest.out', '/tmp/null.out', self.verbose, self.version_2_with_negative, self.null_score_diff_threshold, ) prediction = predictions[example_id] logger.debug(f'found prediction: "{prediction}"') # empty prediction indicates unknown answer if not prediction: logger.debug('empty prediction') return None if full_sentence: doc = self.nlp(context) for sent in doc.sents: if prediction in sent.text: prediction = sent.text break return prediction
def predict(self, id_, question, paragraph_texts, paragraph_scores): # dataset, examples, features = load_and_cache_examples(self.args, self.tokenizer, evaluate=True, output_examples=True) # processor = SquadV2Processor() if self.args.version_2_with_negative else SquadV1Processor() # todo convert to single query examples examples = create_inference_examples(question, paragraph_texts, paragraph_scores, chinese=self.args.chinese, tokenizer=self.tokenizer) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=self.tokenizer, max_seq_length=self.args.max_seq_length, doc_stride=self.args.doc_stride, max_query_length=self.args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=self.args.threads, tqdm_enabled=False) # if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: # os.makedirs(args.output_dir) self.args.eval_batch_size = self.args.per_gpu_eval_batch_size * max( 1, self.args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.args.eval_batch_size) # multi-gpu evaluate if self.args.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel): self.model = torch.nn.DataParallel(self.model) # Eval! # logger.info("***** Running evaluation {} *****".format(prefix)) # logger.info(" Num examples = %d", len(dataset)) # logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] # start_time = timeit.default_timer() for batch in eval_dataloader: self.model.eval() batch = tuple(t.to(self.args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } # if args.model_type in ["xlm", "roberta", "distilbert", "camembert"]: # del inputs["token_type_ids"] feature_indices = batch[3] # XLNet and XLM use more arguments for their predictions # if args.model_type in ["xlnet", "xlm"]: # inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # # for lang_id-sensitive xlm models # if hasattr(model, "config") and hasattr(model.config, "lang2id"): # inputs.update( # {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)} # ) outputs = self.model(**inputs) for i, feature_index in enumerate(feature_indices): eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) # Compute predictions prefix = "" output_prediction_file = os.path.join( self.args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( self.args.output_dir, "nbest_predictions_{}.json".format(prefix)) if self.args.version_2_with_negative: output_null_log_odds_file = os.path.join( self.args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if self.args.model_type in ["xlnet", "xlm"]: start_n_top = self.model.config.start_n_top if hasattr( self.model, "config") else self.model.module.config.start_n_top end_n_top = self.model.config.end_n_top if hasattr( self.model, "config") else self.model.module.config.end_n_top answers, nbest_answers = compute_predictions_log_probs( examples, features, all_results, self.args.n_best_size, self.args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, self.args.version_2_with_negative, self.tokenizer, self.args.verbose_logging, self.args.chinese) else: answers, nbest_answers = compute_predictions_logits( examples, features, all_results, self.args.n_best_size, self.args.max_answer_length, self.args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, self.args.verbose_logging, self.args.version_2_with_negative, self.args.null_score_diff_threshold, self.tokenizer, self.args.chinese) all_answers = [] for answer_id, ans in enumerate(answers): ans_dict = { "id": id_, "answer": answers[ans][0], "phrase_score": answers[ans][1], "paragraph_score": paragraph_scores[answer_id], } all_answers.append(ans_dict) return all_answers
def evaluate_ensemble(args, checkpoints, tokenizer, model_class, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() model_list = [] for ckpt in checkpoints: logger.info("Evaluate the following fine_tuned_model: %s", ckpt) model_list.append(model_class.from_pretrained(ckpt)) for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if args.model_type in ["xlm", "roberta", "distilbert"]: del inputs["token_type_ids"] example_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ["xlnet", "xlm"]: raise NotImplementedError # inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # # for lang_id-sensitive xlm models # if hasattr(model, "config") and hasattr(model.config, "lang2id"): # inputs.update( # {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)} # ) outputs_list = [] for model in model_list: model.to(args.device) model.eval() with torch.no_grad(): outputs = model(**inputs) outputs_list.append(outputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) start_logits_list, end_logits_list = [], [] for outputs in outputs_list: output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: raise NotImplementedError # start_logits = output[0] # start_top_index = output[1] # end_logits = output[2] # end_top_index = output[3] # cls_logits = output[4] # result = SquadResult( # unique_id, # start_logits, # end_logits, # start_top_index=start_top_index, # end_top_index=end_top_index, # cls_logits=cls_logits, # ) else: start_logits, end_logits = output start_logits_list.append(start_logits) end_logits_list.append(end_logits) if args.model_type in ["xlnet", "xlm"]: raise NotImplementedError else: start_logits_list = np.array(start_logits_list) end_logits_list = np.array(end_logits_list) #Ensembling method (eg max/avg/etc) start_logits = list(start_logits_list.mean(axis=0)) end_logits = list(end_logits_list.mean(axis=0)) result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ["xlnet", "xlm"]: raise NotImplementedError # start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top # end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging, ) else: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def test_epoch_end(self, outputs): example_indices = torch.cat([x["example_indices"] for x in outputs ]).detach().cpu().tolist() start_logits = torch.cat([x["start_logits"] for x in outputs]).detach().cpu().tolist() end_logits = torch.cat([x["end_logits"] for x in outputs]).detach().cpu().tolist() if "cls_logits" in list(outputs[0].keys()): start_top_index = torch.cat([ x["start_top_index"] for x in outputs ]).detach().cpu().tolist() end_top_index = torch.cat([x["end_top_index"] for x in outputs ]).detach().cpu().tolist() cls_logits = torch.cat([x["cls_logits"] for x in outputs]).detach().cpu().tolist() examples = self.trainer.datamodule.test_examples features = self.trainer.datamodule.test_features all_results = [] for i, example_index in enumerate(example_indices): eval_feature = features[example_index] unique_id = int(eval_feature.unique_id) # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. from transformers.data.processors.squad import SquadResult if "cls_logits" in list(outputs[0].keys()): result = SquadResult( unique_id, start_logits[i], end_logits[i], start_top_index=start_top_index[i], end_top_index=end_top_index[i], cls_logits=cls_logits[i], ) else: result = SquadResult(unique_id, start_logits[i], end_logits[i]) all_results.append(result) # Compute predictions output_prediction_file = os.path.join( self.trainer.checkpoint_callback.dirpath, "predictions_eval.json") output_nbest_file = os.path.join( self.trainer.checkpoint_callback.dirpath, "nbest_predictions_eval.json") if self.version_2_with_negative: output_null_log_odds_file = os.path.join( self.trainer.checkpoint_callback.dirpath, "null_odds_eval.json") else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if self.hparams.model_type in ["xlnet", "xlm"]: start_n_top = self.model.config.start_n_top if hasattr( self.model, "config") else self.model.module.config.start_n_top end_n_top = self.model.config.end_n_top if hasattr( self.model, "config") else self.model.module.config.end_n_top from transformers.data.metrics.squad_metrics import compute_predictions_log_probs predictions = compute_predictions_log_probs( examples, features, all_results, self.hparams.n_best_size, self.hparams.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, self.version_2_with_negative, self.trainer.datamodule.tokenizer, False # Not want to do verbose logging ) else: from transformers.data.metrics.squad_metrics import compute_predictions_logits predictions = compute_predictions_logits( examples, features, all_results, self.hparams.n_best_size, self.hparams.max_answer_length, self.hparams.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, False, # Not want to do verbose logging self.version_2_with_negative, self.hparams.null_score_diff_threshold, self.trainer.datamodule.tokenizer) # Compute the F1 and exact scores. from transformers.data.metrics.squad_metrics import squad_evaluate results = squad_evaluate(examples, predictions) return results
def evaluate(args, config, model, tokenizer, prefix="", global_step=0): dataset, examples, features = load_and_cache_examples(args, config, tokenizer, evaluate=True, output_examples=True) config['eval'][ 'eval_batch_size'] = config.eval.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=config.eval.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", config.eval.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if config.model.model_type in [ "xlm", "roberta", "distilbert", "camembert" ]: del inputs["token_type_ids"] example_indices = batch[3] # XLNet and XLM use more arguments for their predictions if config.model.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm models if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * config.input.lang_id).to(args.device) }) outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(config.output.log_dir, f"predictions_{prefix}.json") output_nbest_file = os.path.join( config.output.log_dir, f"nbest_{config.model.n_best_size}_predictions_{prefix}.json") if config.input.version_2_with_negative: output_null_log_odds_file = os.path.join(config.output.log_dir, f"null_odds_{prefix}.json") else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if config.model.model_type in ["xlnet", "xlm"]: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, config.model.n_best_size, config.model.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, config.input.version_2_with_negative, tokenizer, config.output.verbose_logging, ) else: predictions = compute_predictions_logits( examples, features, all_results, config.model.n_best_size, config.model.max_answer_length, config.model.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, config.output.verbose_logging, config.input.version_2_with_negative, config.model.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) # Save eval results to output file as well if prefix == "-1": # evaluate at the end of training, store in the log_dir directly output_eval_file = os.path.join(config.output.log_dir, "eval_results.tsv") else: # there is a 'prefix' subfolder output_eval_file = os.path.join(config.output.log_dir, prefix, "eval_results.tsv") if not os.path.exists( output_eval_file): # file does not exist yet. write header first with open(output_eval_file, "a") as writer: writer.write("global_step\t" + "\t".join(results.keys()) + "\n") # write header with open(output_eval_file, "a") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) line = [str(global_step)] + [str(r) for r in results.values()] writer.write("\t".join(line) + "\n") return results
def evaluate(args: Args, model, tokenizer, dataset, examples, features, suffix="", return_raw=False): if args.no_cuda is None: args.no_cuda = not _is_gpu_available() if args.predictions_folder: assert args.eval_file, "Need name of the eval file to save predictions!" device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = 0 if args.no_cuda else torch.cuda.device_count() eval_batch_size = args.per_gpu_eval_batch_size * max(1, n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=eval_batch_size) model.to(device) # multi-gpu evaluate if n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! click.echo( f"Generating predictions for model {click.style(args.model_path, fg='blue')}, " f"running on {click.style(str(device), fg='green')}") click.echo(" Num examples = %d" % len(dataset)) click.echo(" Batch size = %d" % eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if args.model_type in [ "xlm", "roberta", "distilbert", "camembert" ]: del inputs["token_type_ids"] feature_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm models if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(device) }) outputs = model(**inputs) for i, feature_index in enumerate(feature_indices): eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) eval_time = timeit.default_timer() - start_time logger.info( f"Evaluation done in total {eval_time} secs ({eval_time / len(dataset)} sec per example)" ) eval_file = args.eval_file predictions_folder = args.predictions_folder v2 = args.v2 if predictions_folder: out_file = get_output_predictions_file_name(eval_file, predictions_folder, suffix) logger.info(f"Saving predictions in {out_file}") # Compute predictions file_name = os.path.basename(out_file) output_prediction_file = os.path.join(predictions_folder, file_name) # output_nbest_file = os.path.join(predictions_folder, f"nbest-{file_name}") output_nbest_file = None if v2: output_null_log_odds_file = os.path.join(predictions_folder, f"null-odds-{file_name}") else: output_null_log_odds_file = None else: logger.info("Not saving predictions...") output_prediction_file = None output_nbest_file = None output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ["xlnet", "xlm"]: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.v2, tokenizer, args.verbose_logging, ) else: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.v2, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. # results = squad_evaluate(examples, predictions) # return results if return_raw: return predictions else: return squad_evaluate(examples, predictions)
def answer_question(self, ranked_examples): squad_examples = [SquadExample( qas_id=str(x['id']), question_text=x['question'], context_text=x['document'], answer_text=None, start_position_character=None, title='', answers=[], ) for x in ranked_examples] squad_features, squad_dataset = squad_convert_examples_to_features( examples=squad_examples, tokenizer=self.tokenizer, max_seq_length=512, doc_stride=128, max_query_length=64, is_training=False, return_dataset="pt", threads=cpu_count(), ) eval_batch_size = self.per_gpu_eval_batch_size * max(1, self.n_gpu) eval_sampler = SequentialSampler(squad_dataset) eval_dataloader = DataLoader(squad_dataset, sampler=eval_sampler, batch_size=eval_batch_size) # multi-gpu evaluate if self.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel): self.model = torch.nn.DataParallel(self.model) # Eval! logger.info("***** Running evaluation of QA *****") logger.info(" Num examples = %d", len(squad_dataset)) logger.info(" Batch size = %d", eval_batch_size) all_results = [] for batch in tqdm(eval_dataloader, desc="Evaluating reader"): self.model.eval() batch = tuple(t.to(self.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if self.model_type in ["xlm", "roberta", "distilbert"]: del inputs["token_type_ids"] example_indices = batch[3] # XLNet and XLM use more arguments for their predictions if self.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm self.models if hasattr(self.model, "config") and hasattr(self.model.config, "lang2id"): inputs.update( {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * self.lang_id).to(self.device)} ) outputs = self.model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = squad_features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) # Compute predictions output_prediction_file = os.path.join(self.model_name_or_path, "predictions.json") output_nbest_file = os.path.join(self.model_name_or_path, "nbest_predictions.json") if True: output_null_log_odds_file = os.path.join(self.model_name_or_path, "null_odds.json") else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if self.model_type in ["xlnet", "xlm"]: start_n_top = self.model.config.start_n_top if hasattr(self.model, "config") else self.model.module.config.start_n_top end_n_top = self.model.config.end_n_top if hasattr(self.model, "config") else self.model.module.config.end_n_top predictions = compute_predictions_log_probs( squad_examples, squad_features, all_results, n_best_size=self.n_best_size, max_answer_length=self.max_answer_length, output_prediction_file=output_prediction_file, output_nbest_file=output_nbest_file, output_null_log_odds_file=output_null_log_odds_file, start_n_top=start_n_top, end_n_top=end_n_top, version_2_with_negative=True, tokenizer=self.okenizer, verbose_logging=True, ) else: predictions = compute_predictions_logits( squad_examples, squad_features, all_results, n_best_size=self.n_best_size, max_answer_length=self.max_answer_length, do_lower_case=True, output_prediction_file=output_prediction_file, output_nbest_file=output_nbest_file, output_null_log_odds_file=output_null_log_odds_file, verbose_logging=True, version_2_with_negative=True, null_score_diff_threshold=0.0, tokenizer=self.tokenizer, ) logger.info('predictions: {}'.format(predictions)) with open(output_nbest_file) as f: output_nbest = json.load(f) return output_nbest
def __get_predictions(self, dataloader, features, samples, prefix=""): self.model.eval() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") all_results = [] for batch in tqdm(dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if self.model_type in ["xlm", "roberta", "distilbert"]: del inputs["token_type_ids"] example_indices = batch[3] # XLNet and XLM use more arguments for their predictions if self.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) outputs = self.model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) #compute predictions output_prediction_file = os.path.join( self.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( self.output_dir, "nbest_predictions_{}.json".format(prefix)) output_null_log_odds_file = os.path.join( self.output_dir, "null_odds_{}.json".format(prefix)) # XLNet and XLM use a more complex post-processing procedure if self.model_type in ["xlnet", "xlm"]: start_n_top = self.model.config.start_n_top if hasattr( self.model, "config") else self.model.module.config.start_n_top end_n_top = self.model.config.end_n_top if hasattr( self.model, "config") else self.model.module.config.end_n_top predictions = compute_predictions_log_probs( samples, features, all_results, self.n_best_size, self.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, True, self.tokenizer, False, ) else: predictions = compute_predictions_logits( samples, features, all_results, self.n_best_size, self.max_answer_length, True, output_prediction_file, output_nbest_file, output_null_log_odds_file, False, True, self.null_score_diff_threshold, self.tokenizer) return predictions
def evaluate(model, tokenizer, output_dir, prefix="", bs=2): dataset, examples, features = load_and_cache_examples(tokenizer, evaluate=True, output_examples=True) if not os.path.exists(output_dir): os.makedirs(output_dir) eval_batch_size = bs # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=eval_batch_size) # Eval! print("***** Running evaluation {} *****".format(prefix)) print(" Num examples = %d" % len(dataset)) print(" Batch size = %d" % eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } example_indices = batch[3] # XLNet and XLM use more arguments for their predictions inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm models if hasattr(model, "config") and hasattr(model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(device) }) outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time print(" Evaluation done in total %f secs (%f sec per example)" % (evalTime, evalTime / len(dataset))) # Compute predictions output_prediction_file = os.path.join(output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( output_dir, "nbest_predictions_{}.json".format(prefix)) output_null_log_odds_file = os.path.join( output_dir, "null_odds_{}.json".format(prefix)) # XLNet and XLM use a more complex post-processing procedure start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, 20, 30, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, True, tokenizer, False, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ["xlnet", "xlm"]: start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging, ) else: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case,
def evaluate(args, model, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1]} if args.model_type != 'distilbert': inputs[ 'token_type_ids'] = None if args.model_type == 'xlm' else batch[ 2] # XLM don't use segment_ids example_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ['xlnet', 'xlm']: inputs.update({'cls_index': batch[4], 'p_mask': batch[5]}) outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult(unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ['xlnet', 'xlm']: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging) else: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def evaluate(args, model, tokenizer, prefix="", save_dir='', save_log_path=None): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not save_dir and args.local_rank in [-1, 0]: os.makedirs(save_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() # y_cls_correct = 0 # y_cls_incorrect = 0 y_cls_tp, y_cls_tn, y_cls_fp, y_cls_fn = 0, 0, 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if args.model_type in [ "xlm", "roberta", "distilbert", "camembert" ]: del inputs["token_type_ids"] example_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm models if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device) }) outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) is_impossible = eval_feature.is_impossible output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits, logits_cls, prob_cls = output prob_cls = np.asarray(prob_cls, dtype=np.float) predict_cls = np.argmax(prob_cls) if predict_cls == int(not is_impossible): if is_impossible: y_cls_tn += 1 else: y_cls_tp += 1 else: if is_impossible: y_cls_fp += 1 else: y_cls_fn += 1 result = SquadResult(unique_id, start_logits, end_logits) # Add cls prediction if args.force_cls_pred: result.prob_cls = prob_cls all_results.append(result) # print(y_cls_correct, y_cls_incorrect) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(save_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( save_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( save_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ["xlnet", "xlm"]: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging, ) else: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) if args.force_cls_pred: example_index_to_features = collections.defaultdict(list) for feature in features: example_index_to_features[feature.example_index].append(feature) unique_id_to_result = {} for result in all_results: unique_id_to_result[result.unique_id] = result n_force = 0 for example_index, example in enumerate(examples): eval_features = example_index_to_features[example_index] prob = [] for eval_feature in eval_features: eval_result = unique_id_to_result[eval_feature.unique_id] prob.append(eval_result.prob_cls[0]) if np.mean(prob) >= 0.8: predictions[example.qas_id] = "" n_force += 1 print("\n") print("num of force prediction:", n_force) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) cls_accuracy = (y_cls_tn + y_cls_tp) / (y_cls_tn + y_cls_tp + y_cls_fn + y_cls_fp) cls_no_ans_accuracy = y_cls_tn / (y_cls_tn + y_cls_fp) cls_has_ans_accuracy = y_cls_tp / (y_cls_tp + y_cls_fn) # Add CLS accuracy to result results.update({ 'cls_accuracy': cls_accuracy, 'cls_no_ans_accuracy': cls_no_ans_accuracy, 'cls_has_ans_accuracy': cls_has_ans_accuracy }) # save log to file if save_log_path: util.save_json_file(save_log_path, results) return results
def evaluate(args, model, tokenizer, prefix="", adapter_names=None): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "adapter_names": adapter_names, } if args.model_type in [ "xlm", "roberta", "distilbert", "camembert" ]: del inputs["token_type_ids"] feature_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm models if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device) }) outputs = model(**inputs) for i, feature_index in enumerate(feature_indices): # TODO: i and feature_index are the same number! Simplify by removing enumerate? eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ["xlnet", "xlm"]: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging, ) else: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def evaluate(self, dataset, examples, features, prefix=""): eval_batch_size = 8 eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=eval_batch_size) all_results = [] for batch in tqdm(eval_dataloader, desc="Evaluating"): self._model.eval() batch = tuple(t.to(self._device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } example_indices = batch[3] if self._model_name in ['xlnet']: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) print("Coding: inputs ", inputs) outputs = self._model(**inputs) # feature is needed for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) print("Coding: unique_id ", unique_id) output = [self._to_list(output[i]) for output in outputs] start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult(unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits) all_results.append(result) # Compute predictions output_dir = os.getcwd() output_prediction_file = os.path.join( output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( output_dir, "nbest_predictions_{}.json".format(prefix)) version_2_with_negative = True output_null_log_odds_file = None if version_2_with_negative: output_null_log_odds_file = os.path.join( output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None n_best_size = 20 max_answer_length = 30 verbose_logging = True if self._model_name in ['xlnet']: start_n_top = self._model.config.start_n_top end_n_top = self._model.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, n_best_size, max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, version_2_with_negative, self._tokenizer, verbose_logging) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results