def compute_predictions_logits(self, all_results, prefix=""): output_prediction_file = os.path.join(self.args.save_dir, "predictions.json") output_nbest_file = os.path.join(self.args.save_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(self.args.save_dir, "null_odds.json") predictions = compute_predictions_logits( self.examples, self.features, all_results, self.args.n_best_size, self.args.max_answer_length, True, output_prediction_file, output_nbest_file, output_null_log_odds_file, False, False, 0.0, self.tokenizer, ) results = squad_evaluate(self.examples, predictions) return results
def evaluate(model, tokenizer): # Evaluate dataset, examples, features = load_and_cache_examples(tokenizer, is_training=False) eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=8) # Eval! print("***** Running evaluation *****") print(" Num examples = ", len(dataset)) all_results = [] start_time = timeit.default_timer() for batch in tqdm_notebook(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [output[i].detach().cpu().tolist() for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time print(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) predictions = compute_predictions_logits( examples, features, all_results, n_best_size = 20, max_answer_length = 30, do_lower_case=False, output_prediction_file="predictions.json", output_nbest_file="nbest_predictions.json", output_null_log_odds_file=None, verbose_logging=False, version_2_with_negative=False, null_score_diff_threshold=0.0, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def aggregate_reader_metrics(examples, reader_predictions, reader_metrics): ''' Scores and aggregates reader metrics ''' reader_eval = dict(squad_evaluate(examples, reader_predictions)) reader_took = np.mean(reader_metrics) reader_eval['Average Prediction Duration'] = reader_took return reader_eval
def evaluate_full_dataset(self, data_loader: DataLoader): all_results = [] for batch in data_loader: inputs = { "input_ids": batch[0].cuda(), "attention_mask": batch[1].cuda(), "token_type_ids": batch[2].cuda(), } feature_indices = batch[3] outputs = self.model(**inputs) for i, feature_index in enumerate(feature_indices): eval_feature = self.validation_features[feature_index.item()] unique_id = int(eval_feature.unique_id) output = [ output[i].detach().cpu().tolist() for output in outputs ] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) output_prediction_file = None output_nbest_file = None output_null_log_odds_file = None task = self.context.get_data_config().get("task") if task == "SQuAD1.1": version_2_with_negative = False elif task == "SQuAD2.0": version_2_with_negative = True else: raise NameError(f"Incompatible dataset '{task}' detected") # TODO: Make verbose logging configurable verbose_logging = False predictions = compute_predictions_logits( self.validation_examples, self.validation_features, all_results, self.context.get_hparam("n_best_size"), self.context.get_hparam("max_answer_length"), self.context.get_hparam("do_lower_case"), output_prediction_file, output_nbest_file, output_null_log_odds_file, verbose_logging, version_2_with_negative, self.context.get_hparam("null_score_diff_threshold"), self.tokenizer, ) results = squad_evaluate(self.validation_examples, predictions) return results
def evaluate(self, model, dataset, examples, features): eval_batch_size, eval_dataloader = self.get_dataloader_sampler(dataset) # multi-gpu evaluate if self.args_dict[N_GPU] > 1 and not isinstance( model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) else: model = model # Eval! logger.info("***** Running evaluation {} *****".format( self.global_step)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", self.args_dict[eval_batch_size]) all_results = [] start_time = timeit.default_timer() model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(self.args_dict[DEVICE]) for t in batch) with torch.no_grad(): inputs = {"input_ids": batch[0], "attention_mask": batch[1]} example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) eval_time = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", eval_time, eval_time / len(dataset)) # Compute predictions predictions = self.calcuate_predictions(all_results, examples, features) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results, eval_time
def qa_evaluate(lang, test_set, model_type, loader, bert_model, learner, save_dir): all_results, loss, uids = [], [], [] examples = test_set.examples features = test_set.features for batch in loader: with torch.no_grad(): input_ids, attention_mask, token_type_ids, labels, unique_ids = ( batch[0], batch[1], batch[2], batch[3], batch[4], ) bert_output = bert_model(input_ids, attention_mask, token_type_ids) outputs = learner(bert_output, labels=labels, attention_mask=attention_mask) loss.append(outputs.loss.mean().item()) for i, uid in enumerate(unique_ids): unique_id = int(uid.item()) start_logits = outputs.start_logits[i].detach().cpu().tolist() end_logits = outputs.end_logits[i].detach().cpu().tolist() result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) uids.append(unique_id) save_dir = os.path.join(save_dir, "result") os.makedirs(save_dir, exist_ok=True) output_prediction_file = os.path.join(save_dir, f"{lang}.predictions") output_nbest_file = os.path.join(save_dir, f"{lang}.nbest_predictions") features = [f for f in features if f.unique_id in uids] qas_ids = list(dict.fromkeys([f.qas_id for f in features])) predictions = compute_predictions_logits( examples, features, all_results, n_best_size=20, max_answer_length=30, do_lower_case=False, output_prediction_file=output_prediction_file, output_nbest_file=output_nbest_file, output_null_log_odds_file=None, verbose_logging=True, version_2_with_negative=False, null_score_diff_threshold=-np.inf, tokenizer=AutoTokenizer.from_pretrained(model_type), ) results = squad_evaluate(test_set.get_by_ids(qas_ids), predictions) return torch.tensor(loss), dict(results)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--predict_file", default="data/squad/dev-v2.0.json", type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json", ) parser.add_argument("--predict_tag_file", default="data/squad/dev-v2.0_tag", type=str) parser.add_argument("--prediction_file", type=str) args = parser.parse_args() eval_examples = read_squad_examples( input_file=args.predict_file, input_tag_file=args.predict_tag_file, is_training=False, ) with open(args.prediction_file) as f: result = json.load(f) print(json.dumps(dict(squad_evaluate(eval_examples, result)), indent=4))
def evaluate_full_dataset(self, data_loader: DataLoader): all_results = [] for batch in data_loader: inputs = { "input_ids": batch[0].cuda(), "attention_mask": batch[1].cuda(), "token_type_ids": batch[2].cuda(), } feature_indices = batch[3] outputs = self.model(**inputs) for i, feature_index in enumerate(feature_indices): eval_feature = self.validation_features[feature_index.item()] unique_id = int(eval_feature.unique_id) output = [ output[i].detach().cpu().tolist() for output in outputs ] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) output_prediction_file = None output_nbest_file = None output_null_log_odds_file = None predictions = compute_predictions_logits( self.validation_examples, self.validation_features, all_results, self.context.get_hparam("n_best_size"), self.context.get_hparam("max_answer_length"), True, output_prediction_file, output_nbest_file, output_null_log_odds_file, True, False, self.context.get_hparam("null_score_diff_threshold"), self.tokenizer, ) results = squad_evaluate(self.validation_examples, predictions) return results
async def accuracy(self, sources: Sources): if not os.path.isfile( os.path.join(self.parent.config.output_dir, "pytorch_model.bin")): raise ModelNotTrained("Train model before assessing for accuracy.") self.tokenizer = AutoTokenizer.from_pretrained( self.parent.config.output_dir, do_lower_case=self.parent.config.do_lower_case, ) eval_examples = await self._preprocess_data(sources) features, dataset = squad_convert_examples_to_features( examples=eval_examples, tokenizer=self.tokenizer, max_seq_length=self.parent.config.max_seq_length, doc_stride=self.parent.config.doc_stride, max_query_length=self.parent.config.max_query_length, is_training=False, return_dataset="pt", ) results = {} if self.parent.config.local_rank in [-1, 0]: logger.info( "Loading checkpoints saved during training for evaluation") self.model = AutoModelForQuestionAnswering.from_pretrained( self.parent.config.output_dir) self.model.to(self.parent.config.device) # Evaluate predictions = await self._custom_accuracy(eval_examples, features, dataset) results = squad_evaluate(eval_examples, predictions) logger.info("Results: {}".format(results)) # return results return Accuracy(results["f1"])
def compute_predictions_logits(self, all_results, prefix=""): output_prediction_file = os.path.join( self.args.save_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( self.args.save_dir, "nbest_predictions_{}.json".format(prefix)) output_null_log_odds_file = os.path.join( self.args.save_dir, "null_odds_{}.json".format(prefix)) predictions = compute_predictions_logits( self.examples, self.features, all_results, 20, 30, True, output_prediction_file, output_nbest_file, output_null_log_odds_file, False, True, 0.0, self.tokenizer, ) results = squad_evaluate(self.examples, predictions) return results
def evaluate(args, model, tokenizer, prefix="", global_step=None): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in progress_bar(eval_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if args.model_type in [ "xlm", "roberta", "distilbert", "distilkobert", "xlm-roberta" ]: del inputs["token_type_ids"] # reforbert인 경우 if args.model_type in ["reforbert"]: del inputs["attention_mask"] example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) # Write the result # Write the evaluation result on file output_dir = os.path.join(args.output_dir, 'eval') if not os.path.exists(output_dir): os.makedirs(output_dir) output_eval_file = os.path.join( output_dir, "eval_result_{}_{}.txt".format( list(filter(None, args.model_name_or_path.split("/"))).pop(), global_step)) logger.info("***** Official Eval results *****") with open(output_eval_file, "w", encoding='utf-8') as f: official_eval_results = eval_during_train(args) for key in sorted(official_eval_results.keys()): logger.info(" %s = %s", key, str(official_eval_results[key])) f.write(" {} = {}\n".format(key, str(official_eval_results[key]))) return results
def evaluate(args, model, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() seq_lens = torch.sum((batch[0] != 0).to(torch.int32), dim=1).numpy() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): # inputs = { # "input_ids": batch[0], # "attention_mask": batch[1].half() if args.data_type == 'fp16' else batch[1], # "token_type_ids": batch[2], # } inputs = [ batch[0], batch[1].half() if args.data_type == 'fp16' else batch[1], batch[2] ] example_indices = batch[3] # outputs = model(**inputs) outputs = model(*inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits[:seq_lens[i]], end_logits[:seq_lens[i]]) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def evaluate(args, model, tokenizer, device, prefix=""): eval_dataset, examples, features = data.load_and_cache_examples( args.validation, tokenizer, args, evaluate=True, output_examples=True, ) eval_dataloader = data.get_dataloader(eval_dataset, args.per_gpu_eval_batch_size, evaluate=True) all_results = [] start_time = timeit.default_timer() eval_batches = 0 for batch in eval_dataloader: model.eval() batch = tuple(t.to(device) for t in batch) eval_batches += 1 with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if args.model_type in [ "xlm", "roberta", "distilbert", "camembert" ]: del inputs["token_type_ids"] feature_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm models if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(device) }) outputs = model(**inputs) for i, feature_index in enumerate(feature_indices): # TODO: i and feature_index are the same number! Simplify by removing enumerate? eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / (eval_batches * args.per_gpu_eval_batch_size)) # Compute predictions output_prediction_file = os.path.join(args.output_data_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_data_dir, "nbest_predictions_{}.json".format(prefix)) if args.has_unanswerable: output_null_log_odds_file = os.path.join( args.output_data_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ["xlnet", "xlm"]: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = squad_metrics.compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_len, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.has_unanswerable, tokenizer, logger.level < logging.INFO, ) else: predictions = squad_metrics.compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_len, args.uncased_model, output_prediction_file, output_nbest_file, output_null_log_odds_file, logger.level < logging.INFO, args.has_unanswerable, args.null_score_diff_thresh, tokenizer, ) # Compute the F1 and exact scores. results = squad_metrics.squad_evaluate(examples, predictions) return results
def QA_evaluate(args, model, tokenizer, prefix=""): dataset, examples, features = squad_load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) #eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, shuffle=False) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "squad_predictions_{}.json".format(prefix)) output_nbest_file = os.path.join(args.output_dir, "nbest_squad_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join(args.output_dir, "squad_null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def adv_evaluate(self, prefix: str, args, tokenizer, dataset, examples, features) -> torch.Tensor: """Performs PGD attack on each example in the evaluation dataset, recording aggregate metrics Parameters ---------- prefix : str The model to be used for training args : tokenizer : The tokenizer used to preprocess the data. dataset : List(torch.utils.data.TensorDataset) The evaluation dataset examples : List(torch.utils.data.TensorDataset) The examples in the evaluation dataset features : List(torch.utils.data.TensorDataset) SQuAD-like features corresponding to the evalaution dataset Returns ------- torch.Tensor The evaluation metrics (Exact Match (EM) and F1-score) """ if not os.path.exists( self.args.output_dir) and self.args.local_rank in [-1, 0]: os.makedirs(self.args.output_dir) # TODO Add batch attacks for eval eval_batch_size = max(1, self.args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=eval_batch_size) # multi-gpu evaluate if self.args.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel): self.model = torch.nn.DataParallel(self.model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", eval_batch_size) all_results = [] start_time = timeit.default_timer() if self.params.model_type == 'bert': _embed_layer = self.model.bert.get_input_embeddings() elif self.params.model_type == 'distilbert': _embed_layer = self.model.distilbert.get_input_embeddings() elif self.params.model_type == 'albert': _embed_layer = self.model.albert.get_input_embeddings() self.model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(self.args.device) for t in batch) adv_outputs = [] # (k_iter, batch_size) # Set static embedding layer _delta = None for i_iter in range(args.K): input_embedding = torch.stack( [_embed_layer(x) for x in batch[0]]) if not _delta: m = torch.distributions.multivariate_normal.MultivariateNormal( torch.zeros(768), torch.eye(768) * (args.sigma**2)) _sample = m.sample((args.max_seq_length, )) _delta = torch.tensor(_sample, requires_grad=True, device=self.args.device) adv_input_embedding = input_embedding + _delta inputs = { "input_ids": None, "attention_mask": batch[1], "token_type_ids": batch[2], "start_positions": batch[3], "end_positions": batch[4], "inputs_embeds": adv_input_embedding, } if self.params.model_type in ["xlm", "roberta", "distilbert"]: del inputs["token_type_ids"] intermed_adv_outputs = self.model(**inputs) adv_loss = intermed_adv_outputs[0] adv_loss.backward() # Calculate g_adv and update delta g_adv = _delta.grad.data.detach() _delta = self._adv_sgn_attack(_delta, args.eps, args.eta, 'inf') del g_adv _delta.grad.zero_() # TODO: Check inf/NaN. How should we proceed with eval if NaNs? with torch.no_grad(): # Generate adversarial loss with perturbed inputs against predicted logits inputs = { "input_ids": None, "attention_mask": batch[1], "token_type_ids": batch[2], "inputs_embeds": input_embedding + _delta } if self.params.model_type in ["xlm", "roberta", "distilbert"]: del inputs["token_type_ids"] adv_outputs = self.model(**inputs) example_indices = batch[5] for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) adv_output = [ tensor_to_list(output[i]) for output in adv_outputs ] start_logits, end_logits = adv_output result = SquadResult(unique_id, start_logits, end_logits) example_id = example_index.item() all_results.append(result) eval_time = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", eval_time, eval_time / len(dataset)) predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, None, None, None, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) alum_results = squad_evaluate(examples, predictions) print('===alum_results: ', alum_results) return alum_results
def evaluate(args, model, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1]} if args.model_type != 'distilbert': inputs[ 'token_type_ids'] = None if args.model_type == 'xlm' else batch[ 2] # XLM don't use segment_ids example_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ['xlnet', 'xlm']: inputs.update({'cls_index': batch[4], 'p_mask': batch[5]}) outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult(unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ['xlnet', 'xlm']: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging) else: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def evaluate_ensemble(args, checkpoints, tokenizer, model_class, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() model_list = [] for ckpt in checkpoints: logger.info("Evaluate the following fine_tuned_model: %s", ckpt) model_list.append(model_class.from_pretrained(ckpt)) for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if args.model_type in ["xlm", "roberta", "distilbert"]: del inputs["token_type_ids"] example_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ["xlnet", "xlm"]: raise NotImplementedError # inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # # for lang_id-sensitive xlm models # if hasattr(model, "config") and hasattr(model.config, "lang2id"): # inputs.update( # {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)} # ) outputs_list = [] for model in model_list: model.to(args.device) model.eval() with torch.no_grad(): outputs = model(**inputs) outputs_list.append(outputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) start_logits_list, end_logits_list = [], [] for outputs in outputs_list: output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: raise NotImplementedError # start_logits = output[0] # start_top_index = output[1] # end_logits = output[2] # end_top_index = output[3] # cls_logits = output[4] # result = SquadResult( # unique_id, # start_logits, # end_logits, # start_top_index=start_top_index, # end_top_index=end_top_index, # cls_logits=cls_logits, # ) else: start_logits, end_logits = output start_logits_list.append(start_logits) end_logits_list.append(end_logits) if args.model_type in ["xlnet", "xlm"]: raise NotImplementedError else: start_logits_list = np.array(start_logits_list) end_logits_list = np.array(end_logits_list) #Ensembling method (eg max/avg/etc) start_logits = list(start_logits_list.mean(axis=0)) end_logits = list(end_logits_list.mean(axis=0)) result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ["xlnet", "xlm"]: raise NotImplementedError # start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top # end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging, ) else: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def ensemble_vote(args, save_dir='', save_log_path=None, prefix='', predict_prob_mode='add'): examples, all_model_features, all_model_results, tokenizers = load_saved_examples( args, evaluate=True) if not save_dir and args.local_rank in [-1, 0]: os.makedirs(save_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly # eval_sampler = SequentialSampler(dataset) # eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info(f"***** Running ensemble {prefix}*****") logger.info(" Num examples = %d", len(examples)) logger.info(" Batch size = %d", args.eval_batch_size) # We do pure voting now, not taking new inputs # start_time = timeit.default_timer() # evalTime = timeit.default_timer() - start_time # logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(save_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( save_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( save_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None all_predictions = [] all_probs = [] logger.info(f'predict_prob_mode: {predict_prob_mode}') for model_idx in tqdm(range(len(tokenizers)), desc="Predicting"): features = all_model_features[model_idx] all_results = all_model_results[model_idx] tokenizer = tokenizers[model_idx] predictions, probs = hack.compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, prob_mode=predict_prob_mode) all_predictions.append(predictions) all_probs.append(probs) # continue # num of predictions num_of_predicions = len(all_predictions[0]) logger.info(f'Number of predicions {num_of_predicions}') final_predictions = collections.OrderedDict() output_result = collections.OrderedDict() # Grid Search if args.do_grid_search: grid_search_results = collections.OrderedDict() grid_search_predictions = collections.OrderedDict() for weights in product(np.arange(6), repeat=len(all_probs)): if weights == (0, 0, 0, 0, 0): continue for qas_id in all_predictions[0].keys(): probs = np.array([d_prob[qas_id] for d_prob in all_probs]) for i, w in enumerate(weights): probs[i] *= w idx = np.argmax(probs) final_predictions[qas_id] = all_predictions[idx][qas_id] """ logger.info('Model individual results') for i in range(len(tokenizers)): results = squad_evaluate(examples, all_predictions[i]) logger.info(results) """ # Compute the F1 and exact scores. logger.info(f'Weights: {weights}') logger.info('Ensemble results') final_results = squad_evaluate(examples, final_predictions) logger.info(final_results) if len(grid_search_results) == 0: best_weights = weights grid_search_results = final_results grid_search_predictions = final_predictions else: if grid_search_results['exact'] + grid_search_results[ 'f1'] < final_results['exact'] + final_results['f1']: best_weights = weights grid_search_results = final_results grid_search_predictions = final_predictions # save log to file logger.info(f'Best Weights: {best_weights}') output_result[best_weights] = grid_search_results util.save_json_file(os.path.join(save_dir, 'eval_results.json'), output_result) # save prediction to file # TODO save grid search best util.save_json_file(os.path.join(save_dir, 'predictions_.json'), grid_search_predictions) util.convert_submission_format_and_save( save_dir, prediction_file_path=os.path.join(save_dir, 'predictions_.json')) return grid_search_results else: for qas_id in all_predictions[0].keys(): probs = np.array([d_prob[qas_id] for d_prob in all_probs]) idx = np.argmax(probs) final_predictions[qas_id] = all_predictions[idx][qas_id] logger.info('Model individual results') for i in range(len(tokenizers)): results = squad_evaluate(examples, all_predictions[i]) logger.info(results) # Compute the F1 and exact scores. logger.info('Ensemble results') final_results = squad_evaluate(examples, final_predictions) logger.info(final_results) # save log to file util.save_json_file(os.path.join(save_dir, 'eval_results.json'), final_results) util.save_json_file(os.path.join(save_dir, 'predictions_.json'), final_predictions) util.convert_submission_format_and_save( save_dir, prediction_file_path=os.path.join(save_dir, 'predictions_.json')) return final_results
def evaluate(args: Args, model, tokenizer, dataset, examples, features, suffix="", return_raw=False): if args.no_cuda is None: args.no_cuda = not _is_gpu_available() if args.predictions_folder: assert args.eval_file, "Need name of the eval file to save predictions!" device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = 0 if args.no_cuda else torch.cuda.device_count() eval_batch_size = args.per_gpu_eval_batch_size * max(1, n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=eval_batch_size) model.to(device) # multi-gpu evaluate if n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! click.echo( f"Generating predictions for model {click.style(args.model_path, fg='blue')}, " f"running on {click.style(str(device), fg='green')}") click.echo(" Num examples = %d" % len(dataset)) click.echo(" Batch size = %d" % eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if args.model_type in [ "xlm", "roberta", "distilbert", "camembert" ]: del inputs["token_type_ids"] feature_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm models if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(device) }) outputs = model(**inputs) for i, feature_index in enumerate(feature_indices): eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) eval_time = timeit.default_timer() - start_time logger.info( f"Evaluation done in total {eval_time} secs ({eval_time / len(dataset)} sec per example)" ) eval_file = args.eval_file predictions_folder = args.predictions_folder v2 = args.v2 if predictions_folder: out_file = get_output_predictions_file_name(eval_file, predictions_folder, suffix) logger.info(f"Saving predictions in {out_file}") # Compute predictions file_name = os.path.basename(out_file) output_prediction_file = os.path.join(predictions_folder, file_name) # output_nbest_file = os.path.join(predictions_folder, f"nbest-{file_name}") output_nbest_file = None if v2: output_null_log_odds_file = os.path.join(predictions_folder, f"null-odds-{file_name}") else: output_null_log_odds_file = None else: logger.info("Not saving predictions...") output_prediction_file = None output_nbest_file = None output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ["xlnet", "xlm"]: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.v2, tokenizer, args.verbose_logging, ) else: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.v2, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. # results = squad_evaluate(examples, predictions) # return results if return_raw: return predictions else: return squad_evaluate(examples, predictions)
def evaluate_bert(self, model): """After the completion of each training epoch, measure the model's performance on our validation set. """ # Put the model into the evaluation mode. The dropout layers are disabled during # the test time. model.eval() datasets = self.textData.datasets['dev'] features = datasets['features'] eval_sampler = SequentialSampler(datasets['dataset']) dev_dataloader = DataLoader(datasets['dataset'], sampler=eval_sampler, batch_size=args['batchSize']) n_iters = len(datasets['dataset']) # Tracking variables val_accuracy = [] val_loss = [] prefix = "pp" output_prediction_file = os.path.join( args['rootDir'], "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args['rootDir'], "nbest_predictions_{}.json".format(prefix)) output_null_log_odds_file = os.path.join( args['rootDir'], "null_odds_{}.json".format(prefix)) all_results = [] # For each batch in our validation set... for batch in dev_dataloader: batch = tuple(t.to(args['device']) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], # "start_positions": batch[3], # "end_positions": batch[4], } # Compute logits with torch.no_grad(): start_logits, end_logits = model.predict(inputs) feature_indices = batch[3] for i, feature_index in enumerate(feature_indices): eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) # output = [to_list(output[i]) for output in outputs] # start_logits, end_logits = output result = SquadResult(unique_id, self.to_list(start_logits[i]), self.to_list(end_logits[i])) all_results.append(result) # val_loss.append(loss.item()) # print(preds, batch.label) # Calculate the accuracy rate # accuracy = (preds.cpu() == torch.LongTensor(batch.label)).numpy().mean() * 100 # val_accuracy.append(accuracy) predictions = compute_predictions_logits( datasets['examples'], datasets['features'], all_results, 20, 30, True, output_prediction_file, output_nbest_file, output_null_log_odds_file, True, True, 0.0, self.textData.tokenizer, ) results = squad_evaluate(datasets['examples'], predictions) # print(results) # Compute the average accuracy and loss over the validation set. # val_loss = np.mean(val_loss) # val_accuracy = np.mean(val_accuracy) return -1, results
def evaluate(args, model, tokenizer, prefix="", save_dir='', save_log_path=None): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not save_dir and args.local_rank in [-1, 0]: os.makedirs(save_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() # y_cls_correct = 0 # y_cls_incorrect = 0 y_cls_tp, y_cls_tn, y_cls_fp, y_cls_fn = 0, 0, 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if args.model_type in [ "xlm", "roberta", "distilbert", "camembert" ]: del inputs["token_type_ids"] example_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm models if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device) }) outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) is_impossible = eval_feature.is_impossible output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits, logits_cls, prob_cls = output prob_cls = np.asarray(prob_cls, dtype=np.float) predict_cls = np.argmax(prob_cls) if predict_cls == int(not is_impossible): if is_impossible: y_cls_tn += 1 else: y_cls_tp += 1 else: if is_impossible: y_cls_fp += 1 else: y_cls_fn += 1 result = SquadResult(unique_id, start_logits, end_logits) # Add cls prediction if args.force_cls_pred: result.prob_cls = prob_cls all_results.append(result) # print(y_cls_correct, y_cls_incorrect) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(save_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( save_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( save_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ["xlnet", "xlm"]: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging, ) else: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) if args.force_cls_pred: example_index_to_features = collections.defaultdict(list) for feature in features: example_index_to_features[feature.example_index].append(feature) unique_id_to_result = {} for result in all_results: unique_id_to_result[result.unique_id] = result n_force = 0 for example_index, example in enumerate(examples): eval_features = example_index_to_features[example_index] prob = [] for eval_feature in eval_features: eval_result = unique_id_to_result[eval_feature.unique_id] prob.append(eval_result.prob_cls[0]) if np.mean(prob) >= 0.8: predictions[example.qas_id] = "" n_force += 1 print("\n") print("num of force prediction:", n_force) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) cls_accuracy = (y_cls_tn + y_cls_tp) / (y_cls_tn + y_cls_tp + y_cls_fn + y_cls_fp) cls_no_ans_accuracy = y_cls_tn / (y_cls_tn + y_cls_fp) cls_has_ans_accuracy = y_cls_tp / (y_cls_tp + y_cls_fn) # Add CLS accuracy to result results.update({ 'cls_accuracy': cls_accuracy, 'cls_no_ans_accuracy': cls_no_ans_accuracy, 'cls_has_ans_accuracy': cls_has_ans_accuracy }) # save log to file if save_log_path: util.save_json_file(save_log_path, results) return results
def train(EXP: str, MODEL_NAME: str, DELTA: float, WEIGHT_DECAY: float, DEVICE: str) -> float: EPOCHS = 3 BATCH_SIZE = 13 SAMPLES = 10 FREEZE = True LOGS = "logs" DOC_STRIDE = 128 MAX_SEQ_LENGTH = 384 MAX_QUERY_LENGTH = 64 MAX_ANSWER_LENGTH = 30 N_BEST_SIZE = 20 NULL_SCORE_THRESH = 0.0 LOWER_CASE = True THREADS = 4 LOADER_OPTIONS = { "num_workers": 10, "pin_memory": True } LR = 5e-5 ADAM_EPSILON = 1e-8 N_WARMUP_STEPS = 0 MAX_GRAD_NORM = 1 DATA_DIR = os.path.join("./dataset/squadv1") dumper = Dumper(f'dumps/dump_{EXP}_{MODEL_NAME}_{DELTA}.dump') os.makedirs(LOGS, exist_ok=True) writer_name = f"bayeformers_bert_squad.{EXP}" writer_path = os.path.join(LOGS, writer_name) writer_suff = f".DELTA_{DELTA}.WEIGHT_DECAY_{WEIGHT_DECAY}" writer = SummaryWriter(writer_path + writer_suff) o_model, tokenizer = setup_model(MODEL_NAME, LOWER_CASE) o_model = torch.nn.DataParallel(o_model, device_ids=[0, 1, 2, 3]) o_model.to(DEVICE) squadv1 = { "max_seq_length" : MAX_SEQ_LENGTH, "doc_stride" : DOC_STRIDE, "max_query_length": MAX_QUERY_LENGTH, "threads" : THREADS } train_dataset, train_examples, train_features = setup_squadv1_dataset(DATA_DIR, tokenizer=tokenizer, test=False, **squadv1) test_dataset, test_examples, test_features = setup_squadv1_dataset(DATA_DIR, tokenizer=tokenizer, test=True, **squadv1) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, **LOADER_OPTIONS) test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, **LOADER_OPTIONS) decay = [param for name, param in o_model.named_parameters() if name in ["bias", "LayerNorm.weight"]] no_decay = [param for name, param in o_model.named_parameters() if name not in ["bias", "LayerNorm.weight"]] params_decay = { "params": decay, "weight_decay": WEIGHT_DECAY } params_no_decay = { "params": no_decay, "weight_decay": 0.0 } parameters = [params_decay, params_no_decay] criterion = nn.CrossEntropyLoss().to(DEVICE) optim = AdamW(parameters, lr=LR, eps=ADAM_EPSILON) scheduler = get_linear_schedule_with_warmup(optim, N_WARMUP_STEPS, EPOCHS) # =========================== FREQUENTIST ================================== report = Report() with dumper("frequentist_train"): for epoch in tqdm(range(EPOCHS), desc="Epoch"): # ============================ TRAIN ====================================== o_model.train() report.reset() with dumper("epoch", epoch): pbar = tqdm(train_loader, desc="Train") for inputs in pbar: inputs = setup_inputs(inputs, MODEL_NAME, o_model) inputs = dic2cuda(inputs, DEVICE) start_positions = inputs["start_positions"] end_positions = inputs["end_positions"] optim.zero_grad() outputs = o_model(**inputs) start_logits = outputs[1] end_logits = outputs[2] ignored_idx = start_logits.size(1) start_logits = start_logits.clamp_(0, ignored_idx) end_logits = end_logits.clamp_(0, ignored_idx) criterion.ignore_index = ignored_idx with dumper(): dumper['start_positions'] = start_positions dumper['end_positions'] = end_positions dumper['start_logits'] = start_logits dumper['end_logits'] = end_logits start_loss = criterion(start_logits, start_positions) end_loss = criterion( end_logits, end_positions) start_acc = (torch.argmax(start_logits, dim=1) == start_positions).float().sum() end_acc = (torch.argmax( end_logits, dim=1) == end_positions).float().sum() loss = 0.5 * (start_loss + end_loss) acc = 0.5 * (start_acc + end_acc) loss.backward() nn.utils.clip_grad_norm_(o_model.parameters(), MAX_GRAD_NORM) optim.step() report.total += loss.item() / len(train_loader) report.acc += acc.item() * 100 / len(train_dataset) pbar.set_postfix(total=report.total, acc=report.acc) scheduler.step() writer.add_scalar("train_nll", report.total, epoch) writer.add_scalar("train_acc", report.acc, epoch) # ============================ TEST ======================================= o_model.eval() report.reset() with dumper.section("frequentist_test"): with torch.no_grad(): results = [] pbar = tqdm(test_loader, desc="Test") for inputs in pbar: inputs = setup_inputs(inputs, MODEL_NAME, o_model, True) inputs = dic2cuda(inputs, DEVICE) feature_indices = inputs["feature_indices"] del inputs["feature_indices"] outputs = o_model(**inputs) for i, feature_idx in enumerate(feature_indices): eval_feature = test_features[feature_idx.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) results.append(result) with dumper(): dumper['unique_id'] = unique_id dumper['start_logits'] = start_logits dumper['end_logits'] = end_logits predictions = compute_predictions_logits( test_examples, test_features, results, N_BEST_SIZE, MAX_ANSWER_LENGTH, LOWER_CASE, os.path.join(LOGS, f"preds.frequentist.test.{writer_name + writer_suff}.json"), os.path.join(LOGS, f"nbestpreds.frequentist.test.{writer_name + writer_suff}.json"), None, True, False, NULL_SCORE_THRESH, tokenizer, ) results = squad_evaluate(test_examples, predictions) report.em = results["exact"] report.f1 = results["f1"] report.total = results["total"] print(f'em={report.em}, f1={report.f1}, total={report.total}') writer.add_scalar("test_em", report.em, epoch) writer.add_scalar("test_f1", report.f1, epoch) writer.add_scalar("test_total", report.total, epoch) # ============================ EVALUTATION ==================================== b_model = to_bayesian(o_model, delta=DELTA, freeze=FREEZE) b_model = b_model.to(DEVICE) b_model.eval() report.reset() with dumper("bayesian_eval_before_train"): with torch.no_grad(): results = [] pbar = tqdm(test_loader, desc="Bayesian Eval") for inputs in pbar: inputs = setup_inputs(inputs, MODEL_NAME, o_model, True) inputs = dic2cuda(inputs, DEVICE) feature_indices = inputs["feature_indices"] B = inputs["input_ids"].size(0) del inputs["feature_indices"] samples = sample_bayesian(b_model, inputs, SAMPLES, B, MAX_SEQ_LENGTH, DEVICE) _, _, start_logits, end_logits, log_prior, log_variational_posterior = samples start_logits_list = start_logits.tolist() end_logits_list = end_logits.tolist() for i, feature_idx in enumerate(feature_indices): eval_feature = test_features[feature_idx.item()] unique_id = int(eval_feature.unique_id) result = SquadResult(unique_id, start_logits_list[i], end_logits_list[i]) results.append(result) with dumper(): dumper['unique_id'] = unique_id dumper['start_logits'] = start_logits_list[i] dumper['end_logits'] = end_logits_list[i] predictions = compute_predictions_logits( test_examples, test_features, results, N_BEST_SIZE, MAX_ANSWER_LENGTH, LOWER_CASE, os.path.join(LOGS, f"preds.bayesian.eval.{writer_name + writer_suff}.json"), os.path.join(LOGS, f"nbestpreds.bayesian.eval.{writer_name + writer_suff}.json"), None, True, False, NULL_SCORE_THRESH, tokenizer, ) results = squad_evaluate(test_examples, predictions) report.em = results["exact"] report.f1 = results["f1"] report.total = results["total"] print(f'em={report.em}, f1={report.f1}, total={report.total}') writer.add_scalar("bayesian_eval_em", report.em, epoch) writer.add_scalar("bayesian_eval_f1", report.f1, epoch) writer.add_scalar("bayesian_eval_total", report.total, epoch) # ============================ BAYESIAN ====================================== decay = [param for name, param in b_model.named_parameters() if name in ["bias", "LayerNorm.weight"]] no_decay = [param for name, param in b_model.named_parameters() if name not in ["bias", "LayerNorm.weight"]] params_decay = { "params": decay, "weight_decay": WEIGHT_DECAY } params_no_decay = { "params": no_decay, "weight_decay": 0.0 } parameters = [params_decay, params_no_decay] criterion = nn.CrossEntropyLoss().to(DEVICE) optim = AdamW(parameters, lr=LR, eps=ADAM_EPSILON) scheduler = get_linear_schedule_with_warmup(optim, N_WARMUP_STEPS, EPOCHS) with dumper("bayesian_train"): for epoch in tqdm(range(EPOCHS), desc="Bayesian Epoch"): with dumper("epoch", epoch): # ============================ TRAIN ====================================== b_model.train() report.reset() pbar = tqdm(train_loader, desc="Bayesian Train") for inputs in pbar: inputs = setup_inputs(inputs, MODEL_NAME, o_model) inputs = dic2cuda(inputs, DEVICE) start_positions = inputs["start_positions"] end_positions = inputs["end_positions"] B = inputs["input_ids"].size(0) optim.zero_grad() samples = sample_bayesian(b_model, inputs, SAMPLES, B, MAX_SEQ_LENGTH, DEVICE) raw_start_logits, raw_end_logits, start_logits, end_logits, log_prior, log_variational_posterior = samples ignored_idx = start_logits.size(1) start_logits = start_logits.clamp_(0, ignored_idx) end_logits = end_logits.clamp_(0, ignored_idx) criterion.ignore_index = ignored_idx with dumper(): dumper['start_positions'] = start_positions dumper['end_positions'] = end_positions dumper['start_logits'] = start_logits dumper['end_logits'] = end_logits dumper['log_prior'] = log_prior dumper['log_variational_posterior'] = log_variational_posterior start_loss = criterion(start_logits, start_positions) end_loss = criterion( end_logits, end_positions) start_acc = (torch.argmax(start_logits, dim=1) == start_positions).float().sum() end_acc = (torch.argmax( end_logits, dim=1) == end_positions).float().sum() start_acc_std = np.std([(torch.argmax(start_logits.clamp(0, ignored_idx), dim=1) == start_positions).float().sum().item() for start_logits in raw_start_logits]) end_acc_std = np.std([(torch.argmax( end_logits.clamp(0, ignored_idx), dim=1) == end_positions).float().sum().item() for end_logits in raw_end_logits]) nll = 0.5 * (start_loss + end_loss) acc = 0.5 * (start_acc + end_acc) acc_std = 0.5 * (start_acc_std + end_acc_std) loss = (log_variational_posterior - log_prior) / len(train_loader) + nll loss.backward() nn.utils.clip_grad_norm_(b_model.parameters(), MAX_GRAD_NORM) optim.step() report.total += loss.item() / len(train_loader) report.nll += nll.item() / len(train_loader) report.log_prior += log_prior.item() / len(train_loader) report.log_variational_posterior += log_variational_posterior.item() / len(train_loader) report.acc += acc.item() * 100 / len(train_dataset) report.acc_std += acc_std / len(train_loader) pbar.set_postfix( total=report.total, nll=report.nll, log_prior=report.log_prior, log_variational_posterior=report.log_variational_posterior, acc=report.acc, acc_std=report.acc_std, ) scheduler.step() writer.add_scalar("bayesian_train_nll", report.nll, epoch) writer.add_scalar("bayesian_train_acc", report.acc, epoch) writer.add_scalar("bayesian_train_acc_std", report.acc_std, epoch) # ============================ TEST ======================================= b_model.eval() report.reset() with dumper("bayesian_test_after_train"): with torch.no_grad(): results = [] pbar = tqdm(test_loader, desc="Bayesian Test") for inputs in pbar: inputs = setup_inputs(inputs, MODEL_NAME, o_model, True) inputs = dic2cuda(inputs, DEVICE) feature_indices = inputs["feature_indices"] B = inputs["input_ids"].size(0) del inputs["feature_indices"] samples = sample_bayesian(b_model, inputs, SAMPLES, B, MAX_SEQ_LENGTH, DEVICE) _, _, start_logits, end_logits, log_prior, log_variational_posterior = samples start_logits_list = start_logits.tolist() end_logits_list = end_logits.tolist() for i, feature_idx in enumerate(feature_indices): eval_feature = test_features[feature_idx.item()] unique_id = int(eval_feature.unique_id) result = SquadResult(unique_id, start_logits_list[i], end_logits_list[i]) results.append(result) with dumper(): dumper['unique_id'] = unique_id dumper['start_logits'] = start_logits_list[i] dumper['end_logits'] = end_logits_list[i] predictions = compute_predictions_logits( test_examples, test_features, results, N_BEST_SIZE, MAX_ANSWER_LENGTH, LOWER_CASE, os.path.join(LOGS, f"preds.bayesian.test.{writer_name + writer_suff}.json"), os.path.join(LOGS, f"nbestpreds.bayesian.test.{writer_name + writer_suff}.json"), None, True, False, NULL_SCORE_THRESH, tokenizer, ) results = squad_evaluate(test_examples, predictions) report.em = results["exact"] report.f1 = results["f1"] report.total = results["total"] print(f'em={report.em}, f1={report.f1}, total={report.total}') writer.add_scalar("bayesian_test_em", report.em, epoch) writer.add_scalar("bayesian_test_f1", report.f1, epoch) writer.add_scalar("bayesian_test_total", report.total, epoch) # ============================ SAVE ======================================= torch.save({ "weight_decay": WEIGHT_DECAY, "delta" : DELTA, "model" : b_model.state_dict(), "em" : report.em, "f1" : report.f1, "total" : report.total, }, f"{writer_path + writer_suff}.pth") return report.acc
def evaluate(self, prefix: str, args, tokenizer, dataset, examples, features) -> torch.Tensor: """Performs evaluation on the dataset Parameters ---------- prefix : str The model to be used for training args : tokenizer : The tokenizer used to preprocess the data. dataset : List(torch.utils.data.TensorDataset) The evaluation dataset examples : List(torch.utils.data.TensorDataset) The examples in the evaluation dataset features : List(torch.utils.data.TensorDataset) SQuAD-like features corresponding to the evalaution dataset Returns ------- torch.Tensor The evaluation metrics (Exact Match (EM) and F1-score) """ if not os.path.exists( self.args.output_dir) and self.args.local_rank in [-1, 0]: os.makedirs(self.args.output_dir) eval_batch_size = self.args.per_device_eval_batch_size * max( 1, self.args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=eval_batch_size) # multi-gpu evaluate if self.args.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel): self.model = torch.nn.DataParallel(self.model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): self.model.eval() batch = tuple(t.to(self.args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if self.params.model_type in ["xlm", "roberta", "distilbert"]: del inputs["token_type_ids"] example_indices = batch[3] outputs = self.model(**inputs) example_indices = batch[5] for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [tensor_to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) eval_time = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", eval_time, eval_time / len(dataset)) # Compute predictions predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, None, None, None, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""): eval_dataset,features, examples = load_and_cache_examples( args, tokenizer, labels, pad_token_label_id, mode=mode ) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = ( SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) ) eval_dataloader = DataLoader( eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size ) # Eval! logger.info("***** Running evaluation %s *****", prefix) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None all_results = [] start_time = timeit.default_timer() model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], } inputs["bbox"] = batch[5] inputs["token_type_ids"] = (batch[6]) outputs = model(**inputs) example_indices = batch[7] for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(eval_dataset)) output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix)) output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix)) predictions = compute_predictions_logits( examples, features, all_results, 20, 30, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, True, True, 0.0, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def evaluate(args, model_path1, model1, model2, model3, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, model_path1, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model1, torch.nn.DataParallel): model1 = torch.nn.DataParallel(model1) if args.n_gpu > 1 and not isinstance(model2, torch.nn.DataParallel): model2 = torch.nn.DataParallel(model2) if args.n_gpu > 1 and not isinstance(model3, torch.nn.DataParallel): model3 = torch.nn.DataParallel(model3) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model1.eval() model2.eval() model3.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } feature_indices = batch[3] outputs1 = model1(**inputs) outputs2 = model2(**inputs) outputs3 = model3(**inputs) # print("outputs1", outputs1) for i, feature_index in enumerate(feature_indices): # TODO: i and feature_index are the same number! Simplify by removing enumerate? eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) output1 = [to_list(output1[i]) for output1 in outputs1] # print("output1", output1) # print("len(output1)", len(output1[0])) output2 = [to_list(output2[i]) for output2 in outputs2] output3 = [to_list(output3[i]) for output3 in outputs3] start_logits1, end_logits1 = output1 start_logits2, end_logits2 = output2 start_logits3, end_logits3 = output3 # 第一种加权加和形式集成 weights = [0.4, 0.2, 0.4] start_logits = [ weights[0] * log1 + weights[1] * log2 + weights[2] * log3 for log1, log2, log3 in zip(start_logits1, start_logits2, start_logits3) ] end_logits = [ weights[0] * log1 + weights[1] * log2 + weights[2] * log3 for log1, log2, log3 in zip(end_logits1, end_logits2, end_logits3) ] # # 第二种算数平均是集成 # start_logits = [ # (log1 + log2 + log3)/3 # for log1, log2, log3 in zip(start_logits1, start_logits2, start_logits3) # ] # end_logits = [ # (log1 + log2 + log3) / 3 # for log1, log2, log3 in zip(end_logits1, end_logits2, end_logits3) # ] # # 第三种位置形式 # start_logits = [ # max(log1, log2, log3) # for log1, log2, log3 in zip(start_logits1, start_logits2, start_logits3) # ] # end_logits = [ # max(log1, log2, log3) # for log1, log2, log3 in zip(end_logits1, end_logits2, end_logits3) # ] # print("start_logits1", start_logits1[0]) # print("start_logits2", start_logits2[0]) # print("start_logits3", start_logits3[0]) # print("start_logits", start_logits[0]) result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info( " Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset), ) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def get_evaluation_metrics( model, tokenizer, data_dir: str, filename: str, per_gpu_batch_size: int = 32, num_batches: int = None, disable_tqdm: bool = False, ) -> Dict[str, "Number"]: """ Return an OrderedDict in the format: { 'exact': 0.8169797018445212, 'f1': 4.4469722448269335, 'total': 11873, 'HasAns_exact': 0.15182186234817813, 'HasAns_f1': 7.422216845956518, 'HasAns_total': 5928, 'NoAns_exact': 1.4802354920100924, 'NoAns_f1': 1.4802354920100924, 'NoAns_total': 5945, 'best_exact': 50.07159100480081, 'best_exact_thresh': 0.0, 'best_f1': 50.0772059855695, 'best_f1_thresh': 0.0 } """ # These are not used in inference, only for scoring in `compute_predictions_logits()`. processor = SquadV2Processor() examples: List[SquadExample] = processor.get_dev_examples( data_dir, filename=filename) features: List[SquadFeatures] = get_dataset( tokenizer=tokenizer, processor=processor, data_dir=data_dir, filename=filename, per_gpu_batch_size=per_gpu_batch_size, shard=False, shuffle=False, drop_remainder=False, return_raw_features=True, ) # Here we get the dataset instead of just the features, with return_raw_features=False. dataset: tf.data.Dataset = get_dataset( tokenizer=tokenizer, processor=processor, data_dir=data_dir, filename=filename, per_gpu_batch_size=per_gpu_batch_size, shard=False, shuffle=False, drop_remainder=False, return_raw_features=False, ) results: List[SquadResult] = get_squad_results( model=model, dataset=dataset, features=features, per_gpu_batch_size=per_gpu_batch_size, num_batches=num_batches, disable_tqdm=disable_tqdm, ) write_prediction_files = False if write_prediction_files: output_predictions_file = f"/fsx/{args.checkpoint}_predictions.json" output_nbest_file = f"/fsx/{args.checkpoint}_nbest_predictions.json" output_null_log_odds_file = f"/fsx/{args.checkpoint}_null_odds.json" else: output_predictions_file = None output_nbest_file = None output_null_log_odds_file = None predictions = compute_predictions_logits( all_examples=examples, all_features=features, all_results=results, n_best_size=20, max_answer_length=30, do_lower_case=True, output_prediction_file=output_predictions_file, output_nbest_file=output_nbest_file, output_null_log_odds_file=output_null_log_odds_file, verbose_logging=False, version_2_with_negative=True, null_score_diff_threshold=0.0, tokenizer=tokenizer, ) results: collections.OrderedDict = squad_evaluate(examples, predictions) return results
def evaluate(args, model, tokenizer, prefix="", calibration=False): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) dataset_cached = "./dataset_cached" if not os.path.exists(dataset_cached): os.makedirs(dataset_cached) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) calibation_iteration = int( (len(dataset) * 0.05 + args.eval_batch_size - 1) / args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) print(" Batch size = %d" % args.eval_batch_size) if args.mkldnn_eval: from torch.utils import mkldnn as mkldnn_utils model = mkldnn_utils.to_mkldnn(model) print(model) all_results = [] evalTime = 0 nb_eval_steps = 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) if calibration and nb_eval_steps >= calibation_iteration: break with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1]} if args.model_type != 'distilbert': inputs[ 'token_type_ids'] = None if args.model_type == 'xlm' else batch[ 2] # XLM don't use segment_ids example_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ['xlnet', 'xlm']: inputs.update({'cls_index': batch[4], 'p_mask': batch[5]}) if nb_eval_steps >= args.warmup: start_time = timeit.default_timer() outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult(unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) if nb_eval_steps >= args.warmup: evalTime += (timeit.default_timer() - start_time) nb_eval_steps += 1 if args.iter > 0 and nb_eval_steps >= (args.warmup + args.iter): break if nb_eval_steps >= args.warmup: perf = (nb_eval_steps - args.warmup) * args.eval_batch_size / evalTime if args.eval_batch_size == 1: print('Latency: %.3f ms' % (evalTime / (nb_eval_steps - args.warmup) * 1000)) print("Evaluation done in total %f secs (Throughput: %f samples/sec)" % (evalTime, perf)) else: logger.info( "*****no performance, please check dataset length and warmup number *****" ) # Compute predictions output_prediction_file = os.path.join(dataset_cached, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( dataset_cached, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( dataset_cached, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ['xlnet', 'xlm']: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging) elif not calibration and args.iter == 0: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) # Compute the F1 and exact scores. if not calibration and args.iter == 0: results = squad_evaluate(examples, predictions) bert_task_acc_keys = ['best_f1', 'f1', 'mcc', 'spearmanr', 'acc'] for key in bert_task_acc_keys: if key in results.keys(): acc = results[key] break print("Accuracy: %.5f" % acc) else: results = None return results, perf
def evaluate(args, model, tokenizer, prefix="", global_step=None): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info(f"***** Running evaluation {prefix} *****") logger.info(f" Num examples = {len(dataset)}") logger.info(f" Batch size = {args.eval_batch_size}") all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Eval"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info( f" Evaluation done in total {evalTime} secs ({evalTime / len(dataset)} sec per example)" ) # Compute predictions output_prediction_file = os.path.join(args.output_dir, f"predictions_{prefix}.json") output_nbest_file = os.path.join(args.output_dir, f"nbest_predictions_{prefix}.json") if args.version_2_with_negative: output_null_log_odds_file = os.path.join(args.output_dir, f"null_odds_{prefix}.json") else: output_null_log_odds_file = None predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, False, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) # Write the result # Write the evaluation result on file output_dir = os.path.join(args.output_dir, "eval") if not os.path.exists(output_dir): os.makedirs(output_dir) output_eval_file = os.path.join(output_dir, f"eval_result_{global_step}.txt") logger.info("***** Official Eval results *****") with open(output_eval_file, "w", encoding="utf-8") as f: official_eval_results = eval_during_train(args) for key in sorted(official_eval_results.keys()): logger.info(f" {key} = {official_eval_results[key]}") f.write(f" {key} = {official_eval_results[key]}\n") return results
def evaluate(args, model, tokenizer, prefix="", adapter_names=None): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "adapter_names": adapter_names, } if args.model_type in [ "xlm", "roberta", "distilbert", "camembert" ]: del inputs["token_type_ids"] feature_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm models if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device) }) outputs = model(**inputs) for i, feature_index in enumerate(feature_indices): # TODO: i and feature_index are the same number! Simplify by removing enumerate? eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ["xlnet", "xlm"]: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging, ) else: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def test_epoch_end(self, outputs): example_indices = torch.cat([x["example_indices"] for x in outputs ]).detach().cpu().tolist() start_logits = torch.cat([x["start_logits"] for x in outputs]).detach().cpu().tolist() end_logits = torch.cat([x["end_logits"] for x in outputs]).detach().cpu().tolist() if "cls_logits" in list(outputs[0].keys()): start_top_index = torch.cat([ x["start_top_index"] for x in outputs ]).detach().cpu().tolist() end_top_index = torch.cat([x["end_top_index"] for x in outputs ]).detach().cpu().tolist() cls_logits = torch.cat([x["cls_logits"] for x in outputs]).detach().cpu().tolist() examples = self.trainer.datamodule.test_examples features = self.trainer.datamodule.test_features all_results = [] for i, example_index in enumerate(example_indices): eval_feature = features[example_index] unique_id = int(eval_feature.unique_id) # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. from transformers.data.processors.squad import SquadResult if "cls_logits" in list(outputs[0].keys()): result = SquadResult( unique_id, start_logits[i], end_logits[i], start_top_index=start_top_index[i], end_top_index=end_top_index[i], cls_logits=cls_logits[i], ) else: result = SquadResult(unique_id, start_logits[i], end_logits[i]) all_results.append(result) # Compute predictions output_prediction_file = os.path.join( self.trainer.checkpoint_callback.dirpath, "predictions_eval.json") output_nbest_file = os.path.join( self.trainer.checkpoint_callback.dirpath, "nbest_predictions_eval.json") if self.version_2_with_negative: output_null_log_odds_file = os.path.join( self.trainer.checkpoint_callback.dirpath, "null_odds_eval.json") else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if self.hparams.model_type in ["xlnet", "xlm"]: start_n_top = self.model.config.start_n_top if hasattr( self.model, "config") else self.model.module.config.start_n_top end_n_top = self.model.config.end_n_top if hasattr( self.model, "config") else self.model.module.config.end_n_top from transformers.data.metrics.squad_metrics import compute_predictions_log_probs predictions = compute_predictions_log_probs( examples, features, all_results, self.hparams.n_best_size, self.hparams.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, self.version_2_with_negative, self.trainer.datamodule.tokenizer, False # Not want to do verbose logging ) else: from transformers.data.metrics.squad_metrics import compute_predictions_logits predictions = compute_predictions_logits( examples, features, all_results, self.hparams.n_best_size, self.hparams.max_answer_length, self.hparams.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, False, # Not want to do verbose logging self.version_2_with_negative, self.hparams.null_score_diff_threshold, self.trainer.datamodule.tokenizer) # Compute the F1 and exact scores. from transformers.data.metrics.squad_metrics import squad_evaluate results = squad_evaluate(examples, predictions) return results