def eval_dev(self, dev_data, eval_config, n_gpu): self.model.eval() results = self.eval( dev_data, per_gpu_eval_batch_size=eval_config.per_gpu_eval_batch_size, n_gpu=n_gpu) predictions = np.argmax(results['logits'], axis=1) scores = {} metrics = eval_config.metrics if eval_config.metrics else ['acc'] for metric in metrics: if metric == 'acc': scores[metric] = simple_accuracy(predictions, results['labels']) elif metric == 'f1': scores[metric] = f1_score(results['labels'], predictions) elif metric == 'f1-macro': scores[metric] = f1_score(results['labels'], predictions, average='macro') elif metric == 'em': scores[metric] = exact_match(predictions, results['labels'], results['question_ids']) else: raise ValueError(f"Metric '{metric}' not implemented") return scores
def evaluate(model: TransformerModelWrapper, eval_data: List[InputExample], config: EvalConfig) -> Dict: metrics = config.metrics if config.metrics else ['acc'] results = model.eval(eval_data=eval_data, per_gpu_eval_batch_size=config.per_gpu_eval_batch_size, n_gpu=config.n_gpu) # print("results['logits'].shape=", results['logits'].shape) predictions = np.argmax(results['logits'], axis=1) scores = {} for metric in metrics: if metric == 'acc': scores[metric] = simple_accuracy(predictions, results['labels']) elif metric == 'f1': scores[metric] = f1_score(results['labels'], predictions) elif metric == 'f1-macro': scores[metric] = f1_score(results['labels'], predictions, average='macro') elif metric == 'em': scores[metric] = exact_match(predictions, results['labels'], results['question_ids']) else: raise ValueError(f"Metric '{metric}' not implemented") results['scores'] = scores results['predictions'] = predictions return results
def evaluate( model: TransformerModelWrapper, eval_data: List[InputExample], config: EvalConfig, priming_data: List[InputExample] = None, local_rank=-1, ) -> Dict: """ Evaluate a model. :param model: the model to evaluate :param eval_data: the examples for evaluation :param config: the evaluation config :param priming_data: an optional list of priming data to use :return: a dictionary containing the model's logits, predictions and (if any metrics are given) scores """ if config.priming: for example in eval_data: example.meta["priming_data"] = priming_data metrics = config.metrics if config.metrics else ["acc"] device = torch.device(config.device if config.device else "cuda" if torch. cuda.is_available() else "cpu") model.model.to(device) results = model.eval( eval_data, device, per_gpu_eval_batch_size=config.per_gpu_eval_batch_size, n_gpu=config.n_gpu, decoding_strategy=config.decoding_strategy, priming=config.priming, local_rank=local_rank, ) predictions = np.argmax(results["logits"], axis=1) scores = {} for metric in metrics: if metric == "acc": scores[metric] = simple_accuracy(predictions, results["labels"]) elif metric == "f1": scores[metric] = f1_score(results["labels"], predictions) elif metric == "f1-macro": scores[metric] = f1_score(results["labels"], predictions, average="macro") elif metric == "em": scores[metric] = exact_match(predictions, results["labels"], results["question_ids"]) else: raise ValueError(f"Metric '{metric}' not implemented") results["scores"] = scores results["predictions"] = predictions return results
def in_training_eval(self, eval_kwargs): eval_results = self.eval(**eval_kwargs) predictions = np.argmax(eval_results["logits"], axis=1) if eval_kwargs["metrics"]: if "f1" in eval_kwargs["metrics"]: score = f1_score(eval_results["labels"], predictions) elif "f1-macro" in eval_kwargs["metrics"]: score = f1_score(eval_results["labels"], predictions, average="macro") elif "em" in eval_kwargs["metrics"]: score = exact_match(predictions, eval_results["labels"], eval_results["question_ids"]) else: score = simple_accuracy(predictions, eval_results["labels"]) else: score = simple_accuracy(predictions, eval_results["labels"]) return score
def evaluate(model: TransformerModelWrapper, eval_data: List[InputExample], config: EvalConfig, priming_data: List[InputExample] = None) -> Dict: """ Evaluate a model. :param model: the model to evaluate :param eval_data: the examples for evaluation :param config: the evaluation config :param priming_data: an optional list of priming data to use :return: a dictionary containing the model's logits, predictions and (if any metrics are given) scores """ if config.priming: for example in eval_data: example.meta['priming_data'] = priming_data metrics = config.metrics if config.metrics else ['acc'] device = torch.device(config.device if config.device else "cuda" if torch. cuda.is_available() else "cpu") model.model.to(device) results = model.eval( eval_data, device, per_gpu_eval_batch_size=config.per_gpu_eval_batch_size, n_gpu=config.n_gpu, decoding_strategy=config.decoding_strategy, priming=config.priming) predictions = np.argmax(results['logits'], axis=1) scores = {} for metric in metrics: if metric == 'acc': scores[metric] = simple_accuracy(predictions, results['labels']) elif metric == 'f1': scores[metric] = f1_score(results['labels'], predictions) elif metric == 'f1-macro': scores[metric] = f1_score(results['labels'], predictions, average='macro') elif metric == 'em': scores[metric] = exact_match(predictions, results['labels'], results['question_ids']) elif metric == 'dist-loss': if eval_data[0].logits is not None: scores[metric] = distillation_loss( torch.tensor(results['logits']), torch.stack([ torch.tensor(ex.logits, dtype=torch.float32) for ex in eval_data ]), config.temperature) else: scores[metric] = 0. else: raise ValueError(f"Metric '{metric}' not implemented") results['scores'] = scores results['predictions'] = predictions return results