def test_compute_metrics_basic(): # check we get some exception, may not always be the AssertionError we get now with pytest.raises(Exception): compute_metrics("acc", ["x"] * 10, [""] * 11) ret = compute_metrics("acc", [], []) assert isinstance(ret, dict) assert "acc" in ret assert math.isnan(ret["acc"]) with pytest.raises(Exception): compute_metrics("asdfasdf", ["a"], ["b"]) ls = (["a"] * 5) ls.extend(["b"] * 5) ps = ["a"] * 10 ret = compute_metrics("acc", ps, ls) assert ret["acc"] == 0.5 ret = compute_metrics("acc", ls, ps) assert ret["acc"] == 0.5 ret = compute_metrics("f1_macro", ps, ls) assert ret["f1_macro"] == 1 / 3 ret = compute_metrics("f1_macro", ls, ps) assert ret["f1_macro"] == 1 / 3 ret = compute_metrics(["f1_macro", "acc"], ps, ls) assert isinstance(ret, dict) assert len(ret) == 2 assert "acc" in ret assert "f1_macro" in ret assert ret["f1_macro"] == 1 / 3 assert ret["acc"] == 0.5 ret = compute_metrics(["f1_macro", "acc", "acc"], ps, ls) assert isinstance(ret, dict) assert len(ret) == 2 assert "acc" in ret assert "f1_macro" in ret assert ret["f1_macro"] == 1 / 3 assert ret["acc"] == 0.5 ret = compute_metrics(["f1_macro", ["acc"]], ps, ls) assert isinstance(ret, dict) assert len(ret) == 2 assert "acc" in ret assert "f1_macro" in ret assert ret["f1_macro"] == 1 / 3 assert ret["acc"] == 0.5
def eval(self, model, return_preds_and_labels=False, calibrate_conf_scores=False): """ Performs evaluation on a given model. :param model: The model on which to perform evaluation :type model: AdaptiveModel :param return_preds_and_labels: Whether to add preds and labels in the returned dicts of the :type return_preds_and_labels: bool :param calibrate_conf_scores: Whether to calibrate the temperature for temperature scaling of the confidence scores :type calibrate_conf_scores: bool :return all_results: A list of dictionaries, one for each prediction head. Each dictionary contains the metrics and reports generated during evaluation. :rtype all_results: list of dicts """ model.eval() # init empty lists per prediction head loss_all = [0 for _ in model.prediction_heads] preds_all = [[] for _ in model.prediction_heads] label_all = [[] for _ in model.prediction_heads] ids_all = [[] for _ in model.prediction_heads] passage_start_t_all = [[] for _ in model.prediction_heads] logits_all = [[] for _ in model.prediction_heads] for step, batch in enumerate( tqdm(self.data_loader, desc="Evaluating", mininterval=10)): batch = {key: batch[key].to(self.device) for key in batch} with torch.no_grad(): logits = model.forward(**batch) losses_per_head = model.logits_to_loss_per_head(logits=logits, **batch) preds = model.logits_to_preds(logits=logits, **batch) labels = model.prepare_labels(**batch) # stack results of all batches per prediction head for head_num, head in enumerate(model.prediction_heads): loss_all[head_num] += np.sum( to_numpy(losses_per_head[head_num])) preds_all[head_num] += list(to_numpy(preds[head_num])) label_all[head_num] += list(to_numpy(labels[head_num])) if head.model_type == "span_classification": ids_all[head_num] += list(to_numpy(batch["id"])) passage_start_t_all[head_num] += list( to_numpy(batch["passage_start_t"])) if calibrate_conf_scores: logits_all[head_num] += list(to_numpy(logits)) # Evaluate per prediction head all_results = [] for head_num, head in enumerate(model.prediction_heads): if head.model_type == "multilabel_text_classification": # converting from string preds back to multi-hot encoding from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer(classes=head.label_list) # TODO check why .fit() should be called on predictions, rather than on labels preds_all[head_num] = mlb.fit_transform(preds_all[head_num]) label_all[head_num] = mlb.transform(label_all[head_num]) if head.model_type == "span_classification" and calibrate_conf_scores: temperature_previous = head.temperature_for_confidence.item() logger.info( f"temperature used for confidence scores before calibration: {temperature_previous}" ) head.calibrate_conf(logits_all[head_num], label_all[head_num]) temperature_current = head.temperature_for_confidence.item() logger.info( f"temperature used for confidence scores after calibration: {temperature_current}" ) temperature_change = ( abs(temperature_current - temperature_previous) / temperature_previous) * 100.0 if temperature_change > 50: logger.warning( f"temperature used for calibration of confidence scores changed by more than {temperature_change} percent" ) if hasattr(head, 'aggregate_preds'): # Needed to convert NQ ids from np arrays to strings ids_all_str = [x.astype(str) for x in ids_all[head_num]] ids_all_list = [list(x) for x in ids_all_str] head_ids = ["-".join(x) for x in ids_all_list] preds_all[head_num], label_all[ head_num] = head.aggregate_preds( preds=preds_all[head_num], labels=label_all[head_num], passage_start_t=passage_start_t_all[head_num], ids=head_ids) result = { "loss": loss_all[head_num] / len(self.data_loader.dataset), "task_name": head.task_name } result.update( compute_metrics(metric=head.metric, preds=preds_all[head_num], labels=label_all[head_num])) # Select type of report depending on prediction head output type if self.report: try: result["report"] = compute_report_metrics( head, preds_all[head_num], label_all[head_num]) except: logger.error( f"Couldn't create eval report for head {head_num} with following preds and labels:" f"\n Preds: {preds_all[head_num]} \n Labels: {label_all[head_num]}" ) result["report"] = "Error" if return_preds_and_labels: result["preds"] = preds_all[head_num] result["labels"] = label_all[head_num] all_results.append(result) return all_results
def eval(self, model, return_preds_and_labels=False): """ Performs evaluation on a given model. :param model: The model on which to perform evaluation :type model: AdaptiveModel :param return_preds_and_labels: Whether to add preds and labels in the returned dicts of the :type return_preds_and_labels: bool :return all_results: A list of dictionaries, one for each prediction head. Each dictionary contains the metrics and reports generated during evaluation. :rtype all_results: list of dicts """ model.eval() # init empty lists per prediction head loss_all = [0 for _ in model.prediction_heads] preds_all = [[] for _ in model.prediction_heads] label_all = [[] for _ in model.prediction_heads] ids_all = [[] for _ in model.prediction_heads] passage_start_t_all = [[] for _ in model.prediction_heads] for step, batch in enumerate( tqdm(self.data_loader, desc="Evaluating", mininterval=10)): batch = {key: batch[key].to(self.device) for key in batch} with torch.no_grad(): logits = model.forward(**batch) losses_per_head = model.logits_to_loss_per_head(logits=logits, **batch) preds = model.logits_to_preds(logits=logits, **batch) labels = model.prepare_labels(**batch) # stack results of all batches per prediction head for head_num, head in enumerate(model.prediction_heads): loss_all[head_num] += np.sum( to_numpy(losses_per_head[head_num])) preds_all[head_num] += list(to_numpy(preds[head_num])) label_all[head_num] += list(to_numpy(labels[head_num])) if head.model_type == "span_classification": ids_all[head_num] += list(to_numpy(batch["id"])) passage_start_t_all[head_num] += list( to_numpy(batch["passage_start_t"])) # Evaluate per prediction head all_results = [] for head_num, head in enumerate(model.prediction_heads): if head.model_type == "multilabel_text_classification": # converting from string preds back to multi-hot encoding from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer(classes=head.label_list) # TODO check why .fit() should be called on predictions, rather than on labels preds_all[head_num] = mlb.fit_transform(preds_all[head_num]) label_all[head_num] = mlb.transform(label_all[head_num]) if hasattr(head, 'aggregate_preds'): preds_all[head_num], label_all[ head_num] = head.aggregate_preds( preds=preds_all[head_num], labels=label_all[head_num], passage_start_t=passage_start_t_all[head_num], ids=ids_all[head_num]) result = { "loss": loss_all[head_num] / len(self.data_loader.dataset), "task_name": head.task_name } result.update( compute_metrics(metric=head.metric, preds=preds_all[head_num], labels=label_all[head_num])) # Select type of report depending on prediction head output type if self.report: if head.ph_output_type == "per_token": report_fn = token_classification_report elif head.ph_output_type == "per_sequence": report_fn = classification_report elif head.ph_output_type == "per_token_squad": report_fn = lambda *args, **kwargs: "not Implemented" elif head.ph_output_type == "per_sequence_continuous": report_fn = r2_score else: raise NotImplementedError # CHANGE PARAMETERS, not all report_fn accept digits if head.ph_output_type in [ "per_sequence_continuous", "per_token" ]: result["report"] = report_fn(label_all[head_num], preds_all[head_num]) else: # supply labels as all possible combination because if ground truth labels do not cover # all values in label_list (maybe dev set is small), the report will break if head.model_type == "multilabel_text_classification": # For multilabel classification, we don't eval with string labels here, but with multihot vectors. # Therefore we need to supply all possible label ids instead of label values. all_possible_labels = list(range(len(head.label_list))) else: all_possible_labels = head.label_list result["report"] = report_fn(label_all[head_num], preds_all[head_num], digits=4, labels=all_possible_labels, target_names=head.label_list) if return_preds_and_labels: result["preds"] = preds_all[head_num] result["labels"] = label_all[head_num] all_results.append(result) return all_results