Exemple #1
0
def test_compute_metrics_basic():
    # check we get some exception, may not always be the AssertionError we get now
    with pytest.raises(Exception):
        compute_metrics("acc", ["x"] * 10, [""] * 11)
    ret = compute_metrics("acc", [], [])
    assert isinstance(ret, dict)
    assert "acc" in ret
    assert math.isnan(ret["acc"])
    with pytest.raises(Exception):
        compute_metrics("asdfasdf", ["a"], ["b"])
    ls = (["a"] * 5)
    ls.extend(["b"] * 5)
    ps = ["a"] * 10
    ret = compute_metrics("acc", ps, ls)
    assert ret["acc"] == 0.5
    ret = compute_metrics("acc", ls, ps)
    assert ret["acc"] == 0.5
    ret = compute_metrics("f1_macro", ps, ls)
    assert ret["f1_macro"] == 1 / 3
    ret = compute_metrics("f1_macro", ls, ps)
    assert ret["f1_macro"] == 1 / 3
    ret = compute_metrics(["f1_macro", "acc"], ps, ls)
    assert isinstance(ret, dict)
    assert len(ret) == 2
    assert "acc" in ret
    assert "f1_macro" in ret
    assert ret["f1_macro"] == 1 / 3
    assert ret["acc"] == 0.5
    ret = compute_metrics(["f1_macro", "acc", "acc"], ps, ls)
    assert isinstance(ret, dict)
    assert len(ret) == 2
    assert "acc" in ret
    assert "f1_macro" in ret
    assert ret["f1_macro"] == 1 / 3
    assert ret["acc"] == 0.5
    ret = compute_metrics(["f1_macro", ["acc"]], ps, ls)
    assert isinstance(ret, dict)
    assert len(ret) == 2
    assert "acc" in ret
    assert "f1_macro" in ret
    assert ret["f1_macro"] == 1 / 3
    assert ret["acc"] == 0.5
Exemple #2
0
    def eval(self,
             model,
             return_preds_and_labels=False,
             calibrate_conf_scores=False):
        """
        Performs evaluation on a given model.

        :param model: The model on which to perform evaluation
        :type model: AdaptiveModel
        :param return_preds_and_labels: Whether to add preds and labels in the returned dicts of the
        :type return_preds_and_labels: bool
        :param calibrate_conf_scores: Whether to calibrate the temperature for temperature scaling of the confidence scores
        :type calibrate_conf_scores: bool
        :return all_results: A list of dictionaries, one for each prediction head. Each dictionary contains the metrics
                             and reports generated during evaluation.
        :rtype all_results: list of dicts
        """
        model.eval()

        # init empty lists per prediction head
        loss_all = [0 for _ in model.prediction_heads]
        preds_all = [[] for _ in model.prediction_heads]
        label_all = [[] for _ in model.prediction_heads]
        ids_all = [[] for _ in model.prediction_heads]
        passage_start_t_all = [[] for _ in model.prediction_heads]
        logits_all = [[] for _ in model.prediction_heads]

        for step, batch in enumerate(
                tqdm(self.data_loader, desc="Evaluating", mininterval=10)):
            batch = {key: batch[key].to(self.device) for key in batch}

            with torch.no_grad():

                logits = model.forward(**batch)
                losses_per_head = model.logits_to_loss_per_head(logits=logits,
                                                                **batch)
                preds = model.logits_to_preds(logits=logits, **batch)
                labels = model.prepare_labels(**batch)

            # stack results of all batches per prediction head
            for head_num, head in enumerate(model.prediction_heads):
                loss_all[head_num] += np.sum(
                    to_numpy(losses_per_head[head_num]))
                preds_all[head_num] += list(to_numpy(preds[head_num]))
                label_all[head_num] += list(to_numpy(labels[head_num]))
                if head.model_type == "span_classification":
                    ids_all[head_num] += list(to_numpy(batch["id"]))
                    passage_start_t_all[head_num] += list(
                        to_numpy(batch["passage_start_t"]))
                    if calibrate_conf_scores:
                        logits_all[head_num] += list(to_numpy(logits))

        # Evaluate per prediction head
        all_results = []
        for head_num, head in enumerate(model.prediction_heads):
            if head.model_type == "multilabel_text_classification":
                # converting from string preds back to multi-hot encoding
                from sklearn.preprocessing import MultiLabelBinarizer
                mlb = MultiLabelBinarizer(classes=head.label_list)
                # TODO check why .fit() should be called on predictions, rather than on labels
                preds_all[head_num] = mlb.fit_transform(preds_all[head_num])
                label_all[head_num] = mlb.transform(label_all[head_num])
            if head.model_type == "span_classification" and calibrate_conf_scores:
                temperature_previous = head.temperature_for_confidence.item()
                logger.info(
                    f"temperature used for confidence scores before calibration: {temperature_previous}"
                )
                head.calibrate_conf(logits_all[head_num], label_all[head_num])
                temperature_current = head.temperature_for_confidence.item()
                logger.info(
                    f"temperature used for confidence scores after calibration: {temperature_current}"
                )
                temperature_change = (
                    abs(temperature_current - temperature_previous) /
                    temperature_previous) * 100.0
                if temperature_change > 50:
                    logger.warning(
                        f"temperature used for calibration of confidence scores changed by more than {temperature_change} percent"
                    )
            if hasattr(head, 'aggregate_preds'):
                # Needed to convert NQ ids from np arrays to strings
                ids_all_str = [x.astype(str) for x in ids_all[head_num]]
                ids_all_list = [list(x) for x in ids_all_str]
                head_ids = ["-".join(x) for x in ids_all_list]
                preds_all[head_num], label_all[
                    head_num] = head.aggregate_preds(
                        preds=preds_all[head_num],
                        labels=label_all[head_num],
                        passage_start_t=passage_start_t_all[head_num],
                        ids=head_ids)

            result = {
                "loss": loss_all[head_num] / len(self.data_loader.dataset),
                "task_name": head.task_name
            }
            result.update(
                compute_metrics(metric=head.metric,
                                preds=preds_all[head_num],
                                labels=label_all[head_num]))

            # Select type of report depending on prediction head output type
            if self.report:
                try:
                    result["report"] = compute_report_metrics(
                        head, preds_all[head_num], label_all[head_num])
                except:
                    logger.error(
                        f"Couldn't create eval report for head {head_num} with following preds and labels:"
                        f"\n Preds: {preds_all[head_num]} \n Labels: {label_all[head_num]}"
                    )
                    result["report"] = "Error"

            if return_preds_and_labels:
                result["preds"] = preds_all[head_num]
                result["labels"] = label_all[head_num]

            all_results.append(result)

        return all_results
Exemple #3
0
    def eval(self, model, return_preds_and_labels=False):
        """
        Performs evaluation on a given model.

        :param model: The model on which to perform evaluation
        :type model: AdaptiveModel
        :param return_preds_and_labels: Whether to add preds and labels in the returned dicts of the
        :type return_preds_and_labels: bool
        :return all_results: A list of dictionaries, one for each prediction head. Each dictionary contains the metrics
                             and reports generated during evaluation.
        :rtype all_results: list of dicts
        """
        model.eval()

        # init empty lists per prediction head
        loss_all = [0 for _ in model.prediction_heads]
        preds_all = [[] for _ in model.prediction_heads]
        label_all = [[] for _ in model.prediction_heads]
        ids_all = [[] for _ in model.prediction_heads]
        passage_start_t_all = [[] for _ in model.prediction_heads]

        for step, batch in enumerate(
                tqdm(self.data_loader, desc="Evaluating", mininterval=10)):
            batch = {key: batch[key].to(self.device) for key in batch}

            with torch.no_grad():

                logits = model.forward(**batch)
                losses_per_head = model.logits_to_loss_per_head(logits=logits,
                                                                **batch)
                preds = model.logits_to_preds(logits=logits, **batch)
                labels = model.prepare_labels(**batch)

            # stack results of all batches per prediction head
            for head_num, head in enumerate(model.prediction_heads):
                loss_all[head_num] += np.sum(
                    to_numpy(losses_per_head[head_num]))
                preds_all[head_num] += list(to_numpy(preds[head_num]))
                label_all[head_num] += list(to_numpy(labels[head_num]))
                if head.model_type == "span_classification":
                    ids_all[head_num] += list(to_numpy(batch["id"]))
                    passage_start_t_all[head_num] += list(
                        to_numpy(batch["passage_start_t"]))

        # Evaluate per prediction head
        all_results = []
        for head_num, head in enumerate(model.prediction_heads):
            if head.model_type == "multilabel_text_classification":
                # converting from string preds back to multi-hot encoding
                from sklearn.preprocessing import MultiLabelBinarizer
                mlb = MultiLabelBinarizer(classes=head.label_list)
                # TODO check why .fit() should be called on predictions, rather than on labels
                preds_all[head_num] = mlb.fit_transform(preds_all[head_num])
                label_all[head_num] = mlb.transform(label_all[head_num])
            if hasattr(head, 'aggregate_preds'):
                preds_all[head_num], label_all[
                    head_num] = head.aggregate_preds(
                        preds=preds_all[head_num],
                        labels=label_all[head_num],
                        passage_start_t=passage_start_t_all[head_num],
                        ids=ids_all[head_num])

            result = {
                "loss": loss_all[head_num] / len(self.data_loader.dataset),
                "task_name": head.task_name
            }
            result.update(
                compute_metrics(metric=head.metric,
                                preds=preds_all[head_num],
                                labels=label_all[head_num]))

            # Select type of report depending on prediction head output type
            if self.report:
                if head.ph_output_type == "per_token":
                    report_fn = token_classification_report
                elif head.ph_output_type == "per_sequence":
                    report_fn = classification_report
                elif head.ph_output_type == "per_token_squad":
                    report_fn = lambda *args, **kwargs: "not Implemented"
                elif head.ph_output_type == "per_sequence_continuous":
                    report_fn = r2_score
                else:
                    raise NotImplementedError

                # CHANGE PARAMETERS, not all report_fn accept digits
                if head.ph_output_type in [
                        "per_sequence_continuous", "per_token"
                ]:
                    result["report"] = report_fn(label_all[head_num],
                                                 preds_all[head_num])
                else:
                    # supply labels as all possible combination because if ground truth labels do not cover
                    # all values in label_list (maybe dev set is small), the report will break
                    if head.model_type == "multilabel_text_classification":
                        # For multilabel classification, we don't eval with string labels here, but with multihot vectors.
                        # Therefore we need to supply all possible label ids instead of label values.
                        all_possible_labels = list(range(len(head.label_list)))
                    else:
                        all_possible_labels = head.label_list

                    result["report"] = report_fn(label_all[head_num],
                                                 preds_all[head_num],
                                                 digits=4,
                                                 labels=all_possible_labels,
                                                 target_names=head.label_list)

            if return_preds_and_labels:
                result["preds"] = preds_all[head_num]
                result["labels"] = label_all[head_num]

            all_results.append(result)

        return all_results