Exemple #1
0
    def eval(self, model):
        """
        Performs evaluation on a given model.

        :param model: The model on which to perform evaluation
        :type model: AdaptiveModel
        :return all_results: A list of dictionaries, one for each prediction head. Each dictionary contains the metrics
                             and reports generated during evaluation.
        :rtype all_results: list of dicts
        """
        model.eval()

        # init empty lists per prediction head
        loss_all = [0 for _ in model.prediction_heads]
        preds_all = [[] for _ in model.prediction_heads]
        label_all = [[] for _ in model.prediction_heads]

        for step, batch in enumerate(
            tqdm(self.data_loader, desc="Evaluating", mininterval=10)
        ):
            batch = {key: batch[key].to(self.device) for key in batch}

            with torch.no_grad():

                logits = model.forward(**batch)
                # TODO logits_to_loss should be a single, overloaded function
                losses_per_head = model.logits_to_loss_per_head(logits=logits, **batch)
                preds = model.logits_to_preds(
                    logits=logits, **batch
                )

                labels = model.prepare_labels(**batch)

            # stack results of all batches per prediction head
            for head_num, head in enumerate(model.prediction_heads):
                loss_all[head_num] += np.sum(to_numpy(losses_per_head[head_num]))
                if head.model_type == "span_classification":
                    # TODO check why adaptive model doesnt pack preds into list of list of tuples
                    preds_all[head_num] += preds
                    label_all[head_num] += labels
                else:
                    preds_all[head_num] += list(to_numpy(preds[head_num]))
                    label_all[head_num] += list(to_numpy(labels[head_num]))


        # Evaluate per prediction head
        all_results = []
        for head_num, head in enumerate(model.prediction_heads):
            if head.model_type == "multilabel_text_classification":
                # converting from string preds back to multi-hot encoding
                from sklearn.preprocessing import MultiLabelBinarizer
                mlb = MultiLabelBinarizer(classes=head.label_list)
                # TODO check why .fit() should be called on predictions, rather than on labels
                preds_all[head_num] = mlb.fit_transform(preds_all[head_num])
                label_all[head_num] = mlb.transform(label_all[head_num])
            elif head.model_type == "span_classification":
                temp = head._aggregate_preds(preds_all[head_num])
                preds_all[head_num] = temp

            result = {"loss": loss_all[head_num] / len(self.data_loader.dataset),
                      "task_name": head.task_name}
            result.update(
                compute_metrics(metric=head.metric, preds=preds_all[head_num], labels=label_all[head_num]
                )
            )

            # Select type of report depending on prediction head output type
            if self.classification_report:
                if head.ph_output_type == "per_token":
                    report_fn = token_classification_report
                elif head.ph_output_type == "per_sequence":
                    report_fn = classification_report
                elif head.ph_output_type == "per_token_squad":
                    report_fn = lambda *args, **kwargs: "not Implemented"
                elif head.ph_output_type == "per_sequence_continuous":
                    report_fn = r2_score
                else:
                    raise NotImplementedError

                # CHANGE PARAMETERS, not all report_fn accept digits
                if head.ph_output_type in ["per_sequence_continuous","per_token"]:
                    result["report"] = report_fn(
                        label_all[head_num], preds_all[head_num]
                    )
                else:
                    # supply labels as all possible combination because if ground truth labels do not cover
                    # all values in label_list (maybe dev set is small), the report will break
                    result["report"] = report_fn(
                        label_all[head_num],
                        preds_all[head_num],
                        digits=4,
                        labels=head.label_list,
                        target_names=head.label_list)

            all_results.append(result)

        return all_results
Exemple #2
0
    def eval(self, model):
        """
        Performs evaluation on a given model.

        :param model: The model on which to perform evaluation
        :type model: AdaptiveModel
        :return all_results: A list of dictionaries, one for each prediction head. Each dictionary contains the metrics
                             and reports generated during evaluation.
        :rtype all_results: list of dicts
        """
        model.eval()

        # init empty lists per prediction head
        loss_all = [0 for _ in model.prediction_heads]
        preds_all = [[] for _ in model.prediction_heads]
        label_all = [[] for _ in model.prediction_heads]

        for step, batch in enumerate(
                tqdm(self.data_loader, desc="Evaluating", mininterval=10)):
            batch = {key: batch[key].to(self.device) for key in batch}

            with torch.no_grad():

                logits = model.forward(**batch)
                # TODO logits_to_loss should be a single, overloaded function
                losses_per_head = model.logits_to_loss_per_head(logits=logits,
                                                                **batch)
                preds = model.logits_to_preds(logits=logits, **batch)

                labels = model.prepare_labels(**batch)

            # stack results of all batches per prediction head
            for head_num, head in enumerate(model.prediction_heads):
                loss_all[head_num] += np.sum(
                    to_numpy(losses_per_head[head_num]))
                preds_all[head_num] += list(to_numpy(preds[head_num]))
                label_all[head_num] += list(to_numpy(labels[head_num]))

        # Evaluate per prediction head
        all_results = []
        for head_num, head in enumerate(model.prediction_heads):
            result = {
                "loss": loss_all[head_num] / len(self.data_loader.dataset),
                "task_name": head.task_name
            }
            result.update(
                compute_metrics(metric=head.metric,
                                preds=preds_all[head_num],
                                labels=label_all[head_num]))

            # Select type of report depending on prediction head output type
            if self.classification_report:
                if head.ph_output_type == "per_token":
                    report_fn = token_classification_report
                elif head.ph_output_type == "per_sequence":
                    report_fn = classification_report
                elif head.ph_output_type == "per_token_squad":
                    report_fn = lambda *args, **kwargs: "not Implemented"
                elif head.ph_output_type == "per_sequence_continuous":
                    report_fn = r2_score
                else:
                    raise NotImplementedError

                # CHANGE PARAMETERS, not all report_fn accept digits
                if head.ph_output_type == "per_sequence_continuous":
                    result["report"] = report_fn(label_all[head_num],
                                                 preds_all[head_num])
                else:
                    result["report"] = report_fn(label_all[head_num],
                                                 preds_all[head_num],
                                                 digits=4)

            all_results.append(result)

        return all_results