Example #1
0
def test_confusion_matrix_with_multioutput_samples():
    # Given
    input_solution = [["cat", "ant", "cat"]]
    input_prediction = [["ant", "ant", "cat"]]

    # When-Then
    with pytest.raises(ValueError):
        observed_output = confusion_matrix(input_solution, input_prediction)
def test_confusion_matrix_with_invalid_weights():
    # Given
    input_solution = ["cat", "ant", "cat", "cat", "ant", "bird"]
    input_prediction = ["ant", "ant", "cat", "cat", "ant", "cat"]
    labels = [[1, 2], 0.1, [0.1], 3, 1]

    # When-Then
    with pytest.raises(ValueError):
        observed_output = confusion_matrix(input_solution, input_prediction, labels=labels)
def test_confusion_matrix_with_multiDimensional_labels():
    # Given
    input_solution = ["cat", "ant", "cat", "cat", "ant", "bird"]
    input_prediction = ["ant", "ant", "cat", "cat", "ant", "cat"]
    labels = [["ant", "bird"], "cat"]

    # When-Then
    with pytest.raises(ValueError):
        observed_output = confusion_matrix(input_solution, input_prediction, labels=labels)
Example #4
0
def test_confusion_matrix_with_valid_inputs_without_labels_and_weights():
    # Given
    input_solution = [2, 0, 2, 2, 0, 1]
    input_prediction = [0, 0, 2, 2, 0, 2]
    expected_output = np.array([[2, 0, 0], [0, 0, 1], [1, 0, 2]])

    # When
    observed_output = confusion_matrix(input_solution, input_prediction)

    # Then
    assert (np.array_equal(expected_output, observed_output))
def test_confusion_matrix_with_valid_inputs_with_lesser_number_of_labels_and_without_weights():
    # Given
    input_solution = ["cat", "ant", "cat", "cat", "ant", "bird"]
    input_prediction = ["ant", "ant", "cat", "cat", "ant", "cat"]
    labels = ["bird", "cat"]
    expected_output = np.array([[0, 1], [0, 2]])

    # When
    observed_output = confusion_matrix(input_solution, input_prediction, labels=labels)

    # Then
    assert(np.array_equal(expected_output, observed_output))
def test_confusion_matrix_with_empty_inputs():
    # Given
    input_solution = []
    input_prediction = []
    labels = ["bird", "cat"]
    expected_output = np.array([[0, 0], [0, 0]])

    # When
    observed_output = confusion_matrix(input_solution, input_prediction, labels = labels)

    # Then
    assert(np.array_equal(expected_output, observed_output))
def test_confusion_matrix_with_valid_inputs_with_labels_and_with_weights():
    # Given
    input_solution = ["cat", "ant", "cat", "cat", "ant", "bird"]
    input_prediction = ["ant", "ant", "cat", "cat", "ant", "cat"]
    labels = ["ant", "bird", "cat"]
    weights = [0.1, 0.3, 1.0, 0.8, 0.2, 2.0]
    expected_output = np.array([[0.5, 0.0, 0.0], [0.0, 0.0, 2.0], [0.1, 0.0, 1.8]])

    # When
    observed_output = confusion_matrix(input_solution, input_prediction, labels=labels, weights=weights)

    # Then
    assert(np.array_equal(expected_output, observed_output))
Example #8
0
    def evaluate_predictions(self, y_true, y_pred, sample_weight=None, silent=False, auxiliary_metrics=True, detailed_report=False):
        """ Evaluate predictions. Does not support sample weights since this method reports a variety of metrics.
            Args:
                silent (bool): Should we print which metric is being used as well as performance.
                auxiliary_metrics (bool): Should we compute other (problem_type specific) metrics in addition to the default metric?
                detailed_report (bool): Should we computed more-detailed versions of the auxiliary_metrics? (requires auxiliary_metrics=True).

            Returns single performance-value if auxiliary_metrics=False.
            Otherwise returns dict where keys = metrics, values = performance along each metric.
        """

        is_proba = False
        assert isinstance(y_true, (np.ndarray, pd.Series))
        assert isinstance(y_pred, (np.ndarray, pd.Series, pd.DataFrame))
        self._validate_class_labels(y_true)
        if isinstance(y_pred, np.ndarray):
            if self.problem_type == QUANTILE:
                y_pred = pd.DataFrame(data=y_pred, columns=self.quantile_levels)
            elif len(y_pred.shape) > 1:
                y_pred = pd.DataFrame(data=y_pred, columns=self.class_labels)

        if isinstance(y_pred, pd.DataFrame):
            is_proba = True
        elif not self.eval_metric.needs_pred:
            raise AssertionError(f'`evaluate_predictions` requires y_pred_proba input '
                                 f'when evaluating "{self.eval_metric.name}"... Please generate valid input via `predictor.predict_proba(data)`.\n'
                                 f'This may have occurred if you passed in predict input instead of predict_proba input, '
                                 f'or if you specified `as_multiclass=False` to `predictor.predict_proba(data, as_multiclass=False)`, '
                                 f'which is not supported by `evaluate_predictions`.')
        if is_proba:
            y_pred_proba = y_pred
            y_pred = get_pred_from_proba_df(y_pred_proba, problem_type=self.problem_type)
            if self.problem_type == BINARY:
                # roc_auc crashes if this isn't done
                y_pred_proba = y_pred_proba[self.positive_class]
        else:
            y_pred_proba = None
            y_pred = pd.Series(y_pred)
        if y_pred_proba is not None:
            y_pred_proba_internal = self.label_cleaner.transform_proba(y_pred_proba, as_pandas=True)
        else:
            y_pred_proba_internal = None
        y_true_internal = self.label_cleaner.transform(y_true)  # Get labels in numeric order
        y_true_internal = y_true_internal.fillna(-1)
        y_pred_internal = self.label_cleaner.transform(y_pred)  # Get labels in numeric order

        # Compute auxiliary metrics:
        auxiliary_metrics_lst = [self.eval_metric]
        performance_dict = {}

        if auxiliary_metrics:
            if self.problem_type == REGRESSION:  # Adding regression metrics
                auxiliary_metrics_lst += [
                    'root_mean_squared_error',
                    'mean_squared_error',
                    'mean_absolute_error',
                    'r2',
                    'pearsonr',
                    'median_absolute_error',
                ]
            if self.problem_type in [BINARY, MULTICLASS]:  # Adding classification metrics
                auxiliary_metrics_lst += [
                    'accuracy',
                    'balanced_accuracy',
                    # 'log_loss',  # Don't include as it probably adds more confusion to novice users (can be infinite)
                    'mcc',
                ]
            if self.problem_type == BINARY:  # binary-specific metrics
                auxiliary_metrics_lst += [
                    'roc_auc',
                    'f1',
                    'precision',
                    'recall',
                ]

        scoring_args = dict(
            y=y_true,
            y_internal=y_true_internal,
            weight_evaluation=False,
        )

        if sample_weight is not None:
            scoring_args['sample_weight'] = sample_weight
            scoring_args['weight_evaluation'] = True

        for aux_metric in auxiliary_metrics_lst:
            if isinstance(aux_metric, str):
                aux_metric = get_metric(metric=aux_metric, problem_type=self.problem_type, metric_type='aux_metric')
            if not aux_metric.needs_pred and y_pred_proba_internal is None:
                logger.log(15, f'Skipping {aux_metric.name} because no prediction probabilities are available to score.')
                continue

            if aux_metric.name not in performance_dict:
                if y_pred_proba_internal is not None:
                    score = self._score_with_pred_proba(
                        y_pred_proba_internal=y_pred_proba_internal,
                        metric=aux_metric,
                        **scoring_args
                    )
                else:
                    score = self._score_with_pred(
                        y_pred_internal=y_pred_internal,
                        metric=aux_metric,
                        **scoring_args
                    )
                performance_dict[aux_metric.name] = score

        if self.eval_metric.name in performance_dict:
            score_eval = performance_dict[self.eval_metric.name]
            score_eval_flipped = self.eval_metric.convert_score_to_sklearn_val(score_eval)  # flip negative once again back to positive (so higher is no longer necessarily better)
            if score_eval_flipped != score_eval:
                flipped = True
            else:
                flipped = False
            if not silent:
                logger.log(20, f"Evaluation: {self.eval_metric.name} on test data: {score_eval}")
                if flipped:
                    logger.log(20, f"\tNote: Scores are always higher_is_better. This metric score can be multiplied by -1 to get the metric value.")

        if not silent:
            logger.log(20, "Evaluations on test data:")
            logger.log(20, json.dumps(performance_dict, indent=4))

        if detailed_report and (self.problem_type != REGRESSION):
            # Construct confusion matrix
            try:
                performance_dict['confusion_matrix'] = confusion_matrix(y_true, y_pred, labels=self.label_cleaner.ordered_class_labels, output_format='pandas_dataframe')
            except ValueError:
                pass
            # One final set of metrics to report
            cl_metric = lambda y_true, y_pred: classification_report(y_true, y_pred, output_dict=True)
            metric_name = 'classification_report'
            if metric_name not in performance_dict:
                try:  # only compute auxiliary metrics which do not error (y_pred = class-probabilities may cause some metrics to error)
                    performance_dict[metric_name] = cl_metric(y_true, y_pred)
                except ValueError:
                    pass
                if not silent and metric_name in performance_dict:
                    logger.log(20, "Detailed (per-class) classification report:")
                    logger.log(20, json.dumps(performance_dict[metric_name], indent=4))
        return performance_dict
Example #9
0
    def evaluate_predictions(self, y_true, y_pred, silent=False, auxiliary_metrics=False, detailed_report=True, high_always_good=False):
        """ Evaluate predictions. Does not support sample weights since this method reports a variety of metrics.
            Args:
                silent (bool): Should we print which metric is being used as well as performance.
                auxiliary_metrics (bool): Should we compute other (problem_type specific) metrics in addition to the default metric?
                detailed_report (bool): Should we computed more-detailed versions of the auxiliary_metrics? (requires auxiliary_metrics=True).
                high_always_good (bool): If True, this means higher values of returned metric are ALWAYS superior (so metrics like MSE should be returned negated)

            Returns single performance-value if auxiliary_metrics=False.
            Otherwise returns dict where keys = metrics, values = performance along each metric.
        """
        is_proba = False
        assert isinstance(y_true, (np.ndarray, pd.Series))
        assert isinstance(y_pred, (np.ndarray, pd.Series, pd.DataFrame))
        self._validate_class_labels(y_true)
        if isinstance(y_pred, np.ndarray):
            if self.problem_type == QUANTILE:
                y_pred = pd.DataFrame(data=y_pred, columns=self.quantile_levels)
            elif len(y_pred.shape) > 1:
                y_pred = pd.DataFrame(data=y_pred, columns=self.class_labels)
        if self.problem_type == BINARY:
            if isinstance(y_pred, pd.DataFrame):
                # roc_auc crashes if this isn't done
                y_pred = y_pred[self.positive_class]
                is_proba = True
            elif not self.eval_metric.needs_pred:
                raise AssertionError(f'`evaluate_predictions` requires y_pred_proba input for binary classification '
                                     f'when evaluating "{self.eval_metric.name}"... Please generate valid input via `predictor.predict_proba(data)`.\n'
                                     f'This may have occurred if you passed in predict input instead of predict_proba input, '
                                     f'or if you specified `as_multiclass=False` to `predictor.predict_proba(data, as_multiclass=False)`, '
                                     f'which is not supported by `evaluate_predictions`.')
        elif self.problem_type == MULTICLASS:
            if isinstance(y_pred, pd.DataFrame):
                is_proba = True
        if is_proba and self.eval_metric.needs_pred:
            if self.problem_type == BINARY:
                y_pred = get_pred_from_proba(y_pred_proba=y_pred, problem_type=self.problem_type)
                y_pred = self.label_cleaner.inverse_transform(y_pred)
            else:
                y_pred = get_pred_from_proba_df(y_pred_proba=y_pred, problem_type=self.problem_type)
        if not self.eval_metric.needs_pred:
            y_true = self.label_cleaner.transform(y_true)  # Get labels in numeric order
            performance = self.eval_metric(y_true, y_pred)
        elif self.problem_type == BINARY:
            # Use 1 and 0, otherwise f1 can crash due to unknown pos_label.
            y_true_internal = self.label_cleaner.transform(y_true)
            y_pred_internal = self.label_cleaner.transform(y_pred)
            performance = self.eval_metric(y_true_internal, y_pred_internal)
        else:
            performance = self.eval_metric(y_true, y_pred)

        metric = self.eval_metric.name

        if not high_always_good:
            performance = self.eval_metric.convert_score_to_sklearn_val(performance)  # flip negative once again back to positive (so higher is no longer necessarily better)

        if not silent:
            logger.log(20, f"Evaluation: {metric} on test data: {performance}")

        if not auxiliary_metrics:
            return performance

        # Otherwise compute auxiliary metrics:
        auxiliary_metrics = []
        if self.problem_type == REGRESSION:  # Adding regression metrics
            pearson_corr = lambda x, y: corrcoef(x, y)[0][1]
            pearson_corr.__name__ = 'pearson_correlation'
            auxiliary_metrics += [
                mean_absolute_error, explained_variance_score, r2_score, pearson_corr, mean_squared_error, median_absolute_error,
                # max_error
            ]
        else:  # Adding classification metrics
            auxiliary_metrics += [accuracy_score, balanced_accuracy_score, matthews_corrcoef]
            if self.problem_type == BINARY:  # binary-specific metrics
                # def auc_score(y_true, y_pred): # TODO: this requires y_pred to be probability-scores
                #     fpr, tpr, _ = roc_curve(y_true, y_pred, pos_label)
                #   return auc(fpr, tpr)
                f1micro_score = lambda y_true, y_pred: f1_score(y_true, y_pred, average='micro')
                f1micro_score.__name__ = f1_score.__name__
                auxiliary_metrics += [f1micro_score]  # TODO: add auc?
            # elif self.problem_type == MULTICLASS:  # multiclass metrics
            #     auxiliary_metrics += []  # TODO: No multi-class specific metrics for now. Include top-5, top-10 accuracy here.

        performance_dict = OrderedDict({metric: performance})
        for metric_function in auxiliary_metrics:
            if isinstance(metric_function, tuple):
                metric_function, metric_kwargs = metric_function
            else:
                metric_kwargs = None
            metric_name = metric_function.__name__
            if metric_name not in performance_dict:
                try:  # only compute auxiliary metrics which do not error (y_pred = class-probabilities may cause some metrics to error)
                    if metric_kwargs:
                        performance_dict[metric_name] = metric_function(y_true, y_pred, **metric_kwargs)
                    else:
                        performance_dict[metric_name] = metric_function(y_true, y_pred)
                except ValueError:
                    pass

        if not silent:
            logger.log(20, "Evaluations on test data:")
            logger.log(20, json.dumps(performance_dict, indent=4))

        if detailed_report and (self.problem_type != REGRESSION):
            # Construct confusion matrix
            try:
                performance_dict['confusion_matrix'] = confusion_matrix(y_true, y_pred, labels=self.label_cleaner.ordered_class_labels, output_format='pandas_dataframe')
            except ValueError:
                pass
            # One final set of metrics to report
            cl_metric = lambda y_true, y_pred: classification_report(y_true, y_pred, output_dict=True)
            metric_name = 'classification_report'
            if metric_name not in performance_dict:
                try:  # only compute auxiliary metrics which do not error (y_pred = class-probabilities may cause some metrics to error)
                    performance_dict[metric_name] = cl_metric(y_true, y_pred)
                except ValueError:
                    pass
                if not silent and metric_name in performance_dict:
                    logger.log(20, "Detailed (per-class) classification report:")
                    logger.log(20, json.dumps(performance_dict[metric_name], indent=4))
        return performance_dict
Example #10
0
    def evaluate(self,
                 y_true,
                 y_pred,
                 silent=False,
                 auxiliary_metrics=False,
                 detailed_report=True,
                 high_always_good=False):
        """ Evaluate predictions.
            Args:
                silent (bool): Should we print which metric is being used as well as performance.
                auxiliary_metrics (bool): Should we compute other (problem_type specific) metrics in addition to the default metric?
                detailed_report (bool): Should we computed more-detailed versions of the auxiliary_metrics? (requires auxiliary_metrics=True).
                high_always_good (bool): If True, this means higher values of returned metric are ALWAYS superior (so metrics like MSE should be returned negated)

            Returns single performance-value if auxiliary_metrics=False.
            Otherwise returns dict where keys = metrics, values = performance along each metric.
        """
        assert isinstance(y_true, (np.ndarray, pd.Series))
        assert isinstance(
            y_pred,
            (np.ndarray, pd.Series))  # TODO: Enable DataFrame for y_pred_proba

        self._validate_class_labels(y_true)
        if not self.eval_metric.needs_pred:
            y_true = self.label_cleaner.transform(
                y_true)  # Get labels in numeric order
        performance = self.eval_metric(y_true, y_pred)

        metric = self.eval_metric.name

        if not high_always_good:
            performance = performance * self.eval_metric._sign  # flip negative once again back to positive (so higher is no longer necessarily better)

        if not silent:
            logger.log(20, f"Evaluation: {metric} on test data: {performance}")

        if not auxiliary_metrics:
            return performance

        # Otherwise compute auxiliary metrics:
        auxiliary_metrics = []
        if self.problem_type == REGRESSION:  # Adding regression metrics
            pearson_corr = lambda x, y: corrcoef(x, y)[0][1]
            pearson_corr.__name__ = 'pearson_correlation'
            auxiliary_metrics += [
                mean_absolute_error,
                explained_variance_score,
                r2_score,
                pearson_corr,
                mean_squared_error,
                median_absolute_error,
                # max_error
            ]
        else:  # Adding classification metrics
            auxiliary_metrics += [
                accuracy_score, balanced_accuracy_score, matthews_corrcoef
            ]
            if self.problem_type == BINARY:  # binary-specific metrics
                # def auc_score(y_true, y_pred): # TODO: this requires y_pred to be probability-scores
                #     fpr, tpr, _ = roc_curve(y_true, y_pred, pos_label)
                #   return auc(fpr, tpr)
                f1micro_score = lambda y_true, y_pred: f1_score(
                    y_true, y_pred, average='micro')
                f1micro_score.__name__ = f1_score.__name__
                auxiliary_metrics += [f1micro_score]  # TODO: add auc?
            # elif self.problem_type == MULTICLASS:  # multiclass metrics
            #     auxiliary_metrics += []  # TODO: No multi-class specific metrics for now. Include top-5, top-10 accuracy here.

        performance_dict = OrderedDict({metric: performance})
        for metric_function in auxiliary_metrics:
            if isinstance(metric_function, tuple):
                metric_function, metric_kwargs = metric_function
            else:
                metric_kwargs = None
            metric_name = metric_function.__name__
            if metric_name not in performance_dict:
                try:  # only compute auxiliary metrics which do not error (y_pred = class-probabilities may cause some metrics to error)
                    if metric_kwargs:
                        performance_dict[metric_name] = metric_function(
                            y_true, y_pred, **metric_kwargs)
                    else:
                        performance_dict[metric_name] = metric_function(
                            y_true, y_pred)
                except ValueError:
                    pass

        if not silent:
            logger.log(20, "Evaluations on test data:")
            logger.log(20, json.dumps(performance_dict, indent=4))

        if detailed_report and (self.problem_type != REGRESSION):
            # Construct confusion matrix
            try:
                performance_dict['confusion_matrix'] = confusion_matrix(
                    y_true,
                    y_pred,
                    labels=self.label_cleaner.ordered_class_labels,
                    output_format='pandas_dataframe')
            except ValueError:
                pass
            # One final set of metrics to report
            cl_metric = lambda y_true, y_pred: classification_report(
                y_true, y_pred, output_dict=True)
            metric_name = 'classification_report'
            if metric_name not in performance_dict:
                try:  # only compute auxiliary metrics which do not error (y_pred = class-probabilities may cause some metrics to error)
                    performance_dict[metric_name] = cl_metric(y_true, y_pred)
                except ValueError:
                    pass
                if not silent and metric_name in performance_dict:
                    logger.log(20,
                               "Detailed (per-class) classification report:")
                    logger.log(
                        20, json.dumps(performance_dict[metric_name],
                                       indent=4))
        return performance_dict