def test_invalid_target_data_action_for_data_with_null(problem_type):
    y = pd.Series([None, None, None, 0, 0, 0, 0, 0, 0, 0])
    X = pd.DataFrame({"col": range(len(y))})
    invalid_targets_check = InvalidTargetDataCheck(problem_type, get_default_primary_search_objective(problem_type))
    impute_strategy = "mean" if is_regression(problem_type) else "most_frequent"

    expected = {
        "warnings": [],
        "errors": [DataCheckError(message="3 row(s) (30.0%) of target values are null",
                                  data_check_name=invalid_targets_data_check_name,
                                  message_code=DataCheckMessageCode.TARGET_HAS_NULL,
                                  details={"num_null_rows": 3, "pct_null_rows": 30.0}).to_dict()],
        "actions": [DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, "impute_strategy": impute_strategy}).to_dict()]
    }
    if is_binary(problem_type):
        expected["errors"].append(DataCheckError(message="Binary class targets require exactly two unique values.",
                                                 data_check_name=invalid_targets_data_check_name,
                                                 message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES,
                                                 details={"target_values": [0]}).to_dict())
    elif is_multiclass(problem_type):
        expected["errors"].append(DataCheckError(message=f"Target has two or less classes, which is too few for multiclass problems.  Consider changing to binary.",
                                                 data_check_name=invalid_targets_data_check_name,
                                                 message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_ENOUGH_CLASSES,
                                                 details={"num_classes": 1}).to_dict())
        expected["warnings"].append(DataCheckWarning(message=f"Target has a large number of unique values, could be regression type problem.",
                                                     data_check_name=invalid_targets_data_check_name,
                                                     message_code=DataCheckMessageCode.TARGET_MULTICLASS_HIGH_UNIQUE_CLASS,
                                                     details={"class_to_value_ratio": 0.1}).to_dict())

    messages = invalid_targets_check.validate(X, y)
    assert messages == expected
Esempio n. 2
0
 def _find_best_pipeline(self):
     """Finds the best pipeline in the rankings
     If self._best_pipeline already exists, check to make sure it is different from the current best pipeline before training and thresholding"""
     if len(self.rankings) == 0:
         return
     best_pipeline = self.rankings.iloc[0]
     if not (self._best_pipeline and self._best_pipeline
             == self.get_pipeline(best_pipeline['id'])):
         self._best_pipeline = self.get_pipeline(best_pipeline['id'])
         if self._train_best_pipeline:
             X_threshold_tuning = None
             y_threshold_tuning = None
             X_train, y_train = self.X_train, self.y_train
             if is_binary(self.problem_type) and self.objective.is_defined_for_problem_type(self.problem_type) \
                and self.optimize_thresholds and self.objective.can_optimize_threshold:
                 X_train, X_threshold_tuning, y_train, y_threshold_tuning = split_data(
                     X_train,
                     y_train,
                     self.problem_type,
                     test_size=0.2,
                     random_seed=self.random_seed)
             self._best_pipeline.fit(X_train, y_train)
             tune_binary_threshold(self._best_pipeline, self.objective,
                                   self.problem_type, X_threshold_tuning,
                                   y_threshold_tuning)
Esempio n. 3
0
def test_split_data(problem_type, data_type, X_y_binary, X_y_multi,
                    X_y_regression, make_data_type):
    if is_binary(problem_type):
        X, y = X_y_binary
    if is_multiclass(problem_type):
        X, y = X_y_multi
    if is_regression(problem_type):
        X, y = X_y_regression
    problem_configuration = None
    if is_time_series(problem_type):
        problem_configuration = {'gap': 1, 'max_delay': 7}

    X = make_data_type(data_type, X)
    y = make_data_type(data_type, y)

    test_pct = 0.25
    X_train, X_test, y_train, y_test = split_data(
        X,
        y,
        test_size=test_pct,
        problem_type=problem_type,
        problem_configuration=problem_configuration)
    test_size = len(X) * test_pct
    train_size = len(X) - test_size
    assert len(X_train) == train_size
    assert len(X_test) == test_size
    assert len(y_train) == train_size
    assert len(y_test) == test_size
    assert isinstance(X_train, ww.DataTable)
    assert isinstance(X_test, ww.DataTable)
    assert isinstance(y_train, ww.DataColumn)
    assert isinstance(y_test, ww.DataColumn)
Esempio n. 4
0
def test_explain_predictions_best_worst_time_series(output_format,
                                                    pipeline_class, estimator,
                                                    ts_data):
    X, y = ts_data

    if is_binary(pipeline_class.problem_type):
        y = y % 2

    class TSPipeline(pipeline_class):
        component_graph = ["Delayed Feature Transformer", estimator]
        name = "time series pipeline"

    tspipeline = TSPipeline({"pipeline": {"gap": 1, "max_delay": 2}})

    tspipeline.fit(X, y)

    exp = explain_predictions_best_worst(pipeline=tspipeline,
                                         input_features=X,
                                         y_true=y,
                                         output_format=output_format)

    if output_format == "dict":
        # Check that the computed features to be explained aren't NaN.
        for exp_idx in range(len(exp["explanations"])):
            assert not np.isnan(
                np.array(exp["explanations"][exp_idx]["explanations"][0]
                         ["feature_values"])).any()
Esempio n. 5
0
def test_explain_predictions_stacked_ensemble(
        problem_type, dummy_stacked_ensemble_binary_estimator,
        dummy_stacked_ensemble_multiclass_estimator,
        dummy_stacked_ensemble_regressor_estimator, X_y_binary, X_y_multi,
        X_y_regression):
    if is_binary(problem_type):
        X, y = X_y_binary
        pipeline = dummy_stacked_ensemble_binary_estimator
    elif is_multiclass(problem_type):
        X, y = X_y_multi
        pipeline = dummy_stacked_ensemble_multiclass_estimator
    else:
        X, y = X_y_regression
        pipeline = dummy_stacked_ensemble_regressor_estimator

    with pytest.raises(
            ValueError,
            match="Cannot explain predictions for a stacked ensemble pipeline"
    ):
        explain_predictions(pipeline, X, y, indices_to_explain=[0])

    with pytest.raises(
            ValueError,
            match="Cannot explain predictions for a stacked ensemble pipeline"
    ):
        explain_predictions_best_worst(pipeline, X, y)
Esempio n. 6
0
def test_estimators_feature_name_with_random_ascii(X_y_binary, X_y_multi,
                                                   X_y_regression, ts_data,
                                                   helper_functions):
    for estimator_class in _all_estimators_used_in_search():
        if estimator_class.__name__ == 'ARIMARegressor':
            continue
        supported_problem_types = [
            handle_problem_types(pt)
            for pt in estimator_class.supported_problem_types
        ]
        for problem_type in supported_problem_types:
            clf = helper_functions.safe_init_component_with_njobs_1(
                estimator_class)
            if is_binary(problem_type):
                X, y = X_y_binary
            elif is_multiclass(problem_type):
                X, y = X_y_multi
            elif is_regression(problem_type):
                X, y = X_y_regression

            X = get_random_state(clf.random_seed).random(
                (X.shape[0], len(string.printable)))
            col_names = [
                'column_{}'.format(ascii_char)
                for ascii_char in string.printable
            ]
            X = pd.DataFrame(X, columns=col_names)
            assert clf.input_feature_names is None
            clf.fit(X, y)
            assert len(clf.feature_importance) == len(X.columns)
            assert not np.isnan(clf.feature_importance).all().all()
            predictions = clf.predict(X).to_series()
            assert len(predictions) == len(y)
            assert not np.isnan(predictions).all()
            assert (clf.input_feature_names == col_names)
Esempio n. 7
0
    def can_tune_threshold_with_objective(self, objective):
        """Determine whether the threshold of a binary classification pipeline can be tuned.

       Arguments:
            pipeline (PipelineBase): Binary classification pipeline.
            objective (ObjectiveBase): Primary AutoMLSearch objective.

        Returns:
            bool: True if the pipeline threshold can be tuned.

        """
        return objective.is_defined_for_problem_type(self.problem_type) and \
            objective.can_optimize_threshold and is_binary(self.problem_type)
Esempio n. 8
0
def tune_binary_threshold(pipeline, objective, problem_type,
                          X_threshold_tuning, y_threshold_tuning):
    """Tunes the threshold of a binary pipeline to the X and y thresholding data

    Arguments:
        pipeline (Pipeline): Pipeline instance to threshold
        X_threshold_tuning (ww.DataTable): Features to tune pipeline to
        y_threshold_tuning (ww.DataColumn): Target data to tune pipeline to
    """
    if is_binary(problem_type) and objective.is_defined_for_problem_type(
            problem_type) and objective.can_optimize_threshold:
        pipeline.threshold = 0.5
        if X_threshold_tuning:
            y_predict_proba = pipeline.predict_proba(X_threshold_tuning)
            y_predict_proba = y_predict_proba.iloc[:, 1]
            pipeline.threshold = objective.optimize_threshold(
                y_predict_proba, y_threshold_tuning, X=X_threshold_tuning)
Esempio n. 9
0
def test_type_checks(problem_type):
    assert is_regression(problem_type) == (problem_type in [
        ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION
    ])
    assert is_binary(problem_type) == (problem_type in [
        ProblemTypes.BINARY, ProblemTypes.TIME_SERIES_BINARY
    ])
    assert is_multiclass(problem_type) == (problem_type in [
        ProblemTypes.MULTICLASS, ProblemTypes.TIME_SERIES_MULTICLASS
    ])
    assert is_classification(problem_type) == (problem_type in [
        ProblemTypes.BINARY, ProblemTypes.MULTICLASS,
        ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS
    ])
    assert is_time_series(problem_type) == (problem_type in [
        ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS,
        ProblemTypes.TIME_SERIES_REGRESSION
    ])
Esempio n. 10
0
def tune_binary_threshold(pipeline, objective, problem_type,
                          X_threshold_tuning, y_threshold_tuning):
    """Tunes the threshold of a binary pipeline to the X and y thresholding data

    Arguments:
        pipeline (Pipeline): Pipeline instance to threshold.
        objective (ObjectiveBase): The objective we want to tune with. If not tuneable and best_pipeline is True, will use F1.
        problem_type (ProblemType): The problem type of the pipeline.
        X_threshold_tuning (ww.DataTable): Features to tune pipeline to.
        y_threshold_tuning (ww.DataColumn): Target data to tune pipeline to.
    """
    if is_binary(problem_type) and objective.is_defined_for_problem_type(
            problem_type) and objective.can_optimize_threshold:
        pipeline.threshold = 0.5
        if X_threshold_tuning:
            y_predict_proba = pipeline.predict_proba(X_threshold_tuning)
            y_predict_proba = y_predict_proba.iloc[:, 1]
            pipeline.optimize_threshold(X_threshold_tuning, y_threshold_tuning,
                                        y_predict_proba, objective)
Esempio n. 11
0
    def train_and_score_pipeline(pipeline, automl, full_X_train, full_y_train):
        """Given a pipeline, config and data, train and score the pipeline and return the CV or TV scores

        Arguments:
            pipeline (PipelineBase): The pipeline to score
            automl (AutoMLSearch): The AutoML search, used to access config and for the error callback
            full_X_train (ww.DataTable): Training features
            full_y_train (ww.DataColumn): Training target

        Returns:
            dict: A dict containing cv_score_mean, cv_scores, training_time and a cv_data structure with details.
        """
        start = time.time()
        cv_data = []
        logger.info("\tStarting cross validation")
        X_pd = _convert_woodwork_types_wrapper(full_X_train.to_dataframe())
        y_pd = _convert_woodwork_types_wrapper(full_y_train.to_series())
        y_pd_encoded = y_pd
        # Encode target for classification problems so that we can support float targets. This is okay because we only use split to get the indices to split on
        if is_classification(automl.problem_type):
            y_mapping = {
                original_target: encoded_target
                for (encoded_target,
                     original_target) in enumerate(y_pd.value_counts().index)
            }
            y_pd_encoded = y_pd.map(y_mapping)
        for i, (train, valid) in enumerate(
                automl.data_splitter.split(X_pd, y_pd_encoded)):
            if pipeline.model_family == ModelFamily.ENSEMBLE and i > 0:
                # Stacked ensembles do CV internally, so we do not run CV here for performance reasons.
                logger.debug(
                    f"Skipping fold {i} because CV for stacked ensembles is not supported."
                )
                break
            logger.debug(f"\t\tTraining and scoring on fold {i}")
            X_train, X_valid = full_X_train.iloc[train], full_X_train.iloc[
                valid]
            y_train, y_valid = full_y_train.iloc[train], full_y_train.iloc[
                valid]
            if is_binary(automl.problem_type) or is_multiclass(
                    automl.problem_type):
                diff_train = set(
                    np.setdiff1d(full_y_train.to_series(),
                                 y_train.to_series()))
                diff_valid = set(
                    np.setdiff1d(full_y_train.to_series(),
                                 y_valid.to_series()))
                diff_string = f"Missing target values in the training set after data split: {diff_train}. " if diff_train else ""
                diff_string += f"Missing target values in the validation set after data split: {diff_valid}." if diff_valid else ""
                if diff_string:
                    raise Exception(diff_string)
            objectives_to_score = [automl.objective
                                   ] + automl.additional_objectives
            cv_pipeline = None
            try:
                logger.debug(f"\t\t\tFold {i}: starting training")
                cv_pipeline = EngineBase.train_pipeline(
                    pipeline, X_train, y_train, automl.optimize_thresholds,
                    automl.objective)
                logger.debug(f"\t\t\tFold {i}: finished training")
                if automl.optimize_thresholds and pipeline.can_tune_threshold_with_objective(
                        automl.objective
                ) and automl.objective.can_optimize_threshold:
                    logger.debug(
                        f"\t\t\tFold {i}: Optimal threshold found ({cv_pipeline.threshold:.3f})"
                    )
                logger.debug(f"\t\t\tFold {i}: Scoring trained pipeline")
                scores = cv_pipeline.score(X_valid,
                                           y_valid,
                                           objectives=objectives_to_score)
                logger.debug(
                    f"\t\t\tFold {i}: {automl.objective.name} score: {scores[automl.objective.name]:.3f}"
                )
                score = scores[automl.objective.name]
            except Exception as e:
                if automl.error_callback is not None:
                    automl.error_callback(exception=e,
                                          traceback=traceback.format_tb(
                                              sys.exc_info()[2]),
                                          automl=automl,
                                          fold_num=i,
                                          pipeline=pipeline)
                if isinstance(e, PipelineScoreError):
                    nan_scores = {
                        objective: np.nan
                        for objective in e.exceptions
                    }
                    scores = {**nan_scores, **e.scored_successfully}
                    scores = OrderedDict({
                        o.name: scores[o.name]
                        for o in [automl.objective] +
                        automl.additional_objectives
                    })
                    score = scores[automl.objective.name]
                else:
                    score = np.nan
                    scores = OrderedDict(
                        zip([n.name for n in automl.additional_objectives],
                            [np.nan] * len(automl.additional_objectives)))

            ordered_scores = OrderedDict()
            ordered_scores.update({automl.objective.name: score})
            ordered_scores.update(scores)
            ordered_scores.update({"# Training": y_train.shape[0]})
            ordered_scores.update({"# Validation": y_valid.shape[0]})

            evaluation_entry = {
                "all_objective_scores": ordered_scores,
                "score": score,
                'binary_classification_threshold': None
            }
            if is_binary(
                    automl.problem_type
            ) and cv_pipeline is not None and cv_pipeline.threshold is not None:
                evaluation_entry[
                    'binary_classification_threshold'] = cv_pipeline.threshold
            cv_data.append(evaluation_entry)
        training_time = time.time() - start
        cv_scores = pd.Series([fold['score'] for fold in cv_data])
        cv_score_mean = cv_scores.mean()
        logger.info(
            f"\tFinished cross validation - mean {automl.objective.name}: {cv_score_mean:.3f}"
        )
        return {
            'cv_data': cv_data,
            'training_time': training_time,
            'cv_scores': cv_scores,
            'cv_score_mean': cv_score_mean
        }
Esempio n. 12
0
    def validate(self, X, y):
        """Checks if the target data contains missing or invalid values.

        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): Features. Ignored.
            y (ww.DataColumn, pd.Series, np.ndarray): Target data to check for invalid values.

        Returns:
            dict (DataCheckError): List with DataCheckErrors if any invalid values are found in the target data.

        Example:
            >>> import pandas as pd
            >>> X = pd.DataFrame({"col": [1, 2, 3, 1]})
            >>> y = pd.Series([0, 1, None, None])
            >>> target_check = InvalidTargetDataCheck('binary', 'Log Loss Binary')
            >>> assert target_check.validate(X, y) == {"errors": [{"message": "2 row(s) (50.0%) of target values are null",\
                                                                   "data_check_name": "InvalidTargetDataCheck",\
                                                                   "level": "error",\
                                                                   "code": "TARGET_HAS_NULL",\
                                                                   "details": {"num_null_rows": 2, "pct_null_rows": 50}}],\
                                                       "warnings": [],\
                                                       "actions": [{'code': 'IMPUTE_COL', 'metadata': {'column': None, 'impute_strategy': 'most_frequent', 'is_target': True}}]}
        """
        results = {"warnings": [], "errors": [], "actions": []}

        if y is None:
            results["errors"].append(
                DataCheckError(
                    message="Target is None",
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.TARGET_IS_NONE,
                    details={}).to_dict())
            return results

        y = infer_feature_types(y)
        is_supported_type = y.logical_type in numeric_and_boolean_ww + [
            ww.logical_types.Categorical
        ]
        if not is_supported_type:
            results["errors"].append(
                DataCheckError(
                    message=
                    "Target is unsupported {} type. Valid Woodwork logical types include: {}"
                    .format(
                        y.logical_type, ", ".join([
                            ltype.type_string
                            for ltype in numeric_and_boolean_ww
                        ])),
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE,
                    details={
                        "unsupported_type": y.logical_type.type_string
                    }).to_dict())
        y_df = _convert_woodwork_types_wrapper(y.to_series())
        null_rows = y_df.isnull()
        if null_rows.all():
            results["errors"].append(
                DataCheckError(message="Target is either empty or fully null.",
                               data_check_name=self.name,
                               message_code=DataCheckMessageCode.
                               TARGET_IS_EMPTY_OR_FULLY_NULL,
                               details={}).to_dict())
            return results
        elif null_rows.any():
            num_null_rows = null_rows.sum()
            pct_null_rows = null_rows.mean() * 100
            results["errors"].append(
                DataCheckError(
                    message="{} row(s) ({}%) of target values are null".format(
                        num_null_rows, pct_null_rows),
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.TARGET_HAS_NULL,
                    details={
                        "num_null_rows": num_null_rows,
                        "pct_null_rows": pct_null_rows
                    }).to_dict())
            impute_strategy = "mean" if is_regression(
                self.problem_type) else "most_frequent"
            results["actions"].append(
                DataCheckAction(DataCheckActionCode.IMPUTE_COL,
                                metadata={
                                    "column": None,
                                    "is_target": True,
                                    "impute_strategy": impute_strategy
                                }).to_dict())

        value_counts = y_df.value_counts()
        unique_values = value_counts.index.tolist()

        if is_binary(self.problem_type) and len(value_counts) != 2:
            if self.n_unique is None:
                details = {"target_values": unique_values}
            else:
                details = {
                    "target_values":
                    unique_values[:min(self.n_unique, len(unique_values))]
                }
            results["errors"].append(
                DataCheckError(
                    message=
                    "Binary class targets require exactly two unique values.",
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.
                    TARGET_BINARY_NOT_TWO_UNIQUE_VALUES,
                    details=details).to_dict())

        if self.problem_type == ProblemTypes.REGRESSION and "numeric" not in y.semantic_tags:
            results["errors"].append(
                DataCheckError(
                    message=
                    "Target data type should be numeric for regression type problems.",
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE,
                    details={}).to_dict())

        if is_multiclass(self.problem_type):
            if value_counts.min() <= 1:
                least_populated = value_counts[value_counts <= 1]
                details = {
                    "least_populated_class_labels":
                    least_populated.index.tolist()
                }
                results["errors"].append(
                    DataCheckError(
                        message=
                        "Target does not have at least two instances per class which is required for multiclass classification",
                        data_check_name=self.name,
                        message_code=DataCheckMessageCode.
                        TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS,
                        details=details).to_dict())
            if len(unique_values) <= 2:
                details = {"num_classes": len(unique_values)}
                results["errors"].append(
                    DataCheckError(
                        message=
                        "Target has two or less classes, which is too few for multiclass problems.  Consider changing to binary.",
                        data_check_name=self.name,
                        message_code=DataCheckMessageCode.
                        TARGET_MULTICLASS_NOT_ENOUGH_CLASSES,
                        details=details).to_dict())

            num_class_to_num_value_ratio = len(unique_values) / len(y)
            if num_class_to_num_value_ratio >= self.multiclass_continuous_threshold:
                details = {
                    "class_to_value_ratio": num_class_to_num_value_ratio
                }
                results["warnings"].append(
                    DataCheckWarning(
                        message=
                        "Target has a large number of unique values, could be regression type problem.",
                        data_check_name=self.name,
                        message_code=DataCheckMessageCode.
                        TARGET_MULTICLASS_HIGH_UNIQUE_CLASS,
                        details=details).to_dict())

        any_neg = not (y_df > 0).all() if y.logical_type in [
            ww.logical_types.Integer, ww.logical_types.Double
        ] else None
        if any_neg and self.objective.positive_only:
            details = {
                "Count of offending values":
                sum(val <= 0 for val in y_df.values.flatten())
            }
            results["errors"].append(
                DataCheckError(
                    message=
                    f"Target has non-positive values which is not supported for {self.objective.name}",
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.
                    TARGET_INCOMPATIBLE_OBJECTIVE,
                    details=details).to_dict())

        if X is not None:
            X = infer_feature_types(X)
            X_index = list(X.to_dataframe().index)
            y_index = list(y_df.index)
            X_length = len(X_index)
            y_length = len(y_index)
            if X_length != y_length:
                results["warnings"].append(
                    DataCheckWarning(
                        message=
                        "Input target and features have different lengths",
                        data_check_name=self.name,
                        message_code=DataCheckMessageCode.MISMATCHED_LENGTHS,
                        details={
                            "features_length": X_length,
                            "target_length": y_length
                        }).to_dict())

            if X_index != y_index:
                if set(X_index) == set(y_index):
                    results["warnings"].append(
                        DataCheckWarning(
                            message=
                            "Input target and features have mismatched indices order",
                            data_check_name=self.name,
                            message_code=DataCheckMessageCode.
                            MISMATCHED_INDICES_ORDER,
                            details={}).to_dict())
                else:
                    index_diff_not_in_X = list(set(y_index) -
                                               set(X_index))[:10]
                    index_diff_not_in_y = list(set(X_index) -
                                               set(y_index))[:10]
                    results["warnings"].append(
                        DataCheckWarning(
                            message=
                            "Input target and features have mismatched indices",
                            data_check_name=self.name,
                            message_code=DataCheckMessageCode.
                            MISMATCHED_INDICES,
                            details={
                                "indices_not_in_features": index_diff_not_in_X,
                                "indices_not_in_target": index_diff_not_in_y
                            }).to_dict())

        return results
Esempio n. 13
0
def _compute_shap_values(pipeline, features, training_data=None):
    """Computes SHAP values for each feature.

    Arguments:
        pipeline (PipelineBase): Trained pipeline whose predictions we want to explain with SHAP.
        features (pd.DataFrame): Dataframe of features - needs to correspond to data the pipeline was fit on.
        training_data (pd.DataFrame): Training data the pipeline was fit on.
            For non-tree estimators, we need a sample of training data for the KernelSHAP algorithm.

    Returns:
        dict or list(dict): For regression problems, a dictionary mapping a feature name to a list of SHAP values.
            For classification problems, returns a list of dictionaries. One for each class.
    """
    estimator = pipeline.estimator
    if estimator.model_family == ModelFamily.BASELINE:
        raise ValueError(
            "You passed in a baseline pipeline. These are simple enough that SHAP values are not needed."
        )

    feature_names = features.columns

    # This is to make sure all dtypes are numeric - SHAP algorithms will complain otherwise.
    # Sklearn components do this under-the-hood so we're not changing the data the model was trained on.
    # Catboost can naturally handle string-encoded categorical features so we don't need to convert to numeric.
    if estimator.model_family != ModelFamily.CATBOOST:
        features = check_array(features.values)

    if estimator.model_family.is_tree_estimator():
        # Use tree_path_dependent to avoid linear runtime with dataset size
        with warnings.catch_warnings(record=True) as ws:
            explainer = shap.TreeExplainer(
                estimator._component_obj,
                feature_perturbation="tree_path_dependent")
        if ws:
            logger.debug(
                f"_compute_shap_values TreeExplainer: {ws[0].message}")
        shap_values = explainer.shap_values(features, check_additivity=False)
        # shap only outputs values for positive class for Catboost/Xgboost binary estimators.
        # this modifies the output to match the output format of other binary estimators.
        # Ok to fill values of negative class with zeros since the negative class will get dropped
        # in the UI anyways.
        if estimator.model_family in {
                ModelFamily.CATBOOST, ModelFamily.XGBOOST
        } and is_binary(pipeline.problem_type):
            shap_values = [np.zeros(shap_values.shape), shap_values]
    else:
        if training_data is None:
            raise ValueError(
                "You must pass in a value for parameter 'training_data' when the pipeline "
                "does not have a tree-based estimator. "
                f"Current estimator model family is {estimator.model_family}.")

        # More than 100 datapoints can negatively impact runtime according to SHAP
        # https://github.com/slundberg/shap/blob/master/shap/explainers/kernel.py#L114
        sampled_training_data_features = shap.sample(training_data, 100)
        sampled_training_data_features = check_array(
            sampled_training_data_features)

        if is_regression(pipeline.problem_type):
            link_function = "identity"
            decision_function = estimator._component_obj.predict
        else:
            link_function = "logit"
            decision_function = estimator._component_obj.predict_proba
        with warnings.catch_warnings(record=True) as ws:
            explainer = shap.KernelExplainer(decision_function,
                                             sampled_training_data_features,
                                             link_function)
            shap_values = explainer.shap_values(features)
        if ws:
            logger.debug(
                f"_compute_shap_values KernelExplainer: {ws[0].message}")

    # classification problem
    if isinstance(shap_values, list):
        mappings = []
        for class_shap_values in shap_values:
            mappings.append(
                _create_dictionary(class_shap_values, feature_names))
        return mappings
    # regression problem
    elif isinstance(shap_values, np.ndarray):
        return _create_dictionary(shap_values, feature_names)
    else:
        raise ValueError(
            f"Unknown shap_values datatype {str(type(shap_values))}!")
Esempio n. 14
0
def test_explain_predictions_best_worst_and_explain_predictions(
        mock_make_table, mock_default_metrics, problem_type, output_format,
        answer, explain_predictions_answer, custom_index):
    if output_format == "text":
        mock_make_table.return_value = "table goes here"
    elif output_format == "dataframe":
        shap_table = pd.DataFrame({
            "feature_names": [0],
            "feature_values": [0],
            "qualitative_explanation": [0],
            "quantitative_explanation": [0],
        })
        # Use side effect so that we always get a new copy of the dataframe
        mock_make_table.side_effect = lambda *args, **kwargs: shap_table.copy()
    else:
        mock_make_table.return_value = {
            "explanations": ["explanation_dictionary_goes_here"]
        }

    pipeline = MagicMock()
    pipeline.parameters = "Parameters go here"
    input_features = pd.DataFrame({"a": [3, 4]}, index=custom_index)
    pipeline.problem_type = problem_type
    pipeline.name = "Test Pipeline Name"
    pipeline.compute_estimator_features.return_value = ww.DataTable(
        input_features)

    def _add_custom_index(answer, index_best, index_worst, output_format):

        if output_format == "text":
            answer = answer.format(index_0=index_best, index_1=index_worst)
        elif output_format == "dataframe":
            col_name = "prefix" if "prefix" in answer.columns else "rank"
            n_repeats = answer[col_name].value_counts().tolist()[0]
            answer['index_id'] = [index_best] * n_repeats + [index_worst
                                                             ] * n_repeats
        else:
            answer["explanations"][0]["predicted_values"][
                "index_id"] = index_best
            answer["explanations"][1]["predicted_values"][
                "index_id"] = index_worst
        return answer

    if is_regression(problem_type):
        abs_error_mock = MagicMock(__name__="abs_error")
        abs_error_mock.return_value = pd.Series([4., 1.], dtype="float64")
        mock_default_metrics.__getitem__.return_value = abs_error_mock
        pipeline.predict.return_value = ww.DataColumn(pd.Series([2, 1]))
        y_true = pd.Series([3, 2], index=custom_index)
        answer = _add_custom_index(answer,
                                   index_best=custom_index[1],
                                   index_worst=custom_index[0],
                                   output_format=output_format)
    elif is_binary(problem_type):
        pipeline.classes_.return_value = ["benign", "malignant"]
        cross_entropy_mock = MagicMock(__name__="cross_entropy")
        mock_default_metrics.__getitem__.return_value = cross_entropy_mock
        cross_entropy_mock.return_value = pd.Series([0.2, 0.78])
        pipeline.predict_proba.return_value = ww.DataTable(
            pd.DataFrame({
                "benign": [0.05, 0.1],
                "malignant": [0.95, 0.9]
            }))
        pipeline.predict.return_value = ww.DataColumn(
            pd.Series(["malignant"] * 2))
        y_true = pd.Series(["malignant", "benign"], index=custom_index)
        answer = _add_custom_index(answer,
                                   index_best=custom_index[0],
                                   index_worst=custom_index[1],
                                   output_format=output_format)
    else:
        # Multiclass text output is formatted slightly different so need to account for that
        if output_format == "text":
            mock_make_table.return_value = multiclass_table
        pipeline.classes_.return_value = ["setosa", "versicolor", "virginica"]
        cross_entropy_mock = MagicMock(__name__="cross_entropy")
        mock_default_metrics.__getitem__.return_value = cross_entropy_mock
        cross_entropy_mock.return_value = pd.Series([0.15, 0.34])
        pipeline.predict_proba.return_value = ww.DataTable(
            pd.DataFrame({
                "setosa": [0.8, 0.2],
                "versicolor": [0.1, 0.75],
                "virginica": [0.1, 0.05]
            }))
        pipeline.predict.return_value = ww.DataColumn(
            pd.Series(["setosa", "versicolor"]))
        y_true = pd.Series(["setosa", "versicolor"], index=custom_index)
        answer = _add_custom_index(answer,
                                   index_best=custom_index[0],
                                   index_worst=custom_index[1],
                                   output_format=output_format)

    report = explain_predictions(pipeline,
                                 input_features,
                                 y=y_true,
                                 indices_to_explain=[0, 1],
                                 output_format=output_format)
    if output_format == "text":
        compare_two_tables(report.splitlines(),
                           explain_predictions_answer.splitlines())
    elif output_format == "dataframe":
        assert report.columns.tolist(
        ) == explain_predictions_answer.columns.tolist()
        pd.testing.assert_frame_equal(
            report, explain_predictions_answer[report.columns])
    else:
        assert report == explain_predictions_answer

    best_worst_report = explain_predictions_best_worst(
        pipeline,
        input_features,
        y_true=y_true,
        num_to_explain=1,
        output_format=output_format)
    if output_format == "text":
        compare_two_tables(best_worst_report.splitlines(), answer.splitlines())
    elif output_format == "dataframe":
        # Check dataframes equal without caring about column order
        assert sorted(best_worst_report.columns.tolist()) == sorted(
            answer.columns.tolist())
        pd.testing.assert_frame_equal(best_worst_report,
                                      answer[best_worst_report.columns])
    else:
        assert best_worst_report == answer