Esempio n. 1
0
def _make_single_prediction_shap_table(pipeline, input_features, top_k=3, training_data=None,
                                       include_shap_values=False, output_format="text"):
    """Creates table summarizing the top_k positive and top_k negative contributing features to the prediction of a single datapoint.

    Arguments:
        pipeline (PipelineBase): Fitted pipeline whose predictions we want to explain with SHAP.
        input_features (pd.DataFrame): Dataframe of features - needs to correspond to data the pipeline was fit on.
        top_k (int): How many of the highest/lowest features to include in the table.
        training_data (pd.DataFrame): Training data the pipeline was fit on.
            This is required for non-tree estimators because we need a sample of training data for the KernelSHAP algorithm.
        include_shap_values (bool): Whether the SHAP values should be included in an extra column in the output.
            Default is False.

    Returns:
        str: Table
    """
    pipeline_features = pipeline.compute_estimator_features(input_features)

    shap_values = _compute_shap_values(pipeline, pipeline_features, training_data)
    normalized_shap_values = _normalize_shap_values(shap_values)

    class_names = None
    if hasattr(pipeline, "classes_"):
        class_names = pipeline.classes_

    table_makers = {ProblemTypes.REGRESSION: _RegressionSHAPTable(),
                    ProblemTypes.BINARY: _BinarySHAPTable(class_names),
                    ProblemTypes.MULTICLASS: _MultiClassSHAPTable(class_names)}

    table_maker_class = table_makers[pipeline.problem_type]

    table_maker = table_maker_class.make_text if output_format == "text" else table_maker_class.make_dict

    return table_maker(shap_values, normalized_shap_values, pipeline_features, top_k, include_shap_values)
Esempio n. 2
0
def _make_single_prediction_shap_table(pipeline,
                                       input_features,
                                       y,
                                       index_to_explain,
                                       top_k=3,
                                       include_shap_values=False,
                                       output_format="text"):
    """Creates table summarizing the top_k_features positive and top_k_features negative contributing features to the prediction of a single datapoint.

    Arguments:
        pipeline (PipelineBase): Fitted pipeline whose predictions we want to explain with SHAP.
        input_features (pd.DataFrame): Dataframe of features - needs to correspond to data the pipeline was fit on.
        top_k (int): How many of the highest/lowest features to include in the table.
        training_data (pd.DataFrame): Training data the pipeline was fit on.
            This is required for non-tree estimators because we need a sample of training data for the KernelSHAP algorithm.
        include_shap_values (bool): Whether the SHAP values should be included in an extra column in the output.
            Default is False.
        output_format (str): The desired format of the output.  Can be "text", "dict", or "dataframe".

    Returns:
        str: Table

    Raises:
        ValueError: if requested index results in a NaN in the computed features.
    """
    pipeline_features = pipeline.compute_estimator_features(input_features,
                                                            y).to_dataframe()
    pipeline_features_row = pipeline_features.iloc[[index_to_explain]]
    if pipeline_features_row.isna().any(axis=None):
        raise ValueError(
            f"Requested index ({index_to_explain}) produces NaN in features.")
    shap_values = _compute_shap_values(
        pipeline,
        pipeline_features_row,
        training_data=pipeline_features.dropna(axis=0))
    normalized_shap_values = _normalize_shap_values(shap_values)

    class_names = None
    if hasattr(pipeline, "classes_"):
        class_names = pipeline.classes_

    table_makers = {
        ProblemTypes.REGRESSION: _RegressionSHAPTable(),
        ProblemTypes.BINARY: _BinarySHAPTable(class_names),
        ProblemTypes.MULTICLASS: _MultiClassSHAPTable(class_names),
        ProblemTypes.TIME_SERIES_REGRESSION: _RegressionSHAPTable(),
        ProblemTypes.TIME_SERIES_BINARY: _BinarySHAPTable(class_names),
        ProblemTypes.TIME_SERIES_MULTICLASS: _MultiClassSHAPTable(class_names)
    }

    table_maker_class = table_makers[pipeline.problem_type]
    table_maker = {
        "text": table_maker_class.make_text,
        "dict": table_maker_class.make_dict,
        "dataframe": table_maker_class.make_dataframe
    }[output_format]

    return table_maker(shap_values, normalized_shap_values,
                       pipeline_features_row, top_k, include_shap_values)
Esempio n. 3
0
def test_value_errors_raised(mock_tree_explainer, pipeline, exception, match):

    if "xgboost" in pipeline.name.lower():
        pytest.importorskip("xgboost", "Skipping test because xgboost is not installed.")
    if "catboost" in pipeline.name.lower():
        pytest.importorskip("catboost", "Skipping test because catboost is not installed.")

    with pytest.raises(exception, match=match):
        _ = _compute_shap_values(pipeline({"pipeline": {"gap": 1, "max_delay": 1}}), pd.DataFrame(np.random.random((2, 16))))
Esempio n. 4
0
def test_compute_shap_values_catches_shap_tree_warnings(mock_tree_explainer, mock_debug, X_y_binary, caplog):
    X, y = X_y_binary
    pipeline = BinaryClassificationPipeline(["Random Forest Classifier"])

    def raise_warning_from_shap(estimator, feature_perturbation):
        warnings.warn("Shap raised a warning!")
        mock = MagicMock()
        mock.shap_values.return_value = np.zeros(10)
        return mock

    mock_tree_explainer.side_effect = raise_warning_from_shap

    _ = _compute_shap_values(pipeline, pd.DataFrame(X))
    mock_debug.debug.assert_called_with("_compute_shap_values TreeExplainer: Shap raised a warning!")
Esempio n. 5
0
def calculate_shap_for_test(training_data, y, pipeline, n_points_to_explain):
    """Helper function to compute the SHAP values for n_points_to_explain for a given pipeline."""
    points_to_explain = training_data[:n_points_to_explain]
    pipeline.fit(training_data, y)
    return _compute_shap_values(pipeline, pd.DataFrame(points_to_explain), training_data)