def test_verify_linear_model_coefficient_explanation(self):
        # Validate our explainer against an explainable linear model
        X, y = shap.datasets.adult()
        x_train, x_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=7)
        # Fit a logistic regression classifier
        model = create_sklearn_logistic_regressor(x_train, y_train)

        # Create tabular explainer
        exp = TabularExplainer(model,
                               x_train,
                               features=list(range(x_train.shape[1])))
        test_logger.info(
            "Running explain model for test_verify_linear_model_coefficient_explanation"
        )
        # Validate evaluation sampling
        policy = {
            ExplainParams.SAMPLING_POLICY:
            SamplingPolicy(allow_eval_sampling=True)
        }
        explanation = exp.explain_global(x_test, **policy)
        mean_train = np.mean(x_train.values, axis=0)
        # Retrieve the model coefficients
        coefficients = model.coef_[0]
        # Normalize the coefficients by mean for a rough ground-truth of importance
        norm_coeff = mean_train * coefficients
        # order coefficients by importance
        norm_coeff_imp = np.abs(norm_coeff).argsort()[..., ::-1]
        # Calculate the correlation
        validate_correlation(explanation.global_importance_rank,
                             norm_coeff_imp, 0.76)
Example #2
0
    def test_get_local_raw_explanations_sparse_binary_classification(
            self, mimic_explainer):
        x_train, x_test, y_train, _, classes, _ = create_binary_sparse_newsgroups_data(
        )
        # Fit a linear regression model
        model = create_sklearn_logistic_regressor(x_train, y_train)

        explainer = mimic_explainer(
            model,
            x_train,
            LinearExplainableModel,
            explainable_model_args={'sparse_data': True},
            classes=classes)
        global_explanation = explainer.explain_global(x_test)
        assert global_explanation.method == LINEAR_METHOD

        num_engineered_feats = x_train.shape[1]
        feature_map = np.eye(5, num_engineered_feats)
        feature_names = [str(i) for i in range(feature_map.shape[0])]
        raw_names = feature_names[:feature_map.shape[0]]
        global_raw_explanation = global_explanation.get_raw_explanation(
            [feature_map], raw_feature_names=raw_names)
        self.validate_global_raw_explanation_classification(
            global_explanation,
            global_raw_explanation,
            feature_map,
            classes,
            feature_names,
            is_sparse=True)
    def test_verify_pipeline_model_coefficient_explanation(self):
        # Validate our explainer against an explainable linear model
        X, y = shap.datasets.adult()
        x_train, x_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=7)
        # Note: in pipeline case, we use KernelExplainer;
        # in linear case we use LinearExplainer which is much faster
        pipeline = [True, False]
        threshold = [0.85, 0.76]
        for idx, is_pipeline in enumerate(pipeline):
            # Fit a logistic regression classifier
            model = create_sklearn_logistic_regressor(x_train,
                                                      y_train,
                                                      pipeline=is_pipeline)

            # Create tabular explainer
            exp = TabularExplainer(model,
                                   x_train,
                                   features=list(range(x_train.shape[1])))
            test_logger.info(
                "Running explain model for test_verify_linear_model_coefficient_explanation"
            )
            # Validate evaluation sampling
            policy = {
                ExplainParams.SAMPLING_POLICY:
                SamplingPolicy(allow_eval_sampling=True)
            }
            explanation = exp.explain_global(x_test, **policy)
            mean_train = np.mean(x_train.values, axis=0)
            # Retrieve the model coefficients
            if isinstance(model, Pipeline):
                model = model.steps[0][1]
            coefficients = model.coef_[0]
            # Normalize the coefficients by mean for a rough ground-truth of importance
            norm_coeff = mean_train * coefficients
            # order coefficients by importance
            norm_coeff_imp = np.abs(norm_coeff).argsort()[..., ::-1]
            # Calculate the correlation
            validate_correlation(explanation.global_importance_rank,
                                 norm_coeff_imp, threshold[idx])
    def test_validate_against_shap(self):
        # Validate our explainer against shap library directly
        X, y = shap.datasets.adult()
        x_train, x_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.02,
                                                            random_state=7)
        # Fit several classifiers
        tree_classifiers = [
            create_sklearn_random_forest_classifier(x_train, y_train)
        ]
        non_tree_classifiers = [
            create_sklearn_logistic_regressor(x_train, y_train)
        ]
        tree_regressors = [
            create_sklearn_random_forest_regressor(x_train, y_train)
        ]
        non_tree_regressors = [
            create_sklearn_linear_regressor(x_train, y_train)
        ]
        # For each model, validate we get the same results as calling shap directly
        test_logger.info(
            "Running tree classifiers in test_validate_against_shap")
        for model in tree_classifiers:
            # Run shap directly for comparison
            exp = shap.TreeExplainer(model)
            explanation = exp.shap_values(x_test)
            shap_overall_imp = get_shap_imp_classification(explanation)
            overall_imp = tabular_explainer_imp(model, x_train, x_test)
            validate_correlation(overall_imp, shap_overall_imp, 0.95)

        test_logger.info(
            "Running non tree classifiers in test_validate_against_shap")
        for model in non_tree_classifiers:
            # Run shap directly for comparison
            clustered = shap.kmeans(x_train, 10)
            exp = shap.KernelExplainer(model.predict_proba, clustered)
            explanation = exp.shap_values(x_test)
            shap_overall_imp = get_shap_imp_classification(explanation)
            overall_imp = tabular_explainer_imp(model, x_train, x_test)
            validate_correlation(overall_imp, shap_overall_imp, 0.95)

        test_logger.info(
            "Running tree regressors in test_validate_against_shap")
        for model in tree_regressors:
            # Run shap directly for comparison
            exp = shap.TreeExplainer(model)
            explanation = exp.shap_values(x_test)
            shap_overall_imp = get_shap_imp_regression(explanation)
            overall_imp = tabular_explainer_imp(model, x_train, x_test)
            validate_correlation(overall_imp, shap_overall_imp, 0.95)

        test_logger.info(
            "Running non tree regressors in test_validate_against_shap")
        for model in non_tree_regressors:
            # Run shap directly for comparison
            clustered = shap.kmeans(x_train, 10)
            exp = shap.KernelExplainer(model.predict, clustered)
            explanation = exp.shap_values(x_test)
            shap_overall_imp = get_shap_imp_regression(explanation)
            overall_imp = tabular_explainer_imp(model, x_train, x_test)
            validate_correlation(overall_imp, shap_overall_imp, 0.95)
Example #5
0
    def test_validate_against_shap(self):
        # Validate our explainer against shap library directly
        X, y = shap.datasets.adult()
        x_train, x_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.02,
                                                            random_state=7)
        # Fit several classifiers
        tree_classifiers = [
            create_sklearn_random_forest_classifier(x_train, y_train)
        ]
        non_tree_classifiers = [
            create_sklearn_logistic_regressor(x_train, y_train)
        ]
        tree_regressors = [
            create_sklearn_random_forest_regressor(x_train, y_train)
        ]
        non_tree_regressors = [
            create_sklearn_linear_regressor(x_train, y_train)
        ]
        # For each model, validate we get the same results as calling shap directly
        test_logger.info(
            "Running tree classifiers in test_validate_against_shap")
        for model in tree_classifiers:
            # Run shap directly for comparison
            exp = shap.TreeExplainer(model)
            explanation = exp.shap_values(x_test)
            shap_overall_imp = get_shap_imp_classification(explanation)
            overall_imp = tabular_explainer_imp(model, x_train, x_test)
            validate_correlation(overall_imp, shap_overall_imp, 0.95)

        test_logger.info(
            "Running non tree classifiers in test_validate_against_shap")
        for model in non_tree_classifiers:
            # Run shap directly for comparison
            clustered = shap.kmeans(x_train, 10)
            exp = shap.KernelExplainer(model.predict_proba, clustered)
            explanation = exp.shap_values(x_test)
            shap_overall_imp = get_shap_imp_classification(explanation)
            overall_imp = tabular_explainer_imp(model, x_train, x_test)
            validate_correlation(overall_imp, shap_overall_imp, 0.95)

        test_logger.info(
            "Running tree regressors in test_validate_against_shap")
        for model in tree_regressors:
            # Run shap directly for comparison
            exp = shap.TreeExplainer(model)
            explanation = exp.shap_values(x_test)
            shap_overall_imp = get_shap_imp_regression(explanation)
            overall_imp = tabular_explainer_imp(model, x_train, x_test)
            validate_correlation(overall_imp, shap_overall_imp, 0.95)

        test_logger.info(
            "Running non tree regressors in test_validate_against_shap")
        for model in non_tree_regressors:
            # Run shap directly for comparison
            clustered = shap.kmeans(x_train, 10)
            exp = shap.KernelExplainer(model.predict, clustered)
            explanation = exp.shap_values(x_test)
            shap_overall_imp = get_shap_imp_regression(explanation)
            overall_imp = tabular_explainer_imp(model, x_train, x_test)
            validate_correlation(overall_imp, shap_overall_imp, 0.95)

        if not rapids_installed:
            pytest.skip("cuML not installed; will skip testing GPU Explainer")
        else:
            test_logger.info(
                "Running GPU non tree classifiers in test_validate_against_shap"
            )
            x_train, x_test, y_train, y_validation, _, _ = create_cancer_data()
            gpu_non_tree_classifiers = [
                create_cuml_svm_classifier(x_train.astype(np.float32),
                                           y_train.astype(np.float32))
            ]
            for model in gpu_non_tree_classifiers:
                exp = KernelExplainer(model=model.predict_proba,
                                      data=x_train.astype(np.float32))
                explanation = exp.shap_values(x_test.astype(np.float32))
                shap_overall_imp = get_shap_imp_classification(explanation)
                overall_imp = tabular_explainer_imp(model,
                                                    x_train.astype(np.float32),
                                                    x_test.astype(np.float32),
                                                    use_gpu=True)
                validate_correlation(overall_imp, shap_overall_imp, 0.95)