def test_verify_linear_model_coefficient_explanation(self): # Validate our explainer against an explainable linear model X, y = shap.datasets.adult() x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7) # Fit a logistic regression classifier model = create_sklearn_logistic_regressor(x_train, y_train) # Create tabular explainer exp = TabularExplainer(model, x_train, features=list(range(x_train.shape[1]))) test_logger.info( "Running explain model for test_verify_linear_model_coefficient_explanation" ) # Validate evaluation sampling policy = { ExplainParams.SAMPLING_POLICY: SamplingPolicy(allow_eval_sampling=True) } explanation = exp.explain_global(x_test, **policy) mean_train = np.mean(x_train.values, axis=0) # Retrieve the model coefficients coefficients = model.coef_[0] # Normalize the coefficients by mean for a rough ground-truth of importance norm_coeff = mean_train * coefficients # order coefficients by importance norm_coeff_imp = np.abs(norm_coeff).argsort()[..., ::-1] # Calculate the correlation validate_correlation(explanation.global_importance_rank, norm_coeff_imp, 0.76)
def test_get_local_raw_explanations_sparse_binary_classification( self, mimic_explainer): x_train, x_test, y_train, _, classes, _ = create_binary_sparse_newsgroups_data( ) # Fit a linear regression model model = create_sklearn_logistic_regressor(x_train, y_train) explainer = mimic_explainer( model, x_train, LinearExplainableModel, explainable_model_args={'sparse_data': True}, classes=classes) global_explanation = explainer.explain_global(x_test) assert global_explanation.method == LINEAR_METHOD num_engineered_feats = x_train.shape[1] feature_map = np.eye(5, num_engineered_feats) feature_names = [str(i) for i in range(feature_map.shape[0])] raw_names = feature_names[:feature_map.shape[0]] global_raw_explanation = global_explanation.get_raw_explanation( [feature_map], raw_feature_names=raw_names) self.validate_global_raw_explanation_classification( global_explanation, global_raw_explanation, feature_map, classes, feature_names, is_sparse=True)
def test_verify_pipeline_model_coefficient_explanation(self): # Validate our explainer against an explainable linear model X, y = shap.datasets.adult() x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7) # Note: in pipeline case, we use KernelExplainer; # in linear case we use LinearExplainer which is much faster pipeline = [True, False] threshold = [0.85, 0.76] for idx, is_pipeline in enumerate(pipeline): # Fit a logistic regression classifier model = create_sklearn_logistic_regressor(x_train, y_train, pipeline=is_pipeline) # Create tabular explainer exp = TabularExplainer(model, x_train, features=list(range(x_train.shape[1]))) test_logger.info( "Running explain model for test_verify_linear_model_coefficient_explanation" ) # Validate evaluation sampling policy = { ExplainParams.SAMPLING_POLICY: SamplingPolicy(allow_eval_sampling=True) } explanation = exp.explain_global(x_test, **policy) mean_train = np.mean(x_train.values, axis=0) # Retrieve the model coefficients if isinstance(model, Pipeline): model = model.steps[0][1] coefficients = model.coef_[0] # Normalize the coefficients by mean for a rough ground-truth of importance norm_coeff = mean_train * coefficients # order coefficients by importance norm_coeff_imp = np.abs(norm_coeff).argsort()[..., ::-1] # Calculate the correlation validate_correlation(explanation.global_importance_rank, norm_coeff_imp, threshold[idx])
def test_validate_against_shap(self): # Validate our explainer against shap library directly X, y = shap.datasets.adult() x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.02, random_state=7) # Fit several classifiers tree_classifiers = [ create_sklearn_random_forest_classifier(x_train, y_train) ] non_tree_classifiers = [ create_sklearn_logistic_regressor(x_train, y_train) ] tree_regressors = [ create_sklearn_random_forest_regressor(x_train, y_train) ] non_tree_regressors = [ create_sklearn_linear_regressor(x_train, y_train) ] # For each model, validate we get the same results as calling shap directly test_logger.info( "Running tree classifiers in test_validate_against_shap") for model in tree_classifiers: # Run shap directly for comparison exp = shap.TreeExplainer(model) explanation = exp.shap_values(x_test) shap_overall_imp = get_shap_imp_classification(explanation) overall_imp = tabular_explainer_imp(model, x_train, x_test) validate_correlation(overall_imp, shap_overall_imp, 0.95) test_logger.info( "Running non tree classifiers in test_validate_against_shap") for model in non_tree_classifiers: # Run shap directly for comparison clustered = shap.kmeans(x_train, 10) exp = shap.KernelExplainer(model.predict_proba, clustered) explanation = exp.shap_values(x_test) shap_overall_imp = get_shap_imp_classification(explanation) overall_imp = tabular_explainer_imp(model, x_train, x_test) validate_correlation(overall_imp, shap_overall_imp, 0.95) test_logger.info( "Running tree regressors in test_validate_against_shap") for model in tree_regressors: # Run shap directly for comparison exp = shap.TreeExplainer(model) explanation = exp.shap_values(x_test) shap_overall_imp = get_shap_imp_regression(explanation) overall_imp = tabular_explainer_imp(model, x_train, x_test) validate_correlation(overall_imp, shap_overall_imp, 0.95) test_logger.info( "Running non tree regressors in test_validate_against_shap") for model in non_tree_regressors: # Run shap directly for comparison clustered = shap.kmeans(x_train, 10) exp = shap.KernelExplainer(model.predict, clustered) explanation = exp.shap_values(x_test) shap_overall_imp = get_shap_imp_regression(explanation) overall_imp = tabular_explainer_imp(model, x_train, x_test) validate_correlation(overall_imp, shap_overall_imp, 0.95)
def test_validate_against_shap(self): # Validate our explainer against shap library directly X, y = shap.datasets.adult() x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.02, random_state=7) # Fit several classifiers tree_classifiers = [ create_sklearn_random_forest_classifier(x_train, y_train) ] non_tree_classifiers = [ create_sklearn_logistic_regressor(x_train, y_train) ] tree_regressors = [ create_sklearn_random_forest_regressor(x_train, y_train) ] non_tree_regressors = [ create_sklearn_linear_regressor(x_train, y_train) ] # For each model, validate we get the same results as calling shap directly test_logger.info( "Running tree classifiers in test_validate_against_shap") for model in tree_classifiers: # Run shap directly for comparison exp = shap.TreeExplainer(model) explanation = exp.shap_values(x_test) shap_overall_imp = get_shap_imp_classification(explanation) overall_imp = tabular_explainer_imp(model, x_train, x_test) validate_correlation(overall_imp, shap_overall_imp, 0.95) test_logger.info( "Running non tree classifiers in test_validate_against_shap") for model in non_tree_classifiers: # Run shap directly for comparison clustered = shap.kmeans(x_train, 10) exp = shap.KernelExplainer(model.predict_proba, clustered) explanation = exp.shap_values(x_test) shap_overall_imp = get_shap_imp_classification(explanation) overall_imp = tabular_explainer_imp(model, x_train, x_test) validate_correlation(overall_imp, shap_overall_imp, 0.95) test_logger.info( "Running tree regressors in test_validate_against_shap") for model in tree_regressors: # Run shap directly for comparison exp = shap.TreeExplainer(model) explanation = exp.shap_values(x_test) shap_overall_imp = get_shap_imp_regression(explanation) overall_imp = tabular_explainer_imp(model, x_train, x_test) validate_correlation(overall_imp, shap_overall_imp, 0.95) test_logger.info( "Running non tree regressors in test_validate_against_shap") for model in non_tree_regressors: # Run shap directly for comparison clustered = shap.kmeans(x_train, 10) exp = shap.KernelExplainer(model.predict, clustered) explanation = exp.shap_values(x_test) shap_overall_imp = get_shap_imp_regression(explanation) overall_imp = tabular_explainer_imp(model, x_train, x_test) validate_correlation(overall_imp, shap_overall_imp, 0.95) if not rapids_installed: pytest.skip("cuML not installed; will skip testing GPU Explainer") else: test_logger.info( "Running GPU non tree classifiers in test_validate_against_shap" ) x_train, x_test, y_train, y_validation, _, _ = create_cancer_data() gpu_non_tree_classifiers = [ create_cuml_svm_classifier(x_train.astype(np.float32), y_train.astype(np.float32)) ] for model in gpu_non_tree_classifiers: exp = KernelExplainer(model=model.predict_proba, data=x_train.astype(np.float32)) explanation = exp.shap_values(x_test.astype(np.float32)) shap_overall_imp = get_shap_imp_classification(explanation) overall_imp = tabular_explainer_imp(model, x_train.astype(np.float32), x_test.astype(np.float32), use_gpu=True) validate_correlation(overall_imp, shap_overall_imp, 0.95)