def test_verify_linear_model_coefficient_explanation(self): # Validate our explainer against an explainable linear model X, y = shap.datasets.adult() x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7) # Fit a logistic regression classifier model = create_sklearn_logistic_regressor(x_train, y_train) # Create tabular explainer exp = TabularExplainer(model, x_train, features=list(range(x_train.shape[1]))) test_logger.info( "Running explain model for test_verify_linear_model_coefficient_explanation" ) # Validate evaluation sampling policy = { ExplainParams.SAMPLING_POLICY: SamplingPolicy(allow_eval_sampling=True) } explanation = exp.explain_global(x_test, **policy) mean_train = np.mean(x_train.values, axis=0) # Retrieve the model coefficients coefficients = model.coef_[0] # Normalize the coefficients by mean for a rough ground-truth of importance norm_coeff = mean_train * coefficients # order coefficients by importance norm_coeff_imp = np.abs(norm_coeff).argsort()[..., ::-1] # Calculate the correlation validate_correlation(explanation.global_importance_rank, norm_coeff_imp, 0.76)
def tabular_explainer_imp(model, x_train, x_test, allow_eval_sampling=True): # Create tabular explainer exp = TabularExplainer(model, x_train, features=list(range(x_train.shape[1]))) # Validate evaluation sampling policy = { ExplainParams.SAMPLING_POLICY: SamplingPolicy(allow_eval_sampling=allow_eval_sampling) } explanation = exp.explain_global(x_test, **policy) return explanation.global_importance_rank
def test_explain_model_sparse_tree(self, tabular_explainer): X, y = retrieve_dataset('a1a.svmlight') x_train, x_test, y_train, _ = train_test_split(X, y, test_size=0.002, random_state=7) # Fit a random forest regression model model = create_sklearn_random_forest_regressor(x_train, y_train) _, cols = x_train.shape shape = 1, cols background = csr_matrix(shape, dtype=x_train.dtype) # Create tabular explainer exp = tabular_explainer(model, background) test_logger.info('Running explain global for test_explain_model_sparse_tree') policy = SamplingPolicy(allow_eval_sampling=True) exp.explain_global(x_test, sampling_policy=policy)
def test_verify_pipeline_model_coefficient_explanation(self): # Validate our explainer against an explainable linear model X, y = shap.datasets.adult() x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7) # Note: in pipeline case, we use KernelExplainer; # in linear case we use LinearExplainer which is much faster pipeline = [True, False] threshold = [0.85, 0.76] for idx, is_pipeline in enumerate(pipeline): # Fit a logistic regression classifier model = create_sklearn_logistic_regressor(x_train, y_train, pipeline=is_pipeline) # Create tabular explainer exp = TabularExplainer(model, x_train, features=list(range(x_train.shape[1]))) test_logger.info( "Running explain model for test_verify_linear_model_coefficient_explanation" ) # Validate evaluation sampling policy = { ExplainParams.SAMPLING_POLICY: SamplingPolicy(allow_eval_sampling=True) } explanation = exp.explain_global(x_test, **policy) mean_train = np.mean(x_train.values, axis=0) # Retrieve the model coefficients if isinstance(model, Pipeline): model = model.steps[0][1] coefficients = model.coef_[0] # Normalize the coefficients by mean for a rough ground-truth of importance norm_coeff = mean_train * coefficients # order coefficients by importance norm_coeff_imp = np.abs(norm_coeff).argsort()[..., ::-1] # Calculate the correlation validate_correlation(explanation.global_importance_rank, norm_coeff_imp, threshold[idx])
def _explain_model_dnn_common(self, tabular_explainer, model, x_train, x_test, y_train, features): # Create local tabular explainer without run history exp = tabular_explainer(model, x_train.values, features=features) policy = SamplingPolicy(allow_eval_sampling=True) exp.explain_global(x_test.values, sampling_policy=policy)
def _explain_model_dnn_common(self, tabular_explainer, model, x_train, x_test, y_train, features): # Create tabular explainer exp = tabular_explainer(model, x_train.values, features=features, model_task=ModelTask.Classification) policy = SamplingPolicy(allow_eval_sampling=True) exp.explain_global(x_test.values, sampling_policy=policy)