def test_explain_raw_feats_classification(self, iris, tabular_explainer): # verify that no errors get thrown when calling get_raw_feat_importances x_train = iris[DatasetConstants.X_TRAIN] x_test = iris[DatasetConstants.X_TEST] y_train = iris[DatasetConstants.Y_TRAIN] model = create_sklearn_random_forest_classifier(x_train, y_train) explainer = tabular_explainer(model, x_train) global_explanation = explainer.explain_global(x_test) local_explanation = explainer.explain_local(x_test) raw_feat_indices = [[1, 3], [0, 2]] num_generated_cols = x_train.shape[1] # Create a feature map for only two features feature_map = _get_feature_map_from_indices_list( raw_feat_indices, num_raw_cols=2, num_generated_cols=num_generated_cols) global_raw_importances = global_explanation.get_raw_feature_importances( [feature_map]) assert len(global_raw_importances) == len(raw_feat_indices), \ 'length of global importances does not match number of features' local_raw_importances = local_explanation.get_raw_feature_importances( [feature_map]) assert len(local_raw_importances) == len(iris[DatasetConstants.CLASSES]), \ 'length of local importances does not match number of classes'
def test_explain_model_random_forest_classification( self, tabular_explainer): X, y = shap.datasets.adult() x_train, x_test, y_train, _ = train_test_split(X, y, test_size=0.2, random_state=7) # Fit a tree model model = create_sklearn_random_forest_classifier(x_train, y_train) # Create local tabular explainer without run history exp = tabular_explainer(model, x_train, features=X.columns.values) test_logger.info( 'Running explain global for test_explain_model_random_forest_classification' ) explanation = exp.explain_global(x_test) self.verify_adult_overall_features( explanation.get_ranked_global_names(), explanation.get_ranked_global_values()) self.verify_adult_per_class_features( explanation.get_ranked_per_class_names(), explanation.get_ranked_per_class_values()) self.verify_top_rows_local_features_with_and_without_top_k( explanation, self.adult_local_features_first_three_rf, is_classification=True, top_rows=3)
def test_explain_model_classification_with_predict_only(self, tabular_explainer): X, y = shap.datasets.adult() x_train, x_test, y_train, _ = train_test_split(X, y, test_size=0.003, random_state=7) # Fit a tree model model = create_sklearn_random_forest_classifier(x_train, y_train) # Wrap the model in a predict-only API wrapped_model = wrap_classifier_without_proba(model) # Create tabular explainer exp = tabular_explainer(wrapped_model, x_train, features=X.columns.values, model_task=ModelTask.Classification) test_logger.info('Running explain global for test_explain_model_classification_with_predict_only') explanation = exp.explain_global(x_test) # Validate predicted y values are boolean assert(np.all(np.isin(explanation.eval_y_predicted, [0, 1])))
def test_basic_upload(self, iris, tabular_explainer): x_train = iris[DatasetConstants.X_TRAIN] x_test = iris[DatasetConstants.X_TEST] y_train = iris[DatasetConstants.Y_TRAIN] model = create_sklearn_random_forest_classifier(x_train, y_train) explainer = tabular_explainer(model, x_train) global_explanation = explainer.explain_global(x_test) mlflow.set_experiment(TEST_EXPERIMENT) client = mlflow.tracking.MlflowClient() with mlflow.start_run() as run: _log_explanation(TEST_EXPLANATION, global_explanation) os.mkdir(TEST_EXPLANATION) download_path = client.download_artifacts( run.info.run_uuid, '', dst_path=TEST_EXPLANATION) downloaded_explanation = load_explanation(download_path) _assert_explanation_equivalence(global_explanation, downloaded_explanation)
def test_upload_as_model(self, iris, tabular_explainer, tracking_uri): mlflow.set_tracking_uri(tracking_uri) x_train = iris[DatasetConstants.X_TRAIN] x_test = iris[DatasetConstants.X_TEST] y_train = iris[DatasetConstants.Y_TRAIN] model = create_sklearn_random_forest_classifier(x_train, y_train) explainer = tabular_explainer(model, x_train) global_explanation = explainer.explain_global(x_test) mlflow.set_experiment(TEST_EXPERIMENT) with mlflow.start_run() as run: log_explanation(TEST_EXPLANATION, global_explanation) os.makedirs(TEST_DOWNLOAD, exist_ok=True) run_id = run.info.run_id downloaded_explanation_mlflow = get_explanation( run_id, TEST_EXPLANATION) _assert_explanation_equivalence(global_explanation, downloaded_explanation_mlflow)
def test_validate_against_shap(self): # Validate our explainer against shap library directly X, y = shap.datasets.adult() x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.02, random_state=7) # Fit several classifiers tree_classifiers = [ create_sklearn_random_forest_classifier(x_train, y_train) ] non_tree_classifiers = [ create_sklearn_logistic_regressor(x_train, y_train) ] tree_regressors = [ create_sklearn_random_forest_regressor(x_train, y_train) ] non_tree_regressors = [ create_sklearn_linear_regressor(x_train, y_train) ] # For each model, validate we get the same results as calling shap directly test_logger.info( "Running tree classifiers in test_validate_against_shap") for model in tree_classifiers: # Run shap directly for comparison exp = shap.TreeExplainer(model) explanation = exp.shap_values(x_test) shap_overall_imp = get_shap_imp_classification(explanation) overall_imp = tabular_explainer_imp(model, x_train, x_test) validate_correlation(overall_imp, shap_overall_imp, 0.95) test_logger.info( "Running non tree classifiers in test_validate_against_shap") for model in non_tree_classifiers: # Run shap directly for comparison clustered = shap.kmeans(x_train, 10) exp = shap.KernelExplainer(model.predict_proba, clustered) explanation = exp.shap_values(x_test) shap_overall_imp = get_shap_imp_classification(explanation) overall_imp = tabular_explainer_imp(model, x_train, x_test) validate_correlation(overall_imp, shap_overall_imp, 0.95) test_logger.info( "Running tree regressors in test_validate_against_shap") for model in tree_regressors: # Run shap directly for comparison exp = shap.TreeExplainer(model) explanation = exp.shap_values(x_test) shap_overall_imp = get_shap_imp_regression(explanation) overall_imp = tabular_explainer_imp(model, x_train, x_test) validate_correlation(overall_imp, shap_overall_imp, 0.95) test_logger.info( "Running non tree regressors in test_validate_against_shap") for model in non_tree_regressors: # Run shap directly for comparison clustered = shap.kmeans(x_train, 10) exp = shap.KernelExplainer(model.predict, clustered) explanation = exp.shap_values(x_test) shap_overall_imp = get_shap_imp_regression(explanation) overall_imp = tabular_explainer_imp(model, x_train, x_test) validate_correlation(overall_imp, shap_overall_imp, 0.95)
def test_validate_against_shap(self): # Validate our explainer against shap library directly X, y = shap.datasets.adult() x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.02, random_state=7) # Fit several classifiers tree_classifiers = [ create_sklearn_random_forest_classifier(x_train, y_train) ] non_tree_classifiers = [ create_sklearn_logistic_regressor(x_train, y_train) ] tree_regressors = [ create_sklearn_random_forest_regressor(x_train, y_train) ] non_tree_regressors = [ create_sklearn_linear_regressor(x_train, y_train) ] # For each model, validate we get the same results as calling shap directly test_logger.info( "Running tree classifiers in test_validate_against_shap") for model in tree_classifiers: # Run shap directly for comparison exp = shap.TreeExplainer(model) explanation = exp.shap_values(x_test) shap_overall_imp = get_shap_imp_classification(explanation) overall_imp = tabular_explainer_imp(model, x_train, x_test) validate_correlation(overall_imp, shap_overall_imp, 0.95) test_logger.info( "Running non tree classifiers in test_validate_against_shap") for model in non_tree_classifiers: # Run shap directly for comparison clustered = shap.kmeans(x_train, 10) exp = shap.KernelExplainer(model.predict_proba, clustered) explanation = exp.shap_values(x_test) shap_overall_imp = get_shap_imp_classification(explanation) overall_imp = tabular_explainer_imp(model, x_train, x_test) validate_correlation(overall_imp, shap_overall_imp, 0.95) test_logger.info( "Running tree regressors in test_validate_against_shap") for model in tree_regressors: # Run shap directly for comparison exp = shap.TreeExplainer(model) explanation = exp.shap_values(x_test) shap_overall_imp = get_shap_imp_regression(explanation) overall_imp = tabular_explainer_imp(model, x_train, x_test) validate_correlation(overall_imp, shap_overall_imp, 0.95) test_logger.info( "Running non tree regressors in test_validate_against_shap") for model in non_tree_regressors: # Run shap directly for comparison clustered = shap.kmeans(x_train, 10) exp = shap.KernelExplainer(model.predict, clustered) explanation = exp.shap_values(x_test) shap_overall_imp = get_shap_imp_regression(explanation) overall_imp = tabular_explainer_imp(model, x_train, x_test) validate_correlation(overall_imp, shap_overall_imp, 0.95) if not rapids_installed: pytest.skip("cuML not installed; will skip testing GPU Explainer") else: test_logger.info( "Running GPU non tree classifiers in test_validate_against_shap" ) x_train, x_test, y_train, y_validation, _, _ = create_cancer_data() gpu_non_tree_classifiers = [ create_cuml_svm_classifier(x_train.astype(np.float32), y_train.astype(np.float32)) ] for model in gpu_non_tree_classifiers: exp = KernelExplainer(model=model.predict_proba, data=x_train.astype(np.float32)) explanation = exp.shap_values(x_test.astype(np.float32)) shap_overall_imp = get_shap_imp_classification(explanation) overall_imp = tabular_explainer_imp(model, x_train.astype(np.float32), x_test.astype(np.float32), use_gpu=True) validate_correlation(overall_imp, shap_overall_imp, 0.95)