def shapley_kernel_wrapper(model, trainx, testx, config): """ This function is called by the prediction algorithms (ml_functions) to compute the Shapley values. Note that the decision tree based models such as random forest do provide faster and exact (non-approximated) Shapley values with the TreeShapExplainer""" if config.exp_shap_background >= trainx.shape[0]: background = trainx else: background = shap.kmeans(trainx, config.exp_shap_background) # random sample of background values # ixx = np.random.choice(trainx.shape[0], config.exp_shap_background, replace=False) # background = trainx[ixx, :] # print(background.shape) explainer = shap.KernelExplainer(model.predict_proba, background) if isinstance( model, LogisticRegression ): # one background instance is enough if we use a linear model background = shap.kmeans(trainx, 1) backup_base = explainer.fnull explainer = shap.KernelExplainer(model.predict_proba, background) explainer.fnull = backup_base fnull_save = explainer.fnull[1] out = [ explainer.shap_values(testx[i, :], l1_reg=0.0)[1] for i in np.arange(len(testx)) ] return np.vstack(out)
def get_shap_kernel(estimator: object, X_train): """compute the shap value importance for non-tree based model Args: estimator (a none tree based sklearn estimator): a sklearn non tree based estimator x_train ((pd.DataFrame, np.ndarray),): X training data x_test ((pd.DataFrame, np.ndarray),): X testing data Returns: shap plot """ warnings.filterwarnings("ignore") # because the kernel explainer for non-tree based model extremly slower # so we must use kmeans to extract mainly information from x_train # to speed up the calculation if X_train.shape[1] > 3: x_train_summary = shap.kmeans(X_train, 3) else: x_train_summary = shap.kmeans(X_train, X_train.shape[1]) explainer = shap.KernelExplainer(estimator.predict, x_train_summary) size = len(X_train) if size < 50: size = size elif size * 0.2 > 50: size = 50 else: size = int(size * 0.2) sample_values = shap.sample(X_train, size) shap_values = explainer.shap_values(sample_values, lr_reg='num_features(10)') return explainer, shap_values, sample_values
def explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table, feature_column_names, is_pai, pai_table, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): def predict(d): def input_fn(): return tf.data.Dataset.from_tensor_slices( dict(pd.DataFrame(d, columns=shap_dataset.columns))).batch(1000) return np.array( [p['probabilities'][-1] for p in estimator.predict(input_fn)]) if len(shap_dataset) > 100: # Reduce to 16 weighted samples to speed up shap_dataset_summary = shap.kmeans(shap_dataset, 16) else: shap_dataset_summary = shap_dataset shap_values = shap.KernelExplainer( predict, shap_dataset_summary).shap_values(shap_dataset, l1_reg="aic") if result_table != "": if is_pai: write_shap_values(shap_values, "pai_maxcompute", None, result_table, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) else: conn = connect_with_data_source(datasource) write_shap_values(shap_values, conn.driver, conn, result_table, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) else: explainer.plot_and_save(lambda: shap.summary_plot( shap_values, shap_dataset, show=False, plot_type=plot_type))
def init_shap(self): """ Initialize shap. Calculate shap values. This operation is time consuming. :return: void (Sets the value of the shap_values variable) """ import shap if not self.shap_values: log.info("Initializing Shap - calculating shap values." " This operation is time-consuming so please be patient.") logger = log.getLogger('shap') logger.setLevel(log.WARN) shap_kernel_explainer = shap.KernelExplainer( self.model[1].predict_proba, shap.kmeans(self.X_test_ohe, 1)) shap_values = shap_kernel_explainer.shap_values(self.X_test_ohe) # from util.commons import RANDOM_NUMBER # shap_values = shap_kernel_explainer.shap_values(self.X_test_ohe.sample(66, random_state=RANDOM_NUMBER)) self.shap_kernel_explainer = shap_kernel_explainer self.shap_values = shap_values else: log.info("Shap is already initialized.")
def _summarize_data(X, k=10, to_round_values=True): """Summarize a dataset. For dense dataset, use k mean samples weighted by the number of data points they each represent. For sparse dataset, use a sparse row for the background with calculated median for dense columns. :param X: Matrix of data samples to summarize (# samples x # features). :type X: numpy.array or pandas.DataFrame or scipy.sparse.csr_matrix :param k: Number of cluster centroids to use for approximation. :type k: int :param to_round_values: When using kmeans, for each element of every cluster centroid to match the nearest value from X in the corresponding dimension. This ensures discrete features always get a valid value. Ignored for sparse data sample. :type to_round_values: bool :return: DenseData or SparseData object. :rtype: iml.datatypes.DenseData or iml.datatypes.SparseData """ is_sparse = issparse(X) if not isinstance(X, DenseData): if is_sparse: module_logger.debug('Creating sparse data summary as csr matrix') # calculate median of sparse background data median_dense = csc_median_axis_0(X.tocsc()) return csr_matrix(median_dense) elif len(X) > 10 * k: module_logger.debug('Create dense data summary with k-means') # use kmeans to summarize the examples for initialization # if there are more than 10 x k of them return shap.kmeans(X, k, to_round_values) return X
def get_shap_values(): global data model = [] model_params = [] xx = [] y = [] y_train = [] x_train = [] for d in data: model.append(d['model']) model_params.append(d['model_params']) xx.append(d['x']) y.append(d['y']) y_train.append(d['y_train']) x_train.append(d['x_train']) all_shap = [] shap_names = get_shap_names() for l, g, g_params, x, bd in zip(y, model, model_params, xx, x_train): r_model = reg_model(g, g_params) k_meaned = shap.kmeans(bd, 20) shap_values = shap.KernelExplainer(r_model, k_meaned).shap_values(x) all_shap.extend(shap_values) all_shap = np.array(all_shap) all_x = np.concatenate(xx) shap.summary_plot(all_shap, features=all_x, feature_names=shap_names, max_display=8) print('nice')
def test_log_explanation_with_small_features(): """ Verifies that `log_explanation` does not fail even when `features` has less records than `_MAXIMUM_BACKGROUND_DATA_SIZE`. """ num_rows = 50 assert num_rows < mlflow.shap._MAXIMUM_BACKGROUND_DATA_SIZE X, y = get_boston() X, y = X.iloc[:num_rows], y[:num_rows] model = RandomForestRegressor() model.fit(X, y) with mlflow.start_run() as run: explanation_uri = mlflow.shap.log_explanation(model.predict, X) artifact_path = "model_explanations_shap" artifacts = set(yield_artifacts(run.info.run_id)) assert explanation_uri == os.path.join(run.info.artifact_uri, artifact_path) assert artifacts == { os.path.join(artifact_path, "base_values.npy"), os.path.join(artifact_path, "shap_values.npy"), os.path.join(artifact_path, "summary_bar_plot.png"), } explainer = shap.KernelExplainer(model.predict, shap.kmeans(X, num_rows)) shap_values_expected = explainer.shap_values(X) base_values = np.load(os.path.join(explanation_uri, "base_values.npy")) shap_values = np.load(os.path.join(explanation_uri, "shap_values.npy")) np.testing.assert_array_equal(base_values, explainer.expected_value) np.testing.assert_array_equal(shap_values, shap_values_expected)
def explain_model(model, nlp_args, X_train, X_test, y_train, y_test): model.fit(X_train, y_train) # we use the first 100 training examples as our background dataset to integrate over if pipe.named_steps['clf'].__class__.__name__ == "LogisticRegression": explainer = shap.LinearExplainer( model.named_steps['clf'], model.named_steps['vectorizer'].transform( model.named_steps['preprocess'].transform(X_train))) shap_values = explainer.shap_values( model.named_steps['vectorizer'].transform( model.named_steps['preprocess'].transform(X_test)).toarray()) elif pipe.named_steps[ 'clf'].__class__.__name__ == "RandomForestClassifier": explainer = shap.TreeExplainer(model.named_steps['clf']) shap_values = explainer.shap_values( model.named_steps['vectorizer'].transform( model.named_steps['preprocess'].transform(X_test)).toarray()) else: X_train_summary = shap.kmeans( model.named_steps['vectorizer'].transform( model.named_steps['preprocess'].transform(X_train)).toarray(), 10) explainer = shap.KernelExplainer( model.named_steps['clf'].predict_proba, X_train_summary) shap_values = explainer.shap_values( model.named_steps['vectorizer'].transform( model.named_steps['preprocess'].transform(X_test)).toarray()) return explainer, shap_values
def script(num_models, k, scoring, num_clust): X = in_data.X y = in_data.Y best_params = {'hidden_layer_sizes': (28, 28), 'solver': 'adam'} print(in_data.X.shape) print(best_params) results = {} scores = {} models = {} for m in range(num_models): model = MLPRegressor(**best_params) kfold = model_selection.KFold(n_splits=k) cv_results = model_selection.cross_validate(model, X, y, cv=kfold, scoring=scoring) print(cv_results) score = sum(cv_results['test_r2']) / len(cv_results['test_r2']) results[m] = cv_results scores[m] = score models[m] = model print(m, max(scores.values()), score) print(max(scores, key=scores.get), scores, results[max(scores, key=scores.get)]) best_model = models[max(scores, key=scores.get)] feat_names = [x.name for x in in_data.domain.attributes] #new_feat_names = [x.split('=')[-1] for x in feat_names] print(feat_names) shap.initjs() best_model.fit(X, y) Xsum = shap.kmeans(X, num_clust) if num_clust is not None else X explainer = shap.KernelExplainer(best_model.predict, Xsum.data) shap_values = explainer.shap_values(X) #feat_names = ['{0}: {1}'.format(x[0], x[1]) for x in list(zip(feat_names, np.around(np.mean(np.absolute(shap_values), axis=0), 3)))] #id = len(Window.get_all_windows()) #summary_plot(shap_values, features=X, feature_names=feat_names, class_names=class_names, plot_type='bar', color='black', id=id) #summary_plot(shap_values, features=X, feature_names=feat_names, class_names=class_names, plot_type='dot', title=title, id=id) d = Orange.data.Domain( [Orange.data.ContinuousVariable(f) for f in feat_names] + [Orange.data.ContinuousVariable('S_' + f) for f in feat_names], class_vars=in_data.domain.class_vars, metas=in_data.domain.metas) xs = np.concatenate((in_data.X, shap_values), axis=1) ys = in_data.Y metas = in_data.metas out_data = Orange.data.Table.from_numpy(d, xs, Y=ys, metas=metas) return out_data, None, None, shap_values
def run_callback(self): self.model_idx = self.model_names.index(self.model_select.value) self.kfold = int(self.kfold_select.value) # set all the parameters of the new kfold self.geos_train, self.X_train_df, self.y_train, self.train_probas, \ self.geos_test, self.X_test_df, self.y_test, self.test_probas, \ self.models, self.model_names, self.auc_scores = self._extract_model_results(self.full_results, self.model_idx, self.kfold) self.num_kfold = len(self.full_results[0]['train_idx']) self.features_df = pd.concat([self.X_train_df, self.X_test_df]) self.features_kmeans = shap.kmeans(self.features_df, 10) self.all_probas = np.concatenate([self.train_probas[self.model_idx], self.test_probas[self.model_idx]]) self.all_probas_df = pd.Series(data=self.all_probas, index=self.features_df.index) # create new precision recall curve test_probas_df = pd.DataFrame( {model_name: test_proba for model_name, test_proba in zip(self.model_names, self.test_probas)}) pr_curve = bokeh_multiple_pr_curves(test_probas_df, self.y_test.values) self.left_column.children[0] = pr_curve # create new folium map print(f"Choose model: {self.model_select.value}\t kfold: {self.kfold}") new_folium = bokeh_scored_polygons_folium([self.all_probas_df], [True], train_geos=self.geos_train, test_geos=self.geos_test, start_zoom=self.folium_zoom, start_location=self.start_location, file_name=self.folium_name, width=700, lonlat_text_inputs=self.lonlat_text_inputs) self.folium_column.children[-2] = new_folium self.sample_feature_importance_update()
def explain(self): shap.initjs() X_train_summary = shap.kmeans(self.X_train, self.n_cluster) rf_explainer = shap.KernelExplainer(self.clf.predict, X_train_summary) shap_values_RF_test = rf_explainer.shap_values(self.X_test) self.shap = shap_values_RF_test return shap_values_RF_test
def SHAP_values(model, X_train): k_sample = shap.kmeans(X_train, 5) explainer = shap.KernelExplainer(model.predict, k_sample) shap_values = explainer.shap_values(X_train) shap.summary_plot(shap_values, X_train, plot_type='bar') plt.show()
def calculate_deepshap_values(model, training_data, test_data, K=None): """Calculate deepSHAP values for the given model. Args: model: trained deep model training_data: data of shape (num_samples, num_features) on which model was trained test_data: data of shape (num_samples, num_features) on which model was tested K: number of means to use for K-means summarization of training_data (used for calculating feature importances in SHAP values); if None, then all the training data are used for this calculation (potentially expensive) Returns: A tuple consisting of 1. the explainer's expected values and 2. the SHAP values. """ if K: kmeans_summary = shap.kmeans(training_data, K) print("Finished k-means summarization!") explainer = shap.DeepExplainer(model, kmeans_summary, link="logit") else: explainer = shap.DeepExplainer(model, training_data, link="logit") shap_values = explainer.shap_values(test_data) return explainer.expected_value, shap_values
def __init__(self, task: TaskHandler, results_dir=BUILDING_RESULTS_DIR): self.task = task self.main_panel = column() self.folium_name = task.__str__() # unpack building experiment results self.kfold = 0 self.model_idx = 0 # self.full_results = load_experiment_results(results_dir)['model_results'] self.full_results = load_separate_model_results(results_dir)['model_results'] self.model_results = self.full_results[self.model_idx] self.geos_train, self.X_train_df, self.y_train, self.train_probas, \ self.geos_test, self.X_test_df, self.y_test, self.test_probas, \ self.models, self.model_names, self.auc_scores = self._extract_model_results(self.full_results, self.model_idx, self.kfold) self.num_kfold = len(self.full_results[0]['train_idx']) self.features_df = pd.concat([self.X_train_df, self.X_test_df]) self.features_kmeans = shap.kmeans(self.features_df, 10) self.all_probas = np.concatenate([self.train_probas[self.model_idx], self.test_probas[self.model_idx]]) self.all_probas_df = pd.Series(data=self.all_probas, index=self.features_df.index) self.mean_auc = self.mean_auc_panel(self.model_names, self.auc_scores) plot = self.bokeh_plot() self.main_panel.children.append(plot)
def run(self, X_train, nsamples=1000): X_train_summary = shap.kmeans(X_train, 10) self.sensor_names = X_train.columns self.explainer = shap.KernelExplainer(self.predict_LR, X_train_summary, l1_reg="auto") self.shap_values = np.array( self.explainer.shap_values(X_test, nsamples=nsamples)) #[0]
def get_shap_mean_values(model, df, features): explainer = shap.KernelExplainer(model=model.predict, data=shap.kmeans(df[features], 10)) shap_values = explainer.shap_values(df[FEATURES].sample(10), l1_reg='aic', silent=True) mean_shap_values = shap_values.mean(axis=0) return mean_shap_values
def get_shap_feature_importance(self, base_model, X_test, X_train): try: import shap except ImportError: raise ImportError('You must have shap installed to use shap') if self.flags['tree'] or self.flags['linear']: if self.flags['linear']: fp = self.shap_params.linear_feature_perturbation n = self.shap_params.linear_nsamples explainer = shap.LinearExplainer(base_model, X_train, nsamples=n, feature_perturbation=fp) shap_values = explainer.shap_values(X_test) elif self.flags['tree']: tmo = self.shap_params.tree_model_output tfp = self.shap_params.tree_feature_perturbation explainer =\ shap.TreeExplainer( base_model, X_train, model_output=tmo, feature_perturbation=tfp) ttl = self.shap_params.tree_tree_limit shap_values =\ explainer.shap_values(X_test, tree_limit=ttl) # Kernel else: nkmean = self.shap_params.kernel_nkmean if nkmean is not None: X_train_summary = shap.kmeans(X_train, nkmean) else: X_train_summary = X_train explainer =\ self.get_kernel_explainer(base_model, X_train_summary, self.shap_params.kernel_link) klr = self.shap_params.kernel_l1_reg kns = self.shap_params.kernel_nsamples shap_values =\ explainer.shap_values(np.array(X_test), l1_reg=klr, n_samples=kns) return self.proc_shap_vals(shap_values)
def regressor(): X, y = get_boston() model = RandomForestRegressor() model.fit(X, y) explainer = shap.KernelExplainer(model.predict, shap.kmeans(X, 100)) shap_values = explainer.shap_values(X) return ModelWithExplanation(model, X, shap_values, explainer.expected_value)
def classifier(): X, y = get_iris() model = RandomForestClassifier() model.fit(X, y) explainer = shap.KernelExplainer(model.predict_proba, shap.kmeans(X, 100)) shap_values = explainer.shap_values(X) return ModelWithExplanation(model, X, shap_values, explainer.expected_value)
def main(toolName, datasetName): tool = getToolObject(toolName) #load the dataset [used as data in shapley plot] sequences = getDataset(datasetName, toolName) print("Loaded the dataset " + datasetName + " of " + str(len(sequences)) + " data points.") #Calculate the features for the dataset print( "Calculating features for the dataset. [Might take a while for wu-crispr]" ) feature_set = [] cnt = 1 for seq in sequences: features = tool.getFeatures(seq) feature_set.append(features) if toolName == 'wu-crispr' and cnt % 100 == 0: #inform on progress [wu-crispr takes a while] print("-- Calculated features for " + str(cnt)) cnt = cnt + 1 print("Calculated the features for this dataset.") #Get feature names [for printing in the shapley plot] feature_names = tool.loadFeatureNames() print("Loaded the names of all " + str(len(feature_names)) + " features.") #Put together features with names in one dataframe dataset_df = pd.DataFrame(np.array(feature_set), columns=feature_names) #training set (must be loaded before model) train_df = tool.loadTrainingSet() print("Loaded the training set used for tool " + toolName + ", size: " + str(train_df.shape) + ".") #load model of the tool model = tool.loadModel() print("Loaded the model for tool " + toolName + ".") #summarize training set and subsample test set summary_train_df = shap.kmeans(train_df, 2) dataset_sub_df = dataset_df # optional to speed up things can use dataset_df.sample(400) #compute and plot shapley values shap_explainer = shap.KernelExplainer(model.predict, summary_train_df) print(dataset_sub_df) shap_values = shap_explainer.shap_values(dataset_sub_df) #save the values with open("../results/SHAP-" + toolName + "-" + datasetName, 'wb') as file: pickle.dump(shap_values, file) #the SHAP values pickle.dump(dataset_sub_df, file) #the data used print("Computed and saved SHAP values.") #ploatting shap.summary_plot(shap_values, dataset_sub_df)
def _kmeans(self, means: int) -> DenseData: """ Wrapper to cache kmeans results for repeated runs :param means: amount of centers for kmeans :return: kmeaned background data for shap (either from cache or newly counted) """ if means in self.sampled_background.keys(): return self.sampled_background[means] self.sampled_background[means] = kmeans(self.X_train, means) return self.sampled_background[means]
def build_shap_explainer(model, dataset): # --- Summarizing the dataset --- # Can be fine tuned further (Kmeans, etc.) # summ = np.median(dataset, axis=0).reshape((1,dataset.shape[1])) summ = shap.kmeans(dataset, 100) print("Summary data for SHAP:", summ) explainer = shap.KernelExplainer(model.run_model_data_prob, summ, link="identity") print("Expected value for SHAP:", explainer.expected_value[1]) return explainer
def run_shap(features_csv, labels_csv, output_file): # Import dfs labels = pd.read_csv(os.path.join(os.getcwd(), labels_csv)) imp_feat = pd.read_csv(os.path.join(os.getcwd(), features_csv)) # set label index labels.set_index('respondent_id', inplace=True) # IMPUTED imp_feat.set_index('Unnamed: 0', inplace=True) imp_feat.sort_index(inplace=True) # merge_df options merged_df = imp_feat.join(labels) # merged_df = imp_feat_small.join(labels) df_h1n1 = merged_df.reset_index(drop=True).drop(['seasonal_vaccine'], axis=1) print(df_h1n1.shape) X = df_h1n1.iloc[:, :-1] y = df_h1n1.iloc[:, -1] X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42) # get feature names feature_names = list(X_train) # check shape print(X.shape) print(X_train.shape) # IMPUTED Scaling and X_train = StandardScaler().fit_transform(X_train) print(X_train.shape) X_val = StandardScaler().fit_transform(X_val) clf = SVC(kernel='rbf', probability=True).fit(X_train, y_train) X_train_summary = shap.kmeans(X_train, 10) explainer = shap.KernelExplainer(clf.predict_proba, X_train_summary) shap_values_train = explainer.shap_values(X_train) shap_values_test = explainer.shap_values(X_val) df_SVC = pd.DataFrame(shap_values_train[0].mean(0), index=X.columns, columns=['SVC']).sort_values('SVC', ascending=False) df_SVC.to_csv(output_file)
def run_shap(results, X_test, new_path, hp): for key, item in results.items(): model = item[0] # X_test = X_test.iloc[:10, :] k_X = shap.kmeans(X_test, 5) if key == 'MLP': explainer = shap.KernelExplainer(model.model.predict_classes, k_X) else: explainer = shap.KernelExplainer(model.model.predict, k_X) shap_values = explainer.shap_values(X_test) f = plt.figure() shap.summary_plot(shap_values, X_test, show=False, plot_type='bar') f.savefig(new_path + "/" + hp + '-' + key + "-summary_plot.pdf", bbox_inches='tight', dpi=600)
def _get_kernel_explainer(predict_func, bkgrd_data, kmeans_size=10): if predict_func is None: raise ValueError( "No target to compute shap values. Expected either model or predict_func" ) # rather than use the whole training set to estimate expected values, # summarize with a set of weighted kmeans, each weighted by # the number of points they represent. if kmeans_size is None: x_bkgrd_summary = bkgrd_data else: x_bkgrd_summary = shap.kmeans(bkgrd_data, kmeans_size) return shap.KernelExplainer(predict_func, x_bkgrd_summary)
def calculate_shap_values(model, train_data, shap_data, model_type="kernel"): # use faster tree explainer for tree based models if model_type == "tree": explainer = shap.TreeExplainer(model) else: if len(train_data) > 100: explainer = shap.KernelExplainer(model, shap.kmeans(train_data, 100)) else: explainer = shap.KernelExplainer(model, train_data) shap_values = explainer.shap_values(shap_data) return explainer, shap_values
def kernel(self, print_result=False): result = {} result['name'] = self.name result['shap_method'] = 'kernel' X_train_summary = shap.kmeans(self.X_train, 50) explainer = shap.KernelExplainer(model=self.model.predict_proba, data=X_train_summary) shap_values = explainer.shap_values(self.X_test) if isinstance(shap_values, list): shap_values = shap_values[1] return self._run(shap_values, result, print_result)
def explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table, feature_column_names, is_pai, pai_table, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name): def predict(d): if len(d) == 1: # This is to make sure the progress bar of SHAP display properly: # 1. The newline makes the progress bar string captured in pipe # 2. The ASCII control code moves cursor up twice for alignment print("\033[A" * 2) def input_fn(): return tf.data.Dataset.from_tensor_slices( dict(pd.DataFrame(d, columns=shap_dataset.columns))).batch(1000) if plot_type == 'bar': predictions = [ p['logits'] if 'logits' in p else p['predictions'] for p in estimator.predict(input_fn) ] else: predictions = [ p['logits'][-1] if 'logits' in p else p['predictions'][-1] for p in estimator.predict(input_fn) ] return np.array(predictions) if len(shap_dataset) > 100: # Reduce to 16 weighted samples to speed up shap_dataset_summary = shap.kmeans(shap_dataset, 16) else: shap_dataset_summary = shap_dataset shap_values = shap.KernelExplainer( predict, shap_dataset_summary).shap_values(shap_dataset, l1_reg="aic") if result_table != "": if is_pai: write_shap_values(shap_values, "pai_maxcompute", None, result_table, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) else: conn = connect_with_data_source(datasource) write_shap_values(shap_values, conn.driver, conn, result_table, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) explainer.plot_and_save( lambda: shap.summary_plot( shap_values, shap_dataset, show=False, plot_type=plot_type), is_pai, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name)
def get_model_explanations(model, train_data, test_data, background_samples=10, nsamples='auto', num_features=50): """runs KernelSHAP on model and returns subjectwise model explanations model : trained sklearn estimator train_data : n x p array of training data used in model training test_data : n x p array of test data background_samples : number of kmeans clusters used to summarise training data, fewer=faster nsamples : number of times to reevaluate model to estimate Shapley values, more=lower variance num_features : number of features to include in local model """ explainer = shap.KernelExplainer(model.predict, shap.kmeans(train_data, background_samples), link='identity') explanations = explainer.shap_values(test_data, nsamples=nsamples, l1_reg='num_features({:})'.format(num_features)) return explanations
def show_federate_shap_on_each_client(self): logging.info("################load data for shap") client = self.client_list[0] fed_pos = 0 # 联邦特征的起始下标 for client_idx in range(self.args.client_num_in_total): if self.test_data_local_dict[client_idx] is None: continue client.update_local_dataset( 0, self.train_data_local_dict[client_idx], self.test_data_local_dict[client_idx], self.train_data_local_num_dict[client_idx]) train_X, train_y, test_X = client.get_all_X() # federate shapley train_X_all_pd = pd.DataFrame(train_X.numpy()) f_knn = lambda x: self.model_trainer.model.forward(x) med = train_X_all_pd.median().values.reshape( (1, train_X_all_pd.shape[1])) feature_num = len(self.feature_name) - 1 # 特征个数 fs = FederateShap() # Aggregated and average federated shap data = shap.kmeans(train_X_all_pd, 20) step = 3 shap_values_whole = [] cols_federated = self.feature_name[:-1] cols_federated[fed_pos] = 'Federated' del cols_federated[fed_pos + 1:fed_pos + step] for x in data.data: phi = fs.kernel_shap_federated_with_step( f_knn, x, med, feature_num, fed_pos, step) base_value = phi[-1] shap_values = phi[:-1] shap_values_whole.append(list(shap_values)) shap_values_whole = np.array(shap_values_whole) shap_values_whole_mean = np.mean(shap_values_whole, axis=0).transpose() # 绘制联邦特征shapley值的图 shap.summary_plot(shap_values_whole_mean, feature_names=cols_federated, sort=False) shap.summary_plot(shap_values_whole_mean, feature_names=cols_federated, sort=False, plot_type="bar") fed_pos += step