def test_null_model_small(): explainer = shap.SamplingExplainer(lambda x: np.zeros(x.shape[0]), np.ones((2, 4)), nsamples=100) shap_values = explainer.shap_values(np.ones((1, 4))) assert np.sum(np.abs(shap_values)) < 1e-8
def test_null_model(): import numpy as np import shap explainer = shap.SamplingExplainer(lambda x: np.zeros(x.shape[0]), np.ones((2, 10)), nsamples=100) shap_values = explainer.shap_values(np.ones((1, 10))) assert np.sum(np.abs(shap_values)) < 1e-8
def __init__(self, learn:TabularLearner, test_data=None, l1_reg='auto', n_samples=128, max_train_samples=100000, **kwargs): "Initialize `ShapInterpretation` with a Learner, test_data, `n_samples`, `l1_reg`, and optional **kwargs passed to `shap.SamplingExplainer`" self.model = learn.model self.dls = learn.dls self.class_names = learn.dls.vocab if hasattr(learn.dls, 'vocab') else None # only defined for classification problems self.train_data = _prepare_data(learn.dls.train, max_train_samples) self.test_data = _prepare_test_data(learn, test_data, n_samples) pred_func = partial(_predict, learn) self.explainer = shap.SamplingExplainer(pred_func, self.train_data, **kwargs) self.shap_vals = self.explainer.shap_values(self.test_data, l1_reg=l1_reg) self.is_multi_output = isinstance(self.shap_vals, list)
def test_front_page_model_agnostic(): import sklearn import shap from sklearn.model_selection import train_test_split # train a SVM classifier X_train, X_test, Y_train, Y_test = train_test_split(*shap.datasets.iris(), test_size=0.2, random_state=0) svm = sklearn.svm.SVC(kernel='rbf', probability=True) svm.fit(X_train, Y_train) # use Kernel SHAP to explain test set predictions explainer = shap.SamplingExplainer(svm.predict_proba, X_train, nsamples=100) shap_values = explainer.shap_values(X_test)
def save_attributions(model, samples, labels, background, input_shape, subdir='train'): os.makedirs('data/{}/{}/'.format(FLAGS.dataset, subdir), exist_ok=True) primal_explainer = MarginalExplainer(model, background, nsamples=200, representation='mobius') primal_effects = primal_explainer.explain(samples, verbose=True, index_outputs=True, labels=labels) model_func = lambda x: model( np.reshape(x, (x.shape[0], *input_shape)).astype(np.float32)).numpy() if FLAGS.background == 'train_dist': shap_indices = np.random.choice(background.shape[0], size=200, replace=False) background = background[shap_indices] sample_explainer = shap.SamplingExplainer( model_func, np.reshape(background, (background.shape[0], -1))) shap_values = sample_explainer.shap_values( np.reshape(samples, (FLAGS.num_shap_samples, -1))) shap_values = np.stack(shap_values, axis=0) shap_values = shap_values[labels, np.arange(shap_values.shape[1]), :] # grad_explainer = shap.GradientExplainer(model, background) # shap_values = grad_explainer.shap_values(samples, nsamples=200, ranked_outputs=1) shap_values = np.reshape(shap_values, (FLAGS.num_shap_samples, *input_shape)) interaction_effects = shap_values - primal_effects np.save( 'data/{}/{}/primal_effects_{}.npy'.format(FLAGS.dataset, subdir, FLAGS.background), primal_effects) np.save( 'data/{}/{}/shap_values_{}.npy'.format(FLAGS.dataset, subdir, FLAGS.background), shap_values) np.save( 'data/{}/{}/interaction_effects_{}.npy'.format(FLAGS.dataset, subdir, FLAGS.background), interaction_effects)
def test_front_page_model_agnostic(): sklearn = pytest.importorskip('sklearn') train_test_split = pytest.importorskip('sklearn.model_selection').train_test_split # print the JS visualization code to the notebook shap.initjs() # train a SVM classifier X_train, X_test, Y_train, _ = train_test_split(*shap.datasets.iris(), test_size=0.2, random_state=0) svm = sklearn.svm.SVC(kernel='rbf', probability=True) svm.fit(X_train, Y_train) # use Kernel SHAP to explain test set predictions explainer = shap.SamplingExplainer(svm.predict_proba, X_train, nsamples=100) explainer.shap_values(X_test)
def test_front_page_model_agnostic(): import sklearn from sklearn.model_selection import train_test_split import numpy as np import shap # print the JS visualization code to the notebook shap.initjs() # train a SVM classifier X_train, X_test, Y_train, Y_test = train_test_split(*shap.datasets.iris(), test_size=0.2, random_state=0) svm = sklearn.svm.SVC(kernel='rbf', probability=True) svm.fit(X_train, Y_train) # use Kernel SHAP to explain test set predictions explainer = shap.SamplingExplainer(svm.predict_proba, X_train, nsamples=100) shap_values = explainer.shap_values(X_test) # plot the SHAP values for the Setosa output of the first instance shap.force_plot(explainer.expected_value[0], shap_values[0][0, :], X_test.iloc[0, :])
def __init__(self, learn: TabularLearner, test_data=None, link='identity', l1_reg='auto', n_samples=128, **kwargs): "Initialize `ShapInterpretation` with a Learner, test_data, link, `n_samples`, `l1_reg`, and optional **kwargs" self.model = learn.model self.dls = learn.dls self.class_names = learn.dl.vocab self.train_data = pd.merge(learn.dls.cats, learn.dls.conts, left_index=True, right_index=True) self.test_data = _prepare_data(learn, test_data, n_samples) pred_func = partial(_predict, learn) self.explainer = shap.SamplingExplainer(pred_func, self.train_data, **kwargs) self.shap_vals = self.explainer.shap_values(self.test_data, l1_reg=l1_reg) self.is_multi_output = isinstance(self.shap_vals, list)
def __init__(self, learn:TabularLearner, test_data:pd.DataFrame=None, link="identity", nsamples="auto", l1_reg="auto", **kwargs): """ Uses Shap value to interpret the output of a learner for some test data test_data : None or a pandas dataframe The data for which the shap values will be computed. By default, 100 random rows of the train data will be used. link : "identity" or "logit" A generalized linear model link to connect the feature importance values to the model output. Since the feature importance values, phi, sum up to the model output, it often makes sense to connect them to the ouput with a link function where link(outout) = sum(phi). If the model output is a probability then the LogitLink link function makes the feature importance values have log-odds units. nsamples : "auto" or int Number of times to re-evaluate the model when explaining each prediction. More samples lead to lower variance estimates of the SHAP values. l1_reg : "num_features(int)", "auto", "aic", "bic", or float The l1 regularization to use for feature selection (the estimation procedure is based on a debiased lasso). The auto option currently uses "aic" when less that 20% of the possible sample space is enumerated, otherwise it uses no regularization. """ # extracts model and data from the learner self.model = learn.model self.dls = learn.dls self.class_names = learn.dl.vocab # create an explainer for the model train_data = learn.dls.all_cols predict_function = partial(_predict, model=learn.model, dls=learn.dls) self.explainer = shap.SamplingExplainer(predict_function, train_data, **kwargs) #self.explainer = shap.KernelExplainer(predict_function, train_data, **kwargs) # use only for small dataset or sample # computes shap values for the test data self.test_data = train_data.sample(n=min(100, len(train_data)),replace=False) if test_data is None else learn.dls.test_dl(test_data).all_cols self.shap_values = self.explainer.shap_values(self.test_data, nsamples=nsamples, l1_reg=l1_reg) # flags used to indure the proper working of the library self.is_multi_output = type(self.shap_values) == list
def shap_values_svm(path_shap): X_train, y_train, X_test, y_test = load_dataset() num_patient = len(X_train) model = SVC(kernel='linear', C=1, class_weight="balanced", gamma='auto', probability=True) for patient in range(num_patient): model.fit(X_train[patient], y_train[patient]) explainer = shap.SamplingExplainer(model.predict_proba, X_train[patient].iloc[0:100, :]) shap_values = explainer.shap_values(X_test[patient]) with open( '../resources/shap_SVM/shap_svm_patient_{}_SVM.pkl'.format( patient), 'wb') as f: pickle.dump(shap_values, f) path_img_bar = path_shap + "patient{}_svm.png".format(patient) plt.figure() shap.summary_plot(shap_values, X_test[patient], plot_type="bar", show=False) plt.savefig(path_img_bar) # for value in range(len(shap_values)): # path_img_dot = path_shap + "dot_patient{}_class{}.png".format(patient, value) # plt.figure() # shap.summary_plot(shap_values[value], X_test[patient], plot_type="dot", show=False) # plt.savefig(path_img_dot) # plt.close() # plt.figure() # path_img_bar = path_shap + "bar_patient{}_class{}.png".format(patient, value) # shap.summary_plot(shap_values[value], X_test[patient], plot_type="bar", show=False) # plt.savefig(path_img_bar) # plt.close() return
ytest = data_test.pop("y") xtrain = data_train.values xtest = data_test.values """ Naive Bayes """ print('---------------------') print('Beginning with Naive Bayes') print('---------------------') # Train naive bayes on train data naive_bayes = GaussianNB() naive_bayes.fit(X=xtrain, y=ytrain) # We take for true Shapley values the ones we get with sampling with low error and probability of error bayes_explainer = shap.SamplingExplainer(naive_bayes.predict, xtrain) real_values_bayes = bayes_explainer.shap_values(xtest, nsamples="variance", alpha=0.99, expected_error=0.001) # Save the classifier and shapley values np.save("../Data/IME/German/bayes_values", real_values_bayes) filename = "../Data/IME/German/bayes_model.sav" pickle.dump(naive_bayes, open(filename, "wb")) ''' Linear SVM ''' print('---------------------') print('Beginning with Linear SVM') print('---------------------')
def experiment_main(): """ Run through experiments for IME on German, * This may take some time given that we iterate through every point in the test set * We print out the rate at which features occur in the top three features """ generator_specs = {"original_dim": original_dim, "intermediate_dim": 8, "latent_dim": latent_dim, "epochs": 100, "dropout": 0.2,\ "experiment": "German", "feature_names": features} print('---------------------') print('Training adversarial models....') print('---------------------') # Adversarial models adv_models = dict() adv_models["Perturbation"] = Adversarial_IME_Model(racist_model_f(), innocuous_model_psi()).train(xtrain, ytrain,\ feature_names=features, perturbation_multiplier=1) adv_models["DropoutVAE"] = Adversarial_IME_Model(racist_model_f(), innocuous_model_psi(), generator = "DropoutVAE", generator_specs = generator_specs).\ train(xtrain, ytrain, feature_names=features, dummy_idcs=dummy_idcs, integer_idcs=integer_attributes, perturbation_multiplier=1) adv_models["ForestFill"] = Adversarial_IME_Model(racist_model_f(), innocuous_model_psi(), generator = "Forest", generator_specs = generator_specs).\ train(xtrain, ytrain, feature_names=features, dummy_idcs=dummy_idcs, integer_idcs=integer_attributes, perturbation_multiplier=1) for adversarial in ["Perturbation", "DropoutVAE", "ForestFill"]: adv_model = adv_models[adversarial] print('---------------------') print(f'Training explainers with adversarial {adversarial}....') print('---------------------') # Explainers adv_kernel_explainers = dict() adv_kernel_explainers["Perturbation"] = shap.SamplingExplainer( adv_model.predict, xtrain) adv_kernel_explainers["DropoutVAE"] = shap.SamplingExplainer(adv_model.predict, xtrain, generator="DropoutVAE", generator_specs=generator_specs,\ dummy_idcs=dummy_idcs, integer_idcs=integer_attributes, instance_multiplier = 1000) adv_kernel_explainers["ForestFill"] = shap.SamplingExplainer(adv_model.predict, xtrain, generator="Forest", generator_specs=generator_specs,\ dummy_idcs=dummy_idcs, integer_idcs=integer_attributes) for explainer in ["Perturbation", "DropoutVAE", "ForestFill"]: adv_kernel_explainer = adv_kernel_explainers[explainer] explanations = adv_kernel_explainer.shap_values( xtest, fill_data=True, data_location="...\Data/german_forest_ime.csv", distribution_size=1000) # format for display formatted_explanations = [] for exp in explanations: formatted_explanations.append([(features[i], exp[i]) for i in range(len(exp))]) print( f"IME Ranks and Pct Occurances one unrelated feature, adversarial: {adversarial}, explainer: {explainer}:" ) summary = experiment_summary(formatted_explanations, features) print(summary) print("Fidelity:", round(adv_model.fidelity(xtest), 2)) file_name = f"../Results/GermanIme/germanImeSummary_adversarial_{adversarial}_explainer_{explainer}.csv" with open(file_name, "w") as output: w = csv.writer(output) for key, val in summary.items(): w.writerow([key] + [pair for pair in val])
original_dim = xtrain.shape[1] """ Naive Bayes """ print('---------------------') print('Beginning with Naive Bayes') print('---------------------') # Load classifier and true Shapley values bayes = pickle.load(open("../Data/IME/CC/bayes_model.sav", 'rb')) shapley_values = np.load("../Data/IME/CC/bayes_values.npy") generator_specs = {"original_dim": original_dim, "intermediate_dim": 8, "latent_dim": latent_dim, "epochs": 100, "dropout": 0.3,\ "experiment": "CC", "feature_names": features} perturbation_explainer = shap.SamplingExplainer(bayes.predict, xtrain) dvae_explainer = shap.SamplingExplainer(bayes.predict, xtrain, generator="DropoutVAE", generator_specs=generator_specs,\ dummy_idcs=dummy_indcs, integer_idcs=integer_attributes, instance_multiplier=100) forest_explainer = shap.SamplingExplainer(bayes.predict, xtrain, generator="Forest", generator_specs=generator_specs,\ dummy_idcs=dummy_indcs, integer_idcs=integer_attributes, instance_multiplier=100) # Setup experiment perturbation_explainer.create_experiment_table(shapley_values) dvae_explainer.create_experiment_table(shapley_values) forest_explainer.create_experiment_table(shapley_values) # Experiment for perturbations perturbation_explainer.shap_values(xtest, nsamples="variance", is_experiment=True) perturbation_data = perturbation_explainer.get_experiment_dataframe()
explainers = ["kernel", "sampling", "lime", "numeric"] lime_models = [lime_x, lime_v] background = shap.sample(X_test, 3) choice = X.iloc[np.sort(np.random.choice(X_test.shape[0], 3, replace =False))] big_df = pd.DataFrame() for explainer in explainers: print(explainer) if explainer == "kernel": temp_explainer = shap.KernelExplainer(model, background) temp_vals = temp_explainer.shap_values(choice) elif explainer == "sampling": temp_explainer = shap.SamplingExplainer(model, background) temp_vals = temp_explainer.shap_values(choice) elif explainer == "lime": temp_explainer = MyLime(lime_models, choice, mode='regression') temp_vals = temp_explainer.attributions(choice) elif explainer == "numeric": temp_explainer = NumericExplainer(model, duffing.features, duffing.labels, h = 0.001) temp_vals = temp_explainer.feature_att(choice) else: print("not a valid explainer type") big_df = big_df.append(duffing.vals_to_df(temp_vals, choice, explainer = explainer, suffix = suffix))
def get_instance_explanations(X, Y, subset = 1000, classifier_index = "gradient_boosting", explanation_method = "shap", shap_explainer = "kernel", text = False): """ A set of calls for obtaining aggregates of explanations. """ ## label encoding #lab_enc = preprocessing.LabelEncoder() #training_scores_encoded = lab_enc.fit_transform(Y) # TODO: zakaj je potreben label encoder? training_scores_encoded = Y if text: vectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english') X_vectorized = vectorizer.fit_transform(X) #print(X_vectorized) X_vectorized = X_vectorized.todense() #print(X_vectorized) X = pd.DataFrame(X_vectorized) X.columns = vectorizer.get_feature_names() #X.columns = vectorizer.get_feature_names() logging.info("Feature pre-selection via Mutual Information ({}).".format(subset)) #X = X.iloc[:,1:100] minf = mutual_info_classif(X.values, training_scores_encoded) top_k = np.argsort(minf)[::-1][0:subset] attribute_vector = X.columns[top_k] X = X.astype(float).values[:,top_k] skf = StratifiedKFold(n_splits=10) performances = [] enx = 0 t_start = time.time() logging.info("Starting importance estimation .. shape: {}".format(X.shape)) per_class_explanations = defaultdict(list) classifier_mapping = ["gradient_boosting", "random_forest", "svm"] classifiers = [GradientBoostingClassifier(), RandomForestClassifier(n_estimators=10), svm.SVC(probability=True)] ## spyct.Model() model_dict = dict(zip(classifier_mapping, classifiers)) if explanation_method == "shap": logging.info("Shapley-based explanations.") ## for the correctly predicted instances, remember shap values and compute the expected value at the end. for train_index, test_index in skf.split(X, Y): enx+=1 clf = model_dict[classifier_index] x_train = X[train_index] x_test = X[test_index] y_train = Y[train_index] y_test = Y[test_index] ## perform simple feature ranking minf = mutual_info_classif(x_train, y_train) top_k = np.argsort(minf)[::-1][0:subset] x_train = x_train[:,top_k] x_test = x_test[:,top_k] x_train = x_train.astype('float') y_train = y_train.astype('float') x_test = x_test.astype('float') y_test = y_test.astype('float') model = clf.fit(x_train, y_train) preds = model.predict(x_test) if len(np.unique(y_train)) > 1: average = "micro" perf = f1_score(preds,y_test, average = average) performances.append(perf) logging.info("Performance in fold {}, {} (F1)".format(enx, perf)) ## different shap explainers if shap_explainer == "kernel": explainer = shap.KernelExplainer(model.predict_proba, x_train) if shap_explainer == "tree": explainer = shap.TreeExplainer(model.predict_proba, x_train) if shap_explainer == "gradient": explainer = shap.GradientExplainer(model.predict_proba, x_train) if shap_explainer == "deep": explainer = shap.DeepExplainer(model.predict_proba, x_train) if shap_explainer == "sampling": explainer = shap.SamplingExplainer(model.predict_proba, x_train) if shap_explainer == "partition": explainer = shap.PartitionExplainer(model.predict_proba, x_train) for unique_class in set(preds): cors_neg = np.array([enx for enx, pred_tuple in enumerate(zip(preds, y_test)) if pred_tuple[0] == pred_tuple[1] and pred_tuple[0] == unique_class]) if cors_neg.size != 0: shap_values = explainer.shap_values(x_test[cors_neg], nsamples = 10, verbose = False) stack = np.mean(np.vstack(shap_values),axis = 0) per_class_explanations[unique_class].append(stack) final_explanations = {} for class_name, explanation_set in per_class_explanations.items(): final_explanations[class_name] = np.mean(np.matrix(explanation_set),axis = 0) average_perf = (np.mean(performances), np.std(performances)) logging.info("Final performance: {}".format(average_perf)) elif explanation_method == "class-ranking": logging.info("Ranking-based explanations.") unique_scores = np.unique(training_scores_encoded) final_explanations = {} for label in unique_scores: inx = np.where(training_scores_encoded == label) tx = VarianceThreshold().fit(X[inx]).variances_ final_explanations[str(label)] = tx t_end = time.time() - t_start logging.info("Time spent on explanation estimation {}s.".format(t_end)) return (final_explanations, attribute_vector)
def train(argv=None): print('Reading data...') X_train_total, y_train_total, \ X_train, y_train, \ X_vald, y_vald, \ X_test, y_test = data.load_data() learning_rates = [ 0.000001, 0.000005, 0.00001, 0.00005, 0.0001, 0.0005, 0.001 ] num_epochs = 500 num_components = 500 batch_size = 128 try: model = tf.keras.models.load_model('model.h5') pca_model = joblib.load('pca.model') X_train_total_pca = pca_model.transform(X_train_total.values) X_test_pca = pca_model.transform(X_test.values) print('Restored model from saved checkpoint') except (OSError, FileNotFoundError): vald_aucs = [] print('No saved model found. Training from scratch...') print('Finding optimal learning rate...') for learning_rate in learning_rates: model = build_model(learning_rate, num_components) pca_model = PCA(n_components=num_components) X_train_pca = pca_model.fit_transform(X_train.values) X_vald_pca = pca_model.transform(X_vald.values) model.fit(X_train_pca, y_train, epochs=num_epochs, batch_size=128, verbose=0) score = model.evaluate(X_train_pca, y_train, batch_size=128, verbose=0) print( 'Learning rate: {}, Train Loss: {:.4f}, Train Accuracy: {:.4f}, Train AUC: {:.4f}' .format(learning_rate, score[0], score[1], score[2])) score = model.evaluate(X_vald_pca, y_vald, batch_size=128, verbose=0) print('Vald Loss: {:.4f}, Vald Accuracy: {:.4f}, Vald AUC: {:.4f}'. format(score[0], score[1], score[2])) vald_aucs.append(score[2]) print('Training Model...') best_auc_index = np.argmax(vald_aucs) print('Best learning rate was: {}'.format( learning_rates[best_auc_index])) model = build_model(learning_rates[best_auc_index], num_components) pca_model = PCA(n_components=num_components) X_train_total_pca = pca_model.fit_transform(X_train_total.values) X_test_pca = pca_model.transform(X_test.values) model.fit(X_train_total_pca, y_train_total, epochs=num_epochs, batch_size=128, verbose=0) model.save('model.h5') joblib.dump(pca_model, 'pca.model') score = model.evaluate(X_test_pca, y_test, batch_size=128, verbose=0) print('Test Loss: {:.4f}, Test Accuracy: {:.4f}, Test AUC: {:.4f}'.format( score[0], score[1], score[2])) if not FLAGS.train_only: lower_bound = FLAGS.index * 10 upper_bound = lower_bound + 10 print('Getting shap values...') try: sample_shap = np.load('sample_shap{}.npy'.format(FLAGS.index)) except FileNotFoundError: model_func = lambda x: model(x).numpy() sample_explainer = shap.SamplingExplainer(model_func, X_train_total_pca) sample_shap = sample_explainer.shap_values( X_test_pca[lower_bound:upper_bound]) np.save('sample_shap{}.npy'.format(FLAGS.index), sample_shap) print('Getting primal effects...') try: primal_effects = np.load('primal_effects{}.npy'.format( FLAGS.index)) except FileNotFoundError: primal_explainer = MarginalExplainer(model, X_train_total_pca, X_train_total_pca.shape[0]) primal_effects = primal_explainer.explain( X_test_pca[lower_bound:upper_bound], batch_size=128, verbose=True) np.save('primal_effects{}.npy'.format(FLAGS.index), primal_effects) print('Done!')
def samplingSHAP( model, X_train, X_test, background=None, use_probabilities=False, nsamples="auto", l1_reg=0.0, k_means=0, ): """ Perform SamplingSHAP to explain a model. Alternative to KernelShap. From shap documentation: "This is an extension of the Shapley sampling values explanation method (aka. IME) SamplingExplainer computes SHAP values under the assumption of feature independence and is an extension of the algorithm proposed in "An Efficient Explanation of Individual Classifications using Game Theory", Erik Strumbelj, Igor Kononenko, JMLR 2010. It is a good alternative to KernelExplainer when you want to use a large background set (as opposed to a single reference value for example)." It is important to note that this approximation method of Shapley values requires the assumption of feature independence; furthermore, kernelSHAP is allegedly more computationally efficient. - Lundberg & Lee "A unified approach to interpreting model predictions" NIPS (2017) See ``shap.SamplingExplainer`` for more details. Parameters ---------- model : BaseEstimator A fitted sklearn (or other supported) model, with a predict() and/or predict_proba() method implemented. X_train : pandas.DataFrame or ndarray Data set model was trained on. The explainer is fit using this. X_test : pandas.DataFrame or ndarray The explainer predicts SHAP values for these results. In reality, you could provide X_train again here if you wanted to compute values for that set. background : pandas.DataFrame or ndarray From shap documentation: ``The background dataset to use for integrating out features. To determine the impact of a feature, that feature is set to "missing" and the change in the model output is observed. Since most models aren't designed to handle arbitrary missing data at test time, we simulate "missing" by replacing the feature with the values it takes in the background dataset. So if the background dataset is a simple sample of all zeros, then we would approximate a feature being missing by setting it to zero. Unlike the KernelExplainer this data can be the whole training set, even if that is a large set. This is because SamplingExplainer only samples from this background dataset.'' If set to None (default) this uses X as the background also. use_probabilities : bool Use predict_proba() for model - this should only be used for classification tasks. nsamples : int or str Number of samples to use when computing shap values. See ``shap.KernelExplainer.shap_values``. l1_reg : float Strength of l1 regularization to use computing shap values. See ``shap.KernelExplainer.shap_values``. Default of 0 does not do regularization since I'm not sure this computes valid Shapley values. k_means : int If > 0, use KMeans to summarize the dataset which can greatly accelerate the calculation at the cost of accuracy. This summarizes a dataset with k_means samples weighted by the number of data points they each represent. """ import shap if k_means > 0: X_train = shap.kmeans(X_train, k_means) if background is None: background = X_train explainer = shap.SamplingExplainer( model=(model.predict_proba if use_probabilities else model.predict), data=background, ) shap_values = explainer.shap_values(X_test, nsamples=nsamples, l1_reg=l1_reg) return explainer, shap_values
###Change Data Type to Integer X_train=X_train.apply(lambda x: x.astype(int)) X_test=X_test.apply(lambda x: x.astype(int)) # In[ ]: ###SHAP Summary Plot #SHAP Value (impact on model output) for different levels of key variables shap_value=shap.TreeExplainer(clf_best).shap_values(X_train) shap_value=np.array(shap_value[1]) shap.summary_plot(shap_value, X_train) # In[ ]: ###SHAP Explainer, Individual Observation #For each individual observation, how each variable contributed to the final predicted probability shap.initjs() explainer=shap.SamplingExplainer(lambda x: clf_best.predict_proba(x)[:,1], data=X_train) #Apply Explainer to Observation 100 shap_values=explainer.shap_values(X_train.loc[100,:]) #Plot of SHAP Values for Individual Prediction shap.force_plot(explainer.expected_value, shap_values, features=X_train.loc[100,:], feature_names=X_train.columns.tolist())
index=x_train.columns) feat_importances.nlargest(10).sort_values().plot(kind='barh') print( metrics.classification_report(y_train['activityID'], rf_model.predict(x_train))) print( metrics.classification_report(y_test['activityID'], rf_model.predict(x_test))) # pip install shap import shap rf_shap_explainer = shap.SamplingExplainer(rf_model.predict_proba, x_train) rf_shap_vals_train = rf_shap_explainer.shap_values(shap.sample(x_train, 200), approximate=True, nsamples=200) rf_shap_vals_test = rf_shap_explainer.shap_values(shap.sample(x_test, 200), approximate=True, nsamples=200) shap.initjs() shap.summary_plot(rf_shap_vals_train[0], shap.sample(x_train, 200)) shap.summary_plot(rf_shap_vals_test[0], shap.sample(x_test, 200))