コード例 #1
0
ファイル: _sampling.py プロジェクト: zhaijj/shap
def test_null_model_small():

    explainer = shap.SamplingExplainer(lambda x: np.zeros(x.shape[0]),
                                       np.ones((2, 4)),
                                       nsamples=100)
    shap_values = explainer.shap_values(np.ones((1, 4)))
    assert np.sum(np.abs(shap_values)) < 1e-8
コード例 #2
0
def test_null_model():
    import numpy as np
    import shap

    explainer = shap.SamplingExplainer(lambda x: np.zeros(x.shape[0]), np.ones((2, 10)), nsamples=100)
    shap_values = explainer.shap_values(np.ones((1, 10)))
    assert np.sum(np.abs(shap_values)) < 1e-8
コード例 #3
0
ファイル: interp.py プロジェクト: polyrand/fastinference
 def __init__(self, learn:TabularLearner, test_data=None, l1_reg='auto', n_samples=128, max_train_samples=100000, **kwargs):
     "Initialize `ShapInterpretation` with a Learner, test_data, `n_samples`, `l1_reg`, and optional **kwargs passed to `shap.SamplingExplainer`"
     self.model = learn.model
     self.dls = learn.dls
     self.class_names = learn.dls.vocab if hasattr(learn.dls, 'vocab') else None # only defined for classification problems
     self.train_data = _prepare_data(learn.dls.train, max_train_samples)
     self.test_data = _prepare_test_data(learn, test_data, n_samples)
     pred_func = partial(_predict, learn)
     self.explainer = shap.SamplingExplainer(pred_func, self.train_data, **kwargs)
     self.shap_vals = self.explainer.shap_values(self.test_data, l1_reg=l1_reg)
     self.is_multi_output = isinstance(self.shap_vals, list)
コード例 #4
0
def test_front_page_model_agnostic():
    import sklearn
    import shap
    from sklearn.model_selection import train_test_split

    # train a SVM classifier
    X_train, X_test, Y_train, Y_test = train_test_split(*shap.datasets.iris(), test_size=0.2, random_state=0)
    svm = sklearn.svm.SVC(kernel='rbf', probability=True)
    svm.fit(X_train, Y_train)

    # use Kernel SHAP to explain test set predictions
    explainer = shap.SamplingExplainer(svm.predict_proba, X_train, nsamples=100)
    shap_values = explainer.shap_values(X_test)
コード例 #5
0
ファイル: run_image.py プロジェクト: yucunlu/path_explain
def save_attributions(model,
                      samples,
                      labels,
                      background,
                      input_shape,
                      subdir='train'):
    os.makedirs('data/{}/{}/'.format(FLAGS.dataset, subdir), exist_ok=True)

    primal_explainer = MarginalExplainer(model,
                                         background,
                                         nsamples=200,
                                         representation='mobius')
    primal_effects = primal_explainer.explain(samples,
                                              verbose=True,
                                              index_outputs=True,
                                              labels=labels)

    model_func = lambda x: model(
        np.reshape(x, (x.shape[0], *input_shape)).astype(np.float32)).numpy()

    if FLAGS.background == 'train_dist':
        shap_indices = np.random.choice(background.shape[0],
                                        size=200,
                                        replace=False)
        background = background[shap_indices]

    sample_explainer = shap.SamplingExplainer(
        model_func, np.reshape(background, (background.shape[0], -1)))
    shap_values = sample_explainer.shap_values(
        np.reshape(samples, (FLAGS.num_shap_samples, -1)))
    shap_values = np.stack(shap_values, axis=0)
    shap_values = shap_values[labels, np.arange(shap_values.shape[1]), :]

    #     grad_explainer = shap.GradientExplainer(model, background)
    #     shap_values = grad_explainer.shap_values(samples, nsamples=200, ranked_outputs=1)
    shap_values = np.reshape(shap_values,
                             (FLAGS.num_shap_samples, *input_shape))

    interaction_effects = shap_values - primal_effects

    np.save(
        'data/{}/{}/primal_effects_{}.npy'.format(FLAGS.dataset, subdir,
                                                  FLAGS.background),
        primal_effects)
    np.save(
        'data/{}/{}/shap_values_{}.npy'.format(FLAGS.dataset, subdir,
                                               FLAGS.background), shap_values)
    np.save(
        'data/{}/{}/interaction_effects_{}.npy'.format(FLAGS.dataset, subdir,
                                                       FLAGS.background),
        interaction_effects)
コード例 #6
0
def test_front_page_model_agnostic():
    sklearn = pytest.importorskip('sklearn')
    train_test_split = pytest.importorskip('sklearn.model_selection').train_test_split

    # print the JS visualization code to the notebook
    shap.initjs()

    # train a SVM classifier
    X_train, X_test, Y_train, _ = train_test_split(*shap.datasets.iris(), test_size=0.2, random_state=0)
    svm = sklearn.svm.SVC(kernel='rbf', probability=True)
    svm.fit(X_train, Y_train)

    # use Kernel SHAP to explain test set predictions
    explainer = shap.SamplingExplainer(svm.predict_proba, X_train, nsamples=100)
    explainer.shap_values(X_test)
コード例 #7
0
def test_front_page_model_agnostic():
    import sklearn
    from sklearn.model_selection import train_test_split
    import numpy as np
    import shap

    # print the JS visualization code to the notebook
    shap.initjs()

    # train a SVM classifier
    X_train, X_test, Y_train, Y_test = train_test_split(*shap.datasets.iris(), test_size=0.2, random_state=0)
    svm = sklearn.svm.SVC(kernel='rbf', probability=True)
    svm.fit(X_train, Y_train)

    # use Kernel SHAP to explain test set predictions
    explainer = shap.SamplingExplainer(svm.predict_proba, X_train, nsamples=100)
    shap_values = explainer.shap_values(X_test)

    # plot the SHAP values for the Setosa output of the first instance
    shap.force_plot(explainer.expected_value[0], shap_values[0][0, :], X_test.iloc[0, :])
コード例 #8
0
ファイル: interp.py プロジェクト: harirajeev/fastshap
 def __init__(self,
              learn: TabularLearner,
              test_data=None,
              link='identity',
              l1_reg='auto',
              n_samples=128,
              **kwargs):
     "Initialize `ShapInterpretation` with a Learner, test_data, link, `n_samples`, `l1_reg`, and optional **kwargs"
     self.model = learn.model
     self.dls = learn.dls
     self.class_names = learn.dl.vocab
     self.train_data = pd.merge(learn.dls.cats,
                                learn.dls.conts,
                                left_index=True,
                                right_index=True)
     self.test_data = _prepare_data(learn, test_data, n_samples)
     pred_func = partial(_predict, learn)
     self.explainer = shap.SamplingExplainer(pred_func, self.train_data,
                                             **kwargs)
     self.shap_vals = self.explainer.shap_values(self.test_data,
                                                 l1_reg=l1_reg)
     self.is_multi_output = isinstance(self.shap_vals, list)
コード例 #9
0
 def __init__(self, learn:TabularLearner, test_data:pd.DataFrame=None, link="identity", nsamples="auto", l1_reg="auto", **kwargs):
     """
     Uses Shap value to interpret the output of a learner for some test data
     
     test_data : None or a pandas dataframe
         The data for which the shap values will be computed.
         By default, 100 random rows of the train data will be used.
     
     link : "identity" or "logit"
         A generalized linear model link to connect the feature importance values to the model
         output. Since the feature importance values, phi, sum up to the model output, it often makes
         sense to connect them to the ouput with a link function where link(outout) = sum(phi).
         If the model output is a probability then the LogitLink link function makes the feature
         importance values have log-odds units.
     
     nsamples : "auto" or int
         Number of times to re-evaluate the model when explaining each prediction.
         More samples lead to lower variance estimates of the SHAP values.
         
     l1_reg : "num_features(int)", "auto", "aic", "bic", or float
         The l1 regularization to use for feature selection (the estimation procedure is based on
         a debiased lasso). The auto option currently uses "aic" when less that 20% of the possible sample
         space is enumerated, otherwise it uses no regularization.
     """        
     # extracts model and data from the learner
     self.model = learn.model
     self.dls = learn.dls
     self.class_names = learn.dl.vocab
     # create an explainer for the model
     train_data = learn.dls.all_cols
     predict_function = partial(_predict, model=learn.model, dls=learn.dls)
     self.explainer = shap.SamplingExplainer(predict_function, train_data, **kwargs)
     #self.explainer = shap.KernelExplainer(predict_function, train_data, **kwargs) # use only for small dataset or sample
     # computes shap values for the test data
     self.test_data = train_data.sample(n=min(100, len(train_data)),replace=False) if test_data is None else learn.dls.test_dl(test_data).all_cols
     self.shap_values = self.explainer.shap_values(self.test_data, nsamples=nsamples, l1_reg=l1_reg)
     # flags used to indure the proper working of the library
     self.is_multi_output = type(self.shap_values) == list
コード例 #10
0
def shap_values_svm(path_shap):
    X_train, y_train, X_test, y_test = load_dataset()
    num_patient = len(X_train)
    model = SVC(kernel='linear',
                C=1,
                class_weight="balanced",
                gamma='auto',
                probability=True)
    for patient in range(num_patient):
        model.fit(X_train[patient], y_train[patient])
        explainer = shap.SamplingExplainer(model.predict_proba,
                                           X_train[patient].iloc[0:100, :])
        shap_values = explainer.shap_values(X_test[patient])

        with open(
                '../resources/shap_SVM/shap_svm_patient_{}_SVM.pkl'.format(
                    patient), 'wb') as f:
            pickle.dump(shap_values, f)
        path_img_bar = path_shap + "patient{}_svm.png".format(patient)
        plt.figure()
        shap.summary_plot(shap_values,
                          X_test[patient],
                          plot_type="bar",
                          show=False)
        plt.savefig(path_img_bar)

        # for value in range(len(shap_values)):
        #     path_img_dot = path_shap + "dot_patient{}_class{}.png".format(patient, value)
        #     plt.figure()
        #     shap.summary_plot(shap_values[value], X_test[patient], plot_type="dot", show=False)
        #     plt.savefig(path_img_dot)
        #     plt.close()
        #     plt.figure()
        #     path_img_bar = path_shap + "bar_patient{}_class{}.png".format(patient, value)
        #     shap.summary_plot(shap_values[value], X_test[patient], plot_type="bar", show=False)
        #     plt.savefig(path_img_bar)
        #     plt.close()
    return
コード例 #11
0
ytest = data_test.pop("y")
xtrain = data_train.values
xtest = data_test.values
"""
Naive Bayes
"""
print('---------------------')
print('Beginning with Naive Bayes')
print('---------------------')

# Train naive bayes on train data
naive_bayes = GaussianNB()
naive_bayes.fit(X=xtrain, y=ytrain)

# We take for true Shapley values the ones we get with sampling with low error and probability of error
bayes_explainer = shap.SamplingExplainer(naive_bayes.predict, xtrain)
real_values_bayes = bayes_explainer.shap_values(xtest,
                                                nsamples="variance",
                                                alpha=0.99,
                                                expected_error=0.001)

# Save the classifier and shapley values
np.save("../Data/IME/German/bayes_values", real_values_bayes)
filename = "../Data/IME/German/bayes_model.sav"
pickle.dump(naive_bayes, open(filename, "wb"))
'''
Linear SVM
'''
print('---------------------')
print('Beginning with Linear SVM')
print('---------------------')
コード例 #12
0
def experiment_main():
    """
	Run through experiments for IME on German,
	* This may take some time given that we iterate through every point in the test set
	* We print out the rate at which features occur in the top three features
	"""

    generator_specs = {"original_dim": original_dim, "intermediate_dim": 8, "latent_dim": latent_dim, "epochs": 100, "dropout": 0.2,\
         "experiment": "German", "feature_names": features}

    print('---------------------')
    print('Training adversarial models....')
    print('---------------------')

    # Adversarial models
    adv_models = dict()
    adv_models["Perturbation"] = Adversarial_IME_Model(racist_model_f(), innocuous_model_psi()).train(xtrain, ytrain,\
                 feature_names=features, perturbation_multiplier=1)
    adv_models["DropoutVAE"] = Adversarial_IME_Model(racist_model_f(), innocuous_model_psi(), generator = "DropoutVAE", generator_specs = generator_specs).\
                   train(xtrain, ytrain, feature_names=features, dummy_idcs=dummy_idcs, integer_idcs=integer_attributes, perturbation_multiplier=1)
    adv_models["ForestFill"] = Adversarial_IME_Model(racist_model_f(), innocuous_model_psi(), generator = "Forest", generator_specs = generator_specs).\
                   train(xtrain, ytrain, feature_names=features, dummy_idcs=dummy_idcs, integer_idcs=integer_attributes, perturbation_multiplier=1)

    for adversarial in ["Perturbation", "DropoutVAE", "ForestFill"]:
        adv_model = adv_models[adversarial]

        print('---------------------')
        print(f'Training explainers with adversarial {adversarial}....')
        print('---------------------')

        # Explainers
        adv_kernel_explainers = dict()
        adv_kernel_explainers["Perturbation"] = shap.SamplingExplainer(
            adv_model.predict, xtrain)
        adv_kernel_explainers["DropoutVAE"] = shap.SamplingExplainer(adv_model.predict, xtrain, generator="DropoutVAE", generator_specs=generator_specs,\
              dummy_idcs=dummy_idcs, integer_idcs=integer_attributes, instance_multiplier = 1000)
        adv_kernel_explainers["ForestFill"] = shap.SamplingExplainer(adv_model.predict, xtrain, generator="Forest", generator_specs=generator_specs,\
              dummy_idcs=dummy_idcs, integer_idcs=integer_attributes)

        for explainer in ["Perturbation", "DropoutVAE", "ForestFill"]:
            adv_kernel_explainer = adv_kernel_explainers[explainer]
            explanations = adv_kernel_explainer.shap_values(
                xtest,
                fill_data=True,
                data_location="...\Data/german_forest_ime.csv",
                distribution_size=1000)

            # format for display
            formatted_explanations = []
            for exp in explanations:
                formatted_explanations.append([(features[i], exp[i])
                                               for i in range(len(exp))])

            print(
                f"IME Ranks and Pct Occurances one unrelated feature, adversarial: {adversarial}, explainer: {explainer}:"
            )
            summary = experiment_summary(formatted_explanations, features)
            print(summary)
            print("Fidelity:", round(adv_model.fidelity(xtest), 2))

            file_name = f"../Results/GermanIme/germanImeSummary_adversarial_{adversarial}_explainer_{explainer}.csv"
            with open(file_name, "w") as output:
                w = csv.writer(output)
                for key, val in summary.items():
                    w.writerow([key] + [pair for pair in val])
コード例 #13
0
original_dim = xtrain.shape[1]
"""
Naive Bayes
"""
print('---------------------')
print('Beginning with Naive Bayes')
print('---------------------')

# Load classifier and true Shapley values
bayes = pickle.load(open("../Data/IME/CC/bayes_model.sav", 'rb'))
shapley_values = np.load("../Data/IME/CC/bayes_values.npy")

generator_specs = {"original_dim": original_dim, "intermediate_dim": 8, "latent_dim": latent_dim, "epochs": 100, "dropout": 0.3,\
                    "experiment": "CC", "feature_names": features}

perturbation_explainer = shap.SamplingExplainer(bayes.predict, xtrain)
dvae_explainer = shap.SamplingExplainer(bayes.predict, xtrain, generator="DropoutVAE", generator_specs=generator_specs,\
                            dummy_idcs=dummy_indcs, integer_idcs=integer_attributes, instance_multiplier=100)
forest_explainer = shap.SamplingExplainer(bayes.predict, xtrain, generator="Forest", generator_specs=generator_specs,\
                            dummy_idcs=dummy_indcs, integer_idcs=integer_attributes, instance_multiplier=100)

# Setup experiment
perturbation_explainer.create_experiment_table(shapley_values)
dvae_explainer.create_experiment_table(shapley_values)
forest_explainer.create_experiment_table(shapley_values)

# Experiment for perturbations
perturbation_explainer.shap_values(xtest,
                                   nsamples="variance",
                                   is_experiment=True)
perturbation_data = perturbation_explainer.get_experiment_dataframe()
コード例 #14
0
    explainers = ["kernel", "sampling", "lime", "numeric"]
    lime_models = [lime_x, lime_v]

    background = shap.sample(X_test, 3)
    choice = X.iloc[np.sort(np.random.choice(X_test.shape[0], 3, replace =False))]


    big_df = pd.DataFrame()
    for explainer in explainers:
        print(explainer)
        if explainer == "kernel":
            temp_explainer = shap.KernelExplainer(model, background)
            temp_vals = temp_explainer.shap_values(choice)
        elif explainer == "sampling":
            temp_explainer = shap.SamplingExplainer(model, background)
            temp_vals = temp_explainer.shap_values(choice)
        elif explainer == "lime":
            temp_explainer = MyLime(lime_models, choice, mode='regression')
            temp_vals = temp_explainer.attributions(choice)
        elif explainer == "numeric":
            temp_explainer = NumericExplainer(model, duffing.features, duffing.labels, h = 0.001)
            temp_vals = temp_explainer.feature_att(choice)
        else:
            print("not a valid explainer type")
        big_df = big_df.append(duffing.vals_to_df(temp_vals, choice, explainer = explainer, suffix = suffix))


 

コード例 #15
0
def get_instance_explanations(X, Y, subset = 1000, classifier_index = "gradient_boosting", explanation_method = "shap", shap_explainer = "kernel", text = False):
    
    """
    A set of calls for obtaining aggregates of explanations.
    """
    ## label encoding
    #lab_enc = preprocessing.LabelEncoder()
    #training_scores_encoded = lab_enc.fit_transform(Y)
    # TODO: zakaj je potreben label encoder?

    training_scores_encoded = Y
    if text:
        vectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english')
        X_vectorized = vectorizer.fit_transform(X)
        #print(X_vectorized)
        X_vectorized = X_vectorized.todense()
        #print(X_vectorized)
        X = pd.DataFrame(X_vectorized)
        X.columns = vectorizer.get_feature_names()
        #X.columns = vectorizer.get_feature_names()
    logging.info("Feature pre-selection via Mutual Information ({}).".format(subset))
    #X = X.iloc[:,1:100]
    minf = mutual_info_classif(X.values, training_scores_encoded)
    top_k = np.argsort(minf)[::-1][0:subset]
    attribute_vector = X.columns[top_k]
    X = X.astype(float).values[:,top_k]
    skf = StratifiedKFold(n_splits=10)
    performances = []
    enx = 0
    t_start = time.time()
    logging.info("Starting importance estimation ..  shape: {}".format(X.shape))

    per_class_explanations = defaultdict(list)
    classifier_mapping = ["gradient_boosting", "random_forest", "svm"]
    classifiers = [GradientBoostingClassifier(), RandomForestClassifier(n_estimators=10), svm.SVC(probability=True)] ## spyct.Model()

    model_dict = dict(zip(classifier_mapping, classifiers))
    
    if explanation_method == "shap":
        logging.info("Shapley-based explanations.")
        ## for the correctly predicted instances, remember shap values and compute the expected value at the end.
        for train_index, test_index in skf.split(X, Y):
            enx+=1
            clf = model_dict[classifier_index]
            x_train = X[train_index]
            x_test = X[test_index]
            
            y_train = Y[train_index]
            y_test = Y[test_index]

            ## perform simple feature ranking
            minf = mutual_info_classif(x_train, y_train)
            top_k = np.argsort(minf)[::-1][0:subset]
            x_train = x_train[:,top_k]
            x_test = x_test[:,top_k]

            x_train = x_train.astype('float')
            y_train = y_train.astype('float')
            x_test = x_test.astype('float')
            y_test = y_test.astype('float')

            model = clf.fit(x_train, y_train)
            preds = model.predict(x_test)
            if len(np.unique(y_train)) > 1:
                average = "micro"
            perf = f1_score(preds,y_test, average = average)
            performances.append(perf)
            logging.info("Performance in fold {}, {} (F1)".format(enx, perf))
            ## different shap explainers
            if shap_explainer == "kernel":
                explainer = shap.KernelExplainer(model.predict_proba, x_train)
            if shap_explainer == "tree":
                explainer = shap.TreeExplainer(model.predict_proba, x_train)
            if shap_explainer == "gradient":
                explainer = shap.GradientExplainer(model.predict_proba, x_train)
            if shap_explainer == "deep":
                explainer = shap.DeepExplainer(model.predict_proba, x_train)
            if shap_explainer == "sampling":
                explainer = shap.SamplingExplainer(model.predict_proba, x_train)
            if shap_explainer == "partition":
                explainer = shap.PartitionExplainer(model.predict_proba, x_train)

            for unique_class in set(preds):
                cors_neg = np.array([enx for enx, pred_tuple in enumerate(zip(preds, y_test)) if pred_tuple[0] == pred_tuple[1] and pred_tuple[0] == unique_class])
                if cors_neg.size != 0:
                    shap_values = explainer.shap_values(x_test[cors_neg], nsamples = 10, verbose = False)
                    stack = np.mean(np.vstack(shap_values),axis = 0)
                    per_class_explanations[unique_class].append(stack)

        final_explanations = {}
        for class_name, explanation_set in per_class_explanations.items():
            final_explanations[class_name] = np.mean(np.matrix(explanation_set),axis = 0)
        average_perf = (np.mean(performances), np.std(performances))
        logging.info("Final performance: {}".format(average_perf))

    elif explanation_method == "class-ranking":
        logging.info("Ranking-based explanations.")
        unique_scores = np.unique(training_scores_encoded)
        final_explanations = {}
        for label in unique_scores:
            inx = np.where(training_scores_encoded == label)
            tx = VarianceThreshold().fit(X[inx]).variances_
            final_explanations[str(label)] = tx

    t_end = time.time() - t_start
    logging.info("Time spent on explanation estimation {}s.".format(t_end))


    return (final_explanations, attribute_vector)
コード例 #16
0
ファイル: train.py プロジェクト: yucunlu/path_explain
def train(argv=None):
    print('Reading data...')
    X_train_total, y_train_total, \
    X_train, y_train, \
    X_vald,  y_vald, \
    X_test,  y_test = data.load_data()

    learning_rates = [
        0.000001, 0.000005, 0.00001, 0.00005, 0.0001, 0.0005, 0.001
    ]
    num_epochs = 500
    num_components = 500
    batch_size = 128

    try:
        model = tf.keras.models.load_model('model.h5')
        pca_model = joblib.load('pca.model')

        X_train_total_pca = pca_model.transform(X_train_total.values)
        X_test_pca = pca_model.transform(X_test.values)

        print('Restored model from saved checkpoint')
    except (OSError, FileNotFoundError):
        vald_aucs = []
        print('No saved model found. Training from scratch...')
        print('Finding optimal learning rate...')
        for learning_rate in learning_rates:
            model = build_model(learning_rate, num_components)
            pca_model = PCA(n_components=num_components)
            X_train_pca = pca_model.fit_transform(X_train.values)
            X_vald_pca = pca_model.transform(X_vald.values)

            model.fit(X_train_pca,
                      y_train,
                      epochs=num_epochs,
                      batch_size=128,
                      verbose=0)
            score = model.evaluate(X_train_pca,
                                   y_train,
                                   batch_size=128,
                                   verbose=0)
            print(
                'Learning rate: {}, Train Loss: {:.4f}, Train Accuracy: {:.4f}, Train AUC: {:.4f}'
                .format(learning_rate, score[0], score[1], score[2]))
            score = model.evaluate(X_vald_pca,
                                   y_vald,
                                   batch_size=128,
                                   verbose=0)
            print('Vald Loss: {:.4f}, Vald Accuracy: {:.4f}, Vald AUC: {:.4f}'.
                  format(score[0], score[1], score[2]))
            vald_aucs.append(score[2])

        print('Training Model...')
        best_auc_index = np.argmax(vald_aucs)
        print('Best learning rate was: {}'.format(
            learning_rates[best_auc_index]))
        model = build_model(learning_rates[best_auc_index], num_components)
        pca_model = PCA(n_components=num_components)
        X_train_total_pca = pca_model.fit_transform(X_train_total.values)
        X_test_pca = pca_model.transform(X_test.values)

        model.fit(X_train_total_pca,
                  y_train_total,
                  epochs=num_epochs,
                  batch_size=128,
                  verbose=0)
        model.save('model.h5')
        joblib.dump(pca_model, 'pca.model')

    score = model.evaluate(X_test_pca, y_test, batch_size=128, verbose=0)
    print('Test Loss: {:.4f}, Test Accuracy: {:.4f}, Test AUC: {:.4f}'.format(
        score[0], score[1], score[2]))

    if not FLAGS.train_only:
        lower_bound = FLAGS.index * 10
        upper_bound = lower_bound + 10
        print('Getting shap values...')
        try:
            sample_shap = np.load('sample_shap{}.npy'.format(FLAGS.index))
        except FileNotFoundError:
            model_func = lambda x: model(x).numpy()
            sample_explainer = shap.SamplingExplainer(model_func,
                                                      X_train_total_pca)
            sample_shap = sample_explainer.shap_values(
                X_test_pca[lower_bound:upper_bound])
            np.save('sample_shap{}.npy'.format(FLAGS.index), sample_shap)

        print('Getting primal effects...')
        try:
            primal_effects = np.load('primal_effects{}.npy'.format(
                FLAGS.index))
        except FileNotFoundError:
            primal_explainer = MarginalExplainer(model, X_train_total_pca,
                                                 X_train_total_pca.shape[0])
            primal_effects = primal_explainer.explain(
                X_test_pca[lower_bound:upper_bound],
                batch_size=128,
                verbose=True)
            np.save('primal_effects{}.npy'.format(FLAGS.index), primal_effects)
        print('Done!')
コード例 #17
0
ファイル: model.py プロジェクト: mahynski/ml_inspector
    def samplingSHAP(
        model,
        X_train,
        X_test,
        background=None,
        use_probabilities=False,
        nsamples="auto",
        l1_reg=0.0,
        k_means=0,
    ):
        """
        Perform SamplingSHAP to explain a model.

        Alternative to KernelShap.  From shap documentation: "This is an
        extension of the Shapley sampling values explanation method (aka. IME)
        SamplingExplainer computes SHAP values under the assumption of feature
        independence and is an extension of the algorithm proposed in "An
        Efficient Explanation of Individual Classifications using Game Theory",
        Erik Strumbelj, Igor Kononenko, JMLR 2010. It is a good alternative to
        KernelExplainer when you want to use a large background set (as
        opposed to a single reference value for example)."

        It is important to note that this approximation method of Shapley
        values requires the assumption of feature independence; furthermore,
        kernelSHAP is allegedly more computationally efficient.

        - Lundberg & Lee "A unified approach to interpreting model predictions"
          NIPS (2017)

        See ``shap.SamplingExplainer`` for more details.

        Parameters
        ----------
        model : BaseEstimator
            A fitted sklearn (or other supported) model, with a predict()
            and/or predict_proba() method implemented.
        X_train : pandas.DataFrame or ndarray
            Data set model was trained on.  The explainer is fit using this.
        X_test : pandas.DataFrame or ndarray
            The explainer predicts SHAP values for these results.  In reality,
            you could provide X_train again here if you wanted to compute
            values for that set.
        background : pandas.DataFrame or ndarray
            From shap documentation: ``The background dataset to use for
            integrating out features. To determine the impact of a feature,
            that feature is set to "missing" and the change in the model output
            is observed. Since most models aren't designed to handle arbitrary
            missing data at test time, we simulate "missing" by replacing the
            feature with the values it takes in the background dataset. So if
            the background dataset is a simple sample of all zeros, then we
            would approximate a feature being missing by setting it to zero.
            Unlike the KernelExplainer this data can be the whole training set,
            even if that is a large set. This is because SamplingExplainer only
            samples from this background dataset.'' If set to None (default)
            this uses X as the background also.
        use_probabilities : bool
            Use predict_proba() for model - this should only be used for
            classification tasks.
        nsamples : int or str
            Number of samples to use when computing shap values.  See
            ``shap.KernelExplainer.shap_values``.
        l1_reg : float
            Strength of l1 regularization to use computing shap values. See
            ``shap.KernelExplainer.shap_values``. Default of 0 does not do
            regularization since I'm not sure this computes valid Shapley
            values.
        k_means : int
            If > 0, use KMeans to summarize the dataset which can greatly
            accelerate the calculation at the cost of accuracy.  This
            summarizes a dataset with k_means samples weighted by the number of
            data points they each represent.
        """
        import shap

        if k_means > 0:
            X_train = shap.kmeans(X_train, k_means)

        if background is None:
            background = X_train

        explainer = shap.SamplingExplainer(
            model=(model.predict_proba
                   if use_probabilities else model.predict),
            data=background,
        )
        shap_values = explainer.shap_values(X_test,
                                            nsamples=nsamples,
                                            l1_reg=l1_reg)

        return explainer, shap_values
コード例 #18
0
###Change Data Type to Integer
X_train=X_train.apply(lambda x: x.astype(int))
X_test=X_test.apply(lambda x: x.astype(int))


# In[ ]:


###SHAP Summary Plot
#SHAP Value (impact on model output) for different levels of key variables
shap_value=shap.TreeExplainer(clf_best).shap_values(X_train)
shap_value=np.array(shap_value[1])
shap.summary_plot(shap_value, X_train)


# In[ ]:


###SHAP Explainer, Individual Observation
#For each individual observation, how each variable contributed to the final predicted probability
shap.initjs()
explainer=shap.SamplingExplainer(lambda x: clf_best.predict_proba(x)[:,1], data=X_train)

#Apply Explainer to Observation 100
shap_values=explainer.shap_values(X_train.loc[100,:])

#Plot of SHAP Values for Individual Prediction
shap.force_plot(explainer.expected_value, shap_values, features=X_train.loc[100,:],
               feature_names=X_train.columns.tolist())

コード例 #19
0
                             index=x_train.columns)
feat_importances.nlargest(10).sort_values().plot(kind='barh')

print(
    metrics.classification_report(y_train['activityID'],
                                  rf_model.predict(x_train)))

print(
    metrics.classification_report(y_test['activityID'],
                                  rf_model.predict(x_test)))

# pip install shap

import shap

rf_shap_explainer = shap.SamplingExplainer(rf_model.predict_proba, x_train)

rf_shap_vals_train = rf_shap_explainer.shap_values(shap.sample(x_train, 200),
                                                   approximate=True,
                                                   nsamples=200)

rf_shap_vals_test = rf_shap_explainer.shap_values(shap.sample(x_test, 200),
                                                  approximate=True,
                                                  nsamples=200)

shap.initjs()

shap.summary_plot(rf_shap_vals_train[0], shap.sample(x_train, 200))

shap.summary_plot(rf_shap_vals_test[0], shap.sample(x_test, 200))