Esempio n. 1
0
# Summarizing the data with k-Means is a trick to speed up the processing
 
"""
Rather than use the whole training set to estimate expected values, we summarize with
a set of weighted kmeans, each weighted by the number of points they represent.
Running without kmeans took 1 hr 6 mins 7 sec. Running with kmeans took 2 min 47 sec.
Boston Housing is a small dataset.
Running SHAP on models that require the Kernel method becomes prohibitive. 
"""
 
# build the kmeans summary
X_train_summary = shap.kmeans(X_train, 10)
 
# using the kmeans summary
t0 = time.time()
explainerKNN = shap.KernelExplainer(knn.predict,X_train_summary)
shap_values_KNN_test = explainerKNN.shap_values(X_test)
t1 = time.time()
timeit=t1-t0
timeit
 
# without kmeans# a test run took 3967.6232330799103 seconds
"""
t0 = time.time()
explainerKNN = shap.KernelExplainer(knn.predict, X_train)shap_values_KNN_test = explainerKNN.shap_values(X_test)
t1 = time.time()
timeit=t1-t0timeit 
"""
j=52
# now we can plot the SHAP explainer
shap.force_plot(explainerKNN.expected_value, shap_values_KNN_test[j], X_test.iloc[[j]])
Esempio n. 2
0
def calculate_predictions(img_orig,
                          original_class,
                          explainer_name,
                          num_features,
                          strategy,
                          sigma,
                          verbose=0):
    # segment the image so we don't have to explain every pixel
    segments_slic = slic(img_orig, n_segments=49, compactness=1000, sigma=3)

    def f(z):
        return model.predict(mask_image(z, segments_slic, img_orig, 255))

    new_class_lime = None
    new_prediction_lime = None
    global prev_shap_values
    global prev_img_orig
    global prev_explanation
    global prev_num_features
    if ((explainer_name == 'shap' or explainer_name == 'random'
         or explainer_name == 'grad') & (img_orig == prev_img_orig).all()):
        if verbose:
            print("Hitting cache, returning prev values")
        shap_values = prev_shap_values
    elif explainer_name == 'shap':
        # use Kernel SHAP to explain the network's predictions
        explainer = shap.KernelExplainer(f, np.zeros((1, 49)))
        shap_values = explainer.shap_values(np.ones(
            (1, 49)), nsamples=1000)  # runs model 1000 times
    elif explainer_name == 'grad':
        # use Kernel SHAP to explain the network's predictions
        last_conv_layer_name = "conv2d"
        classifier_layer_names = [
            "global_average_pooling2d",
            "dense_1",
        ]

        heatmap = grad.make_gradcam_heatmap(img_orig.reshape(1, 250, 250, 3),
                                            model, last_conv_layer_name,
                                            classifier_layer_names)

        if verbose:
            grad.test_drive_grad(img_orig)
            plt.imshow(heatmap)
            plt.show()

        shap_values = []
        for i in range(5):
            shap_values.append(
                [[item for sublist in heatmap for item in sublist]])
        shap_values = np.array(shap_values)
        shap_values = shap_values / np.max(shap_values)

    elif explainer_name == 'random':
        shap_values = [
            numpy.asarray([[random.uniform(0, 1) for iter in range(50)]])
            for i in range(5)
        ]
    elif explainer_name == 'lime':
        if ((img_orig == prev_img_orig).all()):
            explanation = prev_explanation
            if verbose:
                print("Hitting LIME cache, returning prev values")
        else:
            explainer = lime_image.LimeImageExplainer()
            explanation = explainer.explain_instance(img_orig.astype("double"),
                                                     model.predict,
                                                     num_samples=1000)
            prev_explanation = explanation
            prev_img_orig = img_orig

        if (num_features == prev_num_features):
            new_class_lime = None
            new_prediction_lime = None
        else:
            if strategy == "top":
                lime_img, _ = explanation.get_image_and_mask(
                    explanation.top_labels[original_class],
                    positive_only=True,
                    negative_only=False,
                    hide_rest=True,
                    num_features=num_features,
                    min_weight=0)
            else:
                lime_img, _ = explanation.get_image_and_mask(
                    explanation.top_labels[original_class],
                    positive_only=False,
                    negative_only=True,
                    num_features=1000,
                    hide_rest=True)

            if verbose:
                plt.matshow(lime_img)
                plt.show()

            new_class_lime = model.predict_classes(
                lime_img.reshape(1, 250, 250, 3))[0]
            new_prediction_lime = model.predict(
                lime_img.reshape(1, 250, 250, 3))[0][original_class]
        prev_num_features = num_features

        _, mask = explanation.get_image_and_mask(
            explanation.top_labels[original_class],
            positive_only=True,
            negative_only=False,
            hide_rest=True,
            num_features=num_features,
            min_weight=0)
        shap_values = convert_to_shap_values(mask, verbose)

    prev_shap_values = shap_values.copy()
    prev_img_orig = img_orig.copy()
    # get the top predictions from the model
    preds = model.predict(np.expand_dims(img_orig.copy(), axis=0))
    top_preds = np.argsort(-preds)
    inds = top_preds[0]

    #     show_cut_image(shap_values, img_orig, num_features)

    shap_values = [np.where(a < 0, 0, a) for a in shap_values]
    shap_values = extract_top_ten(shap_values, num_features)

    if strategy == 'rest':
        shap_values = (shap_values - 1) * -1

    masked_image = mask_image_with_noise(shap_values[inds[0]], segments_slic,
                                         img_orig, sigma)

    if verbose:
        plt.imshow(masked_image[0])
        plt.show()

    new_class = model.predict_classes(masked_image)[0]
    new_prediction = model.predict(masked_image)[0][original_class]
    return new_class, new_prediction, new_class_lime, new_prediction_lime
Esempio n. 3
0
def test_null_model_small():
    explainer = shap.KernelExplainer(lambda x: np.zeros(x.shape[0]), np.ones((2, 4)), nsamples=100)
    e = explainer.explain(np.ones((1, 4)))
    assert np.sum(np.abs(e.effects)) < 1e-8
Esempio n. 4
0
#For context, we'll look at the raw predictions before looking at the SHAP values
my_model.predict_proba(data_for_prediction_array)

# Create object that can calculate shap values
explainer = shap.TreeExplainer(my_model)
### Calculate Shap values
shap_values = explainer.shap_values(data_for_prediction)

# It's cumbersome to review raw arrays, but the shap package has a nice way to visualize the results.
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1], data_for_prediction)

#Here is an example using KernelExplainer to get similar results.
#The results aren't identical because kernelExplainer gives an approximate result.
# use Kernel SHAP to explain test set predictions
k_explainer = shap.KernelExplainer(my_model.predict_proba, train_X)
### Calculate Shap values
k_shap_values = k_explainer.shap_values(data_for_prediction)
shap.force_plot(k_explainer.expected_value[1], k_shap_values[1], data_for_prediction)




# or

# **Calculate and show Shap Values for One Prediction:**
# ```
# import shap  # package used to calculate Shap values

# data_for_prediction = val_X.iloc[0,:]  # use 1 row of data here. Could use multiple rows if desired
Esempio n. 5
0
from sklearn.model_selection import train_test_split
import numpy as np
import time
import shap
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

# In[2]:

X_train, X_test, Y_train, Y_test = train_test_split(*shap.datasets.iris(),
                                                    test_size=0.2,
                                                    random_state=0)


def print_accuracy(f):
    print("Accuracy = {0}%".format(100 * np.sum(f(X_test) == Y_test) /
                                   len(Y_test)))
    time.sleep(0.5)


shap.initjs()

# In[9]:

linear_lr = sklearn.linear_model.LogisticRegression()
linear_lr.fit(X_train, Y_train)
print_accuracy(linear_lr.predict)
explainer = shap.KernelExplainer(linear_lr.predict_proba, X_train)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test)
Esempio n. 6
0
# In[23]:

joblib.dump(regression,
            open(os.path.join(MODEL_FOLDER, 'regression_model'), 'wb'))

# ## Model explanation

# In[24]:

import shap

# In[25]:

# create kernel explaner object
explainer = shap.KernelExplainer(model=regression.predict,
                                 data=test[X_FEATURES][:40],
                                 link="identity")

# In[28]:

idx = 22

shap_value_single = explainer.shap_values(
    X=test[X_FEATURES].iloc[idx],
    nsamples=100,
)
shap.initjs()
shap.force_plot(
    base_value=explainer.expected_value,
    shap_values=shap_value_single[0],
    features=test[X_FEATURES].iloc[idx],
Esempio n. 7
0
pdp.pdp_interact_plot(pdp_data, ['x4_plot', 'x5_plot'], plot_type='contour',
                      x_quantile=False, plot_pdp=False,
                      which_classes=None, ncols=2,
                      plot_params=None)

#%%
#y = x1**2 + x2**3 + x3**4 + 2*x4**2 - x5**3
i = 1.3
print(i, 2*i , 3*i, 4*i, -5*i)
print(i + 2*i + 3*i + 4*i -5*i)
import shap
data_for_prediction = np.array([[i, i, i, i, i]])
print(model.predict(data_for_prediction))

k_explainer = shap.KernelExplainer(model.predict, X)
k_shap_values = k_explainer.shap_values(data_for_prediction)
shap.force_plot(k_explainer.expected_value[1], k_shap_values[1], data_for_prediction,
                matplotlib=True)

#explainer = shap.TreeExplainer(model)
l_explainer = shap.LinearExplainer(model, X_train)

#shap_values = explainer.shap_values(data_for_prediction)
l_shap_values = l_explainer.shap_values(data_for_prediction)

#shap.initjs()
shap.force_plot(l_explainer.expected_value[0], l_shap_values[0], data_for_prediction,
                matplotlib=True)

#%%
Esempio n. 8
0
        rounded=True,
        out_file=None)
    graph = graphviz.Source(dot_data)
    graph.render("iris-tree", format="png")

if method_flag == 7:
    # 説明変数の重要度
    x_importances = pd.DataFrame(clf.feature_importances_,
                                 index=pd.DataFrame(train_x).columns,
                                 columns=['importance'])
    x_importances.to_csv(
        'rf_x_importances.csv')  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください

#SHAP visualization
if method_flag == 3 or method_flag == 4 or method_flag == 5 or method_flag == 9:
    explainer = shap.KernelExplainer(clf.predict, train_x)
    shap_values = explainer.shap_values(train_x.loc[[0]])
    shap.force_plot(explainer.expected_value,
                    shap_values[0],
                    train_x.loc[[0]],
                    matplotlib=True)
"""
    shap_values = explainer.shap_values(train_x)
    shap.summary_plot(shap_values, features = train_x,
                #plot_type = 'bar'
                )

    shap.dependence_plot(ind="RM", shap_values=shap_values, features = train_x,
                    interaction_index = 'TSTAT',
                    )
"""
Esempio n. 9
0
plt.figure(figsize=(10, 4))
# plt.set_title('Training & Validation Loss')
no_of_features = list(range(0, len(avg)))
plt.plot(no_of_features,
         avg,
         color='navy',
         marker='o',
         linestyle='-',
         label='Confidence level change vs no of features')
plt.legend(loc='best')

# In[ ]:

import shap
# use Kernel SHAP to explain test set predictions
explainer = shap.KernelExplainer(model.predict_proba, X_train, link="logit")
shap.initjs()

# In[ ]:

# In[ ]:

# In[ ]:

# In[ ]:

c

# # Section 3: Evaluating Model Performance.
#
# **TASK: Plot out the validation loss versus the training loss.**
Esempio n. 10
0
    def interpret(self, raw_input):
        """
        Runs the interpretation command for the machine learning model. Handles both the "default" out-of-the-box
        interpretation for a certain set of UI component types, as well as the custom interpretation case.
        :param raw_input: a list of raw inputs to apply the interpretation(s) on.
        """
        if isinstance(self.interpretation, list):  # Either "default" or "shap"
            processed_input = [
                input_component.preprocess(raw_input[i])
                for i, input_component in enumerate(self.input_components)
            ]
            original_output = self.run_prediction(processed_input)
            scores, alternative_outputs = [], []
            for i, (x, interp) in enumerate(zip(raw_input,
                                                self.interpretation)):
                if interp == "default":
                    input_component = self.input_components[i]
                    neighbor_raw_input = list(raw_input)
                    if input_component.interpret_by_tokens:
                        tokens, neighbor_values, masks = input_component.tokenize(
                            x)
                        interface_scores = []
                        alternative_output = []
                        for neighbor_input in neighbor_values:
                            neighbor_raw_input[i] = neighbor_input
                            processed_neighbor_input = [
                                input_component.preprocess(
                                    neighbor_raw_input[i])
                                for i, input_component in enumerate(
                                    self.input_components)
                            ]
                            neighbor_output = self.run_prediction(
                                processed_neighbor_input)
                            processed_neighbor_output = [
                                output_component.postprocess(
                                    neighbor_output[i]) for i, output_component
                                in enumerate(self.output_components)
                            ]

                            alternative_output.append(
                                processed_neighbor_output)
                            interface_scores.append(
                                quantify_difference_in_label(
                                    self, original_output, neighbor_output))
                        alternative_outputs.append(alternative_output)
                        scores.append(
                            input_component.get_interpretation_scores(
                                raw_input[i],
                                neighbor_values,
                                interface_scores,
                                masks=masks,
                                tokens=tokens))
                    else:
                        neighbor_values, interpret_kwargs = input_component.get_interpretation_neighbors(
                            x)
                        interface_scores = []
                        alternative_output = []
                        for neighbor_input in neighbor_values:
                            neighbor_raw_input[i] = neighbor_input
                            processed_neighbor_input = [
                                input_component.preprocess(
                                    neighbor_raw_input[i])
                                for i, input_component in enumerate(
                                    self.input_components)
                            ]
                            neighbor_output = self.run_prediction(
                                processed_neighbor_input)
                            processed_neighbor_output = [
                                output_component.postprocess(
                                    neighbor_output[i]) for i, output_component
                                in enumerate(self.output_components)
                            ]

                            alternative_output.append(
                                processed_neighbor_output)
                            interface_scores.append(
                                quantify_difference_in_label(
                                    self, original_output, neighbor_output))
                        alternative_outputs.append(alternative_output)
                        interface_scores = [
                            -score for score in interface_scores
                        ]
                        scores.append(
                            input_component.get_interpretation_scores(
                                raw_input[i], neighbor_values,
                                interface_scores, **interpret_kwargs))
                elif interp == "shap" or interp == "shapley":
                    try:
                        import shap
                    except (ImportError, ModuleNotFoundError):
                        raise ValueError(
                            "The package `shap` is required for this interpretation method. Try: `pip install shap`"
                        )
                    input_component = self.input_components[i]
                    if not (input_component.interpret_by_tokens):
                        raise ValueError(
                            "Input component {} does not support `shap` interpretation"
                            .format(input_component))

                    tokens, _, masks = input_component.tokenize(x)

                    # construct a masked version of the input
                    def get_masked_prediction(binary_mask):
                        masked_xs = input_component.get_masked_inputs(
                            tokens, binary_mask)
                        preds = []
                        for masked_x in masked_xs:
                            processed_masked_input = copy.deepcopy(
                                processed_input)
                            processed_masked_input[
                                i] = input_component.preprocess(masked_x)
                            new_output = self.run_prediction(
                                processed_masked_input)
                            pred = get_regression_or_classification_value(
                                self, original_output, new_output)
                            preds.append(pred)
                        return np.array(preds)

                    num_total_segments = len(tokens)
                    explainer = shap.KernelExplainer(
                        get_masked_prediction, np.zeros(
                            (1, num_total_segments)))
                    shap_values = explainer.shap_values(
                        np.ones((1, num_total_segments)),
                        nsamples=int(self.num_shap * num_total_segments),
                        silent=True)
                    scores.append(
                        input_component.get_interpretation_scores(
                            raw_input[i],
                            None,
                            shap_values[0],
                            masks=masks,
                            tokens=tokens))
                    alternative_outputs.append([])
                elif interp is None:
                    scores.append(None)
                    alternative_outputs.append([])
                else:
                    raise ValueError(
                        "Uknown intepretation method: {}".format(interp))
            return scores, alternative_outputs
        else:  # custom interpretation function
            processed_input = [
                input_component.preprocess(raw_input[i])
                for i, input_component in enumerate(self.input_components)
            ]
            interpreter = self.interpretation

            if self.capture_session and self.session is not None:
                graph, sess = self.session
                with graph.as_default(), sess.as_default():
                    interpretation = interpreter(*processed_input)
            else:
                try:
                    interpretation = interpreter(*processed_input)
                except ValueError as exception:
                    if str(exception).endswith(
                            "is not an element of this graph."):
                        raise ValueError(strings.en["TF1_ERROR"])
                    else:
                        raise exception
            if len(raw_input) == 1:
                interpretation = [interpretation]
            return interpretation, []
Esempio n. 11
0
def calculate_shap_values(data,max_lag_steps,input_columns,output_columns,
                  include_output_column  = default_include_output_column,
                  include_t0             = default_include_t0,
                  model_complexity       = default_model_complexity,
                  batch_size             = default_batch_size,
                  num_epochs             = default_num_epochs,
                  learning_rate          = default_learning_rate,
                  num_dense_layers       = default_dense_layers,
                  dense_layer_activation = default_dense_activation,
                  output_layer_activation= default_output_activation,
                  input_scaling          = default_input_scaling,
                  num_shap_samples       = default_shap_samples,
                  nan_percent_cutoff     = default_nan_percent_cutoff,
                  verbose                = False):

    assert(not (include_output_column and include_t0))
    assert(len(output_columns)== 1)
    assert(max_lag_steps>=1)

    time_series_length               = max_lag_steps if include_t0 else max_lag_steps + 1


    samples                          = reshape_and_pad(data, time_series_length,
                                                       input_columns+output_columns,
                                                       verbose=verbose)

    train_X, train_Y, test_X, test_Y = filter_samples(samples,
                                                include_output_column  = include_output_column,
                                                include_t0             = include_t0,
                                                input_scaling          = input_scaling,
                                                nan_percent_cutoff     = nan_percent_cutoff,
                                                verbose                = verbose )


    model, history                   = build_and_run_lstm(train_X, train_Y, test_X, test_Y,
                                                model_complexity        = model_complexity,
                                                batch_size              = batch_size,
                                                num_epochs              = num_epochs,
                                                learning_rate           = learning_rate,
                                                num_dense_layers        = num_dense_layers,
                                                dense_layer_activation  = dense_layer_activation,
                                                output_layer_activation = output_layer_activation,
                                                verbose                 = verbose)

    if verbose:
        print("Validation losses:", history.history['val_loss'][-1],
              " / Training losses:", history.history['loss'][-1],
              " Test Y std_dev:", np.std(test_Y) )


    input_reshape = lambda x : np.reshape(x,(1, time_series_length - 1, len(input_columns)))
    f             = lambda X : np.array([model.predict(input_reshape(x)) for x in X])

    flattened_X   = np.reshape(train_X[:num_shap_samples], (-1,train_X.shape[1]*train_X.shape[2]))

    kernel_explainer   = shap.KernelExplainer(f, flattened_X)
    lookup_elem        = lambda elem_c : data[data.elementcode == elem_c].element.tolist()[0]
    lookup_item        = lambda item_c : data[data.itemcode == item_c].item.tolist()[0]

    feature_names = [lookup_elem(element_c)+'/'+ lookup_item(item_c)+" : t-"+            \
                     str(i-1 if include_t0 else i) for i in range(max_lag_steps-1,0,-1)  \
                                                   for element_c, item_c in input_columns ]



    return lambda : shap.summary_plot(kernel_explainer.shap_values(flattened_X),
                                        flattened_X,feature_names)
 classificator.compile(optimizer='adam',
                       loss='categorical_crossentropy',
                       metrics=['accuracy'])
 #Train model
 history = classificator.fit(train_data,
                             train_label,
                             batch_size=batch_size_classification,
                             epochs=num_epochs_classification,
                             verbose=0)
 loss, accuracy = classificator.evaluate(test_data,
                                         test_label,
                                         verbose=1)
 #SHAP
 elements = np.random.choice(len(train_data),
                             int(0.3 * len(train_data)), False)
 explainer = shap.KernelExplainer(classificator.predict,
                                  train_data[elements])
 #Apply aggregation function to test if necessary
 if num_epochs_detection == 0:
     matrix_metadata = metadata_to_matrix(TMP_TEST, "json")
     names = matrix_metadata[:, -1]
     test_data = np.zeros((len(names), num_archi_features))
     test_label = np.zeros(len(names))
     if data == "MonumenAI":
         for i in range(len(names)):
             im_name = names[i][2:-4]
             idx = test_loader.images_loc['path'].str.contains(im_name)
             test_data[idx] = matrix_metadata[i, :num_archi_features]
             test_label[idx] = matrix_metadata[i, num_archi_features]
     if data == "PascalPart":
         for i in range(len(names)):
             im_name = os.path.join(
"""
Notebook code for shapley values.
"""
import sklearn
import shap
from sklearn.model_selection import train_test_split

# print the JS visualization code to the notebook
shap.initjs()

# train a SVM classifier
X_train, X_test, Y_train, Y_test = train_test_split(*shap.datasets.iris(),
                                                    test_size=0.2,
                                                    random_state=0)
svm = sklearn.svm.SVC(kernel='rbf', probability=True)
svm.fit(X_train, Y_train)

# use Kernel SHAP to explain test set predictions
explainer = shap.KernelExplainer(svm.predict_proba, X_train, link="logit")
shap_values = explainer.shap_values(X_test, nsamples=100)

# plot the SHAP values for the Setosa output of the first instance
shap.force_plot(explainer.expected_value[0],
                shap_values[0][0, :],
                X_test.iloc[0, :],
                link="logit")
Esempio n. 14
0

                    st.text('ICE Plot')
                    features = [0, 1]
                    fig, ax = plt.subplots(figsize=(7, 6))
                    plot_partial_dependence(gscv, X_valid, features, kind='both', target=0, ax=ax)
                    plt.tight_layout()
                    st.pyplot(fig)

                    st.text('Prediction on Test file')
                    df_test['Predicted'] = gscv.predict(df_test)
                    st.write(df_test)

                    st.text('Shapley Explainer')
#                     X_test = df_test.drop('Predicted', axis = 1)
                    explainer = shap.KernelExplainer(gscv.predict_proba, X_valid)
                    shap_values = explainer.shap_values(X_valid.iloc[2,:])
                    st.pyplot(shap.force_plot(explainer.expected_value[0], shap_values[0], X_valid.iloc[2,:], matplotlib=True, text_rotation=8))


                    st.text('Shapley Explainer WaterFall Plot')
                    f = lambda x: gscv.predict_proba(x)[:,1]
                    med = X_train.median().values.reshape((1,X_train.shape[1]))

                    explainer = shap.Explainer(f, med)
                    shap_values = explainer(X_train.iloc[0:100,:])
                    st.pyplot(shap.plots.waterfall(shap_values[2], max_display=7))

                    st.text('Partial Dependence Plot from pdp_box')
                    pdp_ = pdp.pdp_isolate(model=gscv, dataset=X_valid,
                                                model_features=X_valid.columns,
Esempio n. 15
0
def test_single_tree_nonlinear_transformations():
    """ Make sure Independent Tree SHAP single trees with non-linear
    transformations.
    """

    # Supported non-linear transforms
    def sigmoid(x):
        return (1 / (1 + np.exp(-x)))

    def log_loss(yt, yp):
        return (-(yt * np.log(yp) + (1 - yt) * np.log(1 - yp)))

    def mse(yt, yp):
        return (np.square(yt - yp))

    try:
        import xgboost
    except:
        print("Skipping test_several_trees!")
        return

    np.random.seed(10)

    n = 1000
    X = np.random.normal(size=(n, 7))
    b = np.array([-2, 1, 3, 5, 2, 20, -5])
    y = np.matmul(X, b)
    y = y + abs(min(y))
    y = np.random.binomial(n=1, p=y / max(y))
    max_depth = 6

    # train a model with single tree
    Xd = xgboost.DMatrix(X, label=y)
    model = xgboost.train(
        {
            'eta': 1,
            'max_depth': max_depth,
            'base_score': y.mean(),
            "lambda": 0,
            "objective": "binary:logistic"
        }, Xd, 1)
    pred = model.predict(Xd, output_margin=True)  # In margin space (log odds)
    trans_pred = model.predict(Xd)  # In probability space

    expl = shap.TreeExplainer(model, X, feature_perturbation="interventional")
    f = lambda inp: model.predict(xgboost.DMatrix(inp), output_margin=True)
    expl_kern = shap.KernelExplainer(f, X)

    x_ind = 0
    x = X[x_ind:x_ind + 1, :]
    itshap = expl.shap_values(x)
    kshap = expl_kern.shap_values(x, nsamples=300)
    assert np.allclose(itshap.sum() + expl.expected_value, pred[x_ind]), \
    "SHAP values don't sum to model output on explaining margin!"
    assert np.allclose(itshap, kshap), \
    "Independent Tree SHAP doesn't match Kernel SHAP on explaining margin!"

    model.set_attr(objective="binary:logistic")
    expl = shap.TreeExplainer(model,
                              X,
                              feature_perturbation="interventional",
                              model_output="probability")
    itshap = expl.shap_values(x)
    assert np.allclose(itshap.sum() + expl.expected_value, trans_pred[x_ind]), \
    "SHAP values don't sum to model output on explaining logistic!"
Esempio n. 16
0
            model.fit(dataframe_train, dataframe_label.astype('float32'))
            dataframe_train = dataframe_train.to_pandas()
            dataframe_label = dataframe_label.to_pandas()
        else:
            model.fit(dataframe_train.values, dataframe_label.values.ravel())

        # -------------------------------------------------------------
        # Check if cv score should be calculated for the AutoML workflow
        #
        if alg.automl:
            if alg.type == 'classification':
                scores = cross_val_score(model, dataframe_train.values, dataframe_label.values.ravel(),
                                         cv=int(variables.get("N_SPLITS")), scoring=alg.scoring)
                loss = 1 - np.mean(scores)
                if (not alg.name.startswith("TPOT") and not alg.name.startswith("AutoSklearn")):
                    model_explainer = shap.KernelExplainer(model.predict_proba, dataframe_train)  # feature importance
            if alg.type == 'anomaly':
                scores = cross_val_score(model, dataframe_train.values, dataframe_label.values.ravel(),
                                         cv=int(variables.get("N_SPLITS")), scoring=alg.scoring)
                loss = 1 - np.mean(scores)
                model_explainer = shap.KernelExplainer(model.predict, dataframe_train)  # feature importance
            if alg.type == 'regression':
                scores = cross_val_score(model, dataframe_train.values, dataframe_label.values.ravel(),
                                         cv=int(variables.get("N_SPLITS")), scoring=alg.scoring)
                loss = np.abs(np.mean(scores))
                if alg.name == 'BayesianRidgeRegression' or alg.name == 'LinearRegression':
                    model_explainer = shap.LinearExplainer(model, dataframe_train)
                else:
                    if (not alg.name.startswith("TPOT") and not alg.name.startswith("AutoSklearn")):
                        model_explainer = shap.KernelExplainer(model.predict, dataframe_train)
        # -------------------------------------------------------------
Esempio n. 17
0
def log_explanation(predict_function, features, artifact_path=None):
    r"""
    Given a ``predict_function`` capable of computing ML model output on the provided ``features``,
    computes and logs explanations of an ML model's output. Explanations are logged as a directory
    of artifacts containing the following items generated by `SHAP`_ (SHapley Additive
    exPlanations).

        - Base values
        - SHAP values (computed using `shap.KernelExplainer`_)
        - Summary bar plot (shows the average impact of each feature on model output)

    :param predict_function:
        A function to compute the output of a model (e.g. ``predict_proba`` method of
        scikit-learn classifiers). Must have the following signature:

        .. code-block:: python

            def predict_function(X) -> pred:
                ...

        - ``X``: An array-like object whose shape should be (# samples, # features).
        - ``pred``: An array-like object whose shape should be (# samples) for
          a regressor or (# classes, # samples) for a classifier. For a classifier,
          the values in ``pred`` should correspond to the predicted probability of each class.

        Acceptable array-like object types:

            - ``numpy.array``
            - ``pandas.DataFrame``
            - ``shap.common.DenseData``
            - ``scipy.sparse matrix``

    :param features:
        A matrix of features to compute SHAP values with. The provided features should
        have shape (# samples, # features), and can be either of the array-like object
        types listed above.

        .. note::
            Background data for `shap.KernelExplainer`_ is generated by subsampling ``features``
            with `shap.kmeans`_. The background data size is limited to 100 rows for performance
            reasons.

    :param artifact_path:
        The run-relative artifact path to which the explanation is saved.
        If unspecified, defaults to "model_explanations_shap".

    :return: Artifact URI of the logged explanations.

    .. _SHAP: https://github.com/slundberg/shap

    .. _shap.KernelExplainer: https://shap.readthedocs.io/en/latest/generated
        /shap.KernelExplainer.html#shap.KernelExplainer

    .. _shap.kmeans: https://github.com/slundberg/shap/blob/v0.36.0/shap/utils/_legacy.py#L9

    .. code-block:: python
        :caption: Example

        import os

        import numpy as np
        import pandas as pd
        from sklearn.datasets import load_boston
        from sklearn.linear_model import LinearRegression

        import mlflow

        # prepare training data
        dataset = load_boston()
        X = pd.DataFrame(dataset.data[:50, :8], columns=dataset.feature_names[:8])
        y = dataset.target[:50]

        # train a model
        model = LinearRegression()
        model.fit(X, y)

        # log an explanation
        with mlflow.start_run() as run:
            mlflow.shap.log_explanation(model.predict, X)

        # list artifacts
        client = mlflow.tracking.MlflowClient()
        artifact_path = "model_explanations_shap"
        artifacts = [x.path for x in client.list_artifacts(run.info.run_id, artifact_path)]
        print("# artifacts:")
        print(artifacts)

        # load back the logged explanation
        dst_path = client.download_artifacts(run.info.run_id, artifact_path)
        base_values = np.load(os.path.join(dst_path, "base_values.npy"))
        shap_values = np.load(os.path.join(dst_path, "shap_values.npy"))

        print("\n# base_values:")
        print(base_values)
        print("\n# shap_values:")
        print(shap_values[:3])

    .. code-block:: text
        :caption: Output

        # artifacts:
        ['model_explanations_shap/base_values.npy',
         'model_explanations_shap/shap_values.npy',
         'model_explanations_shap/summary_bar_plot.png']

        # base_values:
        20.502000000000002

        # shap_values:
        [[ 2.09975523  0.4746513   7.63759026  0.        ]
         [ 2.00883109 -0.18816665 -0.14419184  0.        ]
         [ 2.00891772 -0.18816665 -0.14419184  0.        ]]

    .. figure:: ../_static/images/shap-ui-screenshot.png

        Logged artifacts
    """
    import matplotlib.pyplot as plt
    import shap

    artifact_path = _DEFAULT_ARTIFACT_PATH if artifact_path is None else artifact_path
    background_data = shap.kmeans(
        features, min(_MAXIMUM_BACKGROUND_DATA_SIZE, len(features)))
    explainer = shap.KernelExplainer(predict_function, background_data)
    shap_values = explainer.shap_values(features)

    _log_numpy(explainer.expected_value, _BASE_VALUES_FILE_NAME, artifact_path)
    _log_numpy(shap_values, _SHAP_VALUES_FILE_NAME, artifact_path)

    shap.summary_plot(shap_values, features, plot_type="bar", show=False)
    fig = plt.gcf()
    fig.tight_layout()
    _log_matplotlib_figure(fig, _SUMMARY_BAR_PLOT_FILE_NAME, artifact_path)
    plt.close(fig)

    return append_to_uri_path(mlflow.active_run().info.artifact_uri,
                              artifact_path)
def experiment_main():
    """
	Run through experiments for SHAP on CC using both one and two unrelated features.
	* This may take some time given that we iterate through every point in the test set
	* We print out the rate at which features occur in the top three features
	"""

    # Setup SHAP

    # Choose the optimal number of clusters
    candidates = [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 32, 64, 100]
    s_score(xtrain, candidates)

    n_clusters = int(input("Please enter the optimal number of clusters: "))

    ##############################################
    # One unrelated (innocuous_model_psi is used)
    ##############################################
    background_distribution = shap.kmeans(xtrain, n_clusters)
    generator_specs = {"original_dim": original_dim, "intermediate_dim": 8, "latent_dim": latent_dim, "epochs": 100, "dropout": 0.2,\
         "experiment": "CC", "feature_names": features}

    # Adversarial models
    adv_models = dict()
    adv_models["Perturbation"] = Adversarial_Kernel_SHAP_Model(
        racist_model_f(), innocuous_model_psi()).train(xtrain,
                                                       ytrain,
                                                       feature_names=features)
    adv_models["DropoutVAE"] = Adversarial_Kernel_SHAP_Model(racist_model_f(), innocuous_model_psi(), generator = "DropoutVAE", generator_specs = generator_specs).\
                   train(xtrain, ytrain, feature_names=features, dummy_idcs=dummy_idcs, integer_idcs=integer_attributes, n_samples=10*xtrain.shape[0])
    adv_models["RBF"] = Adversarial_Kernel_SHAP_Model(racist_model_f(), innocuous_model_psi(), generator = "RBF", generator_specs = generator_specs).\
                   train(xtrain, ytrain, feature_names=features, dummy_idcs=dummy_idcs, integer_idcs=integer_attributes)
    adv_models["Forest"] = Adversarial_Kernel_SHAP_Model(racist_model_f(), innocuous_model_psi(), generator = "Forest", generator_specs = generator_specs).\
                   train(xtrain, ytrain, feature_names=features, dummy_idcs=dummy_idcs, integer_idcs=integer_attributes)

    for adversarial in ["Perturbation", "DropoutVAE", "RBF", "Forest"]:
        adv_shap = adv_models[adversarial]

        # Explainers
        adv_kernel_explainers = dict()
        adv_kernel_explainers["Perturbation"] = shap.KernelExplainer(
            adv_shap.predict, background_distribution)
        adv_kernel_explainers["DropoutVAE"] = shap.KernelExplainer(adv_shap.predict, xtrain, generator="DropoutVAE", generator_specs=generator_specs,\
              dummy_idcs=dummy_idcs, integer_idcs=integer_attributes, instance_multiplier=100)
        adv_kernel_explainers["RBF"] = shap.KernelExplainer(adv_shap.predict, xtrain, generator="RBF", generator_specs=generator_specs,\
              dummy_idcs=dummy_idcs)
        adv_kernel_explainers["Forest"] = shap.KernelExplainer(adv_shap.predict, xtrain, generator="Forest", generator_specs=generator_specs,\
              dummy_idcs=dummy_idcs)
        adv_kernel_explainers["ForestFill"] = shap.KernelExplainer(adv_shap.predict, xtrain, generator="Forest", generator_specs=generator_specs,\
              dummy_idcs=dummy_idcs)

        for explainer in [
                "Perturbation", "DropoutVAE", "RBF", "Forest", "ForestFill"
        ]:
            adv_kernel_explainer = adv_kernel_explainers[explainer]

            if explainer == "ForestFill":
                explanations = adv_kernel_explainer.shap_values(
                    xtest,
                    fill_data=True,
                    data_location="..\Data/cc_forest_shap.csv")

            else:
                explanations = adv_kernel_explainer.shap_values(xtest)

            # format for display
            formatted_explanations = []
            for exp in explanations:
                if explainer == "Perturbation":
                    formatted_explanations.append([(features[i], exp[i])
                                                   for i in range(len(exp))])
                else:
                    formatted_explanations.append([(original_names[i], exp[i])
                                                   for i in range(len(exp))])

            print(
                f"SHAP Ranks and Pct Occurances one unrelated feature, adversarial: {adversarial}, explainer: {explainer}:"
            )
            if explainer == "Perturbation":
                summary = experiment_summary(formatted_explanations, features)
            else:
                summary = experiment_summary(formatted_explanations,
                                             original_names)
            print(summary)
            print("Fidelity:", round(adv_shap.fidelity(xtest), 2))

            file_name = f"../Results/CCShap/ccShapSummary_adversarial_{adversarial}_explainer_{explainer}.csv"
            with open(file_name, "w") as output:
                w = csv.writer(output)
                for key, val in summary.items():
                    w.writerow([key] + [pair for pair in val])

    ##################################################
    # Two unrelated (innocuous_model_psi_two is used)
    ##################################################
    background_distribution = shap.kmeans(xtrain, n_clusters)
    generator_specs = {"original_dim": original_dim, "intermediate_dim": 8, "latent_dim": latent_dim, "epochs": 100, "dropout": 0.2,\
         "experiment": "CC", "feature_names": features}

    # Adversarial models
    adv_models = dict()
    adv_models["Perturbation"] = Adversarial_Kernel_SHAP_Model(
        racist_model_f(),
        innocuous_model_psi_two()).train(xtrain,
                                         ytrain,
                                         feature_names=features)
    adv_models["DropoutVAE"] = Adversarial_Kernel_SHAP_Model(racist_model_f(), innocuous_model_psi_two(), generator = "DropoutVAE", generator_specs = generator_specs).\
                   train(xtrain, ytrain, feature_names=features, dummy_idcs=dummy_idcs, integer_idcs=integer_attributes, n_samples=10*xtrain.shape[0])
    adv_models["RBF"] = Adversarial_Kernel_SHAP_Model(racist_model_f(), innocuous_model_psi_two(), generator = "RBF", generator_specs = generator_specs).\
                   train(xtrain, ytrain, feature_names=features, dummy_idcs=dummy_idcs, integer_idcs=integer_attributes)
    adv_models["Forest"] = Adversarial_Kernel_SHAP_Model(racist_model_f(), innocuous_model_psi_two(), generator = "Forest", generator_specs = generator_specs).\
                   train(xtrain, ytrain, feature_names=features, dummy_idcs=dummy_idcs, integer_idcs=integer_attributes)

    for adversarial in ["Perturbation", "DropoutVAE", "RBF", "Forest"]:
        adv_shap = adv_models[adversarial]

        # Explainers
        adv_kernel_explainers = dict()
        adv_kernel_explainers["Perturbation"] = shap.KernelExplainer(
            adv_shap.predict, background_distribution)
        adv_kernel_explainers["DropoutVAE"] = shap.KernelExplainer(adv_shap.predict, xtrain, generator="DropoutVAE", generator_specs=generator_specs,\
              dummy_idcs=dummy_idcs, integer_idcs=integer_attributes, instance_multiplier=100)
        adv_kernel_explainers["RBF"] = shap.KernelExplainer(adv_shap.predict, xtrain, generator="RBF", generator_specs=generator_specs,\
              dummy_idcs=dummy_idcs)
        adv_kernel_explainers["Forest"] = shap.KernelExplainer(adv_shap.predict, xtrain, generator="Forest", generator_specs=generator_specs,\
              dummy_idcs=dummy_idcs)
        adv_kernel_explainers["ForestFill"] = shap.KernelExplainer(adv_shap.predict, xtrain, generator="Forest", generator_specs=generator_specs,\
              dummy_idcs=dummy_idcs)
        for explainer in [
                "Perturbation", "DropoutVAE", "RBF", "Forest", "ForestFill"
        ]:
            adv_kernel_explainer = adv_kernel_explainers[explainer]

            if explainer == "ForestFill":
                explanations = adv_kernel_explainer.shap_values(
                    xtest,
                    fill_data=True,
                    data_location="..\Data/cc_forest_shap.csv")

            else:
                explanations = adv_kernel_explainer.shap_values(xtest)

            # format for display
            formatted_explanations = []
            for exp in explanations:
                if explainer == "Perturbation":
                    formatted_explanations.append([(features[i], exp[i])
                                                   for i in range(len(exp))])
                else:
                    formatted_explanations.append([(original_names[i], exp[i])
                                                   for i in range(len(exp))])

            print(
                f"SHAP Ranks and Pct Occurances two unrelated features, adversarial: {adversarial}, explainer: {explainer}:"
            )
            if explainer == "Perturbation":
                summary = experiment_summary(formatted_explanations, features)
            else:
                summary = experiment_summary(formatted_explanations,
                                             original_names)
            print(summary)
            print("Fidelity:", round(adv_shap.fidelity(xtest), 2))

            file_name = f"../Results/CCShap/ccShapSummary2_adversarial_{adversarial}_explainer_{explainer}.csv"
            with open(file_name, "w") as output:
                w = csv.writer(output)
                for key, val in summary.items():
                    w.writerow([key] + [pair for pair in val])
    print('---------------------')
Esempio n. 19
0
import sklearn
from sklearn.model_selection import train_test_split
import numpy as np
import shap
import time

X_train, X_test, Y_train, Y_test = train_test_split(*shap.datasets.iris(),
                                                    test_size=0.2,
                                                    random_state=0)

# rather than use the whole training set to estimate expected values, we could summarize with
# a set of weighted kmeans, each weighted by the number of points they represent. But this dataset
# is so small we don't worry about it
#X_train_summary = shap.kmeans(X_train, 50)


def print_accuracy(f):
    print("Accuracy = {0}%".format(100 * np.sum(f(X_test) == Y_test) /
                                   len(Y_test)))
    time.sleep(0.5)  # to let the print get out before any progress bars


shap.initjs()
knn = sklearn.neighbors.KNeighborsClassifier()
knn.fit(X_train, Y_train)

print_accuracy(knn.predict)
explainer = shap.KernelExplainer(knn.predict_proba, X_train)
shap_values = explainer.shap_values(X_test.iloc[0, :])
shap.force_plot(explainer.expected_value[0], shap_values[0], X_test.iloc[0, :])
Esempio n. 20
0
def test_null_model():
    explainer = shap.KernelExplainer(lambda x: np.zeros(x.shape[0]),
                                     np.ones((2, 10)),
                                     nsamples=100)
    e = explainer.explain(np.ones((1, 10)))
Esempio n. 21
0
            # Works for [svc]
            # If too many examples (pass aux to explainer).
            aux = shap.sample(X_train, 100)
            # Set generic kernel explainer
            explainer = shap.KernelExplainer(predict_proba, aux)
        """

        # Sample to speed up processing.
        sample = shap.sample(X_train, 100)

        if isinstance(clf, XGBClassifier):
            # Works for [llr, dtc, etc, xgb]
            explainer = shap.Explainer(clf, sample)
        else:
            # Works for all but [xgb]
            explainer = shap.KernelExplainer(predict_proba, sample)

        # Show kernel type
        print("Kernel type: %s" % type(explainer))

        # Get shap values
        #shap_values = explainer(X)
        shap_values = explainer.shap_values(X_train)

        # Show information
        print("base value: %s" % \
              explainer.expected_value)
        #print("shap_values: %s" % \
        #      str(shap_values.shape))

        # Summary plot
Esempio n. 22
0
# X_train = X_train.sample(X_train.shape[0]//160, random_state=2020)
# X_test = X_train.sample(X_test.shape[0]//150, random_state=2020)

X_train = np.array(X_train, dtype=float)
X_test = np.array(X_test, dtype=float)



with open('output/X_train.pkl', 'wb') as handle:
    pickle.dump(X_train, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('output/X_test.pkl', 'wb') as handle:
    pickle.dump(X_test, handle, protocol=pickle.HIGHEST_PROTOCOL)


explainer = shap.KernelExplainer(find_cluster_matrix, X_train)
shap_values = explainer.shap_values(X_test)

with open('output/explainer.pkl', 'wb') as handle:
    pickle.dump(explainer, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('output/shap_values.pkl', 'wb') as handle:
    pickle.dump(shap_values, handle, protocol=pickle.HIGHEST_PROTOCOL)

# shap_importance = pd.DataFrame(
#     {"feature_name": list(feature_cols), "shap_value": np.abs(shap_values).sum(axis=0)}).sort_values("shap_value",
#                                                                                                      ascending=False)
# shap_importance["shap_value"] = shap_importance["shap_value"] * 100 / max(shap_importance["shap_value"])
#
# shap_importance.to_csv("output/shap_variable_full.csv")
#
Esempio n. 23
0
def upload():
    print('eer  0', request.form)
    dropdown_selection = str(request.form)
    dropdown_selection = dropdown_selection.split()

    print(dropdown_selection)
    model_type = dropdown_selection[3]
    dropdown_selection = dropdown_selection[1]

    print('model type ji ', model_type)

    print(dropdown_selection, "  nuna bhai")

    global id_name

    target = 'images/'
    print('tt', target)

    if not os.path.isdir(target):
        os.mkdir(target)
    global ff
    ff = []
    for file in request.files.getlist("file"):
        print(file)
        filename = file.filename
        destination = "/".join([target, filename])
        print('des', destination)
        file.save(destination)
        ff.append(destination)

    mypath = os.getcwd()
    onlyfiles = [
        os.path.join(mypath, f) for f in os.listdir(mypath)
        if os.path.isfile(os.path.join(mypath, f))
    ]

    print('raJA ', ff)
    import warnings
    warnings.filterwarnings("ignore")

    with open(ff[0], 'rb') as file:
        model = pickle.load(file)

    with open(ff[1], 'rb') as file:
        X_data = pickle.load(file)

    with open(ff[2], 'rb') as file:
        y_data = pickle.load(file)

    if 'GL' in dropdown_selection:

        if 'RR' in model_type:

            PI = permutation_importance(model, X_data, y_data)

            row_to_show = 5

            data_for_prediction = X_data.iloc[row_to_show]

            explainer = shap.Explainer(model,
                                       X_data,
                                       feature_names=X_data.columns)
            shap_values = explainer.shap_values(X_data)

            shap.summary_plot(shap_values, X_data)

            import matplotlib.pyplot as pl
            pl.savefig('static/img/new_plot.png')
            pl.close()

            ICE = ind_cond_exp(model, X_data, y_data)

            #global surgat
            from sklearn.tree import DecisionTreeRegressor
            from sklearn.tree import plot_tree

            predictions = model.predict(X_data)
            dt = DecisionTreeRegressor(random_state=100, max_depth=3)
            # We fit the shallow tree to the matrix X and the predictions of the random forest model
            dt.fit(X_data, predictions)

            fig, ax = plt.subplots(figsize=(20, 10))

            plot_tree(dt,
                      feature_names=list(X_data.columns),
                      precision=3,
                      filled=True,
                      fontsize=12,
                      impurity=True)
            pl.savefig('static/img/new2_plot.png')
            pl.close()

            return render_template('model_explanation_result.html',
                                   PI=PI,
                                   ICE=ICE,
                                   SH="static/img/new_plot.png",
                                   SM="static/img/new2_plot.png")

        if 'RF' in model_type:
            PI = permutation_importance(model, X_data, y_data)

            explainer = shap.TreeExplainer(model,
                                           X_data,
                                           feature_names=X_data.columns)
            shap_values = explainer.shap_values(X_data)

            shap.summary_plot(shap_values, X_data)

            import matplotlib.pyplot as pl
            pl.savefig('static/img/new_plot.png')
            pl.close()

            ICE = ind_cond_exp(model, X_data, y_data)

            #global surgat
            from sklearn.tree import DecisionTreeRegressor
            from sklearn.tree import plot_tree

            predictions = model.predict(X_data)
            dt = DecisionTreeRegressor(random_state=100, max_depth=3)
            # We fit the shallow tree to the matrix X and the predictions of the random forest model
            dt.fit(X_data, predictions)

            fig, ax = plt.subplots(figsize=(20, 10))

            plot_tree(dt,
                      feature_names=list(X_data.columns),
                      precision=3,
                      filled=True,
                      fontsize=12,
                      impurity=True)
            pl.savefig('static/img/new2_plot.png')
            pl.close()

            return render_template('model_explanation_result.html',
                                   PI=PI,
                                   ICE=ICE,
                                   SH="static/img/new_plot.png",
                                   SM="static/img/new2_plot.png")

        if 'CC' in model_type:
            PI = permutation_importance(model, X_data, y_data)

            explainer = shap.KernelExplainer(model.predict_proba, X_data)
            shap_values = explainer.shap_values(X_data)

            shap.summary_plot(shap_values, X_data)

            import matplotlib.pyplot as pl
            pl.savefig('static/img/new_plot.png')
            pl.close()

            #ICE = ind_cond_exp(model,X_data,y_data)

            #global surgat
            from sklearn.tree import DecisionTreeRegressor
            from sklearn.tree import plot_tree

            predictions = model.predict(X_data)

            return render_template(
                'model_explanation_result_classification.html',
                PI=PI,
                SH="static/img/new_plot.png")

    if 'WI' in dropdown_selection:

        # print(res," resss")

        #
        import dash
        from dash.dependencies import Input, Output
        import dash_table
        import dash_core_components as dcc
        import dash_html_components as html

        app = dash.Dash(__name__)
        import pandas as pd
        #should be X data

        mean_list = []
        features = X_data.columns.tolist()
        for i in features:
            mean_list.append(round(X_data[i].mean()))

        explainer = shap.TreeExplainer(model)
        shap.initjs()

        params = features

        id_name_str = "my_graph" + str(id_name)
        print('---------------', id_name_str)
        id_name = id_name + 1

        what_plot.layout = html.Div([
            dash_table.DataTable(
                id='table-editing-simple',
                columns=([{
                    'id': 'Model',
                    'name': 'Model'
                }] + [{
                    'id': p,
                    'name': p
                } for p in params]),
                data=[
                    dict(zip(features, mean_list))
                    #dict(Model=i, **{param: mean_list[i] for param in params})
                    # for i in range(0, len(mean_list))
                ],
                editable=True),
            html.Div(id=id_name_str)
        ])

        @what_plot.callback(Output(id_name_str, "children"),
                            Input('table-editing-simple', 'data'),
                            Input('table-editing-simple', 'columns'))
        def update_graphs(rows, columns):
            df = pd.DataFrame(rows, columns=[c['name'] for c in columns])
            print(rows)

            #
            rows = rows[0]
            col = []
            vvalue = []
            for key in rows:
                print(key, '->', int(rows[key]))
                col.append(key)
                vvalue.append([int(rows[key])])

            ik = dict(zip(col, vvalue))
            instance = pd.DataFrame.from_dict(ik)

            print('instancceee ', instance)

            from shap.plots._force_matplotlib import draw_additive_plot

            # explain the model's predictions using SHAP values(same syntax works for LightGBM, CatBoost, and scikit-learn models)
            #explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(instance)
            shap.initjs()

            #plt.style.use("_classic_test_patch")

            ytu = model.predict(instance)
            print('ress ', ytu)

            koko = _force_plot_html2(explainer.expected_value, shap_values,
                                     instance)

            #print('kkkk ',koko)

            print('Done')

            return koko
    #

        return render_template('local_explain_lime.html', LL=what_plot.index())

    if 'LL' in dropdown_selection:
        None
        #table and plots ========================================================
        import dash
        from dash.dependencies import Input, Output
        import dash_table
        import dash_core_components as dcc
        import dash_html_components as html
        import pandas as pd

        id_name_str = "my_graph" + str(id_name)
        print('---------------', id_name_str)
        id_name = id_name + 1

        print('in LL')
        # make graph===============================================================
        table_plot.layout = html.Div([
            dash_table.DataTable(
                id='datatable-interactivity',
                columns=[{
                    "name": i,
                    "id": i,
                    "deletable": True,
                    "selectable": True
                } for i in X_data.columns],
                data=X_data.to_dict('records'),
                editable=True,
                filter_action="native",
                sort_action="native",
                sort_mode="multi",
                column_selectable="single",
                row_selectable="single",
                row_deletable=True,
                selected_columns=[],
                selected_rows=[],
                page_action="native",
                page_current=0,
                page_size=10,
            ),
            html.Div(id=id_name_str)
        ])

        print('miod LL')

        @table_plot.callback(Output(id_name_str, "children"),
                             Input('datatable-interactivity',
                                   "derived_virtual_data"),
                             Input('datatable-interactivity',
                                   "derived_virtual_selected_rows"))
        def update_graphs(rows, derived_virtual_selected_rows):
            # When the table is first rendered, `derived_virtual_data` and
            # `derived_virtual_selected_rows` will be `None`. This is due to an
            # idiosyncrasy in Dash (unsupplied properties are always None and Dash
            # calls the dependent callbacks when the component is first rendered).
            # So, if `rows` is `None`, then the component was just rendered
            # and its value will be the same as the component's dataframe.
            # Instead of setting `None` in here, you could also set
            # `derived_virtual_data=df.to_rows('dict')` when you initialize
            # the component.
            if derived_virtual_selected_rows is None:
                derived_virtual_selected_rows = []

            dff = X_data if rows is None else pd.DataFrame(rows)

            colors = [
                '#7FDBFF' if i in derived_virtual_selected_rows else '#0074D9'
                for i in range(len(dff))
            ]

            print('my value', derived_virtual_selected_rows)
            print('i am row ', X_data.iloc[derived_virtual_selected_rows])
            print(type(derived_virtual_selected_rows))

            from shap.plots._force_matplotlib import draw_additive_plot

            ttt = X_data.loc[derived_virtual_selected_rows]
            # explain the model's predictions using SHAP values(same syntax works for LightGBM, CatBoost, and scikit-learn models)
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(ttt)
            shap.initjs()

            plt.style.use("_classic_test_patch")

            bubu = _force_plot_html(explainer.expected_value, shap_values, ttt)

            shap_values = explainer.shap_values(X_data)
            #shap.force_plot(explainer.expected_value, shap_values, X_data)
            explain_all = _force_plot_html(explainer.expected_value,
                                           shap_values, X_data)

            print('bubu ', bubu)

            return bubu, explain_all

        return render_template('local_explain_lime.html',
                               LL=table_plot.index())

    if 'BD' in dropdown_selection:
        None

    #FI
    if 'DB' in dropdown_selection:

        #  if 'CC' in model_type:
        #   from explainerdashboard import ClassifierExplainer, ExplainerDashboard
        #  ExplainerDashboard(ClassifierExplainer(model, X_data, y_data)).run()

        if 'RF' in model_type:
            import threading
            import time

            def dashboard_exp(model, X_data, y_data):
                import dash_bootstrap_components as dbc

                from explainerdashboard import RegressionExplainer, ExplainerDashboard
                ExplainerDashboard(
                    RegressionExplainer(model, X_data, y_data),
                    bootstrap=dbc.themes.SANDSTONE,
                    importances=True,
                    model_summary=False,
                    contributions=True,
                    whatif=True,
                    shap_dependence=False,
                    shap_interaction=False,
                    decision_trees=False,
                    hide_whatifindexselector=True,
                    hide_whatifprediction=True,
                    hide_inputeditor=False,
                    hide_whatifcontributiongraph=False,
                    hide_whatifcontributiontable=True,
                    hide_whatifpdp=False,
                    hide_predindexselector=True,
                    hide_predictionsummary=True,
                    hide_contributiongraph=False,
                    hide_pdp=False,
                    hide_contributiontable=True,
                    hide_dropna=True,
                    hide_range=True,
                    hide_depth=True,
                    hide_sort=True,
                    hide_sample=True,  # hide sample size input on pdp component
                    hide_gridlines=True,  # hide gridlines on pdp component
                    hide_gridpoints=True,
                    hide_cats_sort=
                    True,  # hide the sorting option for categorical features
                    hide_cutoff=
                    True,  # hide cutoff selector on classification components
                    hide_percentage=
                    True,  # hide percentage toggle on classificaiton components
                    hide_log_x=
                    True,  # hide x-axis logs toggle on regression plots
                    hide_log_y=
                    True,  # hide y-axis logs toggle on regression plots
                    hide_ratio=True,  # hide the residuals type dropdown
                    hide_points=
                    True,  # hide the show violin scatter markers toggle
                    hide_winsor=True,  # hide the winsorize input
                    hide_wizard=
                    True,  # hide the wizard toggle in lift curve component
                    hide_star_explanation=True,
                ).run()

            t1 = threading.Thread(target=dashboard_exp,
                                  args=(model, X_data, y_data))

            t1.start()

            return '''<H2>
Esempio n. 24
0
from sklearn.ensemble import RandomForestRegressor
# The target variable is 'quality'.
Y = df['quality']
X =  df[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar','chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density','pH', 'sulphates', 'alcohol']]
# Split the data into train and test data:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)
# Build the model with the random forest regression algorithm:
model = RandomForestRegressor(max_depth=6, random_state=0, n_estimators=10)
model.fit(X_train, Y_train)


rf = RandomForestRegressor(max_depth=6, random_state=0, n_estimators=10)
rf.fit(X_train, Y_train)
print(rf.feature_importances_)
importances = rf.feature_importances_
indices = np.argsort(importances)
features = X_train.columns
# plt.title('Feature Importances')
# plt.barh(range(len(indices)), importances[indices], color='b', align='center')
# plt.yticks(range(len(indices)), [features[i] for i in indices])
# plt.xlabel('Relative Importance')
# plt.show()

import shap
rf_shap_values = shap.KernelExplainer(rf.predict,X_test)

# shap.summary_plot(rf_shap_values, X_test)
shap.dependence_plot("alcohol", rf_shap_values, X_test)
# plot the SHAP values for the 10th observation
# shap.force_plot(rf_explainer.expected_value, rf_shap_values[10,:], X_test.iloc[10,:])
Esempio n. 25
0
                out[i][segmentation == j, :] = background
    return out


def f(z):
    #     print("Call")
    #     for i in z:
    #         print (i)
    #     print(img_orig.shape)
    return model.predict(mask_image(z, segments_slic, img_orig, 250))


# In[ ]:

# use Kernel SHAP to explain the network's predictions
explainer = shap.KernelExplainer(f, np.zeros((1, 50)))
shap_values = explainer.shap_values(np.ones((1, 50)),
                                    nsamples=1000)  # runs VGG16 1000 times

# In[ ]:

list(test_image_gen.class_indices.keys())

# In[ ]:

# get the top predictions from the model
preds = model.predict(np.expand_dims(img_orig.copy(), axis=0))
top_preds = np.argsort(-preds)

# In[ ]:
Esempio n. 26
0
best_xgb_model = xgboost.XGBRegressor(colsample_bytree=0.4,
                                      gamma=0,
                                      learning_rate=0.07,
                                      max_depth=3,
                                      min_child_weight=1.5,
                                      n_estimators=10000,
                                      reg_alpha=0.75,
                                      reg_lambda=0.45,
                                      subsample=0.6,
                                      seed=42)

best_xgb_model.fit(trainX, trainY)

print("\n xgboost: \n")
print(
    'MAE: \n',
    mean_absolute_error(valY,
                        best_xgb_model.predict(valX),
                        multioutput='raw_values'))
# The mean squared error
print('MSE: %.5f' % mean_squared_error(valY, best_xgb_model.predict(valX)))
# The coefficient of determination: 1 is perfect prediction
print('R-Sqrd: %.2f' % r2_score(valY, best_xgb_model.predict(valX)))

import shap
shap.initjs()
explainer = shap.KernelExplainer(br.predict, trainX)
shap_values = explainer.shap_values(valX, nsamples=5)
shap.summary_plot(shap_values, valX, plot_type="bar")
Esempio n. 27
0
print(f"Relevant columns are: {', '.join(relevant_columns)}")
print('Classification summary:')
print(classification_report(y, est.predict(X)))
# print('Classification summary:')
# print(classification_report(y, est.predict(X)))

# El clasificador base solo usa age, avg_glucose_level, bmi
# Busca explicaciones en las que aparezcan variables distintas a estas tres.
# Usa X e y como datos (los nombres de las columnas estan en X_features), est  es el clasificador final

from sklearn.metrics import classification_report
import numpy as np
#import dalex as dx
import shap

i = 99
class_names = ['healthy', 'stroke']
#print(X[i].mean)
X = np.array(X)
X_features = np.array(X_features)
#SHAP EXPLAINER
#explainer = shap.LinearExplainer(est, X)
explainer = shap.KernelExplainer(est.predict, X)
shap_values = explainer.shap_values(X)
import pickle
with open('stroke-shap.pkl', 'wb') as fd:
    pickle.dump([explainer, shap_values], fd)

with open('stroke-shap.pkl', 'rb') as fd:
    [explainer, shap_values] = pickle.load(fd)
Esempio n. 28
0
def _compute_shap_values(pipeline, features, training_data=None):
    """Computes SHAP values for each feature.

    Arguments:
        pipeline (PipelineBase): Trained pipeline whose predictions we want to explain with SHAP.
        features (pd.DataFrame): Dataframe of features - needs to correspond to data the pipeline was fit on.
        training_data (pd.DataFrame): Training data the pipeline was fit on.
            For non-tree estimators, we need a sample of training data for the KernelSHAP algorithm.

    Returns:
        dict or list(dict): For regression problems, a dictionary mapping a feature name to a list of SHAP values.
            For classification problems, returns a list of dictionaries. One for each class.
    """
    estimator = pipeline.estimator
    if estimator.model_family == ModelFamily.BASELINE:
        raise ValueError(
            "You passed in a baseline pipeline. These are simple enough that SHAP values are not needed."
        )

    feature_names = features.columns

    # This is to make sure all dtypes are numeric - SHAP algorithms will complain otherwise.
    # Sklearn components do this under-the-hood so we're not changing the data the model was trained on.
    # Catboost can naturally handle string-encoded categorical features so we don't need to convert to numeric.
    if estimator.model_family != ModelFamily.CATBOOST:
        features = check_array(features.values)

    if estimator.model_family.is_tree_estimator():
        # Because of this issue: https://github.com/slundberg/shap/issues/1215
        if estimator.model_family == ModelFamily.XGBOOST:
            raise NotImplementedError(
                "SHAP values cannot currently be computed for xgboost models.")
        if estimator.model_family == ModelFamily.CATBOOST and pipeline.problem_type == ProblemTypes.MULTICLASS:
            # Will randomly segfault
            raise NotImplementedError(
                "SHAP values cannot currently be computed for catboost models for multiclass problems."
            )
        # Use tree_path_dependent to avoid linear runtime with dataset size
        with warnings.catch_warnings(record=True) as ws:
            explainer = shap.TreeExplainer(
                estimator._component_obj,
                feature_perturbation="tree_path_dependent")
        if ws:
            logger.debug(
                f"_compute_shap_values TreeExplainer: {ws[0].message}")
        shap_values = explainer.shap_values(features, check_additivity=False)
        # shap only outputs values for positive class for Catboost binary estimators.
        # this modifies the output to match the output format of other binary estimators.
        # Ok to fill values of negative class with zeros since the negative class will get dropped
        # in the UI anyways.
        if estimator.model_family == ModelFamily.CATBOOST and pipeline.problem_type == ProblemTypes.BINARY:
            shap_values = [np.zeros(shap_values.shape), shap_values]
    else:
        if training_data is None:
            raise ValueError(
                "You must pass in a value for parameter 'training_data' when the pipeline "
                "does not have a tree-based estimator. "
                f"Current estimator model family is {estimator.model_family}.")

        # More than 100 datapoints can negatively impact runtime according to SHAP
        # https://github.com/slundberg/shap/blob/master/shap/explainers/kernel.py#L114
        sampled_training_data_features = pipeline.compute_estimator_features(
            shap.sample(training_data, 100)).to_dataframe()
        sampled_training_data_features = check_array(
            sampled_training_data_features)

        if pipeline.problem_type == ProblemTypes.REGRESSION:
            link_function = "identity"
            decision_function = estimator._component_obj.predict
        else:
            link_function = "logit"
            decision_function = estimator._component_obj.predict_proba
        with warnings.catch_warnings(record=True) as ws:
            explainer = shap.KernelExplainer(decision_function,
                                             sampled_training_data_features,
                                             link_function)
            shap_values = explainer.shap_values(features)
        if ws:
            logger.debug(
                f"_compute_shap_values KernelExplainer: {ws[0].message}")

    # classification problem
    if isinstance(shap_values, list):
        mappings = []
        for class_shap_values in shap_values:
            mappings.append(
                _create_dictionary(class_shap_values, feature_names))
        return mappings
    # regression problem
    elif isinstance(shap_values, np.ndarray):
        return _create_dictionary(shap_values, feature_names)
    else:
        raise ValueError(
            f"Unknown shap_values datatype {str(type(shap_values))}!")
        np.zeros(shape=(predictions.shape[0], 2)))
    class_probabilities.iloc[:, POSITIVE_CLASS_PROB_INDEX] = predictions
    class_probabilities.iloc[:, NEGATIVE_CLASS_PROB_INDEX] = pd.DataFrame(predictions).apply(lambda x: 1 - x)
    return class_probabilities

# use SHAP KernelExplainer to explain test set predictions
USE_MATPLOTLIB = True # False requires IPython and likely additional setup
TRAIN_SUMMARY_N_SAMPLES = 100
MULTISAMPLE_PLOTS_N_TEST_SAMPLES = 100

# provided for IPython / Jupyter consistency
if not USE_MATPLOTLIB:
    shap.initjs()

X_train_samples = shap.sample(X_train, nsamples=TRAIN_SUMMARY_N_SAMPLES) # summarize the X_train with a total of 100 samples
explainer = shap.KernelExplainer(predict_proba, X_train_samples, link="logit")

# find first true positive and true negative samples
Y_test_proba = predict_proba(X_test)
tp_index = -1
tn_index = -1
for i in range(len(Y_test_proba)):
    if Y_test_proba.iloc[i, POSITIVE_CLASS_PROB_INDEX] >= 0.5 and Y_test.iloc[i] == "yes":
        tp_index = i
    if Y_test_proba.iloc[i, POSITIVE_CLASS_PROB_INDEX] < 0.5 and Y_test.iloc[i] == "no":
        tn_index = i
    if tp_index != -1 and tn_index != -1:
        break

# force_plot for single TP sample, if it exists
if tp_index != -1:
Esempio n. 30
0
    def explain(self,
                test_df,
                row_index=None,
                row_num=None,
                class_id=None,
                background_size=50,
                nsamples=500):
        """
        Explain the prediction of an example using SHAP.
        Args:
          df(pd.DataFrame): a pd.DataFrame of test data is same format as original training data DataFrame
                            The DataFrame does NOT need to contain all the original label columns
                            (e.g., the Survived column in Kaggle's Titatnic dataset) but  MUST contain
                            all the original predictor columns (e.g., un-normalized numerical variables, categorical
                            variables as strings).
          row_index(int): index of row in DataFrame to explain (e.g., PassengerID in Titanic dataset).
                          mutually-exclusive with row_id
          row_num(int): raw row number in DataFrame to explain (i.e., 0=first row, 1=second rows, etc.)
                         mutually-exclusive with row_index
          class_id(int): Only required for classification
          background_size(int): size of background data (SHAP parameter)
          nsamples(int): number of samples (SHAP parameter)
        """
        try:
            import shap
        except ImportError:
            msg = 'TabularPredictor.explain requires shap library. Please install with: pip install shap. '+\
                    'Conda users should use this command instead: conda install -c conda-forge shap'
            warnings.warn(msg)
            return

        classification, multilabel = U.is_classifier(self.model)
        if classification and class_id is None:
            raise ValueError('For classification models, please supply the class_id of the class you would like to explain.' + \
                             'It should be an index into the list returned by predictor.get_classes().')

        f = self._predict_shap

        # prune dataframe
        df_display = test_df.copy()
        df_display = df_display[self.preproc.pc]

        # add synthetic labels
        for lab in self.preproc.lc:
            df_display[lab] = np.zeros(df_display.shape[0], dtype=int)

        # convert DataFrame to TabularDataset with processed/normalized independent variables
        tabseq = self.preproc.preprocess_test(df_display, verbose=0)
        tabseq.batch_size = df_display.shape[0]
        df = pd.DataFrame(data=np.concatenate(tabseq[0][0], axis=1),
                          columns=tabseq.cat_columns + tabseq.cont_columns,
                          index=df_display.index)

        # add new auto-engineered feature columns
        for col in [self.preproc.na_names + self.preproc.date_names]:
            df_display[col] = df[col]

        # sort display df correctly
        df_display = df_display[tabseq.cat_columns + tabseq.cont_columns]

        # select row
        if row_num is not None and row_index is not None:
            raise ValueError(
                'row_num and row_index are mutually exclusive with eachother.')

        if row_index is not None:
            df_row = df[df.index.isin([row_index])].iloc[0, :]
            df_display_row = df_display[df_display.index.isin([row_index
                                                               ])].iloc[0, :]
            r_key = 'row_index' if df.index.name is None else df.index.name
            r_val = row_index
        elif row_num is not None:
            df_row = df.iloc[row_num, :]
            df_display_row = df_display.iloc[row_num, :]
            r_key = 'row_num'
            r_val = row_num
        #print(df_row)
        #print(df_display_row)

        # shap
        explainer = shap.KernelExplainer(f, df.iloc[:background_size, :])
        shap_values = explainer.shap_values(df_row,
                                            nsamples=nsamples,
                                            l1_reg='aic')
        expected_value = explainer.expected_value

        if not np.issubdtype(type(explainer.expected_value), np.floating):
            expected_value = explainer.expected_value[
                0 if class_id is None else class_id]
        if type(shap_values) == list:
            shap_values = shap_values[0 if class_id is None else class_id]

        if classification:
            print('Explanation for class = %s (%s=%s): ' %
                  (self.get_classes()[class_id], r_key, r_val))
        plt.show(
            shap.force_plot(expected_value,
                            shap_values,
                            df_display_row,
                            matplotlib=True))