コード例 #1
0
def plot_shap_summary(
    shap_values,
    features,
    plot_type=None,
    figsize=None,
    color=None,
    max_display=None,
    feature_names=None,
    title=None,
    show=True,
    sort=True,
    color_bar=True,
    layered_violin_max_num_bins=None,
    class_names=None,
    class_inds=None,
    color_bar_label=None,
):
    """Function to plot shap summary plot.
    This function is a helper function to plot the shap summary plot
    based on all types of shap explainers including tree, linear, and dnn.
    Parameters
    ----------
    shap_values: Numpy array or Pandas DataFrame
        Calculated SHAP values. For single output explanations like
        binary classificationthis this is a matrix of SHAP values (n_samples, n_features).
        For multi-output explanations this is a list of such matrices of SHAP values
    features: Numpy array or Pandas DataFrame
        The feature matrix that was used to calculate the SHAP values. For the case
        of Numpy array it is recommened to pass the feature_names list as well
    plot_type: str, optional (single-output default="dot", multi-output default="bar")
        The type of summar plot. Options are "bar", "dot", "violin", and "compact_dot"
        which is recommended for SHAP interactions
    figsize: tuple, optional, (default="auto")
        Figure size
    color: str, optional, (default="#D0AAF3")
        Color of the horizontal lines when plot_type="bar"
    max_display: int, optional, (default=20)
        Limit to show the number of features in the plot
    feature_names: str, optional, (default=None)
        List of feature names to pass. It should follow the order
        of fatures
    title: str, optional, (default=None)
        Title of the plot
    show: bool, optional, (default=True)
        Flag to show the plot in inteactive environment
    sort: bool, optional, (default=True)
        Flag to plot sorted shap vlues in descending order
    color_bar: bool, optional, (default=True)
        Flag to show color_bar when plot_type is "dot" or "violin"
    layered_violin_max_num_bins: int, optional, (default=20)
        The number of bins for calculating the violin plots ranges
        and outliers
    class_names: list, optional, (default=None)
        List of class names for multi-output problems
    class_inds: list, optional, (default=True)
        List of class indices for multi-output problems
    color_bar_label: str, optional, (default="Feature Value")
        Label for color bar
    """

    # initializing figsize
    if figsize is None:
        figsize = "auto"
    elif isinstance(figsize, list) or isinstance(figsize, tuple):
        figsize = figsize
    else:
        raise TypeError("Only tuple and list types are allowed for figsize.")

    # initializing color
    if color is None:
        color = "#D0AAF3"
    elif isinstance(color, str):
        color = color
    else:
        raise TypeError("Only str type is allowed for color.")

    # initializing layered_violin_max_num_bins
    if layered_violin_max_num_bins is None:
        layered_violin_max_num_bins = 20
    elif isinstance(layered_violin_max_num_bins, int):
        layered_violin_max_num_bins = layered_violin_max_num_bins
    else:
        raise TypeError(
            "Only int type is allowed for layered_violin_max_num_bins.")

    # initializing color_bar_label
    if color_bar_label is None:
        color_bar_label = "Feature Value"
    elif isinstance(color_bar_label, int):
        color_bar_label = color_bar_label
    else:
        raise TypeError("Only str type is allowed for color_bar_label.")

    shap.summary_plot(
        shap_values,
        features,
        plot_type=plot_type,
        plot_size=figsize,
        color=color,
        max_display=max_display,
        feature_names=feature_names,
        title=title,
        show=show,
        sort=sort,
        color_bar=color_bar,
        layered_violin_max_num_bins=layered_violin_max_num_bins,
        class_names=class_names,
        class_inds=class_inds,
        color_bar_label=color_bar_label,
    )
    plt.show()
コード例 #2
0
ファイル: dashboardSubhi.py プロジェクト: petrosFr/OCRprojet7
    if st.sidebar.checkbox("Afficher les informations du client?"):
        st.write("Statut famille :**",
                 selected_id["NAME_FAMILY_STATUS"].iloc[0], "**")
        st.write("Nombre d'enfant(s) :**", selected_id["CNT_CHILDREN"].iloc[0],
                 "**")
        st.write("Age client :**", int(selected_id["DAYS_BIRTH"].values / 365),
                 "**", "ans.")
        st.write("DAYS_LAST_PHONE_CHANGE",
                 selected_id['DAYS_LAST_PHONE_CHANGE'].iloc[0])
        st.write("AMT CREDIT", selected_id['AMT_CREDIT'].iloc[0])
        st.write("AMT INCOME TOTAL", selected_id['AMT_INCOME_TOTAL'].iloc[0])
        st.write("AMT ANNUITY", selected_id['AMT_ANNUITY'].iloc[0])

fig, axs = plt.subplots(nrows=1, ncols=1)
shap.summary_plot(shap_values[0], X, plot_type='bar')
st.sidebar.pyplot(fig)

fig1, ax = plt.subplots(nrows=1, ncols=1)
shap.summary_plot(shap_values[0], X)
st.sidebar.pyplot(fig1)

vals = np.abs(shap_values[0])
feature_importance = pd.DataFrame(
    list(zip(X.columns, sum(vals))),
    columns=['col_name', 'feature_importance_vals'])
feature_importance.sort_values(by=['feature_importance_vals'],
                               ascending=False,
                               inplace=True)
val = feature_importance['col_name'].head(6)
コード例 #3
0
plt.title('Number of cases above a failure prediction level')

#%% FURTHER EXPLORATION: train explainer shap, explore predictions and how decisions are called
import shap
explainer = shap.TreeExplainer(brf_model)
for mode in [df_features_ec_season, df_features_ec_season_permuted]:
    shap_values = explainer.shap_values(mode,
                                        approximate=True,
                                        check_additivity=True)

    # dependence plots
    for name in mode.columns:
        shap.dependence_plot(name, shap_values[1], mode)

    # Summary plots
    shap.summary_plot(shap_values, mode, plot_type="bar")
    shap.summary_plot(shap_values[1], mode, plot_type="bar")
    shap.summary_plot(shap_values[1], mode)  # Failure

    # Decision plots explaining decisions to classify
    shap.decision_plot(explainer.expected_value[1], shap_values[1], mode)
    shap.decision_plot(explainer.expected_value[1], shap_values[1][1],
                       mode.iloc[1])  #2012 year

    # Calculate force plot for a given value 2012
    shap.initjs()
    shap_values_2012 = explainer.shap_values(mode.iloc[[4]])
    shap_display = shap.force_plot(explainer.expected_value[1],
                                   shap_values_2012[1],
                                   mode.iloc[[4]],
                                   matplotlib=True)
コード例 #4
0
import joblib
import shap
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb

from pylab import rcParams
rcParams['figure.figsize'] = 8, 16


X_test = pd.read_csv('models/X_test.csv', index_col = 0)
Y_test = pd.read_csv('models/Y_test.csv', index_col = 0)

shap.initjs()

xg_reg = joblib.load('models/XG_boost.model')


explainer = shap.TreeExplainer(xg_reg)
shap_values = explainer.shap_values(X_test)

# visualize the first prediction's explanation

shap.summary_plot(shap_values, X_test, show=False)
plt.tight_layout()
plt.savefig("plots/shap_summary_plot.png")
コード例 #5
0
fig = px.bar(fi_dt,
             x='Importance',
             y='Feature',
             orientation='h',
             color='Importance')
st.plotly_chart(fig)
###

###
st.title("Shap Value")

import shap

shap_values = shap.TreeExplainer(model).shap_values(X)
st.pyplot(shap.summary_plot(shap_values, X, plot_type="bar"))
###

###
st.title("Shap Summary Plot")

f = plt.figure()
st.pyplot(shap.summary_plot(shap_values, X))
###

###
import shap
import streamlit as st
import streamlit.components.v1 as components

コード例 #6
0
def main(args):
    """
    Runs evaluation for the data set
        1. Loads model from tar.gz
        2. Reads in test features
        3. Runs an accuracy report
        4. Generates feature importance with SHAP

    Args:
        model-name (str): Name of the trained model, default xgboost
        test-features (str): preprocessed test features for
         evaluation, default test_features.csv
        train-features (str): preproceed train features for SHAP,
        default train_features.csv
        test-features (str): preproceed test features for SHAP,
        default test_features.csv
        report-name (str): Name of the evaluation output
        , default evaluation.json
        shap-name (str): Name of the SHAP feature importance
        output file, default shap.csv
        threshold (float): Threshold to cut probablities at
        , default 0.5
        tau (int): time range for the c-index will be from 0 to tau
        , default 100
    """

    model_path = os.path.join("/opt/ml/processing/model", "model.tar.gz")

    logger.info(f"Extracting model from path: {model_path}")

    with tarfile.open(model_path) as tar:
        tar.extractall(path=".")
    logger.info("Loading model")
    with open(args.model_name, "rb") as f:
        model = pickle.load(f)

    logger.info("Loading train and test data")

    test_features_data = os.path.join("/opt/ml/processing/test",
                                      args.test_features)
    train_features_data = os.path.join("/opt/ml/processing/train",
                                       args.train_features)

    X_test = pd.read_csv(test_features_data, header=0)
    X_train = pd.read_csv(train_features_data, header=0)

    y_test = X_test.iloc[:, 0]
    y_train = X_train.iloc[:, 0]

    # Reverse transfrom to event and duration columns
    y_test_df = pd.DataFrame(
        np.vstack((np.where(y_test > 0, 1, 0), np.abs(y_test))).T,
        columns=["event", "duration"],
    )

    y_train_df = pd.DataFrame(
        np.vstack((np.where(y_train > 0, 1, 0), np.abs(y_train))).T,
        columns=["event", "duration"],
    )

    X_test.drop(X_test.columns[0], axis=1, inplace=True)
    X_train.drop(X_test.columns[0], axis=1, inplace=True)

    logger.info("Running inference")

    predictions = model.predict(xgboost.DMatrix(X_test.values[:, 1:]),
                                output_margin=False)

    logger.info("Creating evaluation report")

    # NOTE: technical evaluation is really not as a classifier
    # TO DO: Normalize to 0 to 1 scale
    report_dict = classification_report(y_test_df["event"],
                                        predictions > args.threshold,
                                        output_dict=True)
    report_dict["accuracy"] = accuracy_score(y_test_df["event"],
                                             predictions > args.threshold)

    _, y_train_tuple = get_x_y(y_train_df, ["event", "duration"],
                               pos_label=True)
    _, y_test_tuple = get_x_y(y_test_df, ["event", "duration"], pos_label=True)

    concordance_index = concordance_index_ipcw(
        y_train_tuple,
        y_test_tuple,
        predictions,
        tau=args.tau,  # default within 100 days
    )

    report_dict["concordance_index"] = {
        "cindex": float(concordance_index[0]),
        "concordant": int(concordance_index[1]),
        "discordant": int(concordance_index[2]),
        "tied_risk": int(concordance_index[3]),
        "tied_time": int(concordance_index[4]),
    }

    times, score = brier_score(y_train_tuple, y_test_tuple, predictions,
                               y_test_df["duration"].max() - 1)

    report_dict["brier_score"] = {
        "times": times.astype(np.int32).tolist(),
        "score": score.astype(np.float32).tolist(),
    }

    logger.info(f"Classification report:\n{report_dict}")

    evaluation_output_path = os.path.join("/opt/ml/processing/evaluation",
                                          args.report_name)
    logger.info(f"Saving classification report to {evaluation_output_path}")

    logger.debug(report_dict)

    with open(evaluation_output_path, "w") as f:
        f.write(json.dumps(report_dict))

    # SHAP
    latest_job_debugger_artifacts_path = "/opt/ml/processing/debug/debug-output"
    trial = create_trial(latest_job_debugger_artifacts_path)

    shap_values = trial.tensor("full_shap/f0").value(trial.last_complete_step)

    pd.DataFrame(shap_values).to_csv(
        os.path.join("/opt/ml/processing/evaluation", args.shap_name))

    shap_no_base = shap_values[1:, :-1]
    feature_names = X_train.columns
    os.makedirs("/opt/ml/processing/plot/", exist_ok=True)
    logger.info(shap_values.shape, shap_no_base.shape, X_train.shape)
    shap.summary_plot(shap_no_base,
                      features=X_train,
                      feature_names=feature_names,
                      show=False)
    plt.savefig("/opt/ml/processing/plot/feature_importance.png",
                bbox_inches="tight")
コード例 #7
0
# Just like with the permutation method, we might also want to understand model output in aggregate. Shapley values allow us to do this as well. Run the next cell to initialize the shapley values for each example in the test set (this may also take a few minutes). 

# In[41]:


shap_values = shap.TreeExplainer(rf).shap_values(X_test)[1]


# You can ignore the `setting feature_perturbation` message.

# Run the next cell to see a summary plot of the shapley values for each feature on each of the test examples. The colors indicate the value of the feature. The features are listed in terms of decreasing absolute average shapley value over all the individuals in the dataset.

# In[42]:


shap.summary_plot(shap_values, X_test)


# In the above plot, you might be able to notice a high concentration of points on specific SHAP value ranges. This means that a high proportion of our test set lies on those ranges.
# 
# As with the permutation method, age, sex, poverty index, and diastolic BP seem to be the most important features. Being older has a negative impact on mortality, and being a woman (sex=2.0) has a positive effect. 

# <a name="2-2-3"></a>
# #### 2.2.3 Visualizing Interactions between Features

# The `shap` library also lets you visualize interactions between features using dependence plots. These plot the Shapley value for a given feature for each data point, and color the points in using the value for another feature. This lets us begin to explain the variation in shapley value for a single value of the main feature.

# Run the next cell to see the interaction between Age and Sex. 

# In[43]:
コード例 #8
0
ファイル: pipeline.py プロジェクト: aliizadi/PyLearner
clf = RandomForestClassifier(n_estimators=100, max_depth=30, n_jobs=-1)

outlier_detector = IsolationForest(contamination=0.15)
without_outliers_classifier = WithoutOutliersClassifier(outlier_detector, clf)
cross_validate_test(X,
                    y,
                    without_outliers_classifier,
                    metric=accuracy_score,
                    outlier_detection=True)

# %%
import shap

tf = transformation(raw_data).drop_columns(columns=[])
X, y = tf.create_X_y()

pipe = make_pipeline(KNNImputer(n_neighbors=7), StandardScaler())
pipe.fit(X)

X_train = pipe.transform(X)

clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)

clf.fit(X_train, y)
explainer = shap.TreeExplainer(clf)

shap_values = explainer.shap_values(X_train)[1]

shap.summary_plot(shap_values, X_train, X.columns.tolist())
# shap.summary_plot(shap_values, X_train,  X.columns.tolist(), plot_type="bar")
コード例 #9
0
# Build Regression Model
model = RandomForestRegressor()
model.fit(X, Y)

# Apply Model to Make Prediction
# Unpickle our model RF so we can use it!
if os.path.isfile("./model.pkl"):
  mod = pickle.load(open("./model.pkl", "rb"))
else:
  raise FileNotFoundError

prediction_RF = mod.predict(df)

st.write("""**Median Predicted value** of owner-occupied homes in $1000s""")
st.write(prediction_RF)
st.write('---')

# Explaining the model's predictions using SHAP values
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

st.header('Feature Importance')
plt.title('Feature importance based on SHAP values')
shap.summary_plot(shap_values, X)
st.pyplot(bbox_inches='tight')
st.write('---')

plt.title('Feature importance based on SHAP values (Bar)')
shap.summary_plot(shap_values, X, plot_type="bar")
st.pyplot(bbox_inches='tight')
コード例 #10
0
Original file is located at
    https://colab.research.google.com/drive/1qbgfxSs_mTGTnBbBaP87YqOEuOYKxOu-

### Opening the black box

### eli5
"""

import eli5

eli5.show_weights(random_forest, feature_names=features)
"""### SHAP"""

import shap

shap_values = shap.TreeExplainer(random_forest).shap_values(X_rus)
shap.summary_plot(shap_values, X_rus, plot_type="bar", feature_names=features)
"""### LIME"""

import lime

predict_fn_xgb = lambda x: random_forest.predict_proba(x).astype(float)

explainer = lime.lime_tabular.LimeTabularExplainer(X_rus,
                                                   feature_names=features,
                                                   kernel_width=3)
observation_1 = 2
exp = explainer.explain_instance(X_rus[observation_1],
                                 predict_fn_xgb,
                                 num_features=6)
exp.show_in_notebook(show_all=False)
コード例 #11
0
def _bar_ranking_plot(mean_shap_values, X, folder, max_feats, ext=".png"):
    """Function for customizing and saving SHAP summary bar plot."""
    shap.summary_plot(mean_shap_values, X, plot_type="bar", max_display=max_feats, show=False)
    plt.title("Feature Rankings-All Classes")
    plt.savefig(os.path.join(folder, "shap_bar_rank" + ext), dpi=200, bbox_inches="tight")
コード例 #12
0
def explain(x,
            model,
            task,
            path="outputs/plots/mlflow_artifacts/shap",
            n_features=5):
    '''   explain a model' decisions based on SHAP value approximation. 
    SHAP algorithm is quadratic with the depth of trees.
    -> Be careful not to go over 12 for max_depth.
 

    Args:
        x (DataFrame): Input data
        model ([type]): Model to explain
        task (str): Task to perform. Available: regression, classification.

        path (str, optional): [description]. Defaults to "outputs/plots/mlflow_artifacts/shap".
        n_features (int, optional): Number of most important features for which to generate partial dependance plot.
    '''

    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(x, check_additivity=False)
    features_names = x.columns

    if os.path.exists(path):
        shutil.rmtree(path)

    #Compute top features
    vals = np.abs(shap_values).mean(0)
    feature_importance = pd.DataFrame(
        list(zip(x.columns, sum(vals))),
        columns=['feature', 'feature_importance_vals'])
    feature_importance.sort_values(by=['feature_importance_vals'],
                                   ascending=False,
                                   inplace=True)
    top_features = feature_importance['feature'].values[:n_features]
    os.makedirs("{}/summary_plots".format(path))
    os.makedirs("{}/dependance_plots".format(path))
    # os.makedirs("{}/interaction_plots".format(path))

    plt.rcParams.update({'figure.max_open_warning': 0})

    if task == 'classification':
        ## Summary plots
        shap.summary_plot(shap_values,
                          x,
                          class_names=model.classes_,
                          show=False)
        plt.savefig("{}/summary_plots/main_summary_plot.png".format(path),
                    dpi=150,
                    bbox_inches='tight')
        plt.clf()

        for i, klass in enumerate(model.classes_):
            shap.summary_plot(shap_values[i],
                              x,
                              class_names=model.classes_,
                              show=False)
            plt.savefig("{}/summary_plots/{}_summary_plot.png".format(
                path, klass),
                        dpi=150,
                        bbox_inches='tight')
            plt.clf()
        ## Dependance plots
        for feature in top_features:
            for i, klass in enumerate(model.classes_):
                shap.dependence_plot(
                    feature,
                    shap_values[i],
                    x,
                    title='Impact of the {} variable on the prediction of {}'.
                    format(feature, klass),
                    show=False)
                plt.savefig(
                    "{}/dependance_plots/{}_{}_dependance_plot.png".format(
                        path, klass, feature),
                    dpi=150,
                    bbox_inches='tight')
                plt.clf()

        ## Interaction plots
        #  TO DO: takes the 5 most imporant values and watch for interaction
    #     plt.clf()
    #     explainer.shap_interaction_values(x)
    #     plt.savefig("plots/shap/interaction_plots/interaction_plot.png",dpi=150, bbox_inches='tight')

    elif task == 'regression':
        ## Summary plots
        shap.summary_plot(shap_values, x, show=False)
        plt.savefig("{}/summary_plots/main_summary_plot.png".format(path),
                    dpi=150,
                    bbox_inches='tight')
        plt.clf()

        ## Dependance plots
        for feature in features_names:

            shap.dependence_plot(
                feature,
                shap_values,
                x,
                title='Impact of the {} variable'.format(feature),
                show=False)
            plt.savefig("{}/dependance_plots/{}_dependance_plot.png".format(
                path, feature),
                        dpi=150,
                        bbox_inches='tight')
            plt.clf()
コード例 #13
0
model(interpreter.test_data[idx, :, 2:].unsqueeze(0))

interpreter.test_data[idx]

interpreter.explainer.subject_ids

interpreter.feat_names

interpreter.feat_scores.reshape(-1, model.n_inputs+1).shape

val_features[:, :4, 2:].numpy().reshape(-1, model.n_inputs+1).shape

# Summarize the effects of all the features
shap.summary_plot(interpreter.feat_scores.reshape(-1, model.n_inputs+1), 
                  features=interpreter.test_data[:, :4, 2:].numpy().reshape(-1, model.n_inputs+1), 
                  feature_names=interpreter.feat_names, plot_type='bar')

# +
# [TODO] Do the same bar plot as above but in plotly
# -

np.abs(interpreter.feat_scores).reshape(-1, interpreter.feat_scores.shape[-1]).shape

mean_abs_shap = np.mean(np.abs(interpreter.feat_scores).reshape(-1, interpreter.feat_scores.shape[-1]), axis=0)
mean_abs_shap

sorted_idx = np.argsort(mean_abs_shap)
sorted_idx

interpreter.feat_names
コード例 #14
0
ファイル: model.py プロジェクト: monperrus/bugbug
    def train(self, importance_cutoff=0.15, limit=None):
        classes, self.class_names = self.get_labels()
        self.class_names = sort_class_names(self.class_names)

        # Get items and labels, filtering out those for which we have no labels.
        X_gen, y = split_tuple_generator(lambda: self.items_gen(classes))

        # Extract features from the items.
        X = self.extraction_pipeline.fit_transform(X_gen)

        # Calculate labels.
        y = np.array(y)

        if limit:
            X = X[:limit]
            y = y[:limit]

        print(f"X: {X.shape}, y: {y.shape}")

        is_multilabel = isinstance(y[0], np.ndarray)
        is_binary = len(self.class_names) == 2

        # Split dataset in training and test.
        X_train, X_test, y_train, y_test = self.train_test_split(X, y)
        if self.sampler is not None:
            pipeline = make_pipeline(self.sampler, self.clf)
        else:
            pipeline = self.clf

        tracking_metrics = {}

        # Use k-fold cross validation to evaluate results.
        if self.cross_validation_enabled:
            scorings = ["accuracy"]
            if len(self.class_names) == 2:
                scorings += ["precision", "recall"]

            scores = cross_validate(pipeline,
                                    X_train,
                                    y_train,
                                    scoring=scorings,
                                    cv=5)

            print("Cross Validation scores:")
            for scoring in scorings:
                score = scores[f"test_{scoring}"]
                tracking_metrics[f"test_{scoring}"] = {
                    "mean": score.mean(),
                    "std": score.std() * 2,
                }
                print(
                    f"{scoring.capitalize()}: f{score.mean()} (+/- {score.std() * 2})"
                )

        print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")

        # Training on the resampled dataset if sampler is provided.
        if self.sampler is not None:
            X_train, y_train = self.sampler.fit_resample(X_train, y_train)

            print(
                f"resampled X_train: {X_train.shape}, y_train: {y_train.shape}"
            )

        print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

        self.clf.fit(X_train, y_train)

        print("Model trained")

        feature_names = self.get_human_readable_feature_names()
        if self.calculate_importance and len(feature_names):
            explainer = shap.TreeExplainer(self.clf)
            shap_values = explainer.shap_values(X_train)

            # In the binary case, sometimes shap returns a single shap values matrix.
            if is_binary and not isinstance(shap_values, list):
                shap_values = [-shap_values, shap_values]
                summary_plot_value = shap_values[1]
                summary_plot_type = "layered_violin"
            else:
                summary_plot_value = shap_values
                summary_plot_type = None

            shap.summary_plot(
                summary_plot_value,
                to_array(X_train),
                feature_names=feature_names,
                class_names=self.class_names,
                plot_type=summary_plot_type,
                show=False,
            )

            matplotlib.pyplot.savefig("feature_importance.png",
                                      bbox_inches="tight")
            matplotlib.pyplot.xlabel("Impact on model output")
            matplotlib.pyplot.clf()

            important_features = self.get_important_features(
                importance_cutoff, shap_values)

            self.print_feature_importances(important_features)

            # Save the important features in the metric report too
            feature_report = self.save_feature_importances(
                important_features, feature_names)

            tracking_metrics["feature_report"] = feature_report

        print("Training Set scores:")
        y_pred = self.clf.predict(X_train)
        if not is_multilabel:
            print(
                classification_report_imbalanced(y_train,
                                                 y_pred,
                                                 labels=self.class_names))

        print("Test Set scores:")
        # Evaluate results on the test set.
        y_pred = self.clf.predict(X_test)

        if is_multilabel:
            assert isinstance(
                y_pred[0], np.ndarray), "The predictions should be multilabel"

        print(f"No confidence threshold - {len(y_test)} classified")
        if is_multilabel:
            confusion_matrix = metrics.multilabel_confusion_matrix(
                y_test, y_pred)
        else:
            confusion_matrix = metrics.confusion_matrix(
                y_test, y_pred, labels=self.class_names)

            print(
                classification_report_imbalanced(y_test,
                                                 y_pred,
                                                 labels=self.class_names))
            report = classification_report_imbalanced_values(
                y_test, y_pred, labels=self.class_names)

            tracking_metrics["report"] = report

        print_labeled_confusion_matrix(confusion_matrix,
                                       self.class_names,
                                       is_multilabel=is_multilabel)

        tracking_metrics["confusion_matrix"] = confusion_matrix.tolist()

        confidence_thresholds = [0.6, 0.7, 0.8, 0.9]

        if is_binary:
            confidence_thresholds = [0.1, 0.2, 0.3, 0.4
                                     ] + confidence_thresholds

        # Evaluate results on the test set for some confidence thresholds.
        for confidence_threshold in confidence_thresholds:
            y_pred_probas = self.clf.predict_proba(X_test)
            confidence_class_names = self.class_names + ["__NOT_CLASSIFIED__"]

            y_pred_filter = []
            classified_indices = []
            for i in range(0, len(y_test)):
                if not is_binary:
                    argmax = np.argmax(y_pred_probas[i])
                else:
                    argmax = 1 if y_pred_probas[i][
                        1] > confidence_threshold else 0

                if y_pred_probas[i][argmax] < confidence_threshold:
                    if not is_multilabel:
                        y_pred_filter.append("__NOT_CLASSIFIED__")
                    continue

                classified_indices.append(i)
                if is_multilabel:
                    y_pred_filter.append(y_pred[i])
                else:
                    y_pred_filter.append(argmax)

            if not is_multilabel:
                y_pred_filter = np.array(y_pred_filter)
                y_pred_filter[classified_indices] = self.le.inverse_transform(
                    np.array(y_pred_filter[classified_indices], dtype=int))

            classified_num = sum(1 for v in y_pred_filter
                                 if v != "__NOT_CLASSIFIED__")

            print(
                f"\nConfidence threshold > {confidence_threshold} - {classified_num} classified"
            )
            if is_multilabel:
                confusion_matrix = metrics.multilabel_confusion_matrix(
                    y_test[classified_indices], np.asarray(y_pred_filter))
            else:
                confusion_matrix = metrics.confusion_matrix(
                    y_test.astype(str),
                    y_pred_filter.astype(str),
                    labels=confidence_class_names,
                )
                print(
                    classification_report_imbalanced(
                        y_test.astype(str),
                        y_pred_filter.astype(str),
                        labels=confidence_class_names,
                    ))
            print_labeled_confusion_matrix(confusion_matrix,
                                           confidence_class_names,
                                           is_multilabel=is_multilabel)

        self.evaluation()

        if self.entire_dataset_training:
            print("Retraining on the entire dataset...")

            if self.sampler is not None:
                X_train, y_train = self.sampler.fit_resample(X, y)
            else:
                X_train = X
                y_train = y

            print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")

            self.clf.fit(X_train, y_train)

        joblib.dump(self, self.__class__.__name__.lower())
        if self.store_dataset:
            joblib.dump(X, f"{self.__class__.__name__.lower()}_data_X")
            joblib.dump(y, f"{self.__class__.__name__.lower()}_data_y")

        return tracking_metrics
コード例 #15
0
ファイル: regression_task.py プロジェクト: kasungayan/MSOM
    dict(enumerate(purchastingpower_categories)))

eneder_categories = ['U', 'F', 'M']
channel_categories = ['app', 'wechat', 'pc', 'mobile', 'others']
marital_categories = ['U', 'M', 'S']

X_test_disp['gender'] = X_test_disp['gender'].map(
    dict(enumerate(geneder_categories)))
X_test_disp['channel'] = X_test_disp['channel'].map(
    dict(enumerate(channel_categories)))
X_test_disp['marital_status'] = X_test_disp['marital_status'].map(
    dict(enumerate(marital_categories)))

shap_values = explainer.shap_values(X_test)

shap.summary_plot(shap_values, X_test, plot_type="bar", max_display=21)

shap.summary_plot(shap_values, X_test, max_display=21)

predictions = model.predict(X_test)
predictions[predictions < 0] = 0

actual = y_test

print("RMSE Error LightGBM:{}".format(
    np.sqrt(metrics.mean_squared_error(actual, predictions))))

X_test.rename(
    columns={'ordertable-original_unit_price': 'original_unit_price'},
    inplace=True)
コード例 #16
0
    model = cls.get_xgb_model()
    class_order = model.classes_
    explainer = shap.TreeExplainer(model)
    test = np.where(mutants == mutant)[0]
    mutant_tag = np.unique(y[test])
    assert len(mutant_tag) == 1
    mutant_tag = mutant_tag[0]
    assert len(test) > 1
    x_test, y_test = x[test], y[test]
    dim_names = BulkSignatureGenerator.dim_names()
    shap_values = explainer.shap_values(x_test, approximate=True)
    x_test = pd.DataFrame(x_test, columns=dim_names)
    if plot_type == 'bar':
        shap.summary_plot(shap_values,
                          x_test,
                          max_display=top_display,
                          plot_type='bar',
                          class_names=['deficient', 'basal', 'enhanced'],
                          color=lambda i: list(["#b09c8599","#dc000099","#8491b4ff",])[i])
    else:
        for i, data_in in enumerate(shap_values):
            if class_order[i] == mutant_tag:
                plt.subplots_adjust(left=0.35, right=0.98)
                plt.title("Mutant: %s (%d)" % (mutant, mutant_tag))
                shap.summary_plot(data_in,
                                  x_test,
                                  max_display=top_display,
                                  plot_type='violin',
                                  class_names=['deficient', 'basal', 'enhanced'])


            'infections_value': infections_value,
            'accumulated': accumulated/100}
    features = pd.DataFrame(data, index=[0])
    return features

s = user_input_features()

print(shap.__version__)

def st_shap(plot, height=None):
    shap_html = f"<head>{shap.getjs()}</head><body>{plot.html()}</body>"
    components.html(shap_html, height=height)

pickle_file = '../models/model_lgbm_reg'
if st.sidebar.button('Calculate Estimated Reproduction Rate'):
	st.subheader('Specified Input parameters')
	st.write(s)
	model = pickle.load(open(pickle_file,'rb'))
	model_predict = (model.predict(s)).astype(str)
	st.markdown('**Estimated Reproduction Rate:**')
	st.write(model_predict[0])

	st.markdown('**Feature importance based on SHAP values**')
	explainerModel = shap.TreeExplainer(model)
	shap_values_Model = explainerModel.shap_values(s)
	st_shap(shap.force_plot(explainerModel.expected_value, shap_values_Model[0], s.iloc[[0]]), 125)
	shap.summary_plot(shap_values_Model, s, plot_type="bar")
	st.pyplot(bbox_inches='tight')


コード例 #18
0
model_white.load_model("../Models/Optimal_XGB_survival_white_model.m")
print(model_white)

# +
#SHAP model to get importance for AA
shap_values = shap.TreeExplainer(model_black).shap_values(X_black_survival)

#Top pathways for AA people to determine survival
black_imp_df = pd.read_csv("../Results/black.csv", header='infer', sep=",")
flist = black_imp_df["Feature"].tolist()
#        "IPA:Myc_Mediated_Apoptosis_Signaling","IPA:EGF_Signaling","HM:Oxidative_phosphorylation",
#        "TPW:Immunogenic_Cell_Death_(ICD)","ICRscore","IPA:UVB_Induced_MAPK_Signaling","IPA:UVA_Induced_MAPK_Signaling"]

fids = [X_black_survival.columns.get_loc(c) for c in flist]
shap.summary_plot(shap_values[:, fids],
                  X_black_survival.iloc[:, fids],
                  sort=False)

# +
#SHAP model to get importance for black
shap_values = shap.TreeExplainer(model_white).shap_values(X_white_survival)

#Top pathways for white people to determine survival
white_imp_df = pd.read_csv("../Results/white.csv", header='infer', sep=",")
flist = white_imp_df["Features"][0:20]

#flist = ["IPA:Telomere_Extension_by_Telomerase","HM:PI3K_Akt_mTOR_signaling","LM:Proliferation",
#        "HM:Wnt_beta_catenin_signaling","TBI:Barrier_genes","IPA:AMPK_Signaling","IPA:PI3K_AKT_Signaling",
#        "HM:Angiogenesis","IPA:ErbB_Signaling","IPA:ERK5_Signaling","HM:G2M_checkpoint",
#        "HM:p53_pathway","HM:UV_response_down","IPA:UVC_Induced_MAPK_Signaling","IPA:HER_2_Signaling_in_Breast_Cancer",
#        "HM:Reactive_oxigen_species_pathway","IPA:VEGF_Signaling","IPA:Estrogen_Dependent_Breast_Cancer_Signaling",
コード例 #19
0
X_train_final = np.concatenate((X_train_A_enc, X_train_B), axis=1)
X_test_final = np.concatenate((X_test_A_enc, X_test_B), axis=1)

#%%
xgb_classifier = xgb.XGBClassifier(n_estimators=90,
                                   max_depth=4,
                                   learning_rate=0.075,
                                   colsample_bytree=0.7,
                                   subsample=0.8,
                                   reg_lambda=16,
                                   gamma=1,
                                   min_child_weight=1.5,
                                   objective='binary:logistic',
                                   scale_pos_weight=20)

xgb_classifier.fit(X_train_final, y_train)
y_score = xgb_classifier.predict_proba(X_train_final)[:, 1:]
print(f'ROC AUC: {roc_auc_score(y_train, y_score):0.3f}')
print(f'AUPRC: {auprc(y_train, y_score):0.3f}')

#%% SHAP
import shap
import matplotlib.pyplot as plt
shap.initjs()
explainer = shap.TreeExplainer(xgb_classifier)
shap_values = explainer.shap_values(X_train_final)
plt.figure()
shap.summary_plot(shap_values, X_train_final, plot_type='bar')
plt.show()
コード例 #20
0
    def train(self, importance_cutoff=0.15):
        classes, self.class_names = self.get_labels()
        self.class_names = sort_class_names(self.class_names)

        # Get items and labels, filtering out those for which we have no labels.
        X_iter, y_iter = split_tuple_iterator(self.items_gen(classes))

        # Extract features from the items.
        X = self.extraction_pipeline.fit_transform([item for item in X_iter])

        # Calculate labels.
        y = np.array(y_iter)

        print(f"X: {X.shape}, y: {y.shape}")

        is_multilabel = isinstance(y[0], np.ndarray)

        # Split dataset in training and test.
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.1, random_state=0
        )
        if self.sampler is not None:
            pipeline = make_pipeline(self.sampler, self.clf)
        else:
            pipeline = self.clf

        tracking_metrics = {}

        # Use k-fold cross validation to evaluate results.
        if self.cross_validation_enabled:
            scorings = ["accuracy"]
            if len(self.class_names) == 2:
                scorings += ["precision", "recall"]

            scores = cross_validate(pipeline, X_train, y_train, scoring=scorings, cv=5)

            print("Cross Validation scores:")
            for scoring in scorings:
                score = scores[f"test_{scoring}"]
                tracking_metrics[f"test_{scoring}"] = {
                    "mean": score.mean(),
                    "std": score.std() * 2,
                }
                print(
                    f"{scoring.capitalize()}: f{score.mean()} (+/- {score.std() * 2})"
                )

        # Training on the resampled dataset if sampler is provided.
        if self.sampler is not None:
            X_train, y_train = self.sampler.fit_resample(X_train, y_train)

        print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
        print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

        self.clf.fit(X_train, y_train)

        feature_names = self.get_human_readable_feature_names()
        if self.calculate_importance and len(feature_names):
            explainer = shap.TreeExplainer(self.clf)
            shap_values = explainer.shap_values(X_train)

            shap.summary_plot(
                shap_values,
                X_train.toarray(),
                feature_names=feature_names,
                class_names=self.class_names,
                plot_type="layered_violin"
                if not isinstance(shap_values, list)
                else None,
                show=False,
            )

            matplotlib.pyplot.savefig("feature_importance.png", bbox_inches="tight")

            important_features = self.get_important_features(
                importance_cutoff, shap_values
            )

            self.print_feature_importances(important_features, feature_names)

        print("Test Set scores:")
        # Evaluate results on the test set.
        y_pred = self.clf.predict(X_test)

        if is_multilabel:
            assert isinstance(
                y_pred[0], np.ndarray
            ), "The predictions should be multilabel"

        print(f"No confidence threshold - {len(y_test)} classified")
        if is_multilabel:
            confusion_matrix = metrics.multilabel_confusion_matrix(y_test, y_pred)
        else:
            confusion_matrix = metrics.confusion_matrix(
                y_test, y_pred, labels=self.class_names
            )

            print(
                classification_report_imbalanced(
                    y_test, y_pred, labels=self.class_names
                )
            )
            report = classification_report_imbalanced_values(
                y_test, y_pred, labels=self.class_names
            )

            tracking_metrics["report"] = report

        print_labeled_confusion_matrix(
            confusion_matrix, self.class_names, is_multilabel=is_multilabel
        )

        tracking_metrics["confusion_matrix"] = confusion_matrix.tolist()

        # Evaluate results on the test set for some confidence thresholds.
        for confidence_threshold in [0.6, 0.7, 0.8, 0.9]:
            y_pred_probas = self.clf.predict_proba(X_test)

            y_test_filter = []
            y_pred_filter = []
            for i in range(0, len(y_test)):
                argmax = np.argmax(y_pred_probas[i])
                if y_pred_probas[i][argmax] < confidence_threshold:
                    continue

                y_test_filter.append(y_test[i])
                if is_multilabel:
                    y_pred_filter.append(y_pred[i])
                else:
                    y_pred_filter.append(argmax)

            if not is_multilabel:
                y_pred_filter = self.le.inverse_transform(y_pred_filter)

            print(
                f"\nConfidence threshold > {confidence_threshold} - {len(y_test_filter)} classified"
            )
            if len(y_test_filter) != 0:
                if is_multilabel:
                    confusion_matrix = metrics.multilabel_confusion_matrix(
                        np.asarray(y_test_filter), np.asarray(y_pred_filter)
                    )
                else:
                    confusion_matrix = metrics.confusion_matrix(
                        np.asarray(y_test_filter),
                        np.asarray(y_pred_filter),
                        labels=self.class_names,
                    )
                    print(
                        classification_report_imbalanced(
                            y_test_filter, y_pred_filter, labels=self.class_names
                        )
                    )
                print_labeled_confusion_matrix(
                    confusion_matrix, self.class_names, is_multilabel=is_multilabel
                )

        joblib.dump(self, self.__class__.__name__.lower())

        return tracking_metrics
コード例 #21
0
    'number_inpatient', 'num_medications', 'number_diagnoses',
    'num_lab_procedures', 'num_procedures', 'time_in_hospital',
    'number_outpatient', 'number_emergency', 'gender_Female', 'payer_code_?',
    'medical_specialty_?', 'diag_1_428', 'diag_1_414', 'diabetesMed_Yes',
    'A1Cresult_None'
]

# Some versions of shap package error when mixing bools and numerics
X = data[base_features].astype(float)

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

# For speed, we will calculate shap values on smaller subset of the validation data
small_val_X = val_X.iloc[:150]
my_model = RandomForestClassifier(n_estimators=30,
                                  random_state=1).fit(train_X, train_y)
data.describe()

explainer = shap.TreeExplainer(my_model)
shap_values = explainer.shap_values(small_val_X)
shap.summary_plot(shap_values[1], small_val_X)

feature_with_bigger_range_of_effects = 'diag_1_428'
shap.summary_plot(shap_values[1], small_val_X)

bigger_effect_when_changed = "diag_1_428"
shap.summary_plot(shap_values[1], small_val_X)

shap.dependence_plot('num_lab_procedures', shap_values[1], small_val_X)
shap.dependence_plot('num_medications', shap_values[1], small_val_X)
コード例 #22
0
    def generate_feature_importance_data(self, probs, importance):
        X_shap_values = shap.TreeExplainer(self.model.clf).shap_values(self.X)

        pred_class = self.model.le.inverse_transform([probs[0].argmax()])[0]

        features = []
        for i, (val, feature_index, is_positive) in enumerate(
                importance["importances"]["classes"][pred_class][0]):
            name = importance["feature_legend"][str(i + 1)]
            value = importance["importances"]["values"][0, int(feature_index)]

            shap.summary_plot(
                X_shap_values[:,
                              int(feature_index)].reshape(self.X.shape[0], 1),
                self.X[:, int(feature_index)].reshape(self.X.shape[0], 1),
                feature_names=[""],
                plot_type="layered_violin",
                show=False,
            )
            matplotlib.pyplot.xlabel("Impact on model output")
            img = io.BytesIO()
            matplotlib.pyplot.savefig(img, bbox_inches="tight")
            matplotlib.pyplot.clf()
            img.seek(0)
            base64_img = base64.b64encode(img.read()).decode("ascii")

            X = self.X[:, int(feature_index)]
            y = self.y[X != 0]
            X = X[X != 0]
            spearman = spearmanr(X, y)

            buggy_X = X[y == 1]
            clean_X = X[y == 0]
            median = np.median(X)
            median_clean = np.median(clean_X)
            median_buggy = np.median(buggy_X)

            perc_buggy_values_higher_than_median = (
                buggy_X >= median).sum() / buggy_X.shape[0]
            perc_buggy_values_lower_than_median = (
                buggy_X < median).sum() / buggy_X.shape[0]
            perc_clean_values_higher_than_median = (
                clean_X > median).sum() / clean_X.shape[0]
            perc_clean_values_lower_than_median = (
                clean_X <= median).sum() / clean_X.shape[0]

            logger.info("Feature: {}".format(name))
            logger.info("Shap value: {}{}".format(
                "+" if (is_positive) else "-", val))
            logger.info(f"spearman:  {spearman}")
            logger.info(f"value: {value}")
            logger.info(f"overall mean: {np.mean(X)}")
            logger.info(f"overall median: {np.median(X)}")
            logger.info(f"mean for y == 0: {np.mean(clean_X)}")
            logger.info(f"mean for y == 1: {np.mean(buggy_X)}")
            logger.info(f"median for y == 0: {np.median(clean_X)}")
            logger.info(f"median for y == 1: {np.median(buggy_X)}")
            logger.info(
                f"perc_buggy_values_higher_than_median: {perc_buggy_values_higher_than_median}"
            )
            logger.info(
                f"perc_buggy_values_lower_than_median: {perc_buggy_values_lower_than_median}"
            )
            logger.info(
                f"perc_clean_values_higher_than_median: {perc_clean_values_higher_than_median}"
            )
            logger.info(
                f"perc_clean_values_lower_than_median: {perc_clean_values_lower_than_median}"
            )

            features.append({
                "index":
                i + 1,
                "name":
                name,
                "shap":
                float(f'{"+" if (is_positive) else "-"}{val}'),
                "value":
                importance["importances"]["values"][0, int(feature_index)],
                "spearman":
                spearman,
                "median":
                median,
                "median_bug_introducing":
                median_buggy,
                "median_clean":
                median_clean,
                "perc_buggy_values_higher_than_median":
                perc_buggy_values_higher_than_median,
                "perc_buggy_values_lower_than_median":
                perc_buggy_values_lower_than_median,
                "perc_clean_values_higher_than_median":
                perc_clean_values_higher_than_median,
                "perc_clean_values_lower_than_median":
                perc_clean_values_lower_than_median,
                "plot":
                base64_img,
            })

        # Group together features that are very similar to each other, so we can simplify the explanation
        # to users.
        attributes = ["Total", "Maximum", "Minimum", "Average"]
        already_added = set()
        feature_groups = []
        for i1, f1 in enumerate(features):
            if i1 in already_added:
                continue

            feature_groups.append([f1])

            for j, f2 in enumerate(features[i1 + 1:]):
                i2 = j + i1 + 1

                f1_name = f1["name"]
                for attribute in attributes:
                    if f1_name.startswith(attribute):
                        f1_name = f1_name[len(attribute) + 1:]
                        break

                f2_name = f2["name"]
                for attribute in attributes:
                    if f2_name.startswith(attribute):
                        f2_name = f2_name[len(attribute) + 1:]
                        break

                if f1_name != f2_name:
                    continue

                already_added.add(i2)
                feature_groups[-1].append(f2)

        # Pick a representative example from each group.
        features = []
        for feature_group in feature_groups:
            shap_sum = sum(f["shap"] for f in feature_group)

            # Only select easily explainable features from the group.
            selected = [
                f for f in feature_group
                if (f["shap"] > 0 and abs(f["value"] -
                                          f["median_bug_introducing"]) <
                    abs(f["value"] - f["median_clean"])) or (
                        f["shap"] < 0 and abs(f["value"] - f["median_clean"]) <
                        abs(f["value"] - f["median_bug_introducing"]))
            ]

            # If there are no easily explainable features in the group, select all features of the group.
            if len(selected) == 0:
                selected = feature_group

            def feature_sort_key(f):
                if f["shap"] > 0 and f["spearman"][0] > 0:
                    return f["perc_buggy_values_higher_than_median"]
                elif f["shap"] > 0 and f["spearman"][0] < 0:
                    return f["perc_buggy_values_lower_than_median"]
                elif f["shap"] < 0 and f["spearman"][0] > 0:
                    return f["perc_clean_values_lower_than_median"]
                elif f["shap"] < 0 and f["spearman"][0] < 0:
                    return f["perc_clean_values_higher_than_median"]

            feature = max(selected, key=feature_sort_key)
            feature["shap"] = shap_sum

            for attribute in attributes:
                if feature["name"].startswith(attribute):
                    feature["name"] = feature["name"][len(attribute) +
                                                      1:].capitalize()
                    break

            features.append(feature)

        with open("importances.json", "w") as f:
            json.dump(features, f)
コード例 #23
0
def persist_shap(model, X_train):
    shap_values = shap.TreeExplainer(model).shap_values(X_train)
    shap.summary_plot(shap_values, X_train, show=False)
    plt.savefig('/dbfs/mnt/documents/images/shap.png')
コード例 #24
0
# Baseline with LGB
import lightgbm as lgb
from math import sqrt
from sklearn.metrics import mean_squared_error
lgb_dtrain = lgb.Dataset(data=train_x, label=train_y)
lgb_param = {
    'max_depth': 10,
    'learning_rate': 0.01,
    'n_estimators': 1000,
    'objective': 'regression'
}
lgb_model = lgb.train(params=lgb_param, train_set=lgb_dtrain)
lgb_model_predict = lgb_model.predict(test_x)
print("RMSE: {}".format(sqrt(mean_squared_error(lgb_model_predict, test_y))))

# !pip install shap
# import skimage -> skimage.__version__ (skimage version)
# skimage version upgrade -> !pip install --upgrade scikit-image
import shap
explainer = shap.TreeExplainer(lgb_model)
shap_values = explainer.shap_values(test_x)

# Sample
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0, :], test_x.iloc[0, :])
# Higher effect with Red Color, Lower effect with Blue Color

shap.force_plot(explainer.expected_value, shap_values, test_x)
shap.summary_plot(shap_values, test_x)

shap.summary_plot(shap_values, test_x, plot_type="bar")
コード例 #25
0
    categorical_binary = np.array([i in categorical for i in Xcols])
    cab.run(X_train, y_train, X_test, y_test, None, categorical_binary)

    bst = cab.bst

    with open(join('..', 'data', 'ml_data', model_name), 'wb') as f:
        pickle.dump(bst, f)
else:
    # shap on a subset of values
    with open(join('..', 'data', 'ml_data', model_name), 'rb') as f:
        bst = pickle.load(f)

# get a subset of the test set and get shap values
# or just load it from the save file
if compute_shap:
    rand_inds = np.random.choice(np.arange(len(X_test)), n_subset_for_shap)
    X_test_sub = X_test[rand_inds, :]
    y_test_sub = y_test[rand_inds]
    shap_values = shap.TreeExplainer(bst).shap_values(X_test_sub)
    shap.summary_plot(shap_values, X_test_sub, feature_names=Xcols)

    with open(join('..', 'data', 'ml_data', 'lightgbm_' + str(n_estimators) + '_shap.pickle'), 'wb') as f:
        pickle.dump((X_test_sub, y_test_sub, shap_values), f)

else:
    with open(join('..', 'data', 'ml_data', 'lightgbm_' + str(n_estimators) + '_shap.pickle'), 'rb') as f:
        X_test_sub, y_test_sub, shap_values = pickle.load(f)

    shap.summary_plot(shap_values, X_test_sub, feature_names=Xcols)
コード例 #26
0
    pdp_feat = pdp.pdp_isolate(model=lgb_clf,
                               dataset=test_X,
                               model_features=feature_names,
                               feature=feature)
    pdp.pdp_plot(pdp_feat, feature)
    plt.show()


pdp_plotter('service_to_uza_area', lgb_clf)

# SHAP

# Re-fit the model and extract the SHAP tree explainer Features
# to determine which features fit most often

top_feats = shuffle_SHAP(X, y, lgb_clf, n_shuffles=100)
# top_feats.to_csv(DATA_PATH + 'top_features.csv')

explainer = shap.TreeExplainer(lgb_clf)
shap_values = explainer.shap_values(test_X)
shap.summary_plot(shap_values, test_X)

# SHAP Dependence PLot
shap.dependence_plot("Unlinked_Passenger_Trips_FY", shap_values, test_X)

# Denver RTD [1 - Ridership is Stable / Increasing]
original.loc[original.HQ_City.str.contains('Denver')]
data.loc[data['5_digit_NTD_ID'] == 80006]
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[630,:])
コード例 #27
0
def run_explanations(csv_path, csv_columns, target_column, zero_value):
    # Read the dataset from the provided CSV and print out information about it.
    df = pd.read_csv(csv_path,
                     names=csv_columns,
                     skipinitialspace=True,
                     skiprows=1)
    #df = df.drop('Target',axis=1)
    input_features = [name for name in csv_columns if name != target_column]
    #data, labels = shap.datasets.adult(display=True)
    if target_column not in csv_columns:
        print("target column error")
        return ("target column error")
    elif zero_value not in df[target_column].tolist():
        if str.isdecimal(zero_value) and (
                np.int64(zero_value) in df[target_column].tolist()
                or np.float64(zero_value) in df[target_column].tolist()):
            print("happy")
            zero_value = np.int64(zero_value)
        else:
            print(zero_value, df[target_column].tolist(),
                  df[target_column].dtype)
            return ("zero value error")

    labels = df[target_column].tolist()
    #labels = np.array([int(label) for label in labels])
    labels2 = []
    for label in labels:
        if label == zero_value:
            labels2.append(0)
        else:
            labels2.append(1)
    labels = np.array(labels2)

    data = df[input_features]

    for feature in input_features:
        if data[feature].dtype is not np.dtype(
                np.int64) and data[feature].dtype is not np.dtype(
                    np.float64) and data[feature].dtype is not np.dtype(
                        np.float32):
            data[feature] = data[feature].astype('category')

    cat_cols = data.select_dtypes(['category']).columns
    data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes)

    from sklearn.model_selection import train_test_split

    X_train, X_test, y_train, y_test = train_test_split(data,
                                                        labels,
                                                        test_size=0.3,
                                                        random_state=42)

    data_disp, labels_disp = shap.datasets.adult(display=True)
    X_train_disp, X_test_disp, y_train_disp, y_test_disp = train_test_split(
        data_disp, labels_disp, test_size=0.3, random_state=42)

    xgc = xgb.XGBClassifier(n_estimators=500,
                            max_depth=5,
                            base_score=0.5,
                            objective='binary:logistic',
                            random_state=42)
    xgc.fit(X_train, y_train)
    predictions = xgc.predict(X_test)

    fig = plt.figure(figsize=(16, 12))
    title = fig.suptitle("Default Feature Importances from XGBoost",
                         fontsize=14)

    ax1 = fig.add_subplot(2, 2, 1)
    xgb.plot_importance(xgc, importance_type='weight', ax=ax1)
    t = ax1.set_title("Feature Importance - Feature Weight")

    ax2 = fig.add_subplot(2, 2, 2)
    xgb.plot_importance(xgc, importance_type='gain', ax=ax2)
    t = ax2.set_title("Feature Importance - Split Mean Gain")

    ax3 = fig.add_subplot(2, 2, 3)
    xgb.plot_importance(xgc, importance_type='cover', ax=ax3)
    t = ax3.set_title("Feature Importance - Sample Coverage")

    #plt.savefig('static/explanations.png')

    explanation = eli5.explain_weights(xgc.get_booster())
    explanation_html = eli5.formatters.html.format_as_html(explanation)
    print(explanation_html)

    with open("templates/explanation.html", "a+") as file:
        file.write(explanation_html)

    doc_num = 0
    print('Actual Label:', y_test[doc_num])
    print('Predicted Label:', predictions[doc_num])
    #eli5.show_prediction(xgc.get_booster(), X_test.iloc[doc_num],
    #                     feature_names=list(data.columns) ,show_feature_values=True)
    explanation2 = eli5.explain_prediction(xgc.get_booster(),
                                           X_test.iloc[doc_num],
                                           feature_names=list(data.columns))
    explanation_html2 = eli5.formatters.html.format_as_html(explanation2)
    with open("templates/explanation.html", "a") as file:
        file.write(explanation_html2)

    doc_num = 2
    print('Actual Label:', y_test[doc_num])
    print('Predicted Label:', predictions[doc_num])
    #eli5.show_predicon(xgc.get_booster(), X_test.iloc[doc_num], feature_names=list(data.columns) ,show_feature_values=True)
    explanation3 = eli5.explain_prediction(xgc.get_booster(),
                                           X_test.iloc[doc_num],
                                           feature_names=list(data.columns))
    explanation_html3 = eli5.formatters.html.format_as_html(explanation3)
    with open("templates/explanation.html", "a") as file:
        file.write(explanation_html3)

    #target_names = ['$50K or less', 'More than $50K']
    interpreter = Interpretation(training_data=X_test,
                                 training_labels=y_test,
                                 feature_names=list(data.columns))
    im_model = InMemoryModel(xgc.predict_proba, examples=X_train)

    plots = interpreter.feature_importance.plot_feature_importance(
        im_model, ascending=True, n_samples=23000)

    plots[0].savefig('skater.png')

    features_pdp = input_features

    xgc_np = xgb.XGBClassifier(n_estimators=500,
                               max_depth=5,
                               base_score=0.5,
                               objective='binary:logistic',
                               random_state=42)
    xgc_np.fit(X_train.values, y_train)

    # In[ ]:

    from skater.core.local_interpretation.lime.lime_tabular import LimeTabularExplainer

    exp = LimeTabularExplainer(X_test.values,
                               feature_names=list(data.columns),
                               discretize_continuous=True)

    doc_num = 0
    print('Actual Label:', y_test[doc_num])
    print('Predicted Label:', predictions[doc_num])
    instance = exp.explain_instance(X_test.iloc[doc_num].values,
                                    xgc_np.predict_proba)
    instance.save_to_file('templates/lime.html', show_all=False)

    doc_num = 2
    print('Actual Label:', y_test[doc_num])
    print('Predicted Label:', predictions[doc_num])
    instance2 = exp.explain_instance(X_test.iloc[doc_num].values,
                                     xgc_np.predict_proba)
    instance2.save_to_file('templates/lime2.html', show_all=False)

    explainer = shap.TreeExplainer(xgc)
    shap_values = explainer.shap_values(X_test)
    pd.DataFrame(shap_values).head()

    #shap.force_plot(explainer.expected_value, shap_values[:,], X_test_disp.iloc[:,],show=False,matplotlib=True)
    #plt.savefig("static/force_plot.png")

    shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
    plt.savefig("static/summary_plot.png")

    shap.summary_plot(shap_values, X_test, show=False)
    plt.savefig("static/summary_plot2.png")

    return "Everyone Happy"
コード例 #28
0
def calculate_average_shap(file_name, treatment, outcome,algorithm, top_features, plot_file = ''):
    
    if file_name == '':
        print("Invalid treatment/outcome combination (" + str(treatment) + ", " + str(outcome)+ ")")
    else:
        with open(file_name, 'rb') as file:
             model_file = pickle.load(file)
          
    model = model_file['model_original']
    data = model_file['train']
    data_test = model_file['test']

    X = data.drop(["COMORB_DEATH"], axis=1, inplace = False)
    y = data["COMORB_DEATH"]
    X_test = data_test.drop(["COMORB_DEATH"], axis=1, inplace = False)
    y_test = data_test["COMORB_DEATH"]

    ## Calculate SHAP values (for each observation x feature)
    if algorithm in ['rf','cart','xgboost']:
        explainer = shap.TreeExplainer(model,
                                   data=X_test,
                                   model_output="probability",
                                   );
        shap_values = explainer.shap_values(X_test);
        
        ## only save plot for tree models
        if plot_file != '':
            plt.close()
            if isinstance(shap_values, list):
                shap.summary_plot(shap_values[1], X_test, show=False,
                          max_display=10,
                          plot_size=(10, 5),
                          plot_type="violin")  
            else:             
                shap.summary_plot(shap_values, X_test, show=False,
                      max_display=10,
                      plot_size=(10, 5),
                      plot_type="violin")  
                
            f = plt.gcf()
            ax = plt.gca()
            
            plt.xlabel('SHAP value (impact on model output)')   
        
            f.savefig(plot_file, bbox_inches='tight')
            
            plt.close()
        
    else:
        X_train_summary = shap.kmeans(X, 50)
        explainer = shap.KernelExplainer(model.predict_proba,
                                   data=X_train_summary,
                                   model_output="logit",
                                   );
        shap_values = explainer.shap_values(X_test);
        
        if plot_file != '':
            print('Cannot plot summary plot for non-tree models')


    df = pd.DataFrame(columns = ['Risk Factor', 'Mean Absolute SHAP Value']) 
    
    for i in range(0,len(X.columns)):
        if isinstance(shap_values, list):
            df = df.append({'Risk Factor' : X.columns[i], 'Mean Absolute SHAP Value' : pd.Series(shap_values[1][:,i]).abs().mean()},  
                ignore_index = True) 
        else:
            df = df.append({'Risk Factor' : X.columns[i], 'Mean Absolute SHAP Value' : pd.Series(shap_values[:,i]).abs().mean()},  
                ignore_index = True)
    
    df = df.sort_values(by='Mean Absolute SHAP Value', ascending=False)    
    df = df.head(top_features)
    
    return df
コード例 #29
0
def main():
    st.title("Feature Interpreation using SHAP")
    st.subheader("Sayantan Ghosh")
    
    @st.cache
    #Loading teh Boston Data---------------------------------------
    def load_data():
        boston = load_boston()
        return boston
    
    #Laoding the Dataset
    data_load_state = st.text("Loading Data")   
    boston = load_data()
    data_load_state = st.text("Data Loaded")
    
    
    #-----------------------------------------------------------------
    @st.cache
    def load_dataframe():
        Boston = pd.DataFrame(boston.data, columns=boston.feature_names)
        Boston['MEDV'] = boston.target
        return Boston
    
    Boston = load_dataframe()
    #Showing the snapshot of the data
    st.write(Boston.head(5))

    user_input = st.text_input(" Give rad value RAD", 1)

    record = Boston.loc[Boston['RAD'] == int(user_input)]

    st.write(record)

    #Just pass the record into the Model.predict(record)
    
    #--------------------------------------------------------------------------------------------------------
    #Defining X and Y
    x = Boston.loc[:, Boston.columns != 'MEDV'].values
    y = Boston.loc[:, Boston.columns == 'MEDV'].values
    x_train, x_test, y_train, y_test = train_test_split (Boston[boston.feature_names],y, test_size = 0.25, random_state=34)
    
    # Building the dashboard on XGBOOST model:
    st.title('Model the Boston Housing Dataset using XGBOOST')
    
    # creating DMatrices for XGBOOST application
    #dtrain = xgb.DMatrix(x_train, label=y_train, feature_names=boston.feature_names)
    #dtest  = xgb.DMatrix(x_test, label=y_test, feature_names=boston.feature_names)
   
    # Loading the cross-validated tuned XGBOOST model
    #loaded_model = pickle.load(open("xgboost_cv_best_pickle.dat", "rb"))
    #loaded_predictions = loaded_model.predict(dtest)
    
    
    
    loaded_model = xgb.XGBRegressor(
            n_estimators=150,
          reg_lambda=1,
            gamma=0,
            max_depth=8
        )    
    
    loaded_model.fit(x_train,y_train)
    loaded_predictions = loaded_model.predict(x_test)
    st.write('RMSE of the XGBoost model on test set:', round(np.sqrt(metrics.mean_squared_error(y_test, loaded_predictions)),2))
    
    
   
   
    
    
    #feature importance-------------------------------------------------------------------------------------------
    try:
        
        st.write('Using the standard XGBOOST importance plot feature, exposes the fact that the most important feature is not stable, select'
                 ' different importance types using the selectbox below')
        importance_type = st.selectbox('Select the desired importance type', ('weight','gain','cover'),index=0)
        importance_plot = xgb.plot_importance(loaded_model,importance_type=importance_type)
        pl.title ('xgboost.plot_importance(best XGBoost model) importance type = '+ str(importance_type))
        st.pyplot(bbox_inches='tight')
        pl.clf()
    except:
        pass
    
    
    #Feature Importance------------------------------------------------------------------------------------------
    st.write('To handle this inconsitency, SHAP values give robust details, among which is feature importance')
    explainer = shap.TreeExplainer(loaded_model)
    shap_values = explainer.shap_values(x_train)
    pl.title('Assessing feature importance based on Shap values')
    shap.summary_plot(shap_values,x_train,plot_type="bar",show=False)
    st.pyplot(bbox_inches='tight')
    pl.clf()
    
    
    #--------------------------------------------------------------------------------------------------------------
    st.write('SHAP values can also be used to represent the distribution of the training set of the respectable'
             'SHAP value in relation with the Target value, in this case the Median House Value (MEDV)')
    pl.title('Total distribution of observations based on Shap values, colored by Target value')
    shap.summary_plot(shap_values,x_train,show=False)
    st.pyplot(bbox_inches='tight')
    pl.clf()
    
    
    #----------------------------------------------
    st.write('Another example of SHAP values is for GDPR regulation, one should be able to give detailed information as to'
              ' why a specific prediction was made.')
    expectation = explainer.expected_value
    
    individual = st.number_input('Select the desired record from the training set for detailed explanation.'
                                           ,min_value=1
                                           ,max_value=1000)
    predicted_values = loaded_model.predict(x_train)
    real_value = y_train[individual]
    st.write('The real median house value for this individual record is: '+str(real_value))
    st.write('The predicted median house value for this individual record is: '+str(predicted_values[individual]))
    st.write('This prediction is calculated as follows: '
              'The average median house value: ('+str(expectation)+')'+
               ' + the sum of the SHAP values. ')
    st.write(  'For this individual record the sum of the SHAP values is: '+str(sum(shap_values[individual,:])))
    st.write(  'This yields to a predicted value of median house value of:'+str(expectation)+' + '+str(sum(shap_values[individual,:]))+
               '= '+str(expectation+(sum(shap_values[individual,:]))))
    st.write('Which features caused this specific prediction? features in red increased the prediction, in blue decreased them')
    shap.force_plot(explainer.expected_value, shap_values[individual,:],x_train.iloc[individual,:],matplotlib=True,show=False
                    ,figsize=(16,5))
    st.pyplot(bbox_inches='tight',dpi=300,pad_inches=0)
    pl.clf()
コード例 #30
0
                fts = ""
                for ft in imp:
                    fts = fts + ft[0] + " " + str(it) + " " + str(
                        vol) + " " + str(no) + " " + str(ft[2]) + " " + str(
                            round(ft[1], 4)) + "\n"

                print(fts)
                imp_str = imp_str + fts

                if get_shap:
                    shap_vals = shap.TreeExplainer(xg_reg).shap_values(
                        train[features])

                    shap.summary_plot(shap_vals,
                                      train[features],
                                      plot_type="bar")
                    shap_comb = shap_vals.transpose()

                    shap_mean = []
                    num_f = len(shap_comb)
                    for fi in range(len(shap_comb)):
                        vabs = abs(shap_comb[fi])
                        v_mean = stat.mean(vabs)
                        shap_mean.append(v_mean)

                    shapl = list(zip(features, shap_mean, range(1, numf + 1)))
                    shapl.sort(key=lambda tup: tup[1], reverse=True)

                    shps = ""
                    for ft in shapl: