Exemple #1
0
def plot_shap_summary(model, X_train, model_name='XGB'):
    shap_values = shap.TreeExplainer(model).shap_values(X_train)
    shap_values_df = pd.DataFrame(shap_values, columns=X_train.columns)
    plot_shap_summary_bar(shap_values_df, model_name)
Exemple #2
0
    "boosting_type": "gbdt",
    "objective": "regression",
    "metric": {"l2", "l1"},
    "num_leaves": 4,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": 0,
    "min_data_in_bin":1,"min_data":1,"min_hess":0
}

gbm = lgb.train(lgb_params, train_data, num_boost_round=20, verbose_eval=1)

shap.initjs()

explainer = shap.TreeExplainer(gbm)
shap_values = explainer.shap_values(x)

# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[1,:], x.iloc[1,:])

shap.force_plot(explainer.expected_value, shap_values, x)
shap.dependence_plot("dist_num", shap_values, x)


# IMPORTANCES OF ALL FEATURES
shap.summary_plot(shap_values, x, plot_type="bar")
shap.summary_plot(shap_values, x)

shap.summary_plot(explainer.shap_interaction_values(x), x)
Exemple #3
0
    def generate_feature_importance_data(self, probs, importance):
        X_shap_values = shap.TreeExplainer(self.model.clf).shap_values(self.X)

        pred_class = self.model.le.inverse_transform([probs[0].argmax()])[0]

        features = []
        for i, (val, feature_index, is_positive) in enumerate(
                importance["importances"]["classes"][pred_class][0]):
            name = importance["feature_legend"][str(i + 1)]
            value = importance["importances"]["values"][0, int(feature_index)]

            shap.summary_plot(
                X_shap_values[:,
                              int(feature_index)].reshape(self.X.shape[0], 1),
                self.X[:, int(feature_index)].reshape(self.X.shape[0], 1),
                feature_names=[""],
                plot_type="layered_violin",
                show=False,
            )
            matplotlib.pyplot.xlabel("Impact on model output")
            img = io.BytesIO()
            matplotlib.pyplot.savefig(img, bbox_inches="tight")
            matplotlib.pyplot.clf()
            img.seek(0)
            base64_img = base64.b64encode(img.read()).decode("ascii")

            X = self.X[:, int(feature_index)]
            y = self.y[X != 0]
            X = X[X != 0]
            spearman = spearmanr(X, y)

            buggy_X = X[y == 1]
            clean_X = X[y == 0]
            median = np.median(X)
            median_clean = np.median(clean_X)
            median_buggy = np.median(buggy_X)

            perc_buggy_values_higher_than_median = (
                buggy_X >= median).sum() / buggy_X.shape[0]
            perc_buggy_values_lower_than_median = (
                buggy_X < median).sum() / buggy_X.shape[0]
            perc_clean_values_higher_than_median = (
                clean_X > median).sum() / clean_X.shape[0]
            perc_clean_values_lower_than_median = (
                clean_X <= median).sum() / clean_X.shape[0]

            logger.info("Feature: {}".format(name))
            logger.info("Shap value: {}{}".format(
                "+" if (is_positive) else "-", val))
            logger.info(f"spearman:  {spearman}")
            logger.info(f"value: {value}")
            logger.info(f"overall mean: {np.mean(X)}")
            logger.info(f"overall median: {np.median(X)}")
            logger.info(f"mean for y == 0: {np.mean(clean_X)}")
            logger.info(f"mean for y == 1: {np.mean(buggy_X)}")
            logger.info(f"median for y == 0: {np.median(clean_X)}")
            logger.info(f"median for y == 1: {np.median(buggy_X)}")
            logger.info(
                f"perc_buggy_values_higher_than_median: {perc_buggy_values_higher_than_median}"
            )
            logger.info(
                f"perc_buggy_values_lower_than_median: {perc_buggy_values_lower_than_median}"
            )
            logger.info(
                f"perc_clean_values_higher_than_median: {perc_clean_values_higher_than_median}"
            )
            logger.info(
                f"perc_clean_values_lower_than_median: {perc_clean_values_lower_than_median}"
            )

            features.append({
                "index":
                i + 1,
                "name":
                name,
                "shap":
                float(f'{"+" if (is_positive) else "-"}{val}'),
                "value":
                importance["importances"]["values"][0, int(feature_index)],
                "spearman":
                spearman,
                "median":
                median,
                "median_bug_introducing":
                median_buggy,
                "median_clean":
                median_clean,
                "perc_buggy_values_higher_than_median":
                perc_buggy_values_higher_than_median,
                "perc_buggy_values_lower_than_median":
                perc_buggy_values_lower_than_median,
                "perc_clean_values_higher_than_median":
                perc_clean_values_higher_than_median,
                "perc_clean_values_lower_than_median":
                perc_clean_values_lower_than_median,
                "plot":
                base64_img,
            })

        # Group together features that are very similar to each other, so we can simplify the explanation
        # to users.
        attributes = ["Total", "Maximum", "Minimum", "Average"]
        already_added = set()
        feature_groups = []
        for i1, f1 in enumerate(features):
            if i1 in already_added:
                continue

            feature_groups.append([f1])

            for j, f2 in enumerate(features[i1 + 1:]):
                i2 = j + i1 + 1

                f1_name = f1["name"]
                for attribute in attributes:
                    if f1_name.startswith(attribute):
                        f1_name = f1_name[len(attribute) + 1:]
                        break

                f2_name = f2["name"]
                for attribute in attributes:
                    if f2_name.startswith(attribute):
                        f2_name = f2_name[len(attribute) + 1:]
                        break

                if f1_name != f2_name:
                    continue

                already_added.add(i2)
                feature_groups[-1].append(f2)

        # Pick a representative example from each group.
        features = []
        for feature_group in feature_groups:
            shap_sum = sum(f["shap"] for f in feature_group)

            # Only select easily explainable features from the group.
            selected = [
                f for f in feature_group
                if (f["shap"] > 0 and abs(f["value"] -
                                          f["median_bug_introducing"]) <
                    abs(f["value"] - f["median_clean"])) or (
                        f["shap"] < 0 and abs(f["value"] - f["median_clean"]) <
                        abs(f["value"] - f["median_bug_introducing"]))
            ]

            # If there are no easily explainable features in the group, select all features of the group.
            if len(selected) == 0:
                selected = feature_group

            def feature_sort_key(f):
                if f["shap"] > 0 and f["spearman"][0] > 0:
                    return f["perc_buggy_values_higher_than_median"]
                elif f["shap"] > 0 and f["spearman"][0] < 0:
                    return f["perc_buggy_values_lower_than_median"]
                elif f["shap"] < 0 and f["spearman"][0] > 0:
                    return f["perc_clean_values_lower_than_median"]
                elif f["shap"] < 0 and f["spearman"][0] < 0:
                    return f["perc_clean_values_higher_than_median"]

            feature = max(selected, key=feature_sort_key)
            feature["shap"] = shap_sum

            for attribute in attributes:
                if feature["name"].startswith(attribute):
                    feature["name"] = feature["name"][len(attribute) +
                                                      1:].capitalize()
                    break

            features.append(feature)

        with open("importances.json", "w") as f:
            json.dump(features, f)
for xXx in os.listdir(path_to_features):
    print(xXx[:-4])
    start = time.time()
    with open(os.path.join(path_to_features, xXx), "rb") as f:
        [x_train, x_test, y_train, y_test] = pickle.load(f)

    lgbm.fit(x_train, y_train)

    df = pd.DataFrame(columns=['feat_impo_lgbm', 'shan_tree', 'feat_impo_xgb'],
                      index=x_test.columns)
    try:
        for col in df.columns:
            if col == 'shan_tree':
                if xXx != 'opensmile_avec2013_long.pkl' and xXx != 'opensmile_IS13_ComParE_long.pkl':
                    try:
                        zz = shap.TreeExplainer(lgbm, x_test)
                        # shap_values = np.mean(abs(zz.shap_values(x_test)), axis=0)
                        shap_values = zz.shap_values(x_test)

                        temp_df = pd.DataFrame(columns=['col_idx', 'mean'],
                                               index=range(x_test.shape[1]))
                        temp_df['col_idx'] = x_test.columns
                        # df['mean'] = shap_values

                        means = []
                        for idx, row in df.iterrows():
                            means.append(np.mean(abs(shap_values[:, idx])))

                        temp_df['mean'] = means

                        temp_df.sort_values(by='mean',
Exemple #5
0
X, y = shap.datasets.diabetes()

# Shape
X.shape, y.shape

# Distribution of target variable
pd.Series(y).plot('hist')


# Train using XGBoost Regressor model
XGB_model = xgboost.XGBRegressor()
XGB_model.fit(X, y)


# Create Tree explainer
explainer = shap.TreeExplainer(XGB_model)

# Extract SHAP values to explain the model predictions
shap_values = explainer.shap_values(X)


# Plot Feature Importance
shap.summary_plot(shap_values, X, plot_type="bar")


# Plot Feature Importance - 'Dot' type
shap.summary_plot(shap_values, X, plot_type='dot')


# Visualize the explanation of first prediction
shap.force_plot(explainer.expected_value, shap_values[0, :], X.iloc[0, :])
Exemple #6
0
def shap_scatterplot(sklearn_model: BaseEstimator,
                     X_explain: pd.DataFrame,
                     feature_labels: dict,
                     feature: str = "bac_guess",
                     moderator: Sequence[str] = "episode",
                     output_folder: str = "/mnt/data/figures/shap") -> None:
    """Partial Dependence Plot for SHAP

    Args:
        sklearn_model (BaseEstimator): e.g., lightgbm
        X_explain (pd.DataFrame): feature set
        feature_labels (dict): maps feature_names onto feature_labels for plotting
        feature (str): The main feature to scatterplot
        output_folder (str): prefix for storing plots & parquets
    """
    # Exclude missing data, which distorts visualization
    mask = X_explain[feature] > -999
    X_explain = X_explain.loc[mask]

    # Compute SHAP values
    shap_values = shap.TreeExplainer(sklearn_model).shap_values(X_explain)

    if isinstance(shap_values, list):
        # Some output a list for each class
        shap_values = shap_values[1]

    columns = X_explain.columns.tolist()
    if feature not in columns:
        raise ValueError(f"{feature} is not a column in the given feature df.")

    formatter_params = {'xtick.labelsize': 8, 'ytick.labelsize': 8}
    plt.rcParams.update(formatter_params)

    for mod in moderator:
        ax = shap.dependence_plot(feature,
                                  shap_values,
                                  X_explain,
                                  interaction_index=mod,
                                  dot_size=2)
        if (feature == "bac_guess") or (feature == "bac_cumulative_avg"):
            plt.axvspan(.06, .10, alpha=.10, color='grey')
            plt.axvspan(.04, .12, alpha=.10, color='grey')
        plt.gcf().set_size_inches(6, 3)

        flabel = feature_labels[feature]["label"]
        plt.xlabel(flabel, fontsize=10)
        plt.ylabel(f"SHAP Value for {flabel}", fontsize=10)

        # cbarlabel = feature_labels[mod]["label"]
        # cbar = plt.colorbar()
        # cbar.ax.tick_params(labelsize=7)
        #plt.colorbar().set_label(label=cbarlabel, fontsize=10)

        # # Hack to change fontsize on the legend/colorbar
        # cax = plt.gcf().axes[-1]
        # cax.tick_params(labelsize=8)
        # # Hack to change fontsize of the legend label
        # plt.gcf().figure.axes[-1].yaxis.label.set_size(10)# size of legend label

        plt.tight_layout()
        plt.savefig(f"{output_folder}/shap_scatterplot_{feature}_by_{mod}.pdf",
                    bbox_inches="tight")
        plt.close()
Exemple #7
0
    def train(self, importance_cutoff=0.15):
        classes, self.class_names = self.get_labels()
        self.class_names = sort_class_names(self.class_names)

        # Get items and labels, filtering out those for which we have no labels.
        X_iter, y_iter = split_tuple_iterator(self.items_gen(classes))

        # Extract features from the items.
        X = self.extraction_pipeline.fit_transform([item for item in X_iter])

        # Calculate labels.
        y = np.array(y_iter)

        print(f"X: {X.shape}, y: {y.shape}")

        is_multilabel = isinstance(y[0], np.ndarray)

        # Split dataset in training and test.
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.1, random_state=0
        )
        if self.sampler is not None:
            pipeline = make_pipeline(self.sampler, self.clf)
        else:
            pipeline = self.clf

        tracking_metrics = {}

        # Use k-fold cross validation to evaluate results.
        if self.cross_validation_enabled:
            scorings = ["accuracy"]
            if len(self.class_names) == 2:
                scorings += ["precision", "recall"]

            scores = cross_validate(pipeline, X_train, y_train, scoring=scorings, cv=5)

            print("Cross Validation scores:")
            for scoring in scorings:
                score = scores[f"test_{scoring}"]
                tracking_metrics[f"test_{scoring}"] = {
                    "mean": score.mean(),
                    "std": score.std() * 2,
                }
                print(
                    f"{scoring.capitalize()}: f{score.mean()} (+/- {score.std() * 2})"
                )

        # Training on the resampled dataset if sampler is provided.
        if self.sampler is not None:
            X_train, y_train = self.sampler.fit_resample(X_train, y_train)

        print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
        print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

        self.clf.fit(X_train, y_train)

        feature_names = self.get_human_readable_feature_names()
        if self.calculate_importance and len(feature_names):
            explainer = shap.TreeExplainer(self.clf)
            shap_values = explainer.shap_values(X_train)

            shap.summary_plot(
                shap_values,
                X_train.toarray(),
                feature_names=feature_names,
                class_names=self.class_names,
                plot_type="layered_violin"
                if not isinstance(shap_values, list)
                else None,
                show=False,
            )

            matplotlib.pyplot.savefig("feature_importance.png", bbox_inches="tight")

            important_features = self.get_important_features(
                importance_cutoff, shap_values
            )

            self.print_feature_importances(important_features, feature_names)

        print("Test Set scores:")
        # Evaluate results on the test set.
        y_pred = self.clf.predict(X_test)

        if is_multilabel:
            assert isinstance(
                y_pred[0], np.ndarray
            ), "The predictions should be multilabel"

        print(f"No confidence threshold - {len(y_test)} classified")
        if is_multilabel:
            confusion_matrix = metrics.multilabel_confusion_matrix(y_test, y_pred)
        else:
            confusion_matrix = metrics.confusion_matrix(
                y_test, y_pred, labels=self.class_names
            )

            print(
                classification_report_imbalanced(
                    y_test, y_pred, labels=self.class_names
                )
            )
            report = classification_report_imbalanced_values(
                y_test, y_pred, labels=self.class_names
            )

            tracking_metrics["report"] = report

        print_labeled_confusion_matrix(
            confusion_matrix, self.class_names, is_multilabel=is_multilabel
        )

        tracking_metrics["confusion_matrix"] = confusion_matrix.tolist()

        # Evaluate results on the test set for some confidence thresholds.
        for confidence_threshold in [0.6, 0.7, 0.8, 0.9]:
            y_pred_probas = self.clf.predict_proba(X_test)

            y_test_filter = []
            y_pred_filter = []
            for i in range(0, len(y_test)):
                argmax = np.argmax(y_pred_probas[i])
                if y_pred_probas[i][argmax] < confidence_threshold:
                    continue

                y_test_filter.append(y_test[i])
                if is_multilabel:
                    y_pred_filter.append(y_pred[i])
                else:
                    y_pred_filter.append(argmax)

            if not is_multilabel:
                y_pred_filter = self.le.inverse_transform(y_pred_filter)

            print(
                f"\nConfidence threshold > {confidence_threshold} - {len(y_test_filter)} classified"
            )
            if len(y_test_filter) != 0:
                if is_multilabel:
                    confusion_matrix = metrics.multilabel_confusion_matrix(
                        np.asarray(y_test_filter), np.asarray(y_pred_filter)
                    )
                else:
                    confusion_matrix = metrics.confusion_matrix(
                        np.asarray(y_test_filter),
                        np.asarray(y_pred_filter),
                        labels=self.class_names,
                    )
                    print(
                        classification_report_imbalanced(
                            y_test_filter, y_pred_filter, labels=self.class_names
                        )
                    )
                print_labeled_confusion_matrix(
                    confusion_matrix, self.class_names, is_multilabel=is_multilabel
                )

        joblib.dump(self, self.__class__.__name__.lower())

        return tracking_metrics
Exemple #8
0
    'reg_lambda': 10,
    'reg_alpha': 0.1,
    'learning_rate': 0.01,
    'gamma': 0.1
}

xgb = XGBClassifier(**basicparameter)
# xgb=XGBClassifier() # apply the default parameters
xgb.fit(Xtrain, Ytrain)

#  score the model
print('============================= XGBoost =============================')
score(xgb, Xtrain, Ytrain, Xtest, Ytest)

print('============================== SHAP ===============================')
explainer = shap.TreeExplainer(xgb)  # define the explainer
shap_values = explainer.shap_values(X)  # use all data for analysis


def gen_data(inputs, X):
    """ creates a data Frame with inputs and X for statistics with shap """
    df1 = pd.DataFrame()
    for i, name in enumerate(inputs):
        df1[name] = X[:, i]
    return df1


df1 = gen_data(inputs, X)

shap.summary_plot(shap_values, df1)
# shap.summary_plot(shap_values, df1, plot_type="bar")
#Data Splitting
X_train, X_test, y_train, y_test = train_test_split(dfX,
                                                    dfY,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=123)

rf = RandomForestRegressor(n_estimators=400,
                           min_samples_split=2,
                           min_samples_leaf=1,
                           max_features="auto",
                           max_depth=65)
rf.fit(X_train, y_train)

explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_train)

f = plt.figure()
shap.summary_plot(shap_values, X_train, plot_type="bar")
f.savefig("varImp.pdf", bbox_inches='tight', dpi=600)

plt.close('all')
f = plt.figure()
shap.summary_plot(shap_values, X_train)
f.savefig("varImp2.pdf", bbox_inches='tight', dpi=600)

plt.close('all')
shap.dependence_plot("median_step_width", shap_values, X_train)
plt.savefig("sw-dep.pdf", bbox_inches='tight', dpi=600)
# # Best model from grid search (best.best_estimator_)
best_model = DecisionTreeClassifier(min_samples_leaf=0.005,
                                    min_samples_split=0.015)
plot_roc_curve(best_model, X_test, y_test)

# --- Ensemble classifiers --------------------------------------------------- #
rf = RandomForestClassifier().fit(X_train, y_train)
ada = AdaBoostClassifier().fit(X_train, y_train)
gbc = GradientBoostingClassifier().fit(X_train, y_train)

# Plot AUC for ensemble classifiers
fig, ax = plt.subplots()
plot_roc_curve(rf, X_test, y_test, ax=ax)
plot_roc_curve(ada, X_test, y_test, ax=ax)
plot_roc_curve(gbc, X_test, y_test, ax=ax)
plt.savefig("ensemble.png", dpi=500)

# Create SHAP explainers for best model
model = RandomForestClassifier().fit(X_train, y_train)
shap_values = shap.TreeExplainer(model).shap_values(X_train)

shap.summary_plot(shap_values[0],
                  X_train,
                  feature_names=X.columns,
                  plot_type="violin")
shap.summary_plot(shap_values[0],
                  X_train,
                  feature_names=X.columns,
                  plot_type="bar")
Exemple #11
0
def reg_top10_lightGBM(merge_data, outname, memo):
    from sklearn.model_selection import StratifiedShuffleSplit

    # 目的変数を分離
    X = merge_data.drop("target", axis=1).values
    y = merge_data["target"].values
    columns_name = merge_data.drop("target", axis=1).columns

    # 分類するための関数を定義
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2)

    def data_split(X, y):
        for train_index, test_index in sss.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

        X_train = pd.DataFrame(X_train, columns=columns_name)
        X_test = pd.DataFrame(X_test, columns=columns_name)

        return X_train, y_train, X_test, y_test

    # train, test, valに分離
    X_train, y_train, X_test, y_test = data_split(X, y)
    X_train, y_train, X_val, y_val = data_split(X_train.values, y_train)

    # shape 確認
    print("train shape", X_train.shape)
    print("test shape", X_test.shape)
    print("validation shape", X_val.shape)
    # shape 確認
    print("y_train shape", y_train.shape)
    print("y_test shape", y_test.shape)
    print("y_validation shape", y_val.shape)
    y_test_df = pd.DataFrame(y_test)
    print("y_test describe", y_test_df.describe())
    print("not_ y_test describe", (~y_test_df.duplicated()).sum())
    #y_test_df.value_counts().plot(kind="bar")
    print("y_test_df.duplicated().sum()", y_test_df.duplicated().sum())
    #print(y_test_df[y_test_df.duplicated()])
    # クラスの割合を確認
    plt.figure(figsize=(20, 5))
    plt.subplot(1, 3, 1)
    plt.hist(y_train)

    plt.subplot(1, 3, 2)
    plt.hist(y_test)

    plt.subplot(1, 3, 3)
    plt.hist(y_val)
    #shap
    import shap
    shap.initjs()

    import lightgbm as lgb

    # データセットを作成
    train = lgb.Dataset(X_train, label=y_train)
    valid = lgb.Dataset(X_val, label=y_val)

    # モデルのパラメータを設定
    # パラメータを設定
    params = {
        'task': 'train',  # 学習、トレーニング ⇔ 予測predict
        'boosting_type': 'gbdt',  # 勾配ブースティング
        'objective': 'regression',  # 目的関数:回帰
        'metric': 'rmse',  # 回帰分析モデルの性能を測る指標
        'learning_rate': 0.1
    }  # 学習率(初期値0.1)
    # モデルを訓練
    model = lgb.train(params,
                      train,
                      valid_sets=valid,
                      num_boost_round=3000,
                      early_stopping_rounds=100)

    # 予測
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    #shap
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)
    shap.summary_plot(shap_values, X_test, plot_type="bar")
    # Plot Feature Importance - 'Dot' type
    shap.summary_plot(shap_values, X_test, plot_type='dot')
    #shap.summary_plot(shap_values, X_test, plot_type='dot')

    from sklearn.metrics import mean_squared_error  # モデル評価用(平均二乗誤差)
    from sklearn.metrics import r2_score  # モデル評価用(決定係数)
    # 真値と予測値の表示
    df_pred = pd.DataFrame({
        'regression_y_test': y_test,
        'regression_y_pred': y_pred
    })
    display(df_pred)

    # 散布図を描画(真値 vs 予測値)
    plt.plot(y_test, y_pred, color='red',
             label='x=y')  # 直線y = x (真値と予測値が同じ場合は直線状に点がプロットされる)
    plt.scatter(y_test, y_pred)  # 散布図のプロット
    plt.xlabel('y_test')  # x軸ラベル
    plt.ylabel('y_pred')  # y軸ラベル
    plt.title('y vs y_pred')  # グラフタイトル

    # モデル評価
    # rmse : 平均二乗誤差の平方根
    mse = mean_squared_error(y_test, y_pred)  # MSE(平均二乗誤差)の算出
    rmse = np.sqrt(mse)  # RSME = √MSEの算出
    print('RMSE :', rmse)
    # r2 : 決定係数
    r2 = r2_score(y_test, y_pred)
    print('R2 :', r2)
    df_Df = pd.DataFrame({
        'regression_y_test_' + memo: y_test,
        'regression_y_pred_' + memo: y_pred,
        'RMSE_' + memo: rmse,
        'R2_' + memo: r2
    })
    df_Df.to_csv(r"" + "./20210201_output/" + 'DPC_g/20210415/' + outname +
                 memo + '.csv',
                 encoding='shift-jis')

    importance = pd.DataFrame(model.feature_importance(),
                              columns=['importance'])
    display(importance)
    C_you = merge_data.drop(["target"], axis=1)
    importance["columns"] = list(C_you.columns)
    #importance.to_csv(r""+"./20210201_output/"+'DPC_g/20210415/'+outnameimp+memo+'.csv', encoding = 'shift-jis')

    return importance, shap_values
        # RandomOverSampler: over-sample the minority class(es) by picking samples at random with replacement.
        #clf = GridSearchCV(estimator=createPipeline(model, "RandomOverSampler", feature_selector=feature_selector), param_grid=model_hyperparams, cv=inner_cv, scoring="roc_auc", refit=True)
        clf = RandomizedSearchCV(estimator=createPipeline(
            model, "RandomOverSampler", feature_selector=feature_selector),
                                 param_distributions=model_hyperparams,
                                 cv=inner_cv,
                                 scoring="roc_auc",
                                 refit=True,
                                 random_state=0,
                                 n_iter=3)
    clf.fit(train_x, train_y.values.ravel())

    # plot: interpret our model (the following SHAP related code only works for tree-based models)
    feature_names = train_x.columns[clf.best_estimator_.steps[1]
                                    [1].get_support(indices=True)]
    explainer = shap.TreeExplainer(clf.best_estimator_.steps[2][1])
    test_current_fold = test_x[feature_names]
    shap_values = explainer.shap_values(test_current_fold)

    shap_current_fold = pd.DataFrame(data=shap_values[1],
                                     columns=feature_names)
    shap_all_folds = pd.concat([shap_all_folds, shap_current_fold],
                               axis=0,
                               sort=False)
    test_all_folds = pd.concat([test_all_folds, test_current_fold],
                               axis=0,
                               sort=False)

    # Collect results and parameters
    best_params = best_params + [clf.best_params_] * test_x.shape[0]
    cur_fold_pred = clf.predict(test_x).tolist()
Exemple #13
0

if st.sidebar.button('Validate/See results'):

    # Preprocessing
    scaling(df)
    onehot(df)

    # Replacing columns int the same order as our model
    order(df)

    # Prediction
    prediction = XGB.predict(df)
    prob = XGB.predict_proba(df)
    # Create Tree Explainer object that can calculate shap values
    explainer = shap.TreeExplainer(XGB)

    # Calculate Shap values
    choosen_instance = df.loc[[0]]
    shap_values = explainer.shap_values(choosen_instance)

    # Printing results
    if prediction == 0:
        st.write('ACCEPTED', prob)
        st.write(
            "Under 0 you have the probability you won't be in defaut payment")
        st.write(
            "Under 1 you have the probability you will be in defaut payment")
        st.write('Comprehension of acceptance:')
        # visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
        st_shap(
Exemple #14
0
def shap_call(xgb, sample = None, feats='all', nb_features_in_exp = None):
    timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \
            resource.getrusage(resource.RUSAGE_SELF).ru_utime

    f2imap = {}
    for i, f in enumerate(xgb.feature_names):
        f2imap[f.strip()] = i

    if (sample is not None):
        if (nb_features_in_exp is None):
            nb_features_in_exp = len(sample)

        try:
            feat_sample  = np.asarray(sample, dtype=np.float32)
        except:
            print("Cannot parse input sample:", sample)
            exit()
        print("\n\n Starting SHAP explainer... \n Considering a sample with features:", feat_sample)
        if not (len(feat_sample) == len(xgb.X_train[0])):
            print("Unmatched features are not supported: The number of features in a sample {} is not equal to the number of features in this benchmark {}".format(len(feat_sample), len(xgb.X_train[0])))
            exit()

        # compute boost predictions
        feat_sample_exp = np.expand_dims(feat_sample, axis=0)
        feat_sample_exp = xgb.transform(feat_sample_exp)
        y_pred = xgb.model.predict(feat_sample_exp)[0]
        y_pred_prob = xgb.model.predict_proba(feat_sample_exp)[0]

        # No need to pass dataset as it is recored in model
        # https://shap.readthedocs.io/en/latest/

        explainer = shap.TreeExplainer(xgb.model)
        shap_values = explainer.shap_values(feat_sample_exp)

        shap_values_sample = shap_values[-1]
        transformed_sample = feat_sample_exp[-1]




        # we need to sum values per feature
        # https://github.com/slundberg/shap/issues/397
        sum_values = []
        if (xgb.use_categorical):
            p = 0
            for f in xgb.categorical_features:
                nb_values = len(xgb.categorical_names[f])
                sum_v = 0
                for i in range(nb_values):
                    sum_v = sum_v + shap_values_sample[p+i]
                p = p + nb_values
                sum_values.append(sum_v)
        else:
            sum_values = shap_values_sample
        expl = []

        # choose which features in the explanation to focus on
        if feats in ('p', 'pos', '+'):
            feats = 1
        elif feats in ('n', 'neg', '-'):
            feats = -1
        else:
            feats = 0

        print("\t \t Explanations for the winner class", y_pred, " (xgboost confidence = ", y_pred_prob[int(y_pred)], ")")
        print("base_value = {}, predicted_value = {}".format(explainer.expected_value, np.sum(sum_values) + explainer.expected_value))

        abs_sum_values = np.abs(sum_values)
        sorted_by_abs_sum_values =np.argsort(-abs_sum_values)

        for k1, v1 in enumerate(sorted_by_abs_sum_values):

            k = v1
            v = sum_values[v1]

            if (feats == 1 and v < 0) or (feats == -1 and v >= 0):
                continue

            expl.append(f2imap[xgb.feature_names[k]])
            print("id = {}, name = {}, score = {}".format(f2imap[xgb.feature_names[k]], xgb.feature_names[k], v))

            if (len(expl) ==  nb_features_in_exp):
                break

        timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \
                resource.getrusage(resource.RUSAGE_SELF).ru_utime - timer
        print('  time: {0:.2f}'.format(timer))

        return sorted(expl[:nb_features_in_exp])
Exemple #15
0
    xg_model = clf.named_steps['xg_classifier']
    xg_model = xg_model.get_booster()
    standard_scaler = clf.named_steps['scalar6']

    model_bytearray = xg_model.save_raw()[4:]

    def myfun(self=None):
        return model_bytearray

    xg_model.save_raw = myfun

    data_shap = standard_scaler.transform(data_shap)
    bears = standard_scaler.transform(bears)

    if feature_importance == 1:
        shap_explainer = shap.TreeExplainer(xg_model, data_shap)
        shap_values = shap_explainer.shap_values(bears)

        bears_ids = bears_ids.reset_index(drop=True)
        result = pd.concat(
            [bears_ids,
             pd.DataFrame(shap_values, columns=data.columns)],
            axis=1,
            sort=False)
        result.to_csv('bears_shap.csv', index=False)

####Cross-Val Classifiers####
if cross_val == 1:

    classifier_cross_val = pd.DataFrame(data=None,
                                        columns=[
Exemple #16
0

#Compare normalized gini and Mean Squared Error
#Use mean as a benchmark
y_test['ClaimAmountAvg'] = y_test.mean().values[0]

print(normalized_gini(y_test['ClaimAmount'], y_pred))  #0.679
print(sqrt(mean_squared_error(y_test['ClaimAmount'], y_pred)))  #706

print(normalized_gini(y_test['ClaimAmount'], y_test['ClaimAmountAvg']))  #0.075
print(sqrt(mean_squared_error(y_test['ClaimAmount'],
                              y_test['ClaimAmountAvg'])))  #711
#Mean Squared Error not much improved, but normalized gini much improved

#Use shap to plot LGBM Model
explainer = shap.TreeExplainer(regressor)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, plot_type="bar")
#Past values most important for prediction

############
##Real Run
############
#Create Train/Test Splits
#Test on current year
date = pd.Timestamp(2019, 7, 1)
X_train = X.loc[X['MinDate'] < date]
y_train = y.loc[y['MinDate'] < date]

X_test = X.loc[X['MinDate'] == date]
y_test = y.loc[y['MinDate'] == date]
Exemple #17
0
 def get_shap_values(self):
     self.explainer = shap.TreeExplainer(self.model)
     self.shap_values = self.explainer.shap_values(self.X_valid)
     shap.summary_plot(self.shap_values, self.X_valid)
     return self.explainer, self.shap_values
#cf4 = confusion_matrix(y_true, y_predrf3)
#print('tn, fp, fn, tp', cf4)
#cf_normalize3 = (cf4-np.min(cf4))/np.ptp(cf4)
#tnrf3, fprf3, fnrf3, tprf3 = cf4.ravel()
#print(cf4.ravel())
#print("Accuracy: {}".format((tprf3+tnrf3)/len(y_test)))
#print("Recall: {}".format(tprf3/(tprf3+fnrf3)))
#print("Precision: {}".format(tprf3/(tprf3+fprf3)))
#%%
# ---------SHAP---------
to_train2 = np.delete(to_train, [xpos, ypos], axis=0)
# DF, based on which importance is checked
X_importance = X_test
#%%
# Explain model predictions using shap library:
explainer = shap.TreeExplainer(xgb1)
shap_values = explainer.shap_values(X_importance)
#%%
plt.figure()
shap.summary_plot(shap_values,
                  X_importance,
                  max_display=20,
                  feature_names=to_train,
                  show=False)
plt.tight_layout()
plt.show()
#plt.savefig('summaryplot.png', dpi=200)
#%%
shap.dependence_plot('ewdeep10_kurt',
                     shap_values,
                     X_importance,
Exemple #19
0
]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=42)

model = XGBClassifier(eta=0.05,
                      n_estimators=121,
                      max_depth=7,
                      min_samples_split=50,
                      min_samples_leaf=5,
                      cv=5)

model.fit(X_train, y_train)
explainer = shap.TreeExplainer(model)

shap_values = explainer.shap_values(X_train)
#print ("a")
#print(shap_values)
#shap_l=shap_values.tolist()
#print(shap_values)
i = 0
#while i<3:
#print(explainer.expected_value[i])
#print(shap_values[i])
#shap.dependence_plot(0, shap_values[0], X,X_name)

#  shap.force_plot(explainer.expected_value[0],shap_values[0],X_train)
#    i=i+1
shap.summary_plot(shap_values, X, X_name)
Exemple #20
0
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
my_model = RandomForestClassifier(random_state=0).fit(train_X, train_y)



# We will look at SHAP values for a single row of the dataset
# (we arbitrarily chose row 5). 
row_to_show = 5
data_for_prediction = val_X.iloc[row_to_show]  # use 1 row of data here. Could use multiple rows if desired
# invert rows, columns to columns, rows
data_for_prediction_array = data_for_prediction.values.reshape(1, -1)
#For context, we'll look at the raw predictions before looking at the SHAP values
my_model.predict_proba(data_for_prediction_array)

# Create object that can calculate shap values
explainer = shap.TreeExplainer(my_model)
### Calculate Shap values
shap_values = explainer.shap_values(data_for_prediction)

# It's cumbersome to review raw arrays, but the shap package has a nice way to visualize the results.
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1], data_for_prediction)

#Here is an example using KernelExplainer to get similar results.
#The results aren't identical because kernelExplainer gives an approximate result.
# use Kernel SHAP to explain test set predictions
k_explainer = shap.KernelExplainer(my_model.predict_proba, train_X)
### Calculate Shap values
k_shap_values = k_explainer.shap_values(data_for_prediction)
shap.force_plot(k_explainer.expected_value[1], k_shap_values[1], data_for_prediction)
Exemple #21
0
    def classify(
        self, items, probabilities=False, importances=False, importance_cutoff=0.15
    ):
        assert items is not None
        assert (
            self.extraction_pipeline is not None and self.clf is not None
        ), "The module needs to be initialized first"

        if not isinstance(items, list):
            items = [items]

        assert isinstance(items[0], dict) or isinstance(items[0], tuple)

        X = self.extraction_pipeline.transform(items)
        if probabilities:
            classes = self.clf.predict_proba(X)
        else:
            classes = self.clf.predict(X)

        classes = self.overwrite_classes(items, classes, probabilities)

        if importances:
            explainer = shap.TreeExplainer(self.clf)
            shap_values = explainer.shap_values(X)

            important_features = self.get_important_features(
                importance_cutoff, shap_values
            )

            # Workaround: handle multi class case for force_plot to work correctly
            if len(classes[0]) > 2:
                pred_class_index = classes.argmax(axis=-1)[0]
                explainer.expected_value = explainer.expected_value[pred_class_index]
                shap_values = shap_values[pred_class_index]
            else:
                pred_class_index = 0

            pred_class = self.class_names[pred_class_index]
            top_indexes = [
                int(index)
                for importance, index, is_positive in important_features["classes"][
                    pred_class
                ][0]
            ]

            feature_names = self.get_human_readable_feature_names()

            feature_legend = {
                str(i + 1): feature_names[feature_i]
                for i, feature_i in enumerate(top_indexes)
            }

            with io.StringIO() as out:
                p = shap.force_plot(
                    explainer.expected_value,
                    shap_values[:, top_indexes],
                    X.toarray()[:, top_indexes],
                    feature_names=[str(i + 1) for i in range(len(top_indexes))],
                    matplotlib=False,
                    show=False,
                )

                # TODO: use full_html=False
                shap.save_html(out, p)

                html = out.getvalue()

            return (
                classes,
                {
                    "importances": important_features,
                    "html": html,
                    "feature_legend": feature_legend,
                },
            )

        return classes
Exemple #22
0
def upload():
    print('eer  0', request.form)
    dropdown_selection = str(request.form)
    dropdown_selection = dropdown_selection.split()

    print(dropdown_selection)
    model_type = dropdown_selection[3]
    dropdown_selection = dropdown_selection[1]

    print('model type ji ', model_type)

    print(dropdown_selection, "  nuna bhai")

    global id_name

    target = 'images/'
    print('tt', target)

    if not os.path.isdir(target):
        os.mkdir(target)
    global ff
    ff = []
    for file in request.files.getlist("file"):
        print(file)
        filename = file.filename
        destination = "/".join([target, filename])
        print('des', destination)
        file.save(destination)
        ff.append(destination)

    mypath = os.getcwd()
    onlyfiles = [
        os.path.join(mypath, f) for f in os.listdir(mypath)
        if os.path.isfile(os.path.join(mypath, f))
    ]

    print('raJA ', ff)
    import warnings
    warnings.filterwarnings("ignore")

    with open(ff[0], 'rb') as file:
        model = pickle.load(file)

    with open(ff[1], 'rb') as file:
        X_data = pickle.load(file)

    with open(ff[2], 'rb') as file:
        y_data = pickle.load(file)

    if 'GL' in dropdown_selection:

        if 'RR' in model_type:

            PI = permutation_importance(model, X_data, y_data)

            row_to_show = 5

            data_for_prediction = X_data.iloc[row_to_show]

            explainer = shap.Explainer(model,
                                       X_data,
                                       feature_names=X_data.columns)
            shap_values = explainer.shap_values(X_data)

            shap.summary_plot(shap_values, X_data)

            import matplotlib.pyplot as pl
            pl.savefig('static/img/new_plot.png')
            pl.close()

            ICE = ind_cond_exp(model, X_data, y_data)

            #global surgat
            from sklearn.tree import DecisionTreeRegressor
            from sklearn.tree import plot_tree

            predictions = model.predict(X_data)
            dt = DecisionTreeRegressor(random_state=100, max_depth=3)
            # We fit the shallow tree to the matrix X and the predictions of the random forest model
            dt.fit(X_data, predictions)

            fig, ax = plt.subplots(figsize=(20, 10))

            plot_tree(dt,
                      feature_names=list(X_data.columns),
                      precision=3,
                      filled=True,
                      fontsize=12,
                      impurity=True)
            pl.savefig('static/img/new2_plot.png')
            pl.close()

            return render_template('model_explanation_result.html',
                                   PI=PI,
                                   ICE=ICE,
                                   SH="static/img/new_plot.png",
                                   SM="static/img/new2_plot.png")

        if 'RF' in model_type:
            PI = permutation_importance(model, X_data, y_data)

            explainer = shap.TreeExplainer(model,
                                           X_data,
                                           feature_names=X_data.columns)
            shap_values = explainer.shap_values(X_data)

            shap.summary_plot(shap_values, X_data)

            import matplotlib.pyplot as pl
            pl.savefig('static/img/new_plot.png')
            pl.close()

            ICE = ind_cond_exp(model, X_data, y_data)

            #global surgat
            from sklearn.tree import DecisionTreeRegressor
            from sklearn.tree import plot_tree

            predictions = model.predict(X_data)
            dt = DecisionTreeRegressor(random_state=100, max_depth=3)
            # We fit the shallow tree to the matrix X and the predictions of the random forest model
            dt.fit(X_data, predictions)

            fig, ax = plt.subplots(figsize=(20, 10))

            plot_tree(dt,
                      feature_names=list(X_data.columns),
                      precision=3,
                      filled=True,
                      fontsize=12,
                      impurity=True)
            pl.savefig('static/img/new2_plot.png')
            pl.close()

            return render_template('model_explanation_result.html',
                                   PI=PI,
                                   ICE=ICE,
                                   SH="static/img/new_plot.png",
                                   SM="static/img/new2_plot.png")

        if 'CC' in model_type:
            PI = permutation_importance(model, X_data, y_data)

            explainer = shap.KernelExplainer(model.predict_proba, X_data)
            shap_values = explainer.shap_values(X_data)

            shap.summary_plot(shap_values, X_data)

            import matplotlib.pyplot as pl
            pl.savefig('static/img/new_plot.png')
            pl.close()

            #ICE = ind_cond_exp(model,X_data,y_data)

            #global surgat
            from sklearn.tree import DecisionTreeRegressor
            from sklearn.tree import plot_tree

            predictions = model.predict(X_data)

            return render_template(
                'model_explanation_result_classification.html',
                PI=PI,
                SH="static/img/new_plot.png")

    if 'WI' in dropdown_selection:

        # print(res," resss")

        #
        import dash
        from dash.dependencies import Input, Output
        import dash_table
        import dash_core_components as dcc
        import dash_html_components as html

        app = dash.Dash(__name__)
        import pandas as pd
        #should be X data

        mean_list = []
        features = X_data.columns.tolist()
        for i in features:
            mean_list.append(round(X_data[i].mean()))

        explainer = shap.TreeExplainer(model)
        shap.initjs()

        params = features

        id_name_str = "my_graph" + str(id_name)
        print('---------------', id_name_str)
        id_name = id_name + 1

        what_plot.layout = html.Div([
            dash_table.DataTable(
                id='table-editing-simple',
                columns=([{
                    'id': 'Model',
                    'name': 'Model'
                }] + [{
                    'id': p,
                    'name': p
                } for p in params]),
                data=[
                    dict(zip(features, mean_list))
                    #dict(Model=i, **{param: mean_list[i] for param in params})
                    # for i in range(0, len(mean_list))
                ],
                editable=True),
            html.Div(id=id_name_str)
        ])

        @what_plot.callback(Output(id_name_str, "children"),
                            Input('table-editing-simple', 'data'),
                            Input('table-editing-simple', 'columns'))
        def update_graphs(rows, columns):
            df = pd.DataFrame(rows, columns=[c['name'] for c in columns])
            print(rows)

            #
            rows = rows[0]
            col = []
            vvalue = []
            for key in rows:
                print(key, '->', int(rows[key]))
                col.append(key)
                vvalue.append([int(rows[key])])

            ik = dict(zip(col, vvalue))
            instance = pd.DataFrame.from_dict(ik)

            print('instancceee ', instance)

            from shap.plots._force_matplotlib import draw_additive_plot

            # explain the model's predictions using SHAP values(same syntax works for LightGBM, CatBoost, and scikit-learn models)
            #explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(instance)
            shap.initjs()

            #plt.style.use("_classic_test_patch")

            ytu = model.predict(instance)
            print('ress ', ytu)

            koko = _force_plot_html2(explainer.expected_value, shap_values,
                                     instance)

            #print('kkkk ',koko)

            print('Done')

            return koko
    #

        return render_template('local_explain_lime.html', LL=what_plot.index())

    if 'LL' in dropdown_selection:
        None
        #table and plots ========================================================
        import dash
        from dash.dependencies import Input, Output
        import dash_table
        import dash_core_components as dcc
        import dash_html_components as html
        import pandas as pd

        id_name_str = "my_graph" + str(id_name)
        print('---------------', id_name_str)
        id_name = id_name + 1

        print('in LL')
        # make graph===============================================================
        table_plot.layout = html.Div([
            dash_table.DataTable(
                id='datatable-interactivity',
                columns=[{
                    "name": i,
                    "id": i,
                    "deletable": True,
                    "selectable": True
                } for i in X_data.columns],
                data=X_data.to_dict('records'),
                editable=True,
                filter_action="native",
                sort_action="native",
                sort_mode="multi",
                column_selectable="single",
                row_selectable="single",
                row_deletable=True,
                selected_columns=[],
                selected_rows=[],
                page_action="native",
                page_current=0,
                page_size=10,
            ),
            html.Div(id=id_name_str)
        ])

        print('miod LL')

        @table_plot.callback(Output(id_name_str, "children"),
                             Input('datatable-interactivity',
                                   "derived_virtual_data"),
                             Input('datatable-interactivity',
                                   "derived_virtual_selected_rows"))
        def update_graphs(rows, derived_virtual_selected_rows):
            # When the table is first rendered, `derived_virtual_data` and
            # `derived_virtual_selected_rows` will be `None`. This is due to an
            # idiosyncrasy in Dash (unsupplied properties are always None and Dash
            # calls the dependent callbacks when the component is first rendered).
            # So, if `rows` is `None`, then the component was just rendered
            # and its value will be the same as the component's dataframe.
            # Instead of setting `None` in here, you could also set
            # `derived_virtual_data=df.to_rows('dict')` when you initialize
            # the component.
            if derived_virtual_selected_rows is None:
                derived_virtual_selected_rows = []

            dff = X_data if rows is None else pd.DataFrame(rows)

            colors = [
                '#7FDBFF' if i in derived_virtual_selected_rows else '#0074D9'
                for i in range(len(dff))
            ]

            print('my value', derived_virtual_selected_rows)
            print('i am row ', X_data.iloc[derived_virtual_selected_rows])
            print(type(derived_virtual_selected_rows))

            from shap.plots._force_matplotlib import draw_additive_plot

            ttt = X_data.loc[derived_virtual_selected_rows]
            # explain the model's predictions using SHAP values(same syntax works for LightGBM, CatBoost, and scikit-learn models)
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(ttt)
            shap.initjs()

            plt.style.use("_classic_test_patch")

            bubu = _force_plot_html(explainer.expected_value, shap_values, ttt)

            shap_values = explainer.shap_values(X_data)
            #shap.force_plot(explainer.expected_value, shap_values, X_data)
            explain_all = _force_plot_html(explainer.expected_value,
                                           shap_values, X_data)

            print('bubu ', bubu)

            return bubu, explain_all

        return render_template('local_explain_lime.html',
                               LL=table_plot.index())

    if 'BD' in dropdown_selection:
        None

    #FI
    if 'DB' in dropdown_selection:

        #  if 'CC' in model_type:
        #   from explainerdashboard import ClassifierExplainer, ExplainerDashboard
        #  ExplainerDashboard(ClassifierExplainer(model, X_data, y_data)).run()

        if 'RF' in model_type:
            import threading
            import time

            def dashboard_exp(model, X_data, y_data):
                import dash_bootstrap_components as dbc

                from explainerdashboard import RegressionExplainer, ExplainerDashboard
                ExplainerDashboard(
                    RegressionExplainer(model, X_data, y_data),
                    bootstrap=dbc.themes.SANDSTONE,
                    importances=True,
                    model_summary=False,
                    contributions=True,
                    whatif=True,
                    shap_dependence=False,
                    shap_interaction=False,
                    decision_trees=False,
                    hide_whatifindexselector=True,
                    hide_whatifprediction=True,
                    hide_inputeditor=False,
                    hide_whatifcontributiongraph=False,
                    hide_whatifcontributiontable=True,
                    hide_whatifpdp=False,
                    hide_predindexselector=True,
                    hide_predictionsummary=True,
                    hide_contributiongraph=False,
                    hide_pdp=False,
                    hide_contributiontable=True,
                    hide_dropna=True,
                    hide_range=True,
                    hide_depth=True,
                    hide_sort=True,
                    hide_sample=True,  # hide sample size input on pdp component
                    hide_gridlines=True,  # hide gridlines on pdp component
                    hide_gridpoints=True,
                    hide_cats_sort=
                    True,  # hide the sorting option for categorical features
                    hide_cutoff=
                    True,  # hide cutoff selector on classification components
                    hide_percentage=
                    True,  # hide percentage toggle on classificaiton components
                    hide_log_x=
                    True,  # hide x-axis logs toggle on regression plots
                    hide_log_y=
                    True,  # hide y-axis logs toggle on regression plots
                    hide_ratio=True,  # hide the residuals type dropdown
                    hide_points=
                    True,  # hide the show violin scatter markers toggle
                    hide_winsor=True,  # hide the winsorize input
                    hide_wizard=
                    True,  # hide the wizard toggle in lift curve component
                    hide_star_explanation=True,
                ).run()

            t1 = threading.Thread(target=dashboard_exp,
                                  args=(model, X_data, y_data))

            t1.start()

            return '''<H2>
def ensemble2_resampling_weight(n_resampling,
                                n_estimators,
                                featurenum,
                                seed,
                                x_train=None,
                                y_train=None):
    np.random.seed(seed=seed)  # seed값 설정
    model = RandomForestClassifier()  # 모델 변경 가능 (가중치 성능 모델)
    selected_cols_dict = {}
    for x in range(n_resampling):
        sample = np.random.choice(x_train.shape[0], 200, replace=False)
        x_train_sample = x_train.iloc[sample, :]
        y_train_sample = y_train.iloc[sample]
        n = x_train_sample.shape[1]
        # trainset에서 200개 resampling 데이터를 제외하고 나머지는 train 안의 validation set
        not_sample = [i for i in range(x_train.shape[0]) if i not in sample]
        x_train_val = x_train.iloc[not_sample, :]
        y_train_val = y_train.iloc[not_sample]

        # 총 8개의 fs_method ensemble (rfe는 시간이 너무 오래 걸려서 제외)
        fs_model_tree = [
            RandomForestClassifier(random_state=seed,
                                   n_estimators=n_estimators),
            ExtraTreesClassifier(random_state=seed, n_estimators=n_estimators)
        ]

        fs_model_kb = [
            SelectKBest(chi2, k='all'),
            SelectKBest(f_classif, k='all')
        ]

        fs_model_shap = [
            RandomForestClassifier(random_state=seed,
                                   n_estimators=n_estimators)
        ]

        # tree importance
        for j, fs in enumerate(fs_model_tree):
            fs_model = fs
            fs_model.fit(x_train_sample, y_train_sample)
            importances = fs_model.feature_importances_
            importances_series = pd.Series(importances)

            # 가장 작은 rank 값을 가진 10개의 high_importance feature를 사용하여 1번 학습
            rank = importances_series.rank(ascending=False, method='min')
            rank_high_idx = rank[rank <= 10].index
            x_train_sample_high = x_train_sample.iloc[:, rank_high_idx]
            x_train_val_high = x_train_val.iloc[:, rank_high_idx]
            model = model.fit(x_train_sample_high, y_train_sample)

            # 학습된 데이터로 acc계산
            y_pred_val = model.predict(x_train_val_high)
            accuracy_val = accuracy_score(
                y_train_val, y_pred_val)  # 가중치 (1차적인 feature selection의 성능)

            # 각 feature selection method 별로 feature 선택
            rank_high_index = rank[rank <= 400].index
            selected_columns = x_train.columns[rank_high_index]

            # 가중치(1차적인 feature selection의 성능) 을 포함시킨 selected_columns
            selected_columns_acc = []
            for col in selected_columns:
                selected_columns_acc.append(col + f'_{accuracy_val**2}')

            selected_cols_dict[
                f'tree_importances_{j}_{x}'] = selected_columns_acc

        for j, fs in enumerate(fs_model_kb):
            fs_model = fs
            fs_model.fit(x_train_sample, y_train_sample)
            importances = fs_model.scores_
            importances2 = np.nan_to_num(importances)
            importances_series = pd.Series(importances2)

            # 가장 작은 rank 값을 가진 10개의 high_importance feature를 사용하여 1번 학습
            rank = importances_series.rank(ascending=False, method='min')
            rank_high_idx = rank[rank <= 10].index
            x_train_sample_high = x_train_sample.iloc[:, rank_high_idx]
            x_train_val_high = x_train_val.iloc[:, rank_high_idx]
            model = model.fit(x_train_sample_high, y_train_sample)

            # 학습된 데이터로 acc계산
            y_pred_val = model.predict(x_train_val_high)
            accuracy_val = accuracy_score(
                y_train_val, y_pred_val)  # 가중치 (1차적인 feature selection의 성능)

            # 각 feature selection method 별로 feature 선택
            rank_high_index = rank[rank <= 400].index
            selected_columns = x_train.columns[rank_high_index]

            # 가중치(1차적인 feature selection의 성능) 을 포함시킨 selected_columns
            selected_columns_acc = []
            for col in selected_columns:
                selected_columns_acc.append(col + f'_{accuracy_val**2}')

            selected_cols_dict[f'selectkbest_{j}_{x}'] = selected_columns_acc

        for j, fs in enumerate(fs_model_shap):
            fs_model = fs
            fs_model.fit(x_train_sample, y_train_sample)
            explainer = shap.TreeExplainer(fs_model)
            shap_values = explainer.shap_values(x_train_sample)
            shap_values_mat = np.abs(shap_values[1])
            shap_mean = np.mean(shap_values_mat, axis=0)
            importances_series = pd.Series(shap_mean)

            # 가장 작은 rank 값을 가진 10개의 high_importance feature를 사용하여 1번 학습
            rank = importances_series.rank(ascending=False, method='min')
            rank_high_idx = rank[rank <= 10].index
            x_train_sample_high = x_train_sample.iloc[:, rank_high_idx]
            x_train_val_high = x_train_val.iloc[:, rank_high_idx]
            model = model.fit(x_train_sample_high, y_train_sample)

            # 학습된 데이터로 acc계산
            y_pred_val = model.predict(x_train_val_high)
            accuracy_val = accuracy_score(
                y_train_val, y_pred_val)  # 가중치 (1차적인 feature selection의 성능)

            # 각 feature selection method 별로 feature 선택
            rank_high_index = rank[rank <= 400].index
            selected_columns = x_train.columns[rank_high_index]

            # 가중치(1차적인 feature selection의 성능) 을 포함시킨 selected_columns
            selected_columns_acc = []
            for col in selected_columns:
                selected_columns_acc.append(col + f'_{accuracy_val**2}')

            selected_cols_dict[f'shap_{j}_{x}'] = selected_columns_acc

    df_cols = pd.DataFrame(selected_cols_dict)

    columns = []
    for i in df_cols.values:
        for j in i:
            columns.append(j)

    # selected feature들의 가중치를 다 합하기
    counts = dict()
    for col in columns:
        key = col.split('_')[0]
        value = col.split('_')[1]
        if key not in counts:
            counts[key] = np.float(value)
        else:
            counts[key] += np.float(value)

    sorted_counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
    selected_columns_lst = []
    for i in range(featurenum):  # 지정 된 featurenum 개수만큼 상위 빈출 컬럼 뽑기
        selected_columns_lst.append(sorted_counts[i][0])

    return selected_columns_lst
Exemple #24
0
def upload2():
    from werkzeug.datastructures import ImmutableMultiDict

    with open(ff[0], 'rb') as file:
        model = pickle.load(file)

    with open(ff[1], 'rb') as file:
        X_data = pickle.load(file)

    with open(ff[2], 'rb') as file:
        y_data = pickle.load(file)

    print('start')
    print(request.form)
    hh = request.form
    hh = hh.to_dict(flat=False)
    print('hh ', hh)
    for file in request.files.getlist("gg"):
        print(file)
    print(list(X_data.columns))

    series = pd.Series(hh)

    import shap
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_data)

    # load JS visualization code to notebook
    shap.initjs()

    #plt.style.use("_classic_test_patch")
    #plt.clf()
    # visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
    #shap.force_plot(explainer.expected_value, shap_values[1,:], series, matplotlib=True, figsize=(22, 4))
    #shap.force_plot(explainer.expected_value, shap_values[10,:],  \
    #                series,feature_names=X_data.columns,\
    #               matplotlib=True, show=False)

    # plt.savefig("gg.png",dpi=150, bbox_inches='tight')

    #yyy = shap.getjs()
    '''
    oo = yyy.matplotlib
    p = yyy.html  
    yyy_str = mpld3.fig_to_html(p)  
    print('dfsdfsdf ',p)     
    '''
    series = series.tolist()
    print('im a he ', series)
    pp = []
    for i in series:
        for j in i:
            j = float(j)
            pp.append(j)

    series = np.array(pp)
    print('im a she ', series)

    #lime
    import lime
    from lime.lime_tabular import LimeTabularExplainer
    explainer = LimeTabularExplainer(X_data,
                                     mode='regression',
                                     feature_names=list(X_data.columns),
                                     random_state=42,
                                     discretize_continuous=False,
                                     kernel_width=0.2)

    exp = explainer.explain_instance(series, model.predict)

    print(exp.local_pred)

    fig = exp.as_pyplot_figure(label=list(X_data.columns))

    fig_2 = exp.as_html(labels=list(X_data.columns))
    #print('dddd ',fig_2)

    plt.tight_layout()
    #fig = plt.figure(figsize = (18,8))

    #    plt.tight_layout()
    #    #plt.boxplot(bank_data.transpose())
    #
    #    #Add titles to the chart and axes
    #    plt.hist(bank_data.transpose(), bins = 50)
    #    plt.title('Boxplot of Bank Stock Prices (5Y Lookback)')
    #    plt.xlabel('Bank')
    #    plt.ylabel('Stock Prices')
    #
    #mpld3.show(fig)
    #
    html_str = mpld3.fig_to_html(fig)
    Html_file = open("templates/lime.html", "w")
    Html_file.write(html_str)
    Html_file.close()
    #

    # plt.savefig('static/img/new34_plot.png')
    #plt.close()

    return render_template('local_result.html',
                           LIME=html_str,
                           SH=fig_2,
                           gh=html_str)
Exemple #25
0
def run_edu_score_prediction_app():

    st.header('■Score prediction Demo')
    st.write(
        'To predict the expected score (e.g., for students who are absent from the test.)'
    )

    st.sidebar.subheader('Data Upload')

    df_edu = pd.read_csv("data/eng_sample_data_score_prediction.csv")

    def download_link(object_to_download, download_filename,
                      download_link_text):
        if isinstance(object_to_download, pd.DataFrame):
            object_to_download = object_to_download.to_csv(
                index=False, encoding='utf_8_sig')
            b64 = base64.b64encode(object_to_download.encode()).decode()
            return f'<a href="data:file/txt;base64,{b64}" download="{download_filename}">{download_link_text}</a>'

    tmp_download_link = download_link(df_edu, 'sample_score_pred.csv',
                                      'Download sample csv file.')
    st.sidebar.markdown(tmp_download_link, unsafe_allow_html=True)

    #     st.sidebar.info("""
    #     [Download the sample csv file](https://github.com/59er/eng_learning_analytics_web/blob/master/sample_data/eng_sample_data_score_prediction_for_WEB.csv)
    #         """)
    try:

        uploaded_file = st.sidebar.file_uploader(
            "File upload (Drag and drop or use [Browse files] button to import csv file. Only utf-8 format is available.)",
            type=["csv"])

        if uploaded_file is not None:
            df_edu = pd.read_csv(uploaded_file)
            uploaded_file.seek(0)
            display_data = st.sidebar.checkbox(label='Show uploaded data')

            if display_data:
                st.dataframe(df_edu)

            df = df_edu.drop(['ID', 'Teacher'], axis=1)
            target = 'Score'
            encode = ['Class', 'Subject']

            for col in encode:
                dummy = pd.get_dummies(df[col], prefix=col)
                df = pd.concat([df, dummy], axis=1)
                del df[col]
            X = df.drop(['Score'], axis=1)
            Y = df['Score']
            X_train, X_test, y_train, y_test = train_test_split(X,
                                                                Y,
                                                                test_size=0.2,
                                                                random_state=0)

            clf = RandomForestRegressor()
            clf.fit(X, Y)
            df = df.drop(['Score'], axis=1)
            prediction = clf.predict(df)
            st.subheader('Score prediction result')
            id = df_edu['ID']
            id = pd.DataFrame(id)
            result = pd.DataFrame(prediction)
            pred_result = pd.concat([id, result], axis=1)
            pred_result = pred_result.rename(columns={0: 'Result'})
            st.dataframe(pred_result)

            score = clf.score(X_test, y_test)
            st.set_option('deprecation.showPyplotGlobalUse', False)

            st.subheader('Prediction accuracy')
            st.write(score)

            fig = plt.figure(figsize=(5, 5))
            explainer = shap.TreeExplainer(clf, X)
            shap_values = explainer.shap_values(X)

            st.subheader(
                'Impact of explanatory variables (each item score) on the objective variable (final score)'
            )
            fig = shap.summary_plot(shap_values, X, plot_type='bar')
            st.pyplot(fig)

            st.subheader(
                'Correlation of explanatory variables with the objective variable (final score)'
            )
            fig1 = shap.summary_plot(shap_values, X)
            st.pyplot(fig1)

            def st_shap(plot, height=None):
                shap_html = f"<head>{shap.getjs()}</head><body>{plot.html()}</body>"
                components.html(shap_html, height=height)

        else:

            def user_input_features():
                class_name = st.sidebar.selectbox(
                    'Class', ('A', 'B', 'C', 'D', 'E', 'F', 'G'))
                subject = st.sidebar.selectbox(
                    'Subject', ('Literature', 'Math', 'Reading'))
                subject_A = st.sidebar.slider('item1', 0, 100, 50)
                subject_B = st.sidebar.slider('item2', 0, 100, 50)
                subject_C = st.sidebar.slider('item3', 0, 100, 50)
                subject_D = st.sidebar.slider('item4', 0, 100, 50)
                subject_E = st.sidebar.slider('item5', 0, 100, 50)
                data = {
                    'Class': class_name,
                    'Subject': subject,
                    'item1': subject_A,
                    'item2': subject_B,
                    'item3': subject_C,
                    'item4': subject_D,
                    'item5': subject_E
                }
                features = pd.DataFrame(data, index=[0])
                return features

            input_df = user_input_features()

        sample_data = pd.read_csv('data/eng_sample_data_score_prediction.csv')
        sample = sample_data.drop(columns=['Score', 'ID', 'Teacher'])
        #df = sample.copy()
        df = pd.concat([input_df, sample], axis=0)
        # st.dataframe(df[:1])

        encode = ['Class', 'Subject']

        for col in encode:
            dummy = pd.get_dummies(df[col], prefix=col)
            df = pd.concat([df, dummy], axis=1)
            del df[col]
        df1 = df[:1]

        if uploaded_file is not None:
            st.write(df1)
        else:
            st.write(
                'The following is default sample data. Select class and subject then use the sliders for each item in the sidebar to get an idea of score prediction.'
            )
            st.write(df1)

        load_clf = pickle.load(open('data/subject_score_prediction.pkl', 'rb'))
        prediction = load_clf.predict(df1)
        st.subheader('Score prediction result')
        st.write(prediction[0])

        df1 = sample_data.copy()
        encode = ['Class', 'Subject']
        for col in encode:
            dummy1 = pd.get_dummies(df1[col], prefix=col)
            df1 = pd.concat([df1, dummy1], axis=1)
            del df1[col]

        X = df1.drop(['Score', 'Teacher', 'ID'], axis=1)
        Y = df1['Score']
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            Y,
                                                            test_size=0.2,
                                                            random_state=0)

        score = load_clf.score(X_test, y_test)
        st.set_option('deprecation.showPyplotGlobalUse', False)

        st.subheader('Prediction accuracy')
        st.write(score)

        fig = plt.figure(figsize=(5, 5))
        explainer = shap.TreeExplainer(load_clf, X)
        shap_values = explainer.shap_values(X)

        st.subheader(
            'Impact of explanatory variables (each item score, class and subject) on the objective variable (final score)'
        )
        fig = shap.summary_plot(shap_values, X, plot_type='bar')
        st.pyplot(fig)

        st.subheader(
            'Correlation of explanatory variables (each item score, class and subject) with the objective variable (final score)'
        )
        fig1 = shap.summary_plot(shap_values, X)
        st.pyplot(fig1)

        def st_shap(plot, height=None):
            shap_html = f"<head>{shap.getjs()}</head><body>{plot.html()}</body>"
            components.html(shap_html, height=height)

        #force_plot for some IDs
        # i = np.arange(1,5,1)
        # for i in i:
        #     st.write('Example of', 'ID', i)
        #     st_shap(shap.force_plot(explainer.expected_value, shap_values[i,:],X.iloc[i,:]),400)

        # st_shap(shap.force_plot(explainer.expected_value, shap_values, X),400)

    except Exception as e:
        print(e)
missing_columns = (set(base_table.columns) -
                   set(df.columns)) - NON_FEATURE_COLUMNS
for col in missing_columns:
    df[col] = float('nan')

columns_to_drop = list(set(df.columns) - set(base_table.columns))
df = df.drop(columns_to_drop, axis=1)

#Rearrange columns to input into model
df = df[base_table.columns]

#Now, create the predictions
predicted_risk = churn_model.predict_proba(df)
predicted_risk_df = pd.DataFrame(predicted_risk)

explainer = shap.TreeExplainer(churn_model)
shap_values = explainer.shap_values(df)
shap_df = pd.DataFrame(shap_values)
shap_df.columns = df.columns
shap_df["churn_prob"] = predicted_risk_df[1]
shap_df["key"] = keys

shap_right = shap_df

shap_df[['Customer', 'Product']] = shap_df['key'].str.split('_',
                                                            n=1,
                                                            expand=True)

shap_df = pd.merge(shap_df, value_df, on='key', how='left')

get_cols = ['Customer', 'Product'] + [
Exemple #27
0
user_df = pd.DataFrame([user_input], columns=feature_names)

clf = joblib.load("penguin_clf.joblib")
prediction = clf.predict(user_df)[0]
class_prediction = class_names[prediction]

st.sidebar.write(f"## Prediction: {class_prediction}")
proba = clf.predict_proba(user_df)
proba_df = pd.DataFrame(proba, columns=class_names)
st.sidebar.write(proba_df)

st.write(f"## Explanation for Predicting: **{class_prediction}**")
st.subheader("SHAP values")

user_encoded = clf[:-1].transform(user_df)
explainer = shap.TreeExplainer(clf[-1])
shap_values = explainer.shap_values(user_encoded[[0], :],
                                    check_additivity=False)

shap_plot_reprs = []

for i in range(3):
    shap_plot = shap.force_plot(explainer.expected_value[i],
                                shap_values[i],
                                user_encoded,
                                feature_names=feature_names,
                                out_names=class_names[i])
    shap_plot_reprs.append(shap_plot._repr_html_())

shap_html_repr = "".join(shap_plot_reprs)
def return_weights_from_xgboost(
    geodataframe,
    raster_path,
    pop_string,
    codes=[21, 22, 23, 24],
    n_pixels_option_values=256,
    tuned_xgb=False,
    gbm_hyperparam_grid={
        "learning_rate": [0.001, 0.01, 0.1],
        "n_estimators": [200],
        "subsample": [0.3, 0.5],
        "max_depth": [4, 5, 6],
        "num_boosting_rounds": [10, 20],
    },
    force_crs_match=True,
    na_value=255,
    ReLU=True,
):
    """Function that returns the weights of each land type according to NLCD types/codes given by Extreme Gradient Boost model (XGBoost)
    
    Parameters
    ----------
    
    geodataframe           : a geopandas geoDataFrame used to build regression
    
    raster_path            : the path to the associated raster image.
    
    pop_string             : the name of the variable on geodataframe that the regression shall be conducted
    
    codes                  : an integer list of codes values that should be considered as 'populated' from the National Land Cover Database (NLCD).
                             The description of each code can be found here: https://www.mrlc.gov/sites/default/files/metadata/landcover.html
                             The default is 21 (Developed, Open Space), 22 (Developed, Low Intensity), 23 (Developed, Medium Intensity) and 24 (Developed, High Intensity).
                             
    n_pixels_option_values : number of options of the pixel values of rasterior. Default is 256.
    
    tuned_xgb              : bool. Default is False.
                             If True the XGBoost model will be tuned making a grid search using gbm_hyperparam_grid dictionary a picking the best model in terms of mean squared error with some pre-defined number of cross-validation.
                             Otherwise, the XGBoost model is fitted with default values of xgboost.train function from xgboost Python library.
                             
    gbm_hyperparam_grid    : a dictionary that represent the grid for the grid search of XGBoost.
    
    force_crs_match        : bool. Default is True.
                             Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file. 
                             It is recommended to let this argument as True.
    
    na_value               : int. Default is 255.
                             The number which is considered to be 'Not a Number' (NaN) in the raster pixel values.
                             
    ReLU                   : bool. Default is True.
                             Wheter the Rectified Linear Units (ReLU) transformation will be used to avoid negative weights for the land types.
    
    Notes
    -----
    1) The formula uses a substring called 'Type_' before the code number due to the 'append_profile_in_gdf' function.
    2) The pixel value, usually, ranges from 0 to 255. That is why the default of 'n_pixels_option_values' is 256.
    3) The returning weights represent the average of the Shapley's values from each feature.
    
    """
    raster_path = fetch_quilt_path(raster_path)

    try:
        import xgboost as xgb
        import shap
    except ImportError as e:
        raise ImportError("xgboost and shap are required to perform this.")

    _check_presence_of_crs(geodataframe)

    if na_value in codes:
        raise ValueError("codes should not assume the na_value value.")

    print("Appending profile...")
    profiled_df = fast_append_profile_in_gdf(
        geodataframe[["geometry", pop_string]], raster_path, force_crs_match
    )  # Use only two columns to build the weights (this avoids error, if the original dataset has already types appended on it).
    print("Append profile: Done.")

    # If the list is unsorted, the codes will be sorted to guarantee that the position of the weights will match
    codes.sort()

    str_codes = [str(i) for i in codes]
    feature_names = ["Type_" + s for s in str_codes]

    y = profiled_df[pop_string]
    X = profiled_df[feature_names]

    print("Starting to fit XGBoost...")
    if tuned_xgb == False:

        # Create the DMatrix
        xgb_dmatrix = xgb.DMatrix(X, y)

        # Create the parameter dictionary
        params = {"objective": "reg:linear"}

        # Train the model
        xg_reg = xgb.train(params=params, dtrain=xgb_dmatrix)

    if tuned_xgb == True:

        try:
            from sklearn.model_selection import GridSearchCV
        except ImportError as e:
            raise ImportError("sklearn is required to perform this.")

        gbm = xgb.XGBRegressor()
        grid_mse = GridSearchCV(
            estimator=gbm,
            param_grid=gbm_hyperparam_grid,
            scoring="neg_mean_squared_error",
            cv=4,  # 4-fold crossvalidation
            verbose=3,  # Prints the grid search profile
            n_jobs=-1,
        )  # Process the GridSearch in parallel all cores availables

        # Fit the grid to the data
        grid_mse.fit(X, y)

        best_params = grid_mse.best_params_
        best_params["objective"] = "reg:linear"

        # Create the DMatrix
        xgb_dmatrix = xgb.DMatrix(X, y)

        # Train the model from the best parameters of the grid search
        xg_reg = xgb.train(params=best_params, dtrain=xgb_dmatrix)

    # Build explainer and fit Shapley's values (https://github.com/slundberg/shap)
    explainer = shap.TreeExplainer(xg_reg)
    shap_values = explainer.shap_values(X)
    weights_from_xgb = shap_values.mean(
        axis=0)  # This is already sorted by pixel Type

    weights = np.zeros(n_pixels_option_values)
    weights[codes] = list(weights_from_xgb)  # Convert to list a dict_values

    if ReLU:
        weights = np.where(weights < 0, 0, weights)

    return weights
Exemple #29
0
    y=data['y'],
    model_scheme='LMP',
    cv=5,
    #grid_search=True,
    #grid_search_scoring='r2',
    #param_grid=parameter_grid,
    eval_metric='rmse',
    parameters=parameters,
    CT_Temp=CT_Temp,
    CT_RT=CT_RT,
    C=C)

catboost.run_model()
print(catboost.__dict__)
'''
catboost.parity_plot(data='train', quantity='LMP').savefig('parity_LMP_train.png')
catboost.parity_plot(data='test', quantity='LMP').savefig('parity_LMP_test.png')
catboost.parity_plot(data='train', quantity='CT_RT').savefig('parity_CT_RT_train.png')
catboost.parity_plot(data='test', quantity='CT_RT').savefig('parity_CT_RT_test.png')
np.save('catboost_dict.npy', catboost.__dict__)
plt.clf()
'''
explainer = shap.TreeExplainer(catboost.model[-1])
shap_values = explainer.shap_values(data['X'])

XX = scale.inverse_transform(data['X'])
X = pd.DataFrame(XX, columns=data['features'])
# summarize the effects of all the features
shap.summary_plot(shap_values, X, plot_type="bar", show=False)
plt.savefig('feature_importance.png', dpi=150, bbox_inches='tight')
Exemple #30
0
    print(
        f"Model has {np.sum(xgboost_test_predictions==Y_test)/len(xgboost_test_predictions)*100:.3} % success rate"
    )

    # Make plots of different importance scores
    for score in ['weight', 'gain', 'cover']:
        xgb.plot_importance(bst,
                            importance_type=score,
                            show_values=False,
                            xlabel=score)
        plt.savefig(f'xkb_{score}.png')
    """ Analyze using Shapeley values """
    # This hack is needed for the current version of xgboost with SHAP
    model_bytearray = bst.save_raw()[4:]

    def myfun(self=None):
        return model_bytearray

    bst.save_raw = myfun

    # Get the explainer for the xgboost model and calculate the shapeley-values
    explainer = shap.TreeExplainer(bst)
    shap_values = explainer.shap_values(X_test)

    # summarize the effects of all the features
    shap.summary_plot(shap_values, X_test)
    shap.dependence_plot("occupation",
                         shap_values,
                         X_test,
                         interaction_index=None,
                         show=False)