def plot_shap_summary(model, X_train, model_name='XGB'): shap_values = shap.TreeExplainer(model).shap_values(X_train) shap_values_df = pd.DataFrame(shap_values, columns=X_train.columns) plot_shap_summary_bar(shap_values_df, model_name)
"boosting_type": "gbdt", "objective": "regression", "metric": {"l2", "l1"}, "num_leaves": 4, "learning_rate": 0.05, "feature_fraction": 0.9, "bagging_fraction": 0.8, "bagging_freq": 5, "verbose": 0, "min_data_in_bin":1,"min_data":1,"min_hess":0 } gbm = lgb.train(lgb_params, train_data, num_boost_round=20, verbose_eval=1) shap.initjs() explainer = shap.TreeExplainer(gbm) shap_values = explainer.shap_values(x) # visualize the first prediction's explanation (use matplotlib=True to avoid Javascript) shap.force_plot(explainer.expected_value, shap_values[1,:], x.iloc[1,:]) shap.force_plot(explainer.expected_value, shap_values, x) shap.dependence_plot("dist_num", shap_values, x) # IMPORTANCES OF ALL FEATURES shap.summary_plot(shap_values, x, plot_type="bar") shap.summary_plot(shap_values, x) shap.summary_plot(explainer.shap_interaction_values(x), x)
def generate_feature_importance_data(self, probs, importance): X_shap_values = shap.TreeExplainer(self.model.clf).shap_values(self.X) pred_class = self.model.le.inverse_transform([probs[0].argmax()])[0] features = [] for i, (val, feature_index, is_positive) in enumerate( importance["importances"]["classes"][pred_class][0]): name = importance["feature_legend"][str(i + 1)] value = importance["importances"]["values"][0, int(feature_index)] shap.summary_plot( X_shap_values[:, int(feature_index)].reshape(self.X.shape[0], 1), self.X[:, int(feature_index)].reshape(self.X.shape[0], 1), feature_names=[""], plot_type="layered_violin", show=False, ) matplotlib.pyplot.xlabel("Impact on model output") img = io.BytesIO() matplotlib.pyplot.savefig(img, bbox_inches="tight") matplotlib.pyplot.clf() img.seek(0) base64_img = base64.b64encode(img.read()).decode("ascii") X = self.X[:, int(feature_index)] y = self.y[X != 0] X = X[X != 0] spearman = spearmanr(X, y) buggy_X = X[y == 1] clean_X = X[y == 0] median = np.median(X) median_clean = np.median(clean_X) median_buggy = np.median(buggy_X) perc_buggy_values_higher_than_median = ( buggy_X >= median).sum() / buggy_X.shape[0] perc_buggy_values_lower_than_median = ( buggy_X < median).sum() / buggy_X.shape[0] perc_clean_values_higher_than_median = ( clean_X > median).sum() / clean_X.shape[0] perc_clean_values_lower_than_median = ( clean_X <= median).sum() / clean_X.shape[0] logger.info("Feature: {}".format(name)) logger.info("Shap value: {}{}".format( "+" if (is_positive) else "-", val)) logger.info(f"spearman: {spearman}") logger.info(f"value: {value}") logger.info(f"overall mean: {np.mean(X)}") logger.info(f"overall median: {np.median(X)}") logger.info(f"mean for y == 0: {np.mean(clean_X)}") logger.info(f"mean for y == 1: {np.mean(buggy_X)}") logger.info(f"median for y == 0: {np.median(clean_X)}") logger.info(f"median for y == 1: {np.median(buggy_X)}") logger.info( f"perc_buggy_values_higher_than_median: {perc_buggy_values_higher_than_median}" ) logger.info( f"perc_buggy_values_lower_than_median: {perc_buggy_values_lower_than_median}" ) logger.info( f"perc_clean_values_higher_than_median: {perc_clean_values_higher_than_median}" ) logger.info( f"perc_clean_values_lower_than_median: {perc_clean_values_lower_than_median}" ) features.append({ "index": i + 1, "name": name, "shap": float(f'{"+" if (is_positive) else "-"}{val}'), "value": importance["importances"]["values"][0, int(feature_index)], "spearman": spearman, "median": median, "median_bug_introducing": median_buggy, "median_clean": median_clean, "perc_buggy_values_higher_than_median": perc_buggy_values_higher_than_median, "perc_buggy_values_lower_than_median": perc_buggy_values_lower_than_median, "perc_clean_values_higher_than_median": perc_clean_values_higher_than_median, "perc_clean_values_lower_than_median": perc_clean_values_lower_than_median, "plot": base64_img, }) # Group together features that are very similar to each other, so we can simplify the explanation # to users. attributes = ["Total", "Maximum", "Minimum", "Average"] already_added = set() feature_groups = [] for i1, f1 in enumerate(features): if i1 in already_added: continue feature_groups.append([f1]) for j, f2 in enumerate(features[i1 + 1:]): i2 = j + i1 + 1 f1_name = f1["name"] for attribute in attributes: if f1_name.startswith(attribute): f1_name = f1_name[len(attribute) + 1:] break f2_name = f2["name"] for attribute in attributes: if f2_name.startswith(attribute): f2_name = f2_name[len(attribute) + 1:] break if f1_name != f2_name: continue already_added.add(i2) feature_groups[-1].append(f2) # Pick a representative example from each group. features = [] for feature_group in feature_groups: shap_sum = sum(f["shap"] for f in feature_group) # Only select easily explainable features from the group. selected = [ f for f in feature_group if (f["shap"] > 0 and abs(f["value"] - f["median_bug_introducing"]) < abs(f["value"] - f["median_clean"])) or ( f["shap"] < 0 and abs(f["value"] - f["median_clean"]) < abs(f["value"] - f["median_bug_introducing"])) ] # If there are no easily explainable features in the group, select all features of the group. if len(selected) == 0: selected = feature_group def feature_sort_key(f): if f["shap"] > 0 and f["spearman"][0] > 0: return f["perc_buggy_values_higher_than_median"] elif f["shap"] > 0 and f["spearman"][0] < 0: return f["perc_buggy_values_lower_than_median"] elif f["shap"] < 0 and f["spearman"][0] > 0: return f["perc_clean_values_lower_than_median"] elif f["shap"] < 0 and f["spearman"][0] < 0: return f["perc_clean_values_higher_than_median"] feature = max(selected, key=feature_sort_key) feature["shap"] = shap_sum for attribute in attributes: if feature["name"].startswith(attribute): feature["name"] = feature["name"][len(attribute) + 1:].capitalize() break features.append(feature) with open("importances.json", "w") as f: json.dump(features, f)
for xXx in os.listdir(path_to_features): print(xXx[:-4]) start = time.time() with open(os.path.join(path_to_features, xXx), "rb") as f: [x_train, x_test, y_train, y_test] = pickle.load(f) lgbm.fit(x_train, y_train) df = pd.DataFrame(columns=['feat_impo_lgbm', 'shan_tree', 'feat_impo_xgb'], index=x_test.columns) try: for col in df.columns: if col == 'shan_tree': if xXx != 'opensmile_avec2013_long.pkl' and xXx != 'opensmile_IS13_ComParE_long.pkl': try: zz = shap.TreeExplainer(lgbm, x_test) # shap_values = np.mean(abs(zz.shap_values(x_test)), axis=0) shap_values = zz.shap_values(x_test) temp_df = pd.DataFrame(columns=['col_idx', 'mean'], index=range(x_test.shape[1])) temp_df['col_idx'] = x_test.columns # df['mean'] = shap_values means = [] for idx, row in df.iterrows(): means.append(np.mean(abs(shap_values[:, idx]))) temp_df['mean'] = means temp_df.sort_values(by='mean',
X, y = shap.datasets.diabetes() # Shape X.shape, y.shape # Distribution of target variable pd.Series(y).plot('hist') # Train using XGBoost Regressor model XGB_model = xgboost.XGBRegressor() XGB_model.fit(X, y) # Create Tree explainer explainer = shap.TreeExplainer(XGB_model) # Extract SHAP values to explain the model predictions shap_values = explainer.shap_values(X) # Plot Feature Importance shap.summary_plot(shap_values, X, plot_type="bar") # Plot Feature Importance - 'Dot' type shap.summary_plot(shap_values, X, plot_type='dot') # Visualize the explanation of first prediction shap.force_plot(explainer.expected_value, shap_values[0, :], X.iloc[0, :])
def shap_scatterplot(sklearn_model: BaseEstimator, X_explain: pd.DataFrame, feature_labels: dict, feature: str = "bac_guess", moderator: Sequence[str] = "episode", output_folder: str = "/mnt/data/figures/shap") -> None: """Partial Dependence Plot for SHAP Args: sklearn_model (BaseEstimator): e.g., lightgbm X_explain (pd.DataFrame): feature set feature_labels (dict): maps feature_names onto feature_labels for plotting feature (str): The main feature to scatterplot output_folder (str): prefix for storing plots & parquets """ # Exclude missing data, which distorts visualization mask = X_explain[feature] > -999 X_explain = X_explain.loc[mask] # Compute SHAP values shap_values = shap.TreeExplainer(sklearn_model).shap_values(X_explain) if isinstance(shap_values, list): # Some output a list for each class shap_values = shap_values[1] columns = X_explain.columns.tolist() if feature not in columns: raise ValueError(f"{feature} is not a column in the given feature df.") formatter_params = {'xtick.labelsize': 8, 'ytick.labelsize': 8} plt.rcParams.update(formatter_params) for mod in moderator: ax = shap.dependence_plot(feature, shap_values, X_explain, interaction_index=mod, dot_size=2) if (feature == "bac_guess") or (feature == "bac_cumulative_avg"): plt.axvspan(.06, .10, alpha=.10, color='grey') plt.axvspan(.04, .12, alpha=.10, color='grey') plt.gcf().set_size_inches(6, 3) flabel = feature_labels[feature]["label"] plt.xlabel(flabel, fontsize=10) plt.ylabel(f"SHAP Value for {flabel}", fontsize=10) # cbarlabel = feature_labels[mod]["label"] # cbar = plt.colorbar() # cbar.ax.tick_params(labelsize=7) #plt.colorbar().set_label(label=cbarlabel, fontsize=10) # # Hack to change fontsize on the legend/colorbar # cax = plt.gcf().axes[-1] # cax.tick_params(labelsize=8) # # Hack to change fontsize of the legend label # plt.gcf().figure.axes[-1].yaxis.label.set_size(10)# size of legend label plt.tight_layout() plt.savefig(f"{output_folder}/shap_scatterplot_{feature}_by_{mod}.pdf", bbox_inches="tight") plt.close()
def train(self, importance_cutoff=0.15): classes, self.class_names = self.get_labels() self.class_names = sort_class_names(self.class_names) # Get items and labels, filtering out those for which we have no labels. X_iter, y_iter = split_tuple_iterator(self.items_gen(classes)) # Extract features from the items. X = self.extraction_pipeline.fit_transform([item for item in X_iter]) # Calculate labels. y = np.array(y_iter) print(f"X: {X.shape}, y: {y.shape}") is_multilabel = isinstance(y[0], np.ndarray) # Split dataset in training and test. X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.1, random_state=0 ) if self.sampler is not None: pipeline = make_pipeline(self.sampler, self.clf) else: pipeline = self.clf tracking_metrics = {} # Use k-fold cross validation to evaluate results. if self.cross_validation_enabled: scorings = ["accuracy"] if len(self.class_names) == 2: scorings += ["precision", "recall"] scores = cross_validate(pipeline, X_train, y_train, scoring=scorings, cv=5) print("Cross Validation scores:") for scoring in scorings: score = scores[f"test_{scoring}"] tracking_metrics[f"test_{scoring}"] = { "mean": score.mean(), "std": score.std() * 2, } print( f"{scoring.capitalize()}: f{score.mean()} (+/- {score.std() * 2})" ) # Training on the resampled dataset if sampler is provided. if self.sampler is not None: X_train, y_train = self.sampler.fit_resample(X_train, y_train) print(f"X_train: {X_train.shape}, y_train: {y_train.shape}") print(f"X_test: {X_test.shape}, y_test: {y_test.shape}") self.clf.fit(X_train, y_train) feature_names = self.get_human_readable_feature_names() if self.calculate_importance and len(feature_names): explainer = shap.TreeExplainer(self.clf) shap_values = explainer.shap_values(X_train) shap.summary_plot( shap_values, X_train.toarray(), feature_names=feature_names, class_names=self.class_names, plot_type="layered_violin" if not isinstance(shap_values, list) else None, show=False, ) matplotlib.pyplot.savefig("feature_importance.png", bbox_inches="tight") important_features = self.get_important_features( importance_cutoff, shap_values ) self.print_feature_importances(important_features, feature_names) print("Test Set scores:") # Evaluate results on the test set. y_pred = self.clf.predict(X_test) if is_multilabel: assert isinstance( y_pred[0], np.ndarray ), "The predictions should be multilabel" print(f"No confidence threshold - {len(y_test)} classified") if is_multilabel: confusion_matrix = metrics.multilabel_confusion_matrix(y_test, y_pred) else: confusion_matrix = metrics.confusion_matrix( y_test, y_pred, labels=self.class_names ) print( classification_report_imbalanced( y_test, y_pred, labels=self.class_names ) ) report = classification_report_imbalanced_values( y_test, y_pred, labels=self.class_names ) tracking_metrics["report"] = report print_labeled_confusion_matrix( confusion_matrix, self.class_names, is_multilabel=is_multilabel ) tracking_metrics["confusion_matrix"] = confusion_matrix.tolist() # Evaluate results on the test set for some confidence thresholds. for confidence_threshold in [0.6, 0.7, 0.8, 0.9]: y_pred_probas = self.clf.predict_proba(X_test) y_test_filter = [] y_pred_filter = [] for i in range(0, len(y_test)): argmax = np.argmax(y_pred_probas[i]) if y_pred_probas[i][argmax] < confidence_threshold: continue y_test_filter.append(y_test[i]) if is_multilabel: y_pred_filter.append(y_pred[i]) else: y_pred_filter.append(argmax) if not is_multilabel: y_pred_filter = self.le.inverse_transform(y_pred_filter) print( f"\nConfidence threshold > {confidence_threshold} - {len(y_test_filter)} classified" ) if len(y_test_filter) != 0: if is_multilabel: confusion_matrix = metrics.multilabel_confusion_matrix( np.asarray(y_test_filter), np.asarray(y_pred_filter) ) else: confusion_matrix = metrics.confusion_matrix( np.asarray(y_test_filter), np.asarray(y_pred_filter), labels=self.class_names, ) print( classification_report_imbalanced( y_test_filter, y_pred_filter, labels=self.class_names ) ) print_labeled_confusion_matrix( confusion_matrix, self.class_names, is_multilabel=is_multilabel ) joblib.dump(self, self.__class__.__name__.lower()) return tracking_metrics
'reg_lambda': 10, 'reg_alpha': 0.1, 'learning_rate': 0.01, 'gamma': 0.1 } xgb = XGBClassifier(**basicparameter) # xgb=XGBClassifier() # apply the default parameters xgb.fit(Xtrain, Ytrain) # score the model print('============================= XGBoost =============================') score(xgb, Xtrain, Ytrain, Xtest, Ytest) print('============================== SHAP ===============================') explainer = shap.TreeExplainer(xgb) # define the explainer shap_values = explainer.shap_values(X) # use all data for analysis def gen_data(inputs, X): """ creates a data Frame with inputs and X for statistics with shap """ df1 = pd.DataFrame() for i, name in enumerate(inputs): df1[name] = X[:, i] return df1 df1 = gen_data(inputs, X) shap.summary_plot(shap_values, df1) # shap.summary_plot(shap_values, df1, plot_type="bar")
#Data Splitting X_train, X_test, y_train, y_test = train_test_split(dfX, dfY, test_size=0.2, shuffle=True, random_state=123) rf = RandomForestRegressor(n_estimators=400, min_samples_split=2, min_samples_leaf=1, max_features="auto", max_depth=65) rf.fit(X_train, y_train) explainer = shap.TreeExplainer(rf) shap_values = explainer.shap_values(X_train) f = plt.figure() shap.summary_plot(shap_values, X_train, plot_type="bar") f.savefig("varImp.pdf", bbox_inches='tight', dpi=600) plt.close('all') f = plt.figure() shap.summary_plot(shap_values, X_train) f.savefig("varImp2.pdf", bbox_inches='tight', dpi=600) plt.close('all') shap.dependence_plot("median_step_width", shap_values, X_train) plt.savefig("sw-dep.pdf", bbox_inches='tight', dpi=600)
# # Best model from grid search (best.best_estimator_) best_model = DecisionTreeClassifier(min_samples_leaf=0.005, min_samples_split=0.015) plot_roc_curve(best_model, X_test, y_test) # --- Ensemble classifiers --------------------------------------------------- # rf = RandomForestClassifier().fit(X_train, y_train) ada = AdaBoostClassifier().fit(X_train, y_train) gbc = GradientBoostingClassifier().fit(X_train, y_train) # Plot AUC for ensemble classifiers fig, ax = plt.subplots() plot_roc_curve(rf, X_test, y_test, ax=ax) plot_roc_curve(ada, X_test, y_test, ax=ax) plot_roc_curve(gbc, X_test, y_test, ax=ax) plt.savefig("ensemble.png", dpi=500) # Create SHAP explainers for best model model = RandomForestClassifier().fit(X_train, y_train) shap_values = shap.TreeExplainer(model).shap_values(X_train) shap.summary_plot(shap_values[0], X_train, feature_names=X.columns, plot_type="violin") shap.summary_plot(shap_values[0], X_train, feature_names=X.columns, plot_type="bar")
def reg_top10_lightGBM(merge_data, outname, memo): from sklearn.model_selection import StratifiedShuffleSplit # 目的変数を分離 X = merge_data.drop("target", axis=1).values y = merge_data["target"].values columns_name = merge_data.drop("target", axis=1).columns # 分類するための関数を定義 sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2) def data_split(X, y): for train_index, test_index in sss.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] X_train = pd.DataFrame(X_train, columns=columns_name) X_test = pd.DataFrame(X_test, columns=columns_name) return X_train, y_train, X_test, y_test # train, test, valに分離 X_train, y_train, X_test, y_test = data_split(X, y) X_train, y_train, X_val, y_val = data_split(X_train.values, y_train) # shape 確認 print("train shape", X_train.shape) print("test shape", X_test.shape) print("validation shape", X_val.shape) # shape 確認 print("y_train shape", y_train.shape) print("y_test shape", y_test.shape) print("y_validation shape", y_val.shape) y_test_df = pd.DataFrame(y_test) print("y_test describe", y_test_df.describe()) print("not_ y_test describe", (~y_test_df.duplicated()).sum()) #y_test_df.value_counts().plot(kind="bar") print("y_test_df.duplicated().sum()", y_test_df.duplicated().sum()) #print(y_test_df[y_test_df.duplicated()]) # クラスの割合を確認 plt.figure(figsize=(20, 5)) plt.subplot(1, 3, 1) plt.hist(y_train) plt.subplot(1, 3, 2) plt.hist(y_test) plt.subplot(1, 3, 3) plt.hist(y_val) #shap import shap shap.initjs() import lightgbm as lgb # データセットを作成 train = lgb.Dataset(X_train, label=y_train) valid = lgb.Dataset(X_val, label=y_val) # モデルのパラメータを設定 # パラメータを設定 params = { 'task': 'train', # 学習、トレーニング ⇔ 予測predict 'boosting_type': 'gbdt', # 勾配ブースティング 'objective': 'regression', # 目的関数:回帰 'metric': 'rmse', # 回帰分析モデルの性能を測る指標 'learning_rate': 0.1 } # 学習率(初期値0.1) # モデルを訓練 model = lgb.train(params, train, valid_sets=valid, num_boost_round=3000, early_stopping_rounds=100) # 予測 y_pred = model.predict(X_test, num_iteration=model.best_iteration) #shap explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(X_test) shap.summary_plot(shap_values, X_test, plot_type="bar") # Plot Feature Importance - 'Dot' type shap.summary_plot(shap_values, X_test, plot_type='dot') #shap.summary_plot(shap_values, X_test, plot_type='dot') from sklearn.metrics import mean_squared_error # モデル評価用(平均二乗誤差) from sklearn.metrics import r2_score # モデル評価用(決定係数) # 真値と予測値の表示 df_pred = pd.DataFrame({ 'regression_y_test': y_test, 'regression_y_pred': y_pred }) display(df_pred) # 散布図を描画(真値 vs 予測値) plt.plot(y_test, y_pred, color='red', label='x=y') # 直線y = x (真値と予測値が同じ場合は直線状に点がプロットされる) plt.scatter(y_test, y_pred) # 散布図のプロット plt.xlabel('y_test') # x軸ラベル plt.ylabel('y_pred') # y軸ラベル plt.title('y vs y_pred') # グラフタイトル # モデル評価 # rmse : 平均二乗誤差の平方根 mse = mean_squared_error(y_test, y_pred) # MSE(平均二乗誤差)の算出 rmse = np.sqrt(mse) # RSME = √MSEの算出 print('RMSE :', rmse) # r2 : 決定係数 r2 = r2_score(y_test, y_pred) print('R2 :', r2) df_Df = pd.DataFrame({ 'regression_y_test_' + memo: y_test, 'regression_y_pred_' + memo: y_pred, 'RMSE_' + memo: rmse, 'R2_' + memo: r2 }) df_Df.to_csv(r"" + "./20210201_output/" + 'DPC_g/20210415/' + outname + memo + '.csv', encoding='shift-jis') importance = pd.DataFrame(model.feature_importance(), columns=['importance']) display(importance) C_you = merge_data.drop(["target"], axis=1) importance["columns"] = list(C_you.columns) #importance.to_csv(r""+"./20210201_output/"+'DPC_g/20210415/'+outnameimp+memo+'.csv', encoding = 'shift-jis') return importance, shap_values
# RandomOverSampler: over-sample the minority class(es) by picking samples at random with replacement. #clf = GridSearchCV(estimator=createPipeline(model, "RandomOverSampler", feature_selector=feature_selector), param_grid=model_hyperparams, cv=inner_cv, scoring="roc_auc", refit=True) clf = RandomizedSearchCV(estimator=createPipeline( model, "RandomOverSampler", feature_selector=feature_selector), param_distributions=model_hyperparams, cv=inner_cv, scoring="roc_auc", refit=True, random_state=0, n_iter=3) clf.fit(train_x, train_y.values.ravel()) # plot: interpret our model (the following SHAP related code only works for tree-based models) feature_names = train_x.columns[clf.best_estimator_.steps[1] [1].get_support(indices=True)] explainer = shap.TreeExplainer(clf.best_estimator_.steps[2][1]) test_current_fold = test_x[feature_names] shap_values = explainer.shap_values(test_current_fold) shap_current_fold = pd.DataFrame(data=shap_values[1], columns=feature_names) shap_all_folds = pd.concat([shap_all_folds, shap_current_fold], axis=0, sort=False) test_all_folds = pd.concat([test_all_folds, test_current_fold], axis=0, sort=False) # Collect results and parameters best_params = best_params + [clf.best_params_] * test_x.shape[0] cur_fold_pred = clf.predict(test_x).tolist()
if st.sidebar.button('Validate/See results'): # Preprocessing scaling(df) onehot(df) # Replacing columns int the same order as our model order(df) # Prediction prediction = XGB.predict(df) prob = XGB.predict_proba(df) # Create Tree Explainer object that can calculate shap values explainer = shap.TreeExplainer(XGB) # Calculate Shap values choosen_instance = df.loc[[0]] shap_values = explainer.shap_values(choosen_instance) # Printing results if prediction == 0: st.write('ACCEPTED', prob) st.write( "Under 0 you have the probability you won't be in defaut payment") st.write( "Under 1 you have the probability you will be in defaut payment") st.write('Comprehension of acceptance:') # visualize the first prediction's explanation (use matplotlib=True to avoid Javascript) st_shap(
def shap_call(xgb, sample = None, feats='all', nb_features_in_exp = None): timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ resource.getrusage(resource.RUSAGE_SELF).ru_utime f2imap = {} for i, f in enumerate(xgb.feature_names): f2imap[f.strip()] = i if (sample is not None): if (nb_features_in_exp is None): nb_features_in_exp = len(sample) try: feat_sample = np.asarray(sample, dtype=np.float32) except: print("Cannot parse input sample:", sample) exit() print("\n\n Starting SHAP explainer... \n Considering a sample with features:", feat_sample) if not (len(feat_sample) == len(xgb.X_train[0])): print("Unmatched features are not supported: The number of features in a sample {} is not equal to the number of features in this benchmark {}".format(len(feat_sample), len(xgb.X_train[0]))) exit() # compute boost predictions feat_sample_exp = np.expand_dims(feat_sample, axis=0) feat_sample_exp = xgb.transform(feat_sample_exp) y_pred = xgb.model.predict(feat_sample_exp)[0] y_pred_prob = xgb.model.predict_proba(feat_sample_exp)[0] # No need to pass dataset as it is recored in model # https://shap.readthedocs.io/en/latest/ explainer = shap.TreeExplainer(xgb.model) shap_values = explainer.shap_values(feat_sample_exp) shap_values_sample = shap_values[-1] transformed_sample = feat_sample_exp[-1] # we need to sum values per feature # https://github.com/slundberg/shap/issues/397 sum_values = [] if (xgb.use_categorical): p = 0 for f in xgb.categorical_features: nb_values = len(xgb.categorical_names[f]) sum_v = 0 for i in range(nb_values): sum_v = sum_v + shap_values_sample[p+i] p = p + nb_values sum_values.append(sum_v) else: sum_values = shap_values_sample expl = [] # choose which features in the explanation to focus on if feats in ('p', 'pos', '+'): feats = 1 elif feats in ('n', 'neg', '-'): feats = -1 else: feats = 0 print("\t \t Explanations for the winner class", y_pred, " (xgboost confidence = ", y_pred_prob[int(y_pred)], ")") print("base_value = {}, predicted_value = {}".format(explainer.expected_value, np.sum(sum_values) + explainer.expected_value)) abs_sum_values = np.abs(sum_values) sorted_by_abs_sum_values =np.argsort(-abs_sum_values) for k1, v1 in enumerate(sorted_by_abs_sum_values): k = v1 v = sum_values[v1] if (feats == 1 and v < 0) or (feats == -1 and v >= 0): continue expl.append(f2imap[xgb.feature_names[k]]) print("id = {}, name = {}, score = {}".format(f2imap[xgb.feature_names[k]], xgb.feature_names[k], v)) if (len(expl) == nb_features_in_exp): break timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ resource.getrusage(resource.RUSAGE_SELF).ru_utime - timer print(' time: {0:.2f}'.format(timer)) return sorted(expl[:nb_features_in_exp])
xg_model = clf.named_steps['xg_classifier'] xg_model = xg_model.get_booster() standard_scaler = clf.named_steps['scalar6'] model_bytearray = xg_model.save_raw()[4:] def myfun(self=None): return model_bytearray xg_model.save_raw = myfun data_shap = standard_scaler.transform(data_shap) bears = standard_scaler.transform(bears) if feature_importance == 1: shap_explainer = shap.TreeExplainer(xg_model, data_shap) shap_values = shap_explainer.shap_values(bears) bears_ids = bears_ids.reset_index(drop=True) result = pd.concat( [bears_ids, pd.DataFrame(shap_values, columns=data.columns)], axis=1, sort=False) result.to_csv('bears_shap.csv', index=False) ####Cross-Val Classifiers#### if cross_val == 1: classifier_cross_val = pd.DataFrame(data=None, columns=[
#Compare normalized gini and Mean Squared Error #Use mean as a benchmark y_test['ClaimAmountAvg'] = y_test.mean().values[0] print(normalized_gini(y_test['ClaimAmount'], y_pred)) #0.679 print(sqrt(mean_squared_error(y_test['ClaimAmount'], y_pred))) #706 print(normalized_gini(y_test['ClaimAmount'], y_test['ClaimAmountAvg'])) #0.075 print(sqrt(mean_squared_error(y_test['ClaimAmount'], y_test['ClaimAmountAvg']))) #711 #Mean Squared Error not much improved, but normalized gini much improved #Use shap to plot LGBM Model explainer = shap.TreeExplainer(regressor) shap_values = explainer.shap_values(X_test) shap.summary_plot(shap_values, X_test, plot_type="bar") #Past values most important for prediction ############ ##Real Run ############ #Create Train/Test Splits #Test on current year date = pd.Timestamp(2019, 7, 1) X_train = X.loc[X['MinDate'] < date] y_train = y.loc[y['MinDate'] < date] X_test = X.loc[X['MinDate'] == date] y_test = y.loc[y['MinDate'] == date]
def get_shap_values(self): self.explainer = shap.TreeExplainer(self.model) self.shap_values = self.explainer.shap_values(self.X_valid) shap.summary_plot(self.shap_values, self.X_valid) return self.explainer, self.shap_values
#cf4 = confusion_matrix(y_true, y_predrf3) #print('tn, fp, fn, tp', cf4) #cf_normalize3 = (cf4-np.min(cf4))/np.ptp(cf4) #tnrf3, fprf3, fnrf3, tprf3 = cf4.ravel() #print(cf4.ravel()) #print("Accuracy: {}".format((tprf3+tnrf3)/len(y_test))) #print("Recall: {}".format(tprf3/(tprf3+fnrf3))) #print("Precision: {}".format(tprf3/(tprf3+fprf3))) #%% # ---------SHAP--------- to_train2 = np.delete(to_train, [xpos, ypos], axis=0) # DF, based on which importance is checked X_importance = X_test #%% # Explain model predictions using shap library: explainer = shap.TreeExplainer(xgb1) shap_values = explainer.shap_values(X_importance) #%% plt.figure() shap.summary_plot(shap_values, X_importance, max_display=20, feature_names=to_train, show=False) plt.tight_layout() plt.show() #plt.savefig('summaryplot.png', dpi=200) #%% shap.dependence_plot('ewdeep10_kurt', shap_values, X_importance,
] X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42) model = XGBClassifier(eta=0.05, n_estimators=121, max_depth=7, min_samples_split=50, min_samples_leaf=5, cv=5) model.fit(X_train, y_train) explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(X_train) #print ("a") #print(shap_values) #shap_l=shap_values.tolist() #print(shap_values) i = 0 #while i<3: #print(explainer.expected_value[i]) #print(shap_values[i]) #shap.dependence_plot(0, shap_values[0], X,X_name) # shap.force_plot(explainer.expected_value[0],shap_values[0],X_train) # i=i+1 shap.summary_plot(shap_values, X, X_name)
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1) my_model = RandomForestClassifier(random_state=0).fit(train_X, train_y) # We will look at SHAP values for a single row of the dataset # (we arbitrarily chose row 5). row_to_show = 5 data_for_prediction = val_X.iloc[row_to_show] # use 1 row of data here. Could use multiple rows if desired # invert rows, columns to columns, rows data_for_prediction_array = data_for_prediction.values.reshape(1, -1) #For context, we'll look at the raw predictions before looking at the SHAP values my_model.predict_proba(data_for_prediction_array) # Create object that can calculate shap values explainer = shap.TreeExplainer(my_model) ### Calculate Shap values shap_values = explainer.shap_values(data_for_prediction) # It's cumbersome to review raw arrays, but the shap package has a nice way to visualize the results. shap.initjs() shap.force_plot(explainer.expected_value[1], shap_values[1], data_for_prediction) #Here is an example using KernelExplainer to get similar results. #The results aren't identical because kernelExplainer gives an approximate result. # use Kernel SHAP to explain test set predictions k_explainer = shap.KernelExplainer(my_model.predict_proba, train_X) ### Calculate Shap values k_shap_values = k_explainer.shap_values(data_for_prediction) shap.force_plot(k_explainer.expected_value[1], k_shap_values[1], data_for_prediction)
def classify( self, items, probabilities=False, importances=False, importance_cutoff=0.15 ): assert items is not None assert ( self.extraction_pipeline is not None and self.clf is not None ), "The module needs to be initialized first" if not isinstance(items, list): items = [items] assert isinstance(items[0], dict) or isinstance(items[0], tuple) X = self.extraction_pipeline.transform(items) if probabilities: classes = self.clf.predict_proba(X) else: classes = self.clf.predict(X) classes = self.overwrite_classes(items, classes, probabilities) if importances: explainer = shap.TreeExplainer(self.clf) shap_values = explainer.shap_values(X) important_features = self.get_important_features( importance_cutoff, shap_values ) # Workaround: handle multi class case for force_plot to work correctly if len(classes[0]) > 2: pred_class_index = classes.argmax(axis=-1)[0] explainer.expected_value = explainer.expected_value[pred_class_index] shap_values = shap_values[pred_class_index] else: pred_class_index = 0 pred_class = self.class_names[pred_class_index] top_indexes = [ int(index) for importance, index, is_positive in important_features["classes"][ pred_class ][0] ] feature_names = self.get_human_readable_feature_names() feature_legend = { str(i + 1): feature_names[feature_i] for i, feature_i in enumerate(top_indexes) } with io.StringIO() as out: p = shap.force_plot( explainer.expected_value, shap_values[:, top_indexes], X.toarray()[:, top_indexes], feature_names=[str(i + 1) for i in range(len(top_indexes))], matplotlib=False, show=False, ) # TODO: use full_html=False shap.save_html(out, p) html = out.getvalue() return ( classes, { "importances": important_features, "html": html, "feature_legend": feature_legend, }, ) return classes
def upload(): print('eer 0', request.form) dropdown_selection = str(request.form) dropdown_selection = dropdown_selection.split() print(dropdown_selection) model_type = dropdown_selection[3] dropdown_selection = dropdown_selection[1] print('model type ji ', model_type) print(dropdown_selection, " nuna bhai") global id_name target = 'images/' print('tt', target) if not os.path.isdir(target): os.mkdir(target) global ff ff = [] for file in request.files.getlist("file"): print(file) filename = file.filename destination = "/".join([target, filename]) print('des', destination) file.save(destination) ff.append(destination) mypath = os.getcwd() onlyfiles = [ os.path.join(mypath, f) for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f)) ] print('raJA ', ff) import warnings warnings.filterwarnings("ignore") with open(ff[0], 'rb') as file: model = pickle.load(file) with open(ff[1], 'rb') as file: X_data = pickle.load(file) with open(ff[2], 'rb') as file: y_data = pickle.load(file) if 'GL' in dropdown_selection: if 'RR' in model_type: PI = permutation_importance(model, X_data, y_data) row_to_show = 5 data_for_prediction = X_data.iloc[row_to_show] explainer = shap.Explainer(model, X_data, feature_names=X_data.columns) shap_values = explainer.shap_values(X_data) shap.summary_plot(shap_values, X_data) import matplotlib.pyplot as pl pl.savefig('static/img/new_plot.png') pl.close() ICE = ind_cond_exp(model, X_data, y_data) #global surgat from sklearn.tree import DecisionTreeRegressor from sklearn.tree import plot_tree predictions = model.predict(X_data) dt = DecisionTreeRegressor(random_state=100, max_depth=3) # We fit the shallow tree to the matrix X and the predictions of the random forest model dt.fit(X_data, predictions) fig, ax = plt.subplots(figsize=(20, 10)) plot_tree(dt, feature_names=list(X_data.columns), precision=3, filled=True, fontsize=12, impurity=True) pl.savefig('static/img/new2_plot.png') pl.close() return render_template('model_explanation_result.html', PI=PI, ICE=ICE, SH="static/img/new_plot.png", SM="static/img/new2_plot.png") if 'RF' in model_type: PI = permutation_importance(model, X_data, y_data) explainer = shap.TreeExplainer(model, X_data, feature_names=X_data.columns) shap_values = explainer.shap_values(X_data) shap.summary_plot(shap_values, X_data) import matplotlib.pyplot as pl pl.savefig('static/img/new_plot.png') pl.close() ICE = ind_cond_exp(model, X_data, y_data) #global surgat from sklearn.tree import DecisionTreeRegressor from sklearn.tree import plot_tree predictions = model.predict(X_data) dt = DecisionTreeRegressor(random_state=100, max_depth=3) # We fit the shallow tree to the matrix X and the predictions of the random forest model dt.fit(X_data, predictions) fig, ax = plt.subplots(figsize=(20, 10)) plot_tree(dt, feature_names=list(X_data.columns), precision=3, filled=True, fontsize=12, impurity=True) pl.savefig('static/img/new2_plot.png') pl.close() return render_template('model_explanation_result.html', PI=PI, ICE=ICE, SH="static/img/new_plot.png", SM="static/img/new2_plot.png") if 'CC' in model_type: PI = permutation_importance(model, X_data, y_data) explainer = shap.KernelExplainer(model.predict_proba, X_data) shap_values = explainer.shap_values(X_data) shap.summary_plot(shap_values, X_data) import matplotlib.pyplot as pl pl.savefig('static/img/new_plot.png') pl.close() #ICE = ind_cond_exp(model,X_data,y_data) #global surgat from sklearn.tree import DecisionTreeRegressor from sklearn.tree import plot_tree predictions = model.predict(X_data) return render_template( 'model_explanation_result_classification.html', PI=PI, SH="static/img/new_plot.png") if 'WI' in dropdown_selection: # print(res," resss") # import dash from dash.dependencies import Input, Output import dash_table import dash_core_components as dcc import dash_html_components as html app = dash.Dash(__name__) import pandas as pd #should be X data mean_list = [] features = X_data.columns.tolist() for i in features: mean_list.append(round(X_data[i].mean())) explainer = shap.TreeExplainer(model) shap.initjs() params = features id_name_str = "my_graph" + str(id_name) print('---------------', id_name_str) id_name = id_name + 1 what_plot.layout = html.Div([ dash_table.DataTable( id='table-editing-simple', columns=([{ 'id': 'Model', 'name': 'Model' }] + [{ 'id': p, 'name': p } for p in params]), data=[ dict(zip(features, mean_list)) #dict(Model=i, **{param: mean_list[i] for param in params}) # for i in range(0, len(mean_list)) ], editable=True), html.Div(id=id_name_str) ]) @what_plot.callback(Output(id_name_str, "children"), Input('table-editing-simple', 'data'), Input('table-editing-simple', 'columns')) def update_graphs(rows, columns): df = pd.DataFrame(rows, columns=[c['name'] for c in columns]) print(rows) # rows = rows[0] col = [] vvalue = [] for key in rows: print(key, '->', int(rows[key])) col.append(key) vvalue.append([int(rows[key])]) ik = dict(zip(col, vvalue)) instance = pd.DataFrame.from_dict(ik) print('instancceee ', instance) from shap.plots._force_matplotlib import draw_additive_plot # explain the model's predictions using SHAP values(same syntax works for LightGBM, CatBoost, and scikit-learn models) #explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(instance) shap.initjs() #plt.style.use("_classic_test_patch") ytu = model.predict(instance) print('ress ', ytu) koko = _force_plot_html2(explainer.expected_value, shap_values, instance) #print('kkkk ',koko) print('Done') return koko # return render_template('local_explain_lime.html', LL=what_plot.index()) if 'LL' in dropdown_selection: None #table and plots ======================================================== import dash from dash.dependencies import Input, Output import dash_table import dash_core_components as dcc import dash_html_components as html import pandas as pd id_name_str = "my_graph" + str(id_name) print('---------------', id_name_str) id_name = id_name + 1 print('in LL') # make graph=============================================================== table_plot.layout = html.Div([ dash_table.DataTable( id='datatable-interactivity', columns=[{ "name": i, "id": i, "deletable": True, "selectable": True } for i in X_data.columns], data=X_data.to_dict('records'), editable=True, filter_action="native", sort_action="native", sort_mode="multi", column_selectable="single", row_selectable="single", row_deletable=True, selected_columns=[], selected_rows=[], page_action="native", page_current=0, page_size=10, ), html.Div(id=id_name_str) ]) print('miod LL') @table_plot.callback(Output(id_name_str, "children"), Input('datatable-interactivity', "derived_virtual_data"), Input('datatable-interactivity', "derived_virtual_selected_rows")) def update_graphs(rows, derived_virtual_selected_rows): # When the table is first rendered, `derived_virtual_data` and # `derived_virtual_selected_rows` will be `None`. This is due to an # idiosyncrasy in Dash (unsupplied properties are always None and Dash # calls the dependent callbacks when the component is first rendered). # So, if `rows` is `None`, then the component was just rendered # and its value will be the same as the component's dataframe. # Instead of setting `None` in here, you could also set # `derived_virtual_data=df.to_rows('dict')` when you initialize # the component. if derived_virtual_selected_rows is None: derived_virtual_selected_rows = [] dff = X_data if rows is None else pd.DataFrame(rows) colors = [ '#7FDBFF' if i in derived_virtual_selected_rows else '#0074D9' for i in range(len(dff)) ] print('my value', derived_virtual_selected_rows) print('i am row ', X_data.iloc[derived_virtual_selected_rows]) print(type(derived_virtual_selected_rows)) from shap.plots._force_matplotlib import draw_additive_plot ttt = X_data.loc[derived_virtual_selected_rows] # explain the model's predictions using SHAP values(same syntax works for LightGBM, CatBoost, and scikit-learn models) explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(ttt) shap.initjs() plt.style.use("_classic_test_patch") bubu = _force_plot_html(explainer.expected_value, shap_values, ttt) shap_values = explainer.shap_values(X_data) #shap.force_plot(explainer.expected_value, shap_values, X_data) explain_all = _force_plot_html(explainer.expected_value, shap_values, X_data) print('bubu ', bubu) return bubu, explain_all return render_template('local_explain_lime.html', LL=table_plot.index()) if 'BD' in dropdown_selection: None #FI if 'DB' in dropdown_selection: # if 'CC' in model_type: # from explainerdashboard import ClassifierExplainer, ExplainerDashboard # ExplainerDashboard(ClassifierExplainer(model, X_data, y_data)).run() if 'RF' in model_type: import threading import time def dashboard_exp(model, X_data, y_data): import dash_bootstrap_components as dbc from explainerdashboard import RegressionExplainer, ExplainerDashboard ExplainerDashboard( RegressionExplainer(model, X_data, y_data), bootstrap=dbc.themes.SANDSTONE, importances=True, model_summary=False, contributions=True, whatif=True, shap_dependence=False, shap_interaction=False, decision_trees=False, hide_whatifindexselector=True, hide_whatifprediction=True, hide_inputeditor=False, hide_whatifcontributiongraph=False, hide_whatifcontributiontable=True, hide_whatifpdp=False, hide_predindexselector=True, hide_predictionsummary=True, hide_contributiongraph=False, hide_pdp=False, hide_contributiontable=True, hide_dropna=True, hide_range=True, hide_depth=True, hide_sort=True, hide_sample=True, # hide sample size input on pdp component hide_gridlines=True, # hide gridlines on pdp component hide_gridpoints=True, hide_cats_sort= True, # hide the sorting option for categorical features hide_cutoff= True, # hide cutoff selector on classification components hide_percentage= True, # hide percentage toggle on classificaiton components hide_log_x= True, # hide x-axis logs toggle on regression plots hide_log_y= True, # hide y-axis logs toggle on regression plots hide_ratio=True, # hide the residuals type dropdown hide_points= True, # hide the show violin scatter markers toggle hide_winsor=True, # hide the winsorize input hide_wizard= True, # hide the wizard toggle in lift curve component hide_star_explanation=True, ).run() t1 = threading.Thread(target=dashboard_exp, args=(model, X_data, y_data)) t1.start() return '''<H2>
def ensemble2_resampling_weight(n_resampling, n_estimators, featurenum, seed, x_train=None, y_train=None): np.random.seed(seed=seed) # seed값 설정 model = RandomForestClassifier() # 모델 변경 가능 (가중치 성능 모델) selected_cols_dict = {} for x in range(n_resampling): sample = np.random.choice(x_train.shape[0], 200, replace=False) x_train_sample = x_train.iloc[sample, :] y_train_sample = y_train.iloc[sample] n = x_train_sample.shape[1] # trainset에서 200개 resampling 데이터를 제외하고 나머지는 train 안의 validation set not_sample = [i for i in range(x_train.shape[0]) if i not in sample] x_train_val = x_train.iloc[not_sample, :] y_train_val = y_train.iloc[not_sample] # 총 8개의 fs_method ensemble (rfe는 시간이 너무 오래 걸려서 제외) fs_model_tree = [ RandomForestClassifier(random_state=seed, n_estimators=n_estimators), ExtraTreesClassifier(random_state=seed, n_estimators=n_estimators) ] fs_model_kb = [ SelectKBest(chi2, k='all'), SelectKBest(f_classif, k='all') ] fs_model_shap = [ RandomForestClassifier(random_state=seed, n_estimators=n_estimators) ] # tree importance for j, fs in enumerate(fs_model_tree): fs_model = fs fs_model.fit(x_train_sample, y_train_sample) importances = fs_model.feature_importances_ importances_series = pd.Series(importances) # 가장 작은 rank 값을 가진 10개의 high_importance feature를 사용하여 1번 학습 rank = importances_series.rank(ascending=False, method='min') rank_high_idx = rank[rank <= 10].index x_train_sample_high = x_train_sample.iloc[:, rank_high_idx] x_train_val_high = x_train_val.iloc[:, rank_high_idx] model = model.fit(x_train_sample_high, y_train_sample) # 학습된 데이터로 acc계산 y_pred_val = model.predict(x_train_val_high) accuracy_val = accuracy_score( y_train_val, y_pred_val) # 가중치 (1차적인 feature selection의 성능) # 각 feature selection method 별로 feature 선택 rank_high_index = rank[rank <= 400].index selected_columns = x_train.columns[rank_high_index] # 가중치(1차적인 feature selection의 성능) 을 포함시킨 selected_columns selected_columns_acc = [] for col in selected_columns: selected_columns_acc.append(col + f'_{accuracy_val**2}') selected_cols_dict[ f'tree_importances_{j}_{x}'] = selected_columns_acc for j, fs in enumerate(fs_model_kb): fs_model = fs fs_model.fit(x_train_sample, y_train_sample) importances = fs_model.scores_ importances2 = np.nan_to_num(importances) importances_series = pd.Series(importances2) # 가장 작은 rank 값을 가진 10개의 high_importance feature를 사용하여 1번 학습 rank = importances_series.rank(ascending=False, method='min') rank_high_idx = rank[rank <= 10].index x_train_sample_high = x_train_sample.iloc[:, rank_high_idx] x_train_val_high = x_train_val.iloc[:, rank_high_idx] model = model.fit(x_train_sample_high, y_train_sample) # 학습된 데이터로 acc계산 y_pred_val = model.predict(x_train_val_high) accuracy_val = accuracy_score( y_train_val, y_pred_val) # 가중치 (1차적인 feature selection의 성능) # 각 feature selection method 별로 feature 선택 rank_high_index = rank[rank <= 400].index selected_columns = x_train.columns[rank_high_index] # 가중치(1차적인 feature selection의 성능) 을 포함시킨 selected_columns selected_columns_acc = [] for col in selected_columns: selected_columns_acc.append(col + f'_{accuracy_val**2}') selected_cols_dict[f'selectkbest_{j}_{x}'] = selected_columns_acc for j, fs in enumerate(fs_model_shap): fs_model = fs fs_model.fit(x_train_sample, y_train_sample) explainer = shap.TreeExplainer(fs_model) shap_values = explainer.shap_values(x_train_sample) shap_values_mat = np.abs(shap_values[1]) shap_mean = np.mean(shap_values_mat, axis=0) importances_series = pd.Series(shap_mean) # 가장 작은 rank 값을 가진 10개의 high_importance feature를 사용하여 1번 학습 rank = importances_series.rank(ascending=False, method='min') rank_high_idx = rank[rank <= 10].index x_train_sample_high = x_train_sample.iloc[:, rank_high_idx] x_train_val_high = x_train_val.iloc[:, rank_high_idx] model = model.fit(x_train_sample_high, y_train_sample) # 학습된 데이터로 acc계산 y_pred_val = model.predict(x_train_val_high) accuracy_val = accuracy_score( y_train_val, y_pred_val) # 가중치 (1차적인 feature selection의 성능) # 각 feature selection method 별로 feature 선택 rank_high_index = rank[rank <= 400].index selected_columns = x_train.columns[rank_high_index] # 가중치(1차적인 feature selection의 성능) 을 포함시킨 selected_columns selected_columns_acc = [] for col in selected_columns: selected_columns_acc.append(col + f'_{accuracy_val**2}') selected_cols_dict[f'shap_{j}_{x}'] = selected_columns_acc df_cols = pd.DataFrame(selected_cols_dict) columns = [] for i in df_cols.values: for j in i: columns.append(j) # selected feature들의 가중치를 다 합하기 counts = dict() for col in columns: key = col.split('_')[0] value = col.split('_')[1] if key not in counts: counts[key] = np.float(value) else: counts[key] += np.float(value) sorted_counts = sorted(counts.items(), key=lambda x: x[1], reverse=True) selected_columns_lst = [] for i in range(featurenum): # 지정 된 featurenum 개수만큼 상위 빈출 컬럼 뽑기 selected_columns_lst.append(sorted_counts[i][0]) return selected_columns_lst
def upload2(): from werkzeug.datastructures import ImmutableMultiDict with open(ff[0], 'rb') as file: model = pickle.load(file) with open(ff[1], 'rb') as file: X_data = pickle.load(file) with open(ff[2], 'rb') as file: y_data = pickle.load(file) print('start') print(request.form) hh = request.form hh = hh.to_dict(flat=False) print('hh ', hh) for file in request.files.getlist("gg"): print(file) print(list(X_data.columns)) series = pd.Series(hh) import shap explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(X_data) # load JS visualization code to notebook shap.initjs() #plt.style.use("_classic_test_patch") #plt.clf() # visualize the first prediction's explanation (use matplotlib=True to avoid Javascript) #shap.force_plot(explainer.expected_value, shap_values[1,:], series, matplotlib=True, figsize=(22, 4)) #shap.force_plot(explainer.expected_value, shap_values[10,:], \ # series,feature_names=X_data.columns,\ # matplotlib=True, show=False) # plt.savefig("gg.png",dpi=150, bbox_inches='tight') #yyy = shap.getjs() ''' oo = yyy.matplotlib p = yyy.html yyy_str = mpld3.fig_to_html(p) print('dfsdfsdf ',p) ''' series = series.tolist() print('im a he ', series) pp = [] for i in series: for j in i: j = float(j) pp.append(j) series = np.array(pp) print('im a she ', series) #lime import lime from lime.lime_tabular import LimeTabularExplainer explainer = LimeTabularExplainer(X_data, mode='regression', feature_names=list(X_data.columns), random_state=42, discretize_continuous=False, kernel_width=0.2) exp = explainer.explain_instance(series, model.predict) print(exp.local_pred) fig = exp.as_pyplot_figure(label=list(X_data.columns)) fig_2 = exp.as_html(labels=list(X_data.columns)) #print('dddd ',fig_2) plt.tight_layout() #fig = plt.figure(figsize = (18,8)) # plt.tight_layout() # #plt.boxplot(bank_data.transpose()) # # #Add titles to the chart and axes # plt.hist(bank_data.transpose(), bins = 50) # plt.title('Boxplot of Bank Stock Prices (5Y Lookback)') # plt.xlabel('Bank') # plt.ylabel('Stock Prices') # #mpld3.show(fig) # html_str = mpld3.fig_to_html(fig) Html_file = open("templates/lime.html", "w") Html_file.write(html_str) Html_file.close() # # plt.savefig('static/img/new34_plot.png') #plt.close() return render_template('local_result.html', LIME=html_str, SH=fig_2, gh=html_str)
def run_edu_score_prediction_app(): st.header('■Score prediction Demo') st.write( 'To predict the expected score (e.g., for students who are absent from the test.)' ) st.sidebar.subheader('Data Upload') df_edu = pd.read_csv("data/eng_sample_data_score_prediction.csv") def download_link(object_to_download, download_filename, download_link_text): if isinstance(object_to_download, pd.DataFrame): object_to_download = object_to_download.to_csv( index=False, encoding='utf_8_sig') b64 = base64.b64encode(object_to_download.encode()).decode() return f'<a href="data:file/txt;base64,{b64}" download="{download_filename}">{download_link_text}</a>' tmp_download_link = download_link(df_edu, 'sample_score_pred.csv', 'Download sample csv file.') st.sidebar.markdown(tmp_download_link, unsafe_allow_html=True) # st.sidebar.info(""" # [Download the sample csv file](https://github.com/59er/eng_learning_analytics_web/blob/master/sample_data/eng_sample_data_score_prediction_for_WEB.csv) # """) try: uploaded_file = st.sidebar.file_uploader( "File upload (Drag and drop or use [Browse files] button to import csv file. Only utf-8 format is available.)", type=["csv"]) if uploaded_file is not None: df_edu = pd.read_csv(uploaded_file) uploaded_file.seek(0) display_data = st.sidebar.checkbox(label='Show uploaded data') if display_data: st.dataframe(df_edu) df = df_edu.drop(['ID', 'Teacher'], axis=1) target = 'Score' encode = ['Class', 'Subject'] for col in encode: dummy = pd.get_dummies(df[col], prefix=col) df = pd.concat([df, dummy], axis=1) del df[col] X = df.drop(['Score'], axis=1) Y = df['Score'] X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0) clf = RandomForestRegressor() clf.fit(X, Y) df = df.drop(['Score'], axis=1) prediction = clf.predict(df) st.subheader('Score prediction result') id = df_edu['ID'] id = pd.DataFrame(id) result = pd.DataFrame(prediction) pred_result = pd.concat([id, result], axis=1) pred_result = pred_result.rename(columns={0: 'Result'}) st.dataframe(pred_result) score = clf.score(X_test, y_test) st.set_option('deprecation.showPyplotGlobalUse', False) st.subheader('Prediction accuracy') st.write(score) fig = plt.figure(figsize=(5, 5)) explainer = shap.TreeExplainer(clf, X) shap_values = explainer.shap_values(X) st.subheader( 'Impact of explanatory variables (each item score) on the objective variable (final score)' ) fig = shap.summary_plot(shap_values, X, plot_type='bar') st.pyplot(fig) st.subheader( 'Correlation of explanatory variables with the objective variable (final score)' ) fig1 = shap.summary_plot(shap_values, X) st.pyplot(fig1) def st_shap(plot, height=None): shap_html = f"<head>{shap.getjs()}</head><body>{plot.html()}</body>" components.html(shap_html, height=height) else: def user_input_features(): class_name = st.sidebar.selectbox( 'Class', ('A', 'B', 'C', 'D', 'E', 'F', 'G')) subject = st.sidebar.selectbox( 'Subject', ('Literature', 'Math', 'Reading')) subject_A = st.sidebar.slider('item1', 0, 100, 50) subject_B = st.sidebar.slider('item2', 0, 100, 50) subject_C = st.sidebar.slider('item3', 0, 100, 50) subject_D = st.sidebar.slider('item4', 0, 100, 50) subject_E = st.sidebar.slider('item5', 0, 100, 50) data = { 'Class': class_name, 'Subject': subject, 'item1': subject_A, 'item2': subject_B, 'item3': subject_C, 'item4': subject_D, 'item5': subject_E } features = pd.DataFrame(data, index=[0]) return features input_df = user_input_features() sample_data = pd.read_csv('data/eng_sample_data_score_prediction.csv') sample = sample_data.drop(columns=['Score', 'ID', 'Teacher']) #df = sample.copy() df = pd.concat([input_df, sample], axis=0) # st.dataframe(df[:1]) encode = ['Class', 'Subject'] for col in encode: dummy = pd.get_dummies(df[col], prefix=col) df = pd.concat([df, dummy], axis=1) del df[col] df1 = df[:1] if uploaded_file is not None: st.write(df1) else: st.write( 'The following is default sample data. Select class and subject then use the sliders for each item in the sidebar to get an idea of score prediction.' ) st.write(df1) load_clf = pickle.load(open('data/subject_score_prediction.pkl', 'rb')) prediction = load_clf.predict(df1) st.subheader('Score prediction result') st.write(prediction[0]) df1 = sample_data.copy() encode = ['Class', 'Subject'] for col in encode: dummy1 = pd.get_dummies(df1[col], prefix=col) df1 = pd.concat([df1, dummy1], axis=1) del df1[col] X = df1.drop(['Score', 'Teacher', 'ID'], axis=1) Y = df1['Score'] X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0) score = load_clf.score(X_test, y_test) st.set_option('deprecation.showPyplotGlobalUse', False) st.subheader('Prediction accuracy') st.write(score) fig = plt.figure(figsize=(5, 5)) explainer = shap.TreeExplainer(load_clf, X) shap_values = explainer.shap_values(X) st.subheader( 'Impact of explanatory variables (each item score, class and subject) on the objective variable (final score)' ) fig = shap.summary_plot(shap_values, X, plot_type='bar') st.pyplot(fig) st.subheader( 'Correlation of explanatory variables (each item score, class and subject) with the objective variable (final score)' ) fig1 = shap.summary_plot(shap_values, X) st.pyplot(fig1) def st_shap(plot, height=None): shap_html = f"<head>{shap.getjs()}</head><body>{plot.html()}</body>" components.html(shap_html, height=height) #force_plot for some IDs # i = np.arange(1,5,1) # for i in i: # st.write('Example of', 'ID', i) # st_shap(shap.force_plot(explainer.expected_value, shap_values[i,:],X.iloc[i,:]),400) # st_shap(shap.force_plot(explainer.expected_value, shap_values, X),400) except Exception as e: print(e)
missing_columns = (set(base_table.columns) - set(df.columns)) - NON_FEATURE_COLUMNS for col in missing_columns: df[col] = float('nan') columns_to_drop = list(set(df.columns) - set(base_table.columns)) df = df.drop(columns_to_drop, axis=1) #Rearrange columns to input into model df = df[base_table.columns] #Now, create the predictions predicted_risk = churn_model.predict_proba(df) predicted_risk_df = pd.DataFrame(predicted_risk) explainer = shap.TreeExplainer(churn_model) shap_values = explainer.shap_values(df) shap_df = pd.DataFrame(shap_values) shap_df.columns = df.columns shap_df["churn_prob"] = predicted_risk_df[1] shap_df["key"] = keys shap_right = shap_df shap_df[['Customer', 'Product']] = shap_df['key'].str.split('_', n=1, expand=True) shap_df = pd.merge(shap_df, value_df, on='key', how='left') get_cols = ['Customer', 'Product'] + [
user_df = pd.DataFrame([user_input], columns=feature_names) clf = joblib.load("penguin_clf.joblib") prediction = clf.predict(user_df)[0] class_prediction = class_names[prediction] st.sidebar.write(f"## Prediction: {class_prediction}") proba = clf.predict_proba(user_df) proba_df = pd.DataFrame(proba, columns=class_names) st.sidebar.write(proba_df) st.write(f"## Explanation for Predicting: **{class_prediction}**") st.subheader("SHAP values") user_encoded = clf[:-1].transform(user_df) explainer = shap.TreeExplainer(clf[-1]) shap_values = explainer.shap_values(user_encoded[[0], :], check_additivity=False) shap_plot_reprs = [] for i in range(3): shap_plot = shap.force_plot(explainer.expected_value[i], shap_values[i], user_encoded, feature_names=feature_names, out_names=class_names[i]) shap_plot_reprs.append(shap_plot._repr_html_()) shap_html_repr = "".join(shap_plot_reprs)
def return_weights_from_xgboost( geodataframe, raster_path, pop_string, codes=[21, 22, 23, 24], n_pixels_option_values=256, tuned_xgb=False, gbm_hyperparam_grid={ "learning_rate": [0.001, 0.01, 0.1], "n_estimators": [200], "subsample": [0.3, 0.5], "max_depth": [4, 5, 6], "num_boosting_rounds": [10, 20], }, force_crs_match=True, na_value=255, ReLU=True, ): """Function that returns the weights of each land type according to NLCD types/codes given by Extreme Gradient Boost model (XGBoost) Parameters ---------- geodataframe : a geopandas geoDataFrame used to build regression raster_path : the path to the associated raster image. pop_string : the name of the variable on geodataframe that the regression shall be conducted codes : an integer list of codes values that should be considered as 'populated' from the National Land Cover Database (NLCD). The description of each code can be found here: https://www.mrlc.gov/sites/default/files/metadata/landcover.html The default is 21 (Developed, Open Space), 22 (Developed, Low Intensity), 23 (Developed, Medium Intensity) and 24 (Developed, High Intensity). n_pixels_option_values : number of options of the pixel values of rasterior. Default is 256. tuned_xgb : bool. Default is False. If True the XGBoost model will be tuned making a grid search using gbm_hyperparam_grid dictionary a picking the best model in terms of mean squared error with some pre-defined number of cross-validation. Otherwise, the XGBoost model is fitted with default values of xgboost.train function from xgboost Python library. gbm_hyperparam_grid : a dictionary that represent the grid for the grid search of XGBoost. force_crs_match : bool. Default is True. Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file. It is recommended to let this argument as True. na_value : int. Default is 255. The number which is considered to be 'Not a Number' (NaN) in the raster pixel values. ReLU : bool. Default is True. Wheter the Rectified Linear Units (ReLU) transformation will be used to avoid negative weights for the land types. Notes ----- 1) The formula uses a substring called 'Type_' before the code number due to the 'append_profile_in_gdf' function. 2) The pixel value, usually, ranges from 0 to 255. That is why the default of 'n_pixels_option_values' is 256. 3) The returning weights represent the average of the Shapley's values from each feature. """ raster_path = fetch_quilt_path(raster_path) try: import xgboost as xgb import shap except ImportError as e: raise ImportError("xgboost and shap are required to perform this.") _check_presence_of_crs(geodataframe) if na_value in codes: raise ValueError("codes should not assume the na_value value.") print("Appending profile...") profiled_df = fast_append_profile_in_gdf( geodataframe[["geometry", pop_string]], raster_path, force_crs_match ) # Use only two columns to build the weights (this avoids error, if the original dataset has already types appended on it). print("Append profile: Done.") # If the list is unsorted, the codes will be sorted to guarantee that the position of the weights will match codes.sort() str_codes = [str(i) for i in codes] feature_names = ["Type_" + s for s in str_codes] y = profiled_df[pop_string] X = profiled_df[feature_names] print("Starting to fit XGBoost...") if tuned_xgb == False: # Create the DMatrix xgb_dmatrix = xgb.DMatrix(X, y) # Create the parameter dictionary params = {"objective": "reg:linear"} # Train the model xg_reg = xgb.train(params=params, dtrain=xgb_dmatrix) if tuned_xgb == True: try: from sklearn.model_selection import GridSearchCV except ImportError as e: raise ImportError("sklearn is required to perform this.") gbm = xgb.XGBRegressor() grid_mse = GridSearchCV( estimator=gbm, param_grid=gbm_hyperparam_grid, scoring="neg_mean_squared_error", cv=4, # 4-fold crossvalidation verbose=3, # Prints the grid search profile n_jobs=-1, ) # Process the GridSearch in parallel all cores availables # Fit the grid to the data grid_mse.fit(X, y) best_params = grid_mse.best_params_ best_params["objective"] = "reg:linear" # Create the DMatrix xgb_dmatrix = xgb.DMatrix(X, y) # Train the model from the best parameters of the grid search xg_reg = xgb.train(params=best_params, dtrain=xgb_dmatrix) # Build explainer and fit Shapley's values (https://github.com/slundberg/shap) explainer = shap.TreeExplainer(xg_reg) shap_values = explainer.shap_values(X) weights_from_xgb = shap_values.mean( axis=0) # This is already sorted by pixel Type weights = np.zeros(n_pixels_option_values) weights[codes] = list(weights_from_xgb) # Convert to list a dict_values if ReLU: weights = np.where(weights < 0, 0, weights) return weights
y=data['y'], model_scheme='LMP', cv=5, #grid_search=True, #grid_search_scoring='r2', #param_grid=parameter_grid, eval_metric='rmse', parameters=parameters, CT_Temp=CT_Temp, CT_RT=CT_RT, C=C) catboost.run_model() print(catboost.__dict__) ''' catboost.parity_plot(data='train', quantity='LMP').savefig('parity_LMP_train.png') catboost.parity_plot(data='test', quantity='LMP').savefig('parity_LMP_test.png') catboost.parity_plot(data='train', quantity='CT_RT').savefig('parity_CT_RT_train.png') catboost.parity_plot(data='test', quantity='CT_RT').savefig('parity_CT_RT_test.png') np.save('catboost_dict.npy', catboost.__dict__) plt.clf() ''' explainer = shap.TreeExplainer(catboost.model[-1]) shap_values = explainer.shap_values(data['X']) XX = scale.inverse_transform(data['X']) X = pd.DataFrame(XX, columns=data['features']) # summarize the effects of all the features shap.summary_plot(shap_values, X, plot_type="bar", show=False) plt.savefig('feature_importance.png', dpi=150, bbox_inches='tight')
print( f"Model has {np.sum(xgboost_test_predictions==Y_test)/len(xgboost_test_predictions)*100:.3} % success rate" ) # Make plots of different importance scores for score in ['weight', 'gain', 'cover']: xgb.plot_importance(bst, importance_type=score, show_values=False, xlabel=score) plt.savefig(f'xkb_{score}.png') """ Analyze using Shapeley values """ # This hack is needed for the current version of xgboost with SHAP model_bytearray = bst.save_raw()[4:] def myfun(self=None): return model_bytearray bst.save_raw = myfun # Get the explainer for the xgboost model and calculate the shapeley-values explainer = shap.TreeExplainer(bst) shap_values = explainer.shap_values(X_test) # summarize the effects of all the features shap.summary_plot(shap_values, X_test) shap.dependence_plot("occupation", shap_values, X_test, interaction_index=None, show=False)