def training(train, test, validation_size, estimator, target_variable, drop_list, target_type, cv_folds, scoring_cv, cv=True, final=False, hypertuning=False): import matplotlib.pyplot as plt import pandas as pd import lightgbm as lgbm import training import os import sklearn import numpy as np import seaborn as sns import re import matplotlib.pyplot as plt import math from datetime import datetime import datetime import statsmodels.api as sm from sklearn.model_selection import train_test_split from scipy import stats from sklearn.feature_selection import SelectFromModel from sklearn.model_selection import cross_val_score, validation_curve from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn import ensemble from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_val_score from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import OneHotEncoder from sklearn.impute import SimpleImputer from sklearn.preprocessing import KBinsDiscretizer from sklearn.metrics import mean_squared_log_error from sklearn.metrics import make_scorer from sklearn.model_selection import KFold from sklearn.metrics import (confusion_matrix, accuracy_score, recall_score, roc_curve, roc_auc_score, plot_roc_curve, mean_squared_error) import xgboost import shap from catboost import CatBoostClassifier from catboost import CatBoostRegressor import lightgbm as lgbm import optuna.integration.lightgbm as lgb from optuna.integration import _lightgbm_tuner as tuner from optuna.integration._lightgbm_tuner import LightGBMTuner from optuna.integration._lightgbm_tuner import LightGBMTunerCV rmsle_scorer = make_scorer(score_func) train_y = train[target_variable] train_x = train.drop(columns=drop_list) test_y = test[target_variable] test_x = test.drop(columns=drop_list) column_names = list(train_x.columns) if final==True: train_x = train_x.append(test_x) train_y = train_y.append(test_y) if target_type=="bin": if estimator == "log_sk": model = LogisticRegression(max_iter=1000) log_sk = model.fit(train_x, train_y) fitted_model = log_sk if estimator == "gb" and hypertuning==False: model = ensemble.GradientBoostingClassifier(learning_rate = 0.1, max_depth=3, n_estimators= 100) gb = model.fit(train_x, train_y) fitted_model = gb if estimator == "gb" and hypertuning==True: param_grid = { 'n_estimators': [100, 200, 400], 'max_depth': [3, 5, 7], 'learning_rate': [0.1, 0.05, 0.025, 0.01, 0.001, 0.005], 'random_state': [42] } gb = ensemble.GradientBoostingClassifier() gb_grid = GridSearchCV(gb, param_grid, cv=cv_folds, scoring=scoring_cv) gb_grid.fit(train_x, train_y) print('Optimal parameters for gradient boosting classifier = ', gb_grid.best_params_) gb = gb_grid.best_estimator_ fitted_model = gb if estimator == "rf" and hypertuning==False: model = ensemble.RandomForestClassifier(max_depth= 80, max_features= 5, min_samples_leaf= 3, min_samples_split= 12, n_estimators= 100) rf = model.fit(train_x, train_y) fitted_model=rf if estimator == "rf" and hypertuning==True: param_grid = { 'bootstrap': [True], 'max_depth': [10, 20, 30], 'max_features': [2, 3, 5], 'min_samples_leaf': [3, 5, 10], 'min_samples_split': [8, 12], 'n_estimators': [100, 300, 500], 'n_jobs': [3] } rf = RandomForestClassifier() rf_grid = GridSearchCV(rf, param_grid, cv=cv_folds, scoring=scoring_cv) rf_grid.fit(train_x, train_y) print('Optimal parameters for random forest classifier = ', rf_grid.best_params_) rf = rf_grid.best_estimator_ fitted_model = rf if cv and hypertuning==False: cross_val_accuracy = cross_val_score(estimator=model , X=train_x , y=train_y , cv=cv_folds , scoring=scoring_cv) print(f'The average cross validation accuracy of the model is {round(cross_val_accuracy.mean(), 2)}') print(cross_val_accuracy) if target_type=="con": if estimator == "lgbm" and hypertuning==False: train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=validation_size, shuffle=True, random_state=42) train_data=lgb.Dataset(train_x,label=train_y) valid_data=lgb.Dataset(valid_x,label=valid_y) model = lgbm.LGBMRegressor(random_state=42, n_estimators=1000) lgbm_model = model.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], eval_metric=scoring_cv, verbose = -1) fitted_model = lgbm_model if estimator == "lin_reg" and hypertuning==False: model = LinearRegression(max_iter=1000) lin_reg = model.fit(train_x, train_y) fitted_model = lin_reg if estimator == "gb" and hypertuning==False: model = ensemble.GradientBoostingRegressor(learning_rate = 0.001, max_depth=5, n_estimators= 100) gb = model.fit(train_x, train_y) fitted_model = gb if estimator == "rf" and hypertuning==False: model = ensemble.RandomForestRegressor(max_depth= 30, max_features= 5, min_samples_leaf= 3, min_samples_split= 8, n_estimators= 500, n_jobs= -1) rf = model.fit(train_x, train_y) fitted_model=rf if estimator == "gb" and hypertuning==True: # {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 100, 'random_state': 42} param_grid = { 'n_estimators': [100,500,1000], 'max_features': ["auto","sqrt","log2",0.6,0.8], 'min_samples_leaf':[30,50,70], 'min_samples_split':[10,20,500,100], 'max_depth' : [10,15,20,25], 'learning_rate':[0.1,0.01,0.001] } gb = ensemble.GradientBoostingRegressor() gb_grid = GridSearchCV(gb, param_grid, cv=cv_folds, scoring=scoring_cv) gb_grid.fit(train_x, train_y) print('Optimal parameters for gradient boosting regressor = ', gb_grid.best_params_) gb = gb_grid.best_estimator_ fitted_model = gb if estimator == "lgbm" and hypertuning==True: if __name__ == "__main__": dtrain = lgb.Dataset(train_x, label=train_y) params = { "objective": "regression", "metric": "rmse", "verbosity": -1, "boosting_type": "gbdt", } tuner = lgb.LightGBMTunerCV( params, dtrain, verbose_eval=100, early_stopping_rounds=100, folds=KFold(n_splits=5) ) tuner.run() print("Best score:", tuner.best_score) best_params = tuner.best_params print("Best params:", best_params) print(" Params: ") for key, value in best_params.items(): print(" {}: {}".format(key, value)) if estimator == "rf" and hypertuning==True: # {'bootstrap': True, 'max_depth': 80, 'max_features': 2, 'min_samples_leaf': 5, 'min_samples_split': 12, 'n_estimators': 100, 'n_jobs': 1} # max_depth= 80, max_features= 5, min_samples_leaf= 3, min_samples_split= 8, n_estimators= 300, n_jobs= 1 # {'bootstrap': True, 'max_depth': 100, 'max_features': 5, 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 500, 'n_jobs': 4} param_grid = { 'max_depth': [10, 20, 30], 'max_features': [2, 3, 5], 'min_samples_leaf': [3, 5, 10], 'min_samples_split': [8, 12], 'n_estimators': [100, 300, 500], 'n_jobs': [4] } rf = RandomForestRegressor() rf_grid = GridSearchCV(rf, param_grid, cv=cv_folds, scoring=scoring_cv) rf_grid.fit(train_x, train_y) print('Optimal parameters for random forest regressor = ', rf_grid.best_params_) rf = rf_grid.best_estimator_ fitted_model = rf if cv and hypertuning==False: cross_val_rmse = cross_val_score(estimator=model , X=train_x , y=train_y , cv=cv_folds , scoring=scoring_cv) print(f'The average cross validation rmsle of the model is {-1*round(cross_val_rmse.mean(), 2)}') print(cross_val_rmse) if estimator=="gb" or estimator=="rf" or estimator=="lgbm": list_all_Features = train_x.columns.tolist() # Feature importance fi_df = pd.DataFrame({"Feature": list_all_Features, "Importance": fitted_model.feature_importances_}).sort_values(by="Importance", ascending=False) fi_selected=fi_df[:15] important_feature_list = fi_selected["Feature"].tolist() if estimator=="gb": fi_selected.to_excel(r'fi_selected.xlsx') fig = plt.figure(figsize=(20,10)) feat_importances = pd.Series(fitted_model.feature_importances_, index=list_all_Features) feat_importances.nlargest(30).plot(kind='barh', color="green") plt.title("Feature Importance from Gradient Boosting") plt.savefig('Feature Importance from Gradient Boosting.png', bbox_inches = "tight") if estimator=="rf": fi_selected.to_excel(r'fi_selected.xlsx') fig = plt.figure(figsize=(20,20)) feat_importances = pd.Series(fitted_model.feature_importances_, index=list_all_Features) feat_importances.nlargest(30).plot(kind='barh', color="green") plt.title("Feature Importance from Random Forest") plt.savefig('Feature Importance from Random Forest.png', bbox_inches = "tight") if estimator=="lgbm": fi_selected.to_excel(r'fi_selected.xlsx') feat_importances = pd.Series(fitted_model.feature_importances_, index=list_all_Features) explainer = shap.TreeExplainer(fitted_model) shap_values = explainer.shap_values(valid_x) shap.initjs() force_plot = shap.force_plot(explainer.expected_value, shap_values[0,:], valid_x.iloc[0,:]) shap.save_html("index_force_plot.htm", force_plot) force_plot_all = shap.force_plot(explainer.expected_value, shap_values, valid_x) shap.save_html("index_force_plot_all.htm", force_plot_all) plt.figure(figsize=(10,20)) shap.summary_plot(shap_values, valid_x, show=False) plt.savefig('summary_plot.png', bbox_inches = "tight") top_features = feat_importances.nlargest(10) top_features = top_features.reset_index() top_features = top_features['index'].to_list() for i in top_features: plt.figure(figsize=(20,20)) shap.dependence_plot(i, shap_values, valid_x, show=False) plt.savefig(f"dep_plot_{i}.png", bbox_inches = "tight") if final==False and target_type=="con": yhat = fitted_model.predict(test_x).astype(float) y_pred = list(yhat.astype(float)) y_true = list(test_y) print(np.sqrt(mean_squared_error(y_true, y_pred))) if final==False and target_type=="bin": yhat = fitted_model.predict(test_x) y_pred = list(map(round, yhat)) cm = confusion_matrix(test_y, y_pred) print ("Confusion Matrix : \n", cm) print('Test accuracy = ', accuracy_score(test_y, prediction)) print('Test recall = ', recall_score(test_y, prediction)) return fitted_model
def mlflowtisation( train_x,train_y,test_x,test_y, modele=[ElasticNet], params={"random_state":44}, nombre_de_lignes="", nombre_de_colonnes="", dataframe_non_qualifié=None ): path=os.getcwd() os.chdir("./../") import logging logging.basicConfig(level=logging.WARN) logger = logging.getLogger(__name__) def categ(x): if x <=0.1: return 0 else: return 1 def eval_metrics(actual, pred): acc = accuracy_score(actual, pred) return acc with mlflow.start_run(): mod = modele[0](**params) mod.fit(train_x, train_y) try: with open(type(mod).__name__+'.html', 'w') as f: f.write(str(eli5.show_weights(mod).data)) except Exception as e: print(e) try: shap.initjs() explainer = shap.TreeExplainer(mod) observations = mod.transform(train_x.sample(1000)) shap_values = explainer.shap_values(observations) i = 0 shap.force_plot(explainer.expected_value, shap_values[i], features=observations[i]) except Exception as e: print(e) predicted_qualities = np.array(list(map(lambda x: categ(x),list(mod.predict(test_x))))) acc = eval_metrics(test_y, predicted_qualities) rapport_details=classification_report(test_y, predicted_qualities) f=open(type(mod).__name__+'.txt', 'w') f.write(rapport_details) f.close() print("La précision du modèle {} est : {}%".format(str(mod),round(acc*100,2))) try: print("Qualification des données...") res=mod.predict(dataframe_non_qualifié) pd.DataFrame.from_dict({"pred":list(res)}).to_csv(type(mod).__name__+'.csv') except: pass mlflow.log_param("Modèle utilisé", type(mod).__name__) for param in params: mlflow.log_param(param, params[param]) mlflow.log_param("nombre de lignes", nombre_de_lignes) mlflow.log_param("nombre de colonnes", nombre_de_colonnes) mlflow.log_param("rapport_details", rapport_details) mlflow.log_metric("acc", acc) tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme # Model registry does not work with file store if tracking_url_type_store != "file": mlflow.sklearn.log_model(mod,"model", registered_model_name=str(mod)) else: mlflow.sklearn.log_model(mod,"model") mlflow.sklearn.log_model(mod, "model") os.chdir(str(path))
def deepshap_top_feat(data_all, sub_index, to_csv, FLAGS): """ Getting top important features of patients. .... data_all: all data sub_index: Indices of patients to be found their top important featues. to_csv: Save dataframe or not .... Output: DataFrame of cancer patients with top important features. """ models = [] #subset of similar patients sub = data_all[data_all.index.isin(sub_index)] #DeepShap to extract features for num_groups in FLAGS.num_groups: #import trained model model = import_keras_models(FLAGS, num_groups, 'train') #initialize js methods for visualization shap.initjs() clustering_part = Model( inputs=model.inputs, outputs=model. outputs[2], # specifying a single output for shap usage ) # create an instance of the DeepSHAP which is called DeepExplainer explainer_shap = shap.DeepExplainer( model=clustering_part, data=data_all.iloc[:, 0:FLAGS.dimension]) # Fit the explainer on a subset of the data (you can try all but then gets slower) shap_values = explainer_shap.shap_values( X=sub.iloc[:, 0:FLAGS.dimension].values, ranked_outputs=True) features = [] #get top %1 pencentile features for each index for i in range(sub.shape[0]): abso = np.absolute(shap_values[0][0][i]) ind = abso.argsort()[-round(FLAGS.dimension * FLAGS.percent):][::-1] feat = sub.columns.values[ind] features.append(feat) models.append(features) gc.collect() inter_features = [] #get intersection of top features of models for i in range(sub.shape[0]): intsec = list( functools.reduce(set.intersection, [set(item[i]) for item in models])) inter_features.append(intsec) shap_df = pd.DataFrame(list( dict(zip(sub.index.values, inter_features)).items()), columns=['patient', 'shaps']) shap_df['label'] = sub['label'].values if to_csv == True: shap_df.to_csv('shaps_top_features.csv', index=True) return shap_df
def main(): shap.initjs() ##1. read configuration file configs = json.load(open('Configuration.json', 'r')) if not os.path.exists(configs['model']['save_dir']): os.makedirs(configs['model']['save_dir']) ##2. read data clustered_timeseries_path = configs['paths']['clustered_timeseries_path'] time_series = pd.read_csv(clustered_timeseries_path + "TimeSeriesAggregatedClusteredDeltaTwoDays.csv") ##3. impute dynamic_features = configs['data']['dynamic_columns'] grouping = configs['data']['grouping'] time_series[dynamic_features] = impute(time_series, dynamic_features) ##4. generate new features based on delta from baseline outcome_columns = configs['data']['classification_outcome'] baseline_features = configs['data']['baseline_columns'] static_features = configs['data']['static_columns'] new_series = generate_trajectory_timeseries(time_series, baseline_features, static_features, dynamic_features, grouping, outcome_columns) ##5. scale normalized_timeseries = scale(new_series, dynamic_features) groups = np.array(time_series[grouping]) X = normalized_timeseries[dynamic_features] X_student = time_series[static_features] X_student[grouping] = time_series[grouping] print(" AFTER AGGREGATION, DIM OF X_STUDENT: ", X_student.shape) ##6. Training/Prediction for all outcomes. for outcome in configs['data']['classification_outcome']: outcome_df = pd.DataFrame() number_of_features = configs['data']['sequence_length'] batch_size = configs['training']['batch_size'] y = time_series[outcome] y = y.astype(int) teacher_model = LSTMModel(configs['model']['name'] + outcome) teacher_model.build_model(configs) student_model = XGBoostModel(configs['model']['name'] + outcome) for ffold_ind, (training_ind, testing_ind) in enumerate( stratified_group_k_fold(X, y, groups, k=5)): # CROSS-VALIDATION training_groups, testing_groups = groups[training_ind], groups[ testing_ind] this_X_train, this_X_val = X.iloc[training_ind], X.iloc[ testing_ind] this_y_train, this_y_val = y.iloc[training_ind], y.iloc[ testing_ind] print("testing groups!!!!!", len(testing_groups), len(set(testing_groups))) this_y_ids = groups[testing_ind] assert len(set(training_groups) & set(testing_groups)) == 0 #(NumberOfExamples, TimeSteps, FeaturesPerStep). reshaped_x = (this_X_train.values).reshape(-1, batch_size, number_of_features) reshaped_y = (this_y_train.values).reshape(-1, batch_size, 1) reshaped_x_val = (this_X_val.values).reshape( -1, batch_size, number_of_features) reshaped_y_val = (this_y_val.values).reshape(-1, batch_size, 1) print(" THE RESHAPED: ") print(" TRAINING X SHAPE: ", reshaped_x.shape) print(" TRAINING Y SHAPE: ", reshaped_y.shape) print(" VAL X SHAPE: ", reshaped_x_val.shape) print(" VAL Y SHAPE: ", reshaped_y_val.shape) teacher_model.train(reshaped_x, reshaped_y, reshaped_x_val, reshaped_y_val, epochs=configs['training']['epochs'], batch_size=batch_size, save_dir=configs['model']['save_dir']) this_y_val = pd.DataFrame(this_y_val) this_y_val[grouping] = testing_groups print(" before reshaping: ") print(" TRAINING X SHAPE: ", this_X_train.shape) print(" TRAINING Y SHAPE: ", this_y_train.shape) this_X_val.reset_index() y_pred_val_teacher = teacher_model.predict( (this_X_val.values).reshape(-1, batch_size, number_of_features)) print(" DIMENSIONS OF WHAT THE TEACHER PREDICTED: ", y_pred_val_teacher.shape) ##ZI MAKE SURE YS CORRESPOND TO THE XS. DON'T JUST USE Y IN THIS CALL ## ZI WORK ON THIS print(" DIM OF Y PRED BY TEACHER:", y_pred_val_teacher.shape) print(" DIM OF THIS Y VAL: ", this_y_val.shape) #training_groups, testing_groups = groups[training_ind], groups[testing_ind] #this_X_train, this_X_val = X.iloc[training_ind], X.iloc[testing_ind] #this_y_train, this_y_val = y.iloc[training_ind], y.iloc[testing_ind] print(" COLUMNS OF THIS Y VAL WHICH IS XGBOOST TRAINING: ") print(this_y_val.columns) xgboost_y_training = this_y_val print(" PRINTING HEAD") print(xgboost_y_training.head()) xgboost_y_training = xgboost_y_training.groupby(grouping).first() xgboost_y_training = xgboost_y_training.reset_index() lstm_output = pd.DataFrame( y_pred_val_teacher.reshape(len(xgboost_y_training), batch_size)) lstm_output = lstm_output.reset_index() print(" SHAPES: df SO FAR: ", xgboost_y_training.shape, " LSTM OUTPUT: ", lstm_output.shape, type(lstm_output)) xgboost_y_training = pd.merge(xgboost_y_training, lstm_output, left_index=True, right_index=True) #xgboost_y_training = pd.concat([xgboost_y_training, lstm_output], ignore_index=True, sort=False) #student_model.train(Xgboost_X, this_y_val, outcome, configs) static_df = time_series[static_features] static_df[grouping] = time_series[grouping] static_df = static_df.drop_duplicates(grouping) xgboost_y_training = xgboost_y_training.merge(static_df, how='left', on=grouping) xgboost_y_training.to_csv("StuentTrainig" + outcome + ".csv") student_model.train(xgboost_y_training.iloc[:, 3:], xgboost_y_training[outcome], outcome, configs)
def shap_js(model, value): shap.initjs() explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(value) return shap.force_plot(explainer.expected_value[0], shap_values[0], value)
def test_init(self): shap.initjs()
def train(train_data, test_data=None): G = train_data[0] # G 是一个Networkx里的对象,这几个都是经过load_data()处理过的 features = train_data[1] id_map = train_data[2] class_map1 = train_data[4] class_map2 = train_data[5] class_map3 = train_data[6] dict_classmap = { 0: class_map1, 1: class_map2, 2: class_map3, 3: class_map3 } hierarchy = FLAGS.hierarchy features_shape1 = None a_class = construct_class_numpy(class_map1) b_class = construct_class_numpy(class_map2) c_class = construct_class_numpy(class_map3) a_class = tf.cast(a_class, tf.float32) b_class = tf.cast(b_class, tf.float32) c_class = tf.cast(c_class, tf.float32) num_class = [] # for key in class_map.keys(): # num_class = num_class.append(sum(class_map[key])) for hi_num in range(hierarchy): #tf.reset_default_graph() if hi_num == 0: class_map = class_map1 features = features features_shape1 = features.shape[1] if features is not None: # pad with dummy zero vector features = np.vstack( [features, np.zeros((features.shape[1], ))]) features = tf.cast(features, tf.float32) else: print("hierarchy %d finished" % (hi_num), end='\n\n') class_map = dict_classmap[hi_num] features = features2 features = tf.cast(features, tf.float32) features = tf.concat( [features, tf.zeros([1, features_shape1 + num_classes])], axis=0) features_shape1 = features.shape[1] if hi_num == 0: if isinstance(list(class_map.values())[0], list): num_classes = len(list(class_map.values())[0]) else: num_classes = len(set(class_map.values())) else: if isinstance(list(dict_classmap[hi_num].values())[0], list): num_classes = len(list(dict_classmap[hi_num].values())[0]) else: num_classes = len(set(dict_classmap[hi_num].values())) """"" if features is not None: # pad with dummy zero vector features = np.vstack([features, np.zeros((features.shape[1],))]) """ "" # features = tf.cast(features, tf.float32) # embeding_weight=tf.get_variable('emb_weights', [50, 128], initializer=tf.random_normal_initializer(),dtype=tf.float32) # features=tf.matmul(features,embeding_weight) context_pairs = train_data[3] if FLAGS.random_context else None placeholders = construct_placeholders(num_classes) minibatch = NodeMinibatchIterator(G, id_map, placeholders, class_map, num_classes, batch_size=FLAGS.batch_size, max_degree=FLAGS.max_degree, context_pairs=context_pairs) ########## with open('test_nodes.txt', 'w') as f: json.dump(minibatch.test_nodes, f) ########### if hi_num == 0: adj_info_ph = tf.placeholder(tf.int32, shape=minibatch.adj.shape, name='adj_info_ph') # 把adj_info设成Variable应该是因为在训练和测试时会改变adj_info的值,所以 # 用Varible然后用tf.assign()赋值。 adj_info = tf.Variable(adj_info_ph, trainable=False, name="adj_info") shap.initjs() if FLAGS.model == 'graphsage_mean': # Create model sampler = UniformNeighborSampler(adj_info) if FLAGS.samples_3 != 0: layer_infos = [ SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1), SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2), SAGEInfo("node", sampler, FLAGS.samples_3, FLAGS.dim_2) ] elif FLAGS.samples_2 != 0: layer_infos = [ SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1), SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2) ] else: layer_infos = [ SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1) ] model = SupervisedGraphsage( num_classes, placeholders, features, adj_info, minibatch.deg, # 每一个的度 layer_infos, model_size=FLAGS.model_size, sigmoid_loss=FLAGS.sigmoid, identity_dim=FLAGS.identity_dim, logging=True, concat=True, ) elif FLAGS.model == 'gcn': # Create model sampler = UniformNeighborSampler(adj_info) layer_infos = [ SAGEInfo("node", sampler, FLAGS.samples_1, 2 * FLAGS.dim_1), SAGEInfo("node", sampler, FLAGS.samples_2, 2 * FLAGS.dim_2) ] model = SupervisedGraphsage(num_classes, placeholders, features, adj_info, minibatch.deg, layer_infos=layer_infos, aggregator_type="gcn", model_size=FLAGS.model_size, concat=False, sigmoid_loss=FLAGS.sigmoid, identity_dim=FLAGS.identity_dim, logging=True) elif FLAGS.model == 'graphsage_seq': sampler = UniformNeighborSampler(adj_info) layer_infos = [ SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1), SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2) ] model = SupervisedGraphsage(num_classes, placeholders, features, adj_info, minibatch.deg, layer_infos=layer_infos, aggregator_type="seq", model_size=FLAGS.model_size, sigmoid_loss=FLAGS.sigmoid, identity_dim=FLAGS.identity_dim, logging=True, concat=True) elif FLAGS.model == 'graphsage_maxpool': sampler = UniformNeighborSampler(adj_info) layer_infos = [ SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1), SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2) ] model = SupervisedGraphsage(num_classes, placeholders, features, adj_info, minibatch.deg, layer_infos=layer_infos, aggregator_type="maxpool", model_size=FLAGS.model_size, sigmoid_loss=FLAGS.sigmoid, identity_dim=FLAGS.identity_dim, logging=True, concat=True) elif FLAGS.model == 'graphsage_meanpool': sampler = UniformNeighborSampler(adj_info) layer_infos = [ SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1), SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2) ] model = SupervisedGraphsage(num_classes, placeholders, features, adj_info, minibatch.deg, layer_infos=layer_infos, aggregator_type="meanpool", model_size=FLAGS.model_size, sigmoid_loss=FLAGS.sigmoid, identity_dim=FLAGS.identity_dim, logging=True, concat=True) elif FLAGS.model == 'gat': sampler = UniformNeighborSampler(adj_info) # 建立两层网络 采样邻居、邻居个数、输出维度 layer_infos = [ SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1), SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2) ] model = SupervisedGraphsage( num_classes, placeholders, features, adj_info, minibatch.deg, concat=True, layer_infos=layer_infos, aggregator_type="gat", model_size=FLAGS.model_size, sigmoid_loss=FLAGS.sigmoid, identity_dim=FLAGS.identity_dim, logging=True, ) else: raise Exception('Error: model name unrecognized.') config = tf.ConfigProto( log_device_placement=FLAGS.log_device_placement) config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = GPU_MEM_FRACTION config.allow_soft_placement = True # Initialize session sess = tf.Session(config=config) # sess = tf_dbg.LocalCLIDebugWrapperSession(sess) #merged = tf.summary.merge_all() # 将所有东西保存到磁盘,可视化会用到 #summary_writer = tf.summary.FileWriter(log_dir(), sess.graph) # 记录信息,可视化,可以用tensorboard查看 # Init variables sess.run(tf.global_variables_initializer(), feed_dict={adj_info_ph: minibatch.adj}) #sess.run(tf.global_variables_initializer(), feed_dict={adj_info_ph2: minibatch2.adj}) # Train model total_steps = 0 avg_time = 0.0 epoch_val_costs = [] epoch_val_costs2 = [] # 这里minibatch.adj和minibathc.test_adj的大小是一样的,只不过adj里面把不是train的值都变成一样 # val在这里是validation的意思,验证 train_adj_info = tf.assign( adj_info, minibatch.adj ) # tf.assign()是为一个tf.Variable赋值,返回值是一个Variable,是赋值后的值 val_adj_info = tf.assign( adj_info, minibatch.test_adj) # assign()是一个Opration,要用sess.run()才能执行 it = 0 train_loss = [] val_loss = [] train_f1_mics = [] val_f1_mics = [] loss_plt = [] loss_plt2 = [] trainf1mi = [] trainf1ma = [] valf1mi = [] valf1ma = [] iter_num = 0 if hi_num == 0: epochs = FLAGS.epochs elif hi_num == 1: epochs = FLAGS.epochs2 elif hi_num == 2: epochs = FLAGS.epochs3 else: epochs = FLAGS.epochs4 for epoch in range(epochs + 1): if epoch < epochs: minibatch.shuffle() iter = 0 print('Epoch: %04d' % (epoch + 1)) epoch_val_costs.append(0) while not minibatch.end(): # Construct feed dictionary # 通过改变feed_dict来改变每次minibatch的节点 feed_dict, labels = minibatch.next_minibatch_feed_dict( ) # feed_dict是mibatch修改过的placeholder feed_dict.update({placeholders['dropout']: FLAGS.dropout}) t = time.time() # Training step outs = sess.run([model.opt_op, model.loss, model.preds], feed_dict=feed_dict) train_cost = outs[1] iter_num = iter_num + 1 loss_plt.append(float(train_cost)) if iter % FLAGS.print_every == 0: # Validation 验证集 sess.run(val_adj_info.op ) # sess.run() fetch参数是一个Opration,代表执行这个操作。 if FLAGS.validate_batch_size == -1: val_cost, val_f1_mic, val_f1_mac, duration, otu_lazy, _ = incremental_evaluate( sess, model, minibatch, FLAGS.batch_size) else: val_cost, val_f1_mic, val_f1_mac, duration = evaluate( sess, model, minibatch, FLAGS.validate_batch_size) sess.run(train_adj_info.op ) # 每一个tensor都有op属性,代表产生这个张量的opration。 epoch_val_costs[-1] += val_cost #if iter % FLAGS.print_every == 0: #summary_writer.add_summary(outs[0], total_steps) # Print results avg_time = (avg_time * total_steps + time.time() - t) / (total_steps + 1) loss_plt2.append(float(val_cost)) valf1mi.append(float(val_f1_mic)) valf1ma.append(float(val_f1_mac)) if iter % FLAGS.print_every == 0: train_f1_mic, train_f1_mac, train_f1_none = calc_f1( labels, outs[-1]) trainf1mi.append(float(train_f1_mic)) trainf1ma.append(float(train_f1_mac)) print( "Iter:", '%04d' % iter, # 训练集上的损失函数等信息 "train_loss=", "{:.5f}".format(train_cost), "train_f1_mic=", "{:.5f}".format(train_f1_mic), "train_f1_mac=", "{:.5f}".format(train_f1_mac), # 在测试集上的损失函数值等信息 "val_loss=", "{:.5f}".format(val_cost), "val_f1_mic=", "{:.5f}".format(val_f1_mic), "val_f1_mac=", "{:.5f}".format(val_f1_mac), "time=", "{:.5f}".format(avg_time)) train_loss.append(train_cost) val_loss.append(val_cost) train_f1_mics.append(train_f1_mic) val_f1_mics.append(val_f1_mic) iter += 1 total_steps += 1 if total_steps > FLAGS.max_total_steps: break if total_steps > FLAGS.max_total_steps: break # concat features elif hi_num == FLAGS.hierarchy - 1: print("the last outputs") else: iter = 0 minibatch.shuffle() while not minibatch.end(): print("Iter:", '%04d' % iter, "concat") feed_dict, labels = minibatch.next_minibatch_feed_dict( ) # feed_dict是mibatch修改过的placeholder feed_dict.update({placeholders['dropout']: FLAGS.dropout}) x = feed_dict[placeholders['batch']] outs = sess.run([ model.opt_op, model.loss, model.preds, model.node_preds ], feed_dict=feed_dict) features_tail = outs[3] features_tail = tf.cast(features_tail, tf.float32) """"" if hi_num == 0: features_tail = tf.nn.embedding_lookup(a_class, feed_dict[placeholders["batch"]]) elif hi_num == 1: features_tail = tf.nn.embedding_lookup(b_class, feed_dict[placeholders["batch"]]) else: features_tail = tf.nn.embedding_lookup(c_class, feed_dict[placeholders["batch"]]) """ "" hidden = tf.nn.embedding_lookup( features, feed_dict[placeholders["batch"]]) features_inter = tf.concat([hidden, features_tail], axis=1) if iter == 0: features2 = features_inter else: features2 = tf.concat([features2, features_inter], axis=0) iter += 1 # val features & test features iter_num = 0 finished = False while not finished: feed_dict_val, batch_labels, finished, _ = minibatch.incremental_node_val_feed_dict( FLAGS.batch_size, iter_num, test=False) node_outs_val = sess.run( [model.preds, model.loss, model.node_preds], feed_dict=feed_dict_val) tail_val = tf.cast(node_outs_val[2], tf.float32) hidden_val = tf.nn.embedding_lookup( features, feed_dict_val[placeholders["batch"]]) features_inter_val = tf.concat([hidden_val, tail_val], axis=1) iter_num += 1 features2 = tf.concat([features2, features_inter_val], axis=0) print("val features finished") iter_num = 0 finished = False while not finished: feed_dict_test, batch_labels, finished, _ = minibatch.incremental_node_val_feed_dict( FLAGS.batch_size, iter_num, test=True) node_outs_test = sess.run( [model.preds, model.loss, model.node_preds], feed_dict=feed_dict_test) tail_test = tf.cast(node_outs_test[2], tf.float32) hidden_test = tf.nn.embedding_lookup( features, feed_dict_test[placeholders["batch"]]) features_inter_test = tf.concat([hidden_test, tail_test], axis=1) iter_num += 1 features2 = tf.concat([features2, features_inter_test], axis=0) print("test features finished") print("finish features concat") #features2 = sess.run(features2) print("Optimization Finished!") sess.run(val_adj_info.op) val_cost, val_f1_mic, val_f1_mac, duration, otu_f1, ko_none = incremental_evaluate( sess, model, minibatch, FLAGS.batch_size, test=True) print("Full validation stats:", "loss=", "{:.5f}".format(val_cost), "f1_micro=", "{:.5f}".format(val_f1_mic), "f1_macro=", "{:.5f}".format(val_f1_mac), "time=", "{:.5f}".format(duration)) pred = y_ture_pre(sess, model, minibatch, FLAGS.batch_size) for i in range(pred.shape[0]): sum = 0 for l in range(pred.shape[1]): sum = sum + pred[i, l] for m in range(pred.shape[1]): pred[i, m] = pred[i, m] / sum id = json.load(open(FLAGS.train_prefix + "-id_map.json")) # x_train = np.empty([pred.shape[0], array.s) num = 0 session = tf.Session() array = session.run(features) x_test = np.empty([pred.shape[0], array.shape[1]]) x_train = np.empty([len(minibatch.train_nodes), array.shape[1]]) for node in minibatch.val_nodes: x_test[num] = array[id[node]] num = num + 1 num1 = 0 for node in minibatch.train_nodes: x_train[num1] = array[id[node]] num1 = num1 + 1 with open(log_dir() + "val_stats.txt", "w") as fp: fp.write( "loss={:.5f} f1_micro={:.5f} f1_macro={:.5f} time={:.5f}".format( val_cost, val_f1_mic, val_f1_mac, duration)) print("Writing test set stats to file (don't peak!)") val_cost, val_f1_mic, val_f1_mac, duration, otu_lazy, ko_none = incremental_evaluate( sess, model, minibatch, FLAGS.batch_size, test=True) with open(log_dir() + "test_stats.txt", "w") as fp: fp.write("loss={:.5f} f1_micro={:.5f} f1_macro={:.5f}".format( val_cost, val_f1_mic, val_f1_mac)) incremental_evaluate_for_each(sess, model, minibatch, FLAGS.batch_size, test=True) ################################################################################################################## # plot loss plt.figure() plt.plot(loss_plt, label='train_loss') plt.plot(loss_plt2, label='val_loss') plt.legend(loc=0) plt.xlabel('Iteration') plt.ylabel('loss') plt.title('Loss plot') plt.grid(True) plt.axis('tight') #plt.savefig("./graph/HMC12_loss.png") # plt.show() # plot f1 score plt.figure() plt.subplot(211) plt.plot(trainf1mi, label='train_f1_micro') plt.plot(valf1mi, label='val_f1_micro') plt.legend(loc=0) plt.xlabel('Iterations') plt.ylabel('f1_micro') plt.title('train_val_f1_score') plt.grid(True) plt.axis('tight') plt.subplot(212) plt.plot(trainf1ma, label='train_f1_macro') plt.plot(valf1ma, label='val_f1_macro') plt.legend(loc=0) plt.xlabel('Iteration') plt.ylabel('f1_macro') plt.grid(True) plt.axis('tight') #plt.savefig("./graph/HMC123_f1.png") # plt.show() plt.figure() plt.plot(np.arange(len(train_loss)) + 1, train_loss, label='train') plt.plot(np.arange(len(val_loss)) + 1, val_loss, label='val') plt.legend() plt.savefig('loss.png') plt.figure() plt.plot(np.arange(len(train_f1_mics)) + 1, train_f1_mics, label='train') plt.plot(np.arange(len(val_f1_mics)) + 1, val_f1_mics, label='val') plt.legend() plt.savefig('f1.png') # OTU f1 plt.figure() plt.plot(otu_f1, label='otu_f1') plt.legend(loc=0) plt.xlabel('OTU') plt.ylabel('f1_score') plt.title('OTU f1 plot') plt.grid(True) plt.axis('tight') #plt.savefig("./graph/HMC123_otu_f1.png") # plt.show() #Ko f1 score plt.figure() plt.plot(ko_none, label='Ko f1 score') plt.legend(loc=0) plt.xlabel('Ko') plt.ylabel('f1_score') plt.grid(True) plt.axis('tight') #plt.savefig("./graph/HMC123_ko_f1.png") bad_ko = [] b02 = 0 b05 = 0 b07 = 0 for i in range(len(ko_none)): if ko_none[i] < 0.2: bad_ko.append(i) b02 += 1 bad_ko = np.array(bad_ko) elif ko_none[i] < 0.5: b05 += 1 elif ko_none[i] < 0.7: b07 += 1 print("ko f1 below 0.2:", b02) print("ko f1 below 0.5:", b05) print("ko f1 below 0.7:", b07)
def explainable_results(specific_prediction_sample_to_explain: int, X, Y, input_label_index_value, num_labels: int): # , anamoly_data """ Understand, interpret, and trust the results on the deep models at individual/samples level """ ''' from sklearn.ensemble import RandomForestRegressor import xgboost import shap import numpy as np shap.initjs() y = [max(i) for i in Y] my_model_1 = xgboost.XGBRegressor().fit(X, np.array(y)) # explain the model's predictions using SHAP # (same syntax works for LightGBM, CatBoost, scikit-learn, transformers, Spark, etc.) explainer_xgb = shap.Explainer(my_model_1) shap_values_xgb = explainer_xgb(anamoly_data) # visualize the first prediction's explanation shap.plots.waterfall(shap_values_xgb[specific_prediction_sample_to_explain]) my_model_2 = RandomForestRegressor(random_state=0).fit(X, np.array(y)) data_for_prediction = X[specific_prediction_sample_to_explain] # use 1 row of data here. Could use multiple rows if desired # Create object that can calculate shap values explainer_rf = shap.TreeExplainer(my_model_2) # Calculate Shap values shap_values = explainer_rf.shap_values(data_for_prediction) shap.force_plot(explainer_rf.expected_value[specific_prediction_sample_to_explain], shap_values[1], data_for_prediction) ''' # Quick Clean Hack Suggested by - Cory Randolph @coryroyce import shap import numpy as np import pandas as pd from keras.models import Sequential from keras.layers import Dense import ipywidgets as widgets # Get the number of inputs and outputs from the dataset n_inputs, n_outputs = X.shape[1], Y.shape[1] def get_model(n_inputs, n_outputs): model_nn = Sequential() model_nn.add( Dense(32, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu')) model_nn.add(Dense(n_outputs, kernel_initializer='he_uniform')) model_nn.compile(loss='mae', optimizer='adam') return model_nn model_nn = get_model(n_inputs, n_outputs) model_nn.fit(X.iloc[10:, :].values, Y, epochs=30) model_nn.evaluate(x=X.iloc[10:, :].values, y=Y) XpredictInputData = X.iloc[ specific_prediction_sample_to_explain, :] # X[specific_prediction_sample_to_explain,:] if (XpredictInputData.ndim == 1): XpredictInputData = np.array([XpredictInputData]) print(model_nn.predict(XpredictInputData)) # 0:1 ''' Here we take the Keras model trained above and explain why it makes different predictions on individual samples. Set the explainer using the Kernel Explainer (Model agnostic explainer method form SHAP). ''' explainer = shap.KernelExplainer(model=model_nn.predict, data=X.head(50), link="identity") # data = X[0:50] ''' Get the Shapley value for a single example. ''' # Set the index of the specific example to explain shap_value_single = explainer.shap_values( X=X.iloc[specific_prediction_sample_to_explain, :], nsamples=100) # X[specific_prediction_sample_to_explain,:] ''' Display the details of the single example ''' print(X.iloc[specific_prediction_sample_to_explain, :]) ''' Choose the label/output/target to run individual explanations on: Note: The dropdown menu can easily be replaced by manually setting the index on the label to explain. ''' # Create the list of all labels for the drop down list #label_cols = ['window_diff_0', 'window_diff_1', 'window_diff_2', 'window_diff_3', 'window_diff_4', 'window_diff_5', 'window_diff_6'] label_cols = ['window_diff_' + str(i) for i in range(num_labels)] #print(label_cols) df_labels = pd.DataFrame(data=Y, columns=label_cols) df_labels.to_csv('y_labels.csv') list_of_labels = df_labels.columns.to_list() # Y.columns.to_list() # Create a list of tuples so that the index of the label is what is returned tuple_of_labels = list(zip(list_of_labels, range(len(list_of_labels)))) # Create a widget for the labels and then display the widget current_label = widgets.Dropdown(options=tuple_of_labels, value=input_label_index_value, description='Select Label:') # Display the dropdown list (Note: access index value with 'current_label.value') print(current_label) #Dropdown(description='Select Label:', options=(('labels_01', 0), ('labels_02', 1), ('labels_03', 2), etc ''' Plot the force plot for a single example and a single label/output/target ''' print(f'Current label Shown: {list_of_labels[current_label.value]}') # print the JS visualization code to the notebook shap.initjs() shap.force_plot( base_value=explainer.expected_value[current_label.value], shap_values=shap_value_single[current_label.value], features=X.iloc[ specific_prediction_sample_to_explain, :] # X_idx:X_idx+1 ) ''' Create the summary plot for a specific output/label/target. ''' # Note: We are limiting to the first 50 training examples since it takes time to calculate the full number of sampels shap_values = explainer.shap_values(X=X.iloc[0:50, :], nsamples=100) # X[0:50,:] print(f'Current Label Shown: {list_of_labels[current_label.value]}\n') # print the JS visualization code to the notebook shap.initjs() shap.summary_plot( shap_values=shap_values[current_label.value], features=X.iloc[0:50, :] # X[0:50,:] ) ''' Force Plot for the first 50 individual examples. ''' print(f'Current Label Shown: {list_of_labels[current_label.value]}\n') # print the JS visualization code to the notebook shap.initjs() shap.force_plot( base_value=explainer.expected_value[current_label.value], shap_values=shap_values[current_label.value], features=X.iloc[0:50, :] # X[0:50,:] )
def lightgbm(train_X, test_X, train_Y, args): train_X, valid_X, train_Y, valid_Y = train_test_split(train_X, train_Y, test_size=0.1, random_state=4) # データセットを生成する lgb_train = lgb.Dataset(train_X, train_Y) lgb_eval = lgb.Dataset(valid_X, valid_Y, reference=lgb_train) if args.optuna: # optunaを使用 print("Using optuna!!") import optuna.integration.lightgbm as lgb_optuna # LightGBM のハイパーパラメータ lgbm_params = { # 回帰分析 'objective': 'regression', # AUC の最大化を目指す 'metric': 'rmse', # Fatal の場合出力 'verbosity': -1, "feature_pre_filter": False } best_params, history = {}, [] # 上記のパラメータでモデルを学習する model = lgb_optuna.train( lgbm_params, lgb_train, valid_sets=lgb_eval, verbose_eval=100, # 100イテレーション毎に学習結果出力 num_boost_round=1000, # 最大イテレーション回数指定 early_stopping_rounds=100, best_params=best_params, tuning_history=history, ) print(f'best_params : {best_params}') with open('optuna.txt', 'w') as f: print(best_params, file=f) else: best_params = { 'lambda_l1': 3.89081415861961e-06, 'lambda_l2': 0.02666349731287391, 'num_leaves': 6, 'max_depth': -1, 'feature_fraction': 0.8999999999999999, 'bagging_fraction': 1.0, 'bagging_freq': 0, 'min_child_samples': 20, 'objective': 'regression', 'metric': 'rmse' } model = lgb.train( best_params, lgb_train, valid_sets=lgb_eval, verbose_eval=50, # 50イテレーション毎に学習結果出力 num_boost_round=1000, # 最大イテレーション回数指定 early_stopping_rounds=100) # テストデータを予測する y_pred = model.predict(test_X, num_iteration=model.best_iteration) shap.initjs() explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(train_X) shap.summary_plot(shap_values, train_X) return y_pred, model
def make_shap_interpretation(model, training_set, column_names, ml_name, target, dataset, X, processor): """display shap's multi class values and force plots based on personal id selection""" # Summary plot explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(training_set) shap.summary_plot(shap_values, column_names, class_names=model.classes_, plot_type='bar', max_display=10, show=True, auto_size_plot=True) plt.title(f'SHAP Multi Class Values from {ml_name}', fontsize=12, fontweight='bold') plt.legend(loc='lower right') st.markdown("#### Shap Summary Plot") info_global = st.button("How it is calculated") if info_global: st.info(""" The shap summary plot explains how each features impact the output of the model to get the overall influence of each class using absolute values. The bigger the bar of the class is the more influence it has on that particular feature. The shap summary plot is only displaying the top 10 features. For more information, check out this free course at kaggle: [Link](https://www.kaggle.com/dansbecker/shap-values) To check out the shap values documentation, click the link: [Shap Values Documentation]( https://shap.readthedocs.io/en/latest/example_notebooks/overviews/An%20introduction%20to%20explainable%20AI%20with%20Shapley%20values.html ) """) st.pyplot() st.markdown("#### Shap Force Plot") info_local = st.button("How this works") if info_local: st.info(""" The shap force plot demonstrates how each individual feature influence the prediction outcome. Features in the red are the likely ones to be the predicted class whereas the features in blue reduces that probabily to be the predicted class. Is sort of like hot and cold. Heat rises and cold sinks. You can choose one of out the five prediction classes to see the effects of a selected feature. Please expand the force plot for better readability. For more information, check out this free course at kaggle: [Link](https://www.kaggle.com/dansbecker/shap-values) To check out the shap values documentation, click the link: [Shap Values Documentation]( https://shap.readthedocs.io/en/latest/example_notebooks/overviews/An%20introduction%20to%20explainable%20AI%20with%20Shapley%20values.html ) """) # Force plot slider_idx = st.selectbox('Personal ID of Guest', X.index) row_p = X.loc[[slider_idx]] row = processor.transform(row_p) explainer_force = shap.TreeExplainer(model) shap_values_force = explainer_force.shap_values(row) class_list = list(dataset['Target Exit Destination'].value_counts().index) target_value = st.selectbox("Choose the class to plot", class_list, index=1) shap.initjs() if target_value == 'Unknown/Other': shap.force_plot( base_value=explainer_force.expected_value[0], shap_values=shap_values_force[0], features=row, feature_names=column_names, link='logit', show=True, matplotlib=True, figsize=(30, 12), text_rotation=45, ) elif target_value == 'Permanent Exit': shap.force_plot(base_value=explainer_force.expected_value[1], shap_values=shap_values_force[1], features=row, feature_names=column_names, link='logit', show=True, matplotlib=True, figsize=(30, 12), text_rotation=45) elif target_value == 'Emergency Shelter': shap.force_plot(base_value=explainer_force.expected_value[2], shap_values=shap_values_force[2], features=row, feature_names=column_names, link='logit', show=True, matplotlib=True, figsize=(30, 12), text_rotation=45) elif target_value == 'Temporary Exit': shap.force_plot(base_value=explainer_force.expected_value[3], shap_values=shap_values_force[3], features=row, feature_names=column_names, link='logit', show=True, matplotlib=True, figsize=(30, 12), text_rotation=45) elif target_value == 'Transitional Housing': shap.force_plot(base_value=explainer_force.expected_value[4], shap_values=shap_values_force[4], features=row, feature_names=column_names, link='logit', show=False, matplotlib=True, figsize=(30, 12), text_rotation=45) """ Known bugs: 1. Posx and posy should be finite values. Text and fig scaling issues. 2. Shap - matplotlib = True is not yet supported for force plots with multiple samples! Example: Pick [Personal ID 53716] 3. Segmentation fault. It crashes. """ st.pyplot()
def shap_why_connector(self, target, *arg): #Input: Numpy. Output: Pandas df. Turns numbers into categories. def adapter(n): d = pd.DataFrame(data=n, columns=self.featureNames) categories = self.getCategoricalFeatures() for c in categories: d[c] = d[c].map(self.dictionary[c]["values"]) #d['Sex'] = d['Sex'].map({0:'Male', 1: 'Female'}) #d['Embarked'] = d['Embarked'].map({0: 'Southampton', 1: 'Cherbourg', 2: 'Queenstown'}) #d['Pclass'] = d['Pclass'].map({0: 'First', 1: 'Second', 2: 'Third'}) return d #Input: Pandas df. Output: Numpy. Turns categories into numbers. def reverse_adapter(p): d = p.copy() categories = self.getCategoricalFeatures() for c in categories: d[c] = d[c].map( {v: k for k, v in self.dictionary[c]["values"].items()}) #d['Sex'] = d['Sex'].map({'Male': 0, 'Female': 1}) #d['Embarked'] = d['Embarked'].map({'Southampton': 0, 'Cherbourg': 1, 'Queenstown': 2}) #d['Pclass'] = d['Pclass'].map({'First': 0, 'Second': 1, 'Third': 2}) return (d) #filter floats and convert to int query_instance = dict(s.split(':') for s in arg) for k, v in query_instance.items(): print(f"{k}: {v} ({type(v)})") try: x = float(v) x = int(x) query_instance[k] = f"{x}" except: pass sorted_query_instance = {} for f in self.featureNames: sorted_query_instance[f] = query_instance[f] original_instance = pd.DataFrame([sorted_query_instance]) print(original_instance.iloc[0, :]) shap_instance = reverse_adapter(pd.DataFrame([sorted_query_instance])) print(shap_instance.iloc[0, :]) shap_training = reverse_adapter(self.X_train) predict_fn = lambda x: self.model.predict_proba(adapter(x)) shap.initjs() explainer = shap.KernelExplainer(predict_fn, shap_training, link='logit') single_shap = explainer.shap_values( shap_instance.iloc[0, :].astype("int64"), nsamples=100) print(single_shap) fig = shap.force_plot( explainer.expected_value[0], single_shap[0], original_instance, out_names=[ "Chance of " + self.dictionary["class"]["values"][0], self.dictionary["class"]["values"][1] ], link="logit", matplotlib=True, show=False, text_rotation=90) fig.savefig('temp/shap.png', bbox_inches="tight") first_target = self.dictionary["class"]["values"][0] self.explanation = "The plot shows what feature values influenced the prediction to become <big>" + str( target ) + "</big>." + " Particularly, the plot shows the forces that affect the decision to predict " + first_target + ". Red forces increase the chance of " + first_target + ". Blue forces decrease the chance of " + first_target + ". The forces push the average chance of " + first_target + " (base value) up or down. The boundary where the prediction outcome switches is 0.5." self.certainty = "That is hard to tell. The computation is based on perturbation with " + str( self.X_train.shape[0]) + " data samples." plt.clf() return (str('temp/shap.png'))
def __init__(self, model): shap.initjs() self.explainer = shap.TreeExplainer(model) self.shap_values = None self.expected_values = None self.feature_importance = None
def upload2(): from werkzeug.datastructures import ImmutableMultiDict with open(ff[0], 'rb') as file: model = pickle.load(file) with open(ff[1], 'rb') as file: X_data = pickle.load(file) with open(ff[2], 'rb') as file: y_data = pickle.load(file) print('start') print(request.form) hh = request.form hh = hh.to_dict(flat=False) print('hh ', hh) for file in request.files.getlist("gg"): print(file) print(list(X_data.columns)) series = pd.Series(hh) import shap explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(X_data) # load JS visualization code to notebook shap.initjs() #plt.style.use("_classic_test_patch") #plt.clf() # visualize the first prediction's explanation (use matplotlib=True to avoid Javascript) #shap.force_plot(explainer.expected_value, shap_values[1,:], series, matplotlib=True, figsize=(22, 4)) #shap.force_plot(explainer.expected_value, shap_values[10,:], \ # series,feature_names=X_data.columns,\ # matplotlib=True, show=False) # plt.savefig("gg.png",dpi=150, bbox_inches='tight') #yyy = shap.getjs() ''' oo = yyy.matplotlib p = yyy.html yyy_str = mpld3.fig_to_html(p) print('dfsdfsdf ',p) ''' series = series.tolist() print('im a he ', series) pp = [] for i in series: for j in i: j = float(j) pp.append(j) series = np.array(pp) print('im a she ', series) #lime import lime from lime.lime_tabular import LimeTabularExplainer explainer = LimeTabularExplainer(X_data, mode='regression', feature_names=list(X_data.columns), random_state=42, discretize_continuous=False, kernel_width=0.2) exp = explainer.explain_instance(series, model.predict) print(exp.local_pred) fig = exp.as_pyplot_figure(label=list(X_data.columns)) fig_2 = exp.as_html(labels=list(X_data.columns)) #print('dddd ',fig_2) plt.tight_layout() #fig = plt.figure(figsize = (18,8)) # plt.tight_layout() # #plt.boxplot(bank_data.transpose()) # # #Add titles to the chart and axes # plt.hist(bank_data.transpose(), bins = 50) # plt.title('Boxplot of Bank Stock Prices (5Y Lookback)') # plt.xlabel('Bank') # plt.ylabel('Stock Prices') # #mpld3.show(fig) # html_str = mpld3.fig_to_html(fig) Html_file = open("templates/lime.html", "w") Html_file.write(html_str) Html_file.close() # # plt.savefig('static/img/new34_plot.png') #plt.close() return render_template('local_result.html', LIME=html_str, SH=fig_2, gh=html_str)
def upload(): print('eer 0', request.form) dropdown_selection = str(request.form) dropdown_selection = dropdown_selection.split() print(dropdown_selection) model_type = dropdown_selection[3] dropdown_selection = dropdown_selection[1] print('model type ji ', model_type) print(dropdown_selection, " nuna bhai") global id_name target = 'images/' print('tt', target) if not os.path.isdir(target): os.mkdir(target) global ff ff = [] for file in request.files.getlist("file"): print(file) filename = file.filename destination = "/".join([target, filename]) print('des', destination) file.save(destination) ff.append(destination) mypath = os.getcwd() onlyfiles = [ os.path.join(mypath, f) for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f)) ] print('raJA ', ff) import warnings warnings.filterwarnings("ignore") with open(ff[0], 'rb') as file: model = pickle.load(file) with open(ff[1], 'rb') as file: X_data = pickle.load(file) with open(ff[2], 'rb') as file: y_data = pickle.load(file) if 'GL' in dropdown_selection: if 'RR' in model_type: PI = permutation_importance(model, X_data, y_data) row_to_show = 5 data_for_prediction = X_data.iloc[row_to_show] explainer = shap.Explainer(model, X_data, feature_names=X_data.columns) shap_values = explainer.shap_values(X_data) shap.summary_plot(shap_values, X_data) import matplotlib.pyplot as pl pl.savefig('static/img/new_plot.png') pl.close() ICE = ind_cond_exp(model, X_data, y_data) #global surgat from sklearn.tree import DecisionTreeRegressor from sklearn.tree import plot_tree predictions = model.predict(X_data) dt = DecisionTreeRegressor(random_state=100, max_depth=3) # We fit the shallow tree to the matrix X and the predictions of the random forest model dt.fit(X_data, predictions) fig, ax = plt.subplots(figsize=(20, 10)) plot_tree(dt, feature_names=list(X_data.columns), precision=3, filled=True, fontsize=12, impurity=True) pl.savefig('static/img/new2_plot.png') pl.close() return render_template('model_explanation_result.html', PI=PI, ICE=ICE, SH="static/img/new_plot.png", SM="static/img/new2_plot.png") if 'RF' in model_type: PI = permutation_importance(model, X_data, y_data) explainer = shap.TreeExplainer(model, X_data, feature_names=X_data.columns) shap_values = explainer.shap_values(X_data) shap.summary_plot(shap_values, X_data) import matplotlib.pyplot as pl pl.savefig('static/img/new_plot.png') pl.close() ICE = ind_cond_exp(model, X_data, y_data) #global surgat from sklearn.tree import DecisionTreeRegressor from sklearn.tree import plot_tree predictions = model.predict(X_data) dt = DecisionTreeRegressor(random_state=100, max_depth=3) # We fit the shallow tree to the matrix X and the predictions of the random forest model dt.fit(X_data, predictions) fig, ax = plt.subplots(figsize=(20, 10)) plot_tree(dt, feature_names=list(X_data.columns), precision=3, filled=True, fontsize=12, impurity=True) pl.savefig('static/img/new2_plot.png') pl.close() return render_template('model_explanation_result.html', PI=PI, ICE=ICE, SH="static/img/new_plot.png", SM="static/img/new2_plot.png") if 'CC' in model_type: PI = permutation_importance(model, X_data, y_data) explainer = shap.KernelExplainer(model.predict_proba, X_data) shap_values = explainer.shap_values(X_data) shap.summary_plot(shap_values, X_data) import matplotlib.pyplot as pl pl.savefig('static/img/new_plot.png') pl.close() #ICE = ind_cond_exp(model,X_data,y_data) #global surgat from sklearn.tree import DecisionTreeRegressor from sklearn.tree import plot_tree predictions = model.predict(X_data) return render_template( 'model_explanation_result_classification.html', PI=PI, SH="static/img/new_plot.png") if 'WI' in dropdown_selection: # print(res," resss") # import dash from dash.dependencies import Input, Output import dash_table import dash_core_components as dcc import dash_html_components as html app = dash.Dash(__name__) import pandas as pd #should be X data mean_list = [] features = X_data.columns.tolist() for i in features: mean_list.append(round(X_data[i].mean())) explainer = shap.TreeExplainer(model) shap.initjs() params = features id_name_str = "my_graph" + str(id_name) print('---------------', id_name_str) id_name = id_name + 1 what_plot.layout = html.Div([ dash_table.DataTable( id='table-editing-simple', columns=([{ 'id': 'Model', 'name': 'Model' }] + [{ 'id': p, 'name': p } for p in params]), data=[ dict(zip(features, mean_list)) #dict(Model=i, **{param: mean_list[i] for param in params}) # for i in range(0, len(mean_list)) ], editable=True), html.Div(id=id_name_str) ]) @what_plot.callback(Output(id_name_str, "children"), Input('table-editing-simple', 'data'), Input('table-editing-simple', 'columns')) def update_graphs(rows, columns): df = pd.DataFrame(rows, columns=[c['name'] for c in columns]) print(rows) # rows = rows[0] col = [] vvalue = [] for key in rows: print(key, '->', int(rows[key])) col.append(key) vvalue.append([int(rows[key])]) ik = dict(zip(col, vvalue)) instance = pd.DataFrame.from_dict(ik) print('instancceee ', instance) from shap.plots._force_matplotlib import draw_additive_plot # explain the model's predictions using SHAP values(same syntax works for LightGBM, CatBoost, and scikit-learn models) #explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(instance) shap.initjs() #plt.style.use("_classic_test_patch") ytu = model.predict(instance) print('ress ', ytu) koko = _force_plot_html2(explainer.expected_value, shap_values, instance) #print('kkkk ',koko) print('Done') return koko # return render_template('local_explain_lime.html', LL=what_plot.index()) if 'LL' in dropdown_selection: None #table and plots ======================================================== import dash from dash.dependencies import Input, Output import dash_table import dash_core_components as dcc import dash_html_components as html import pandas as pd id_name_str = "my_graph" + str(id_name) print('---------------', id_name_str) id_name = id_name + 1 print('in LL') # make graph=============================================================== table_plot.layout = html.Div([ dash_table.DataTable( id='datatable-interactivity', columns=[{ "name": i, "id": i, "deletable": True, "selectable": True } for i in X_data.columns], data=X_data.to_dict('records'), editable=True, filter_action="native", sort_action="native", sort_mode="multi", column_selectable="single", row_selectable="single", row_deletable=True, selected_columns=[], selected_rows=[], page_action="native", page_current=0, page_size=10, ), html.Div(id=id_name_str) ]) print('miod LL') @table_plot.callback(Output(id_name_str, "children"), Input('datatable-interactivity', "derived_virtual_data"), Input('datatable-interactivity', "derived_virtual_selected_rows")) def update_graphs(rows, derived_virtual_selected_rows): # When the table is first rendered, `derived_virtual_data` and # `derived_virtual_selected_rows` will be `None`. This is due to an # idiosyncrasy in Dash (unsupplied properties are always None and Dash # calls the dependent callbacks when the component is first rendered). # So, if `rows` is `None`, then the component was just rendered # and its value will be the same as the component's dataframe. # Instead of setting `None` in here, you could also set # `derived_virtual_data=df.to_rows('dict')` when you initialize # the component. if derived_virtual_selected_rows is None: derived_virtual_selected_rows = [] dff = X_data if rows is None else pd.DataFrame(rows) colors = [ '#7FDBFF' if i in derived_virtual_selected_rows else '#0074D9' for i in range(len(dff)) ] print('my value', derived_virtual_selected_rows) print('i am row ', X_data.iloc[derived_virtual_selected_rows]) print(type(derived_virtual_selected_rows)) from shap.plots._force_matplotlib import draw_additive_plot ttt = X_data.loc[derived_virtual_selected_rows] # explain the model's predictions using SHAP values(same syntax works for LightGBM, CatBoost, and scikit-learn models) explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(ttt) shap.initjs() plt.style.use("_classic_test_patch") bubu = _force_plot_html(explainer.expected_value, shap_values, ttt) shap_values = explainer.shap_values(X_data) #shap.force_plot(explainer.expected_value, shap_values, X_data) explain_all = _force_plot_html(explainer.expected_value, shap_values, X_data) print('bubu ', bubu) return bubu, explain_all return render_template('local_explain_lime.html', LL=table_plot.index()) if 'BD' in dropdown_selection: None #FI if 'DB' in dropdown_selection: # if 'CC' in model_type: # from explainerdashboard import ClassifierExplainer, ExplainerDashboard # ExplainerDashboard(ClassifierExplainer(model, X_data, y_data)).run() if 'RF' in model_type: import threading import time def dashboard_exp(model, X_data, y_data): import dash_bootstrap_components as dbc from explainerdashboard import RegressionExplainer, ExplainerDashboard ExplainerDashboard( RegressionExplainer(model, X_data, y_data), bootstrap=dbc.themes.SANDSTONE, importances=True, model_summary=False, contributions=True, whatif=True, shap_dependence=False, shap_interaction=False, decision_trees=False, hide_whatifindexselector=True, hide_whatifprediction=True, hide_inputeditor=False, hide_whatifcontributiongraph=False, hide_whatifcontributiontable=True, hide_whatifpdp=False, hide_predindexselector=True, hide_predictionsummary=True, hide_contributiongraph=False, hide_pdp=False, hide_contributiontable=True, hide_dropna=True, hide_range=True, hide_depth=True, hide_sort=True, hide_sample=True, # hide sample size input on pdp component hide_gridlines=True, # hide gridlines on pdp component hide_gridpoints=True, hide_cats_sort= True, # hide the sorting option for categorical features hide_cutoff= True, # hide cutoff selector on classification components hide_percentage= True, # hide percentage toggle on classificaiton components hide_log_x= True, # hide x-axis logs toggle on regression plots hide_log_y= True, # hide y-axis logs toggle on regression plots hide_ratio=True, # hide the residuals type dropdown hide_points= True, # hide the show violin scatter markers toggle hide_winsor=True, # hide the winsorize input hide_wizard= True, # hide the wizard toggle in lift curve component hide_star_explanation=True, ).run() t1 = threading.Thread(target=dashboard_exp, args=(model, X_data, y_data)) t1.start() return '''<H2>
def saveAndGetSHAP(self, user_all_label, pred, new_row_raw, new_row_norm, initModel): start_time = datetime.datetime.now() model_results = [] # 상위 버전 xgboost의 경우, 모델 encoding 버퍼 문제 발생 # https://github.com/slundberg/shap/issues/1215 xgb_booster = initModel.get_booster() model_bytearray = xgb_booster.save_raw()[4:] def byte_error(self=None): return model_bytearray xgb_booster.save_raw = byte_error features = StressModel.feature_df_with_state['features'].values feature_state_df = StressModel.feature_df_with_state model_accuracy = 0 # model 성능 평가 y_pred_proba = initModel.predict_proba(new_row_norm[features]) model_accuracy = y_pred_proba[0] print("model_accuracy: ", model_accuracy) # shap setting shap.initjs() try: explainer = shap.TreeExplainer(xgb_booster) except Exception as e: print("shap tree explainer error: ", e) # explainer.feature_perturbation = "tree_path_dependent" shap_values = explainer.shap_values(new_row_norm[features]) # print(shap_values) expected_value = explainer.expected_value # ## TODO : SHAP Exception 발생 가능 부분 ==> SHAP 에서 적은 빈도수의 Label 해석 안줄때/...혹시나 해서 모델 한번 더 학습 # try: # print("expected_value: ", type(expected_value)) # print("expected_value: ", expected_value.shape[0]) # if (expected_value.shape[0]) != len(user_all_label): # print("Shap if statement...") # with open('data_result/' + str(self.uid) + "_features.p", 'rb') as file: # preprocessed = pickle.load(file) # # norm_df = StressModel.normalizing(self, "default", preprocessed, None, None, None, None) # StressModel.initModel(self, norm_df) # # explainer = shap.TreeExplainer(initModel) # explainer.feature_perturbation = "tree_path_dependent" # # features = StressModel.feature_df_with_state['features'].values # feature_state_df = StressModel.feature_df_with_state # # ### model 성능 평가 # y_pred_proba = initModel.predict_proba(new_row_norm[features]) # model_accuracy = y_pred_proba[0] # print("if model_accuracy: ", model_accuracy) # # shap_values = explainer.shap_values(new_row_norm[features]) # # print(shap_values) # expected_value = explainer.expected_value # # print("len expected_value: ", len(expected_value)) # except Exception as e: # print("SHAP label length error: ", e) # pass check_label = [0 for i in range(3)] # not_user_label_list = list(set(check_label) - set(user_all_label)) # 유저한테 없는 label 계산 try: for label in user_all_label: # 유저한테 있는 Stress label 에 따라 feature_list = "" index = user_all_label.index(label) # shap_accuracy = expected_value[index] shap_list = shap_values[index] if len(shap_list.shape) == 1: ## EXCEPTION CASE.. shap_dict = dict(zip(features, shap_list)) else: shap_dict = dict(zip(features, shap_list[0])) shap_dict_sorted = sorted(shap_dict.items(), key=(lambda x: x[1]), reverse=True) # act_features = ['Duration WALKING', 'Duration RUNNING', 'Duration BICYCLE', 'Duration ON_FOOT', 'Duration VEHICLE'] app_features = ['Social & Communication','Entertainment & Music','Utilities','Shopping', 'Games & Comics', 'Health & Wellness', 'Education', 'Travel', 'Art & Design & Photo', 'News & Magazine', 'Food & Drink'] act_tmp = "" for feature_name, s_value in shap_dict_sorted: if s_value > 0: #check feature_id = feature_state_df[feature_state_df['features'] == feature_name]['feature_id'].values[0] feature_value = new_row_norm[feature_name].values[0] ## TODO : 데이터가 전부 다 0인 경우..추가 작업이 필요할 수 있음 # 현재는 feature_list가 0일 경우, NO_FEATURES 반환 if new_row_raw[feature_name].values[0] != 0: # ACT FEATURE # if feature_name in act_features: # if act_tmp == "": # act_tmp += feature_name # # if feature_value >= 0.5: # feature_list += str(feature_id) + '-high ' # else: # feature_list += str(feature_id) + '-low ' if feature_name in app_features: # Add package try: pkg_result = AppUsed.objects.get(uid=self.uid, day_num=self.dayNo, ema_order=self.emaNo) pkg_text = "" if feature_name == "Entertainment & Music": pkg_text = pkg_result.Entertainment_Music elif feature_name == "Utilities": pkg_text = pkg_result.Utilities elif feature_name == "Shopping": pkg_text = pkg_result.Shopping elif feature_name == "Games & Comics": pkg_text = pkg_result.Games_Comics elif feature_name == "Others": pkg_text = pkg_result.Others elif feature_name == "Health & Wellness": pkg_text = pkg_result.Health_Wellness elif feature_name == "Social & Communication": pkg_text = pkg_result.Social_Communication elif feature_name == "Education": pkg_text = pkg_result.Education elif feature_name == "Travel": pkg_text = pkg_result.Travel elif feature_name == "Art & Design & Photo": pkg_text = pkg_result.Art_Photo elif feature_name == "News & Magazine": pkg_text = pkg_result.News_Magazine elif feature_name == "Food & Drink": pkg_text = pkg_result.Food_Drink if pkg_text != "": if feature_value >= 0.5: feature_list += str(feature_id) + '-high&' + pkg_text + " " else: feature_list += str(feature_id) + '-low ' except Exception as e: print("Exception during making feature_list of app...get AppUsed db", e) else: if feature_value >= 0.5: feature_list += str(feature_id) + '-high ' else: feature_list += str(feature_id) + '-low ' if feature_list == "": feature_list = "NO_FEATURES" try: if label == pred: model_result = ModelResult.objects.create(uid=self.uid, timestamp=start_time, day_num=self.dayNo, ema_order=self.emaNo, prediction_result=label, accuracy=model_accuracy[label], feature_ids=feature_list, model_tag=True) else: model_result = ModelResult.objects.create(uid=self.uid, timestamp=start_time, day_num=self.dayNo, ema_order=self.emaNo, prediction_result=label, accuracy=model_accuracy[label], feature_ids=feature_list) except Exception as e: print("ModelResult.objects.create error: ", e) check_label[label] = 1 model_results.append(model_result) except Exception as e: print("Exception at saveAndGetSHAP error: ", e) pass try: ## For 문 끝난 후, model_result 에 없는 stress lvl 추가 & 일반적인 문구 추가 for i in range(3): if check_label[i] == 0: # random_acc = random.uniform(0.0, 1.0) # random_acc = round(random_acc, 2) try: if i == 0 : # LOW General message, 마지막 띄어쓰기 조심! feature_list = '0-general_0 7-general_0 11-general_0 17-general_0 28-general_0 ' model_result = ModelResult.objects.create(uid=self.uid, timestamp=start_time, day_num=self.dayNo, ema_order=self.emaNo, prediction_result=i, accuracy=0, feature_ids=feature_list) else: #LITTLE HIGH, HIGH General message feature_list = '0-general_1 7-general_1 11-general_1 17-general_1 28-general_1 ' model_result = ModelResult.objects.create(uid=self.uid, timestamp=start_time, day_num=self.dayNo, ema_order=self.emaNo, prediction_result=i, accuracy=0, feature_ids=feature_list) except Exception as e: print("model result에 없는 stress lvl 추가 오류: ", e) model_results.append(model_result) except Exception as e: print("saveAndGetSHAP general statement error: ",e) # print("Total SaveAndGetSHAP Working... ", datetime.datetime.now() - start_time) # 시간 1초도 안 걸림 return model_results
def blockbox(self,model, patient): explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(patient) shap.initjs() return shap.force_plot(explainer.expected_value[1], shap_values[1], patient,matplotlib=True,show=False)
def training(self): data_train = pd.read_csv( "D:/Python_Project/Keywords_extraction/train_balance.csv") data_test = pd.read_csv( "D:/Python_Project/Keywords_extraction/test_balance.csv") acc = 0 # cols = [col for col in data_train.columns if col not in ['id', '关键词', '标签']] # cols = [col for col in data_train.columns if col in ['头词频','词频','词长','IDF','出现在标题','首次出现词位置','最后出现词位置','词方差','词平均','词偏度','词峰度','词差方差','最大词差','最小词差','最小句中位置','首次句位置','最后句位置','出现在第一句','出现在最后一句','句子出现频率','句平均','句偏度','包含英文','度中心性','接近中心性','s','f','v','d','k','x','i','l','un','包含数字']] ''' cols=['词频','词长','IDF','出现在标题','首次出现词位置','最后出现词位置','词方差','词偏度','最大句中位置','最小句中位置', '平均句中位置','平均句长','首次句位置','出现在最后一句','句子出现频率','句方差', '句平均','句差方差','最大句差','包含英文','接近中心性','n', 't', 'v', 'z', 'q', 'd', 'k', 'x', 'y', '包含数字'] ['词频', '词长', 'IDF', '出现在标题', '首次出现词位置', '词方差', '词平均', '最大词差', '最大句中位置', '平均句中位置', '首次句位置', '出现在第一句', '出现在最后一句', '句子出现频率', '句方差', '句差方差', '最大句差', '度中心性', 'n', 'v', 'a', 'z', 'd', 'h', 'k', 'x', 'g', 'j', 'y', 'un', '包含数字'] ''' cols = [ '词频', '词长', 'IDF', '出现在标题', '首次出现词位置', '词方差', '词平均', '最大词差', '最大句中位置', '平均句中位置', '首次句位置', '出现在第一句', '出现在最后一句', '句子出现频率', '句方差', '句差方差', '最大句差', '度中心性', 'n', 'v', 'a', 'z', 'd', 'h', 'k', 'x', 'g', 'j', 'y', 'un', '包含数字' ] # cols = [col for col in data_train.columns if col not in ['id', '关键词', '标签']] x_train = data_train.loc[:, cols] y_train = data_train.loc[:, '标签'] x_train = x_train.reset_index(drop=True) y_train = y_train.reset_index(drop=True) x_val = data_test.loc[:, cols] y_val = data_test.loc[:, '标签'] x_val = x_val.reset_index(drop=True) y_val = y_val.reset_index(drop=True) # 测试集为30%,训练集为70% # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0) lgb_train = lgb.Dataset(x_train, y_train) lgb_eval = lgb.Dataset(x_val, y_val, reference=lgb_train) # print('开始训练......') params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': {'auc'}, 'learning_rate': 0.025, 'num_leaves': 100, 'min_data_in_leaf': 70, 'bagging_fraction': 0.85, 'is_unbalance': 'true', 'seed': 42 } gbm = lgb.train( params, lgb_train, num_boost_round=5000, valid_sets=lgb_eval, early_stopping_rounds=30, verbose_eval=False, ) y_pred = gbm.predict(x_val) y_pred = list(y_pred) Y_val = list(y_val) pos = 0 pos_acc = 0 pos_pre = 0 for i, j in zip(Y_val, y_pred): if (i >= 0.5): pos += 1 if (i >= 0.5 and j >= 0.5): pos_acc += 1 if (j >= 0.5): pos_pre += 1 pos_r = pos_acc / pos pos_a = pos_acc / pos_pre print((pos_a * pos_r) / (pos_a + pos_r) * 2) i = 0 count = 0 for item in y_pred: if item > 0.5: y_pred[i] = 1 else: y_pred[i] = 0 i = i + 1 # print(report(Y_val, y_pred,digits=4)) y_pred = gbm.predict(x_train) y_pred = list(y_pred) Y_train = list(y_train) i = 0 count = 0 for item in y_pred: if item > 0.5: y_pred[i] = 1 else: y_pred[i] = 0 i = i + 1 print(report(Y_train, y_pred, digits=4)) plt.rc('font', family='SimSun', size=13) # gbm.save_model('lgbmodel_allfeature.model') explainer = shap.TreeExplainer(gbm) shap_values = explainer.shap_values(x_train) # 基线值y_base就是训练集的目标变量的拟合值的均值。 y_base = explainer.expected_value shap.initjs() # shap.summary_plot(shap_values[0], x_train, sort=True, color_bar_label=("FEATURE_VALUE0"))#1 shap.summary_plot(shap_values[1], x_train, sort=True, color_bar_label=("FEATURE_VALUE1")) # 2
def importance_plot(model_type, model_name, base_path): shap.initjs() # For RNN if model_name == 'RNN': rnn = rnnForward().double() rnn.load_state_dict( torch.load(base_path + '/Result/' + model_name + '/' + model_type + '.shap')) rnn_df = pd.read_csv(base_path + '/Result/' + model_name + '/' + model_type + '.csv', index_col=0) feature_doc_df = pd.read_csv(base_path + '/Data/feature_documentation.csv') feature_dict = dict( zip(feature_doc_df['var_name'], feature_doc_df['short_name'])) rnn_df.rename(columns=lambda x: change_feature_name(x, feature_dict), inplace=True) shap_values = shap.DeepExplainer(rnn, torch.tensor( rnn_df.values)).shap_values(torch.tensor(rnn_df.values)) summaryplot = shap.summary_plot(shap_values, rnn_df, show=False) plt.savefig('temp.png', bbox_inches='tight') plt.close() df = cumulativeImportance(rnn_df, shap_values) # For NN elif model_name == 'NN': n_nn = 1000 #number of lines want to look nn = nNet().double() nn.load_state_dict( torch.load(base_path + '/Model/' + model_name + '/' + model_type)) nn_df = pd.read_csv(base_path + '/Data/Transformed/' + model_type + '_transformed_800.csv', index_col=0) cols = [ col for col in nn_df.columns if col not in ['fips', 'value', 'year'] ] nn_df = nn_df[cols] feature_doc_df = pd.read_csv(base_path + '/Data/feature_documentation.csv') feature_dict = dict( zip(feature_doc_df['var_name'], feature_doc_df['short_name'])) nn_df.rename(columns=lambda x: change_feature_name(x, feature_dict), inplace=True) shap_values = shap.DeepExplainer( nn, torch.tensor(nn_df.values[:n_nn])).shap_values( torch.tensor(nn_df.values[:n_nn])) summaryplot = shap.summary_plot(shap_values, nn_df[:n_nn], show=False) plt.savefig('temp.png', bbox_inches='tight') plt.close() df = cumulativeImportance(nn_df[:n_nn], shap_values) # For Random Forest elif model_name == 'RF': n_rf = 1000 rf_df = pd.read_csv(base_path + '/Data/Transformed/' + model_type + '_transformed_800.csv', index_col=0) cols = [ col for col in rf_df.columns if col not in ['fips', 'value', 'year'] ] rf_df = rf_df[cols] feature_doc_df = pd.read_csv(base_path + '/Data/feature_documentation.csv') feature_dict = dict( zip(feature_doc_df['var_name'], feature_doc_df['short_name'])) rf_df.rename(columns=lambda x: change_feature_name(x, feature_dict), inplace=True) rf = joblib.load(base_path + '/Model/' + model_name + '/' + model_type) shap_values = shap.TreeExplainer(rf, rf_df.values[:n_rf]).shap_values( rf_df.values[:n_rf], check_additivity=False) summaryplot = shap.summary_plot(shap_values, rf_df[:n_rf], show=False) plt.savefig('temp.png', bbox_inches='tight') plt.close() df = cumulativeImportance(rf_df[:n_rf], shap_values) # For KNN # Must use Kernel method on knn elif model_name == 'KNN': n_knn = 10 knn_df = pd.read_csv(base_path + '/Data/Transformed/' + model_type + '_transformed_800.csv', index_col=0) cols = [ col for col in knn_df.columns if col not in ['fips', 'value', 'year'] ] knn_df = knn_df[cols] feature_doc_df = pd.read_csv(base_path + '/Data/feature_documentation.csv') feature_dict = dict( zip(feature_doc_df['var_name'], feature_doc_df['short_name'])) knn_df.rename(columns=lambda x: change_feature_name(x, feature_dict), inplace=True) knn = joblib.load(base_path + '/Model/' + model_name + '/' + model_type) shap_values = shap.KernelExplainer(knn.predict, knn_df.values[:n_knn]).shap_values( knn_df.values[:n_knn]) summaryplot = shap.summary_plot(shap_values, knn_df[:n_knn], show=False) plt.savefig('temp.png', bbox_inches='tight') plt.close() df = cumulativeImportance(knn_df[:n_knn], shap_values) # For LR elif model_name == 'LR': n_lr = 1000 lr_df = pd.read_csv(base_path + '/Data/Transformed/' + model_type + '_transformed_800.csv', index_col=0) cols = [ col for col in lr_df.columns if col not in ['fips', 'value', 'year'] ] lr_df = lr_df[cols] feature_doc_df = pd.read_csv(base_path + '/Data/feature_documentation.csv') feature_dict = dict( zip(feature_doc_df['var_name'], feature_doc_df['short_name'])) lr_df.rename(columns=lambda x: change_feature_name(x, feature_dict), inplace=True) lr = joblib.load(base_path + '/Model/' + model_name + '/' + model_type) shap_values = shap.LinearExplainer( lr, lr_df.values[:n_lr]).shap_values(lr_df.values[:n_lr]) summaryplot = shap.summary_plot(shap_values, lr_df[:n_lr], show=False) plt.savefig('temp.png', bbox_inches='tight') plt.close() df = cumulativeImportance(lr_df[:n_lr], shap_values) encoded_image = base64.b64encode(open('temp.png', 'rb').read()).decode('ascii') return [ html.Div([ html.Img(src='data:image/png;base64,{}'.format(encoded_image), style={ 'width': '80%', 'height': '80%' }) ]) ], px.bar(df, x='importance', y='feature', color='Correlation', category_orders={'feature': list(df['feature'].iloc[::-1])})
def patient_analysis(model, patient): explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(patient) shap.initjs() return shap.force_plot(explainer.expected_value[1], shap_values[1], patient)
def shap_js_bar_plot(model, values): shap.initjs() explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(values) return shap.summary_plot(shap_values, values, plot_type="bar")
def main(): "SHapley Additive exPlanations" datapath = os.path.join( os.path.dirname(__file__), "combine_data_since_2000_PROCESSED_2018-04-26.csv") data_df = pd.read_csv(datapath) # onyl get players that have been in the league for 3 years data_df2 = data_df.loc[data_df.Year <= 2015].copy() # calculate the player AV percentiles by position data_df2['AV_pctile'] = data_df2.groupby('Pos').AV.rank(pct=True, method='min', ascending=True) # Get the data for the position we want, in this case it's DE pos_df = data_df2.loc[data_df2.Pos == 'DE'].copy().reset_index(drop=True) # Combine measurables features = [ 'Forty', 'Wt', 'Ht', 'Vertical', 'BenchReps', 'BroadJump', 'Cone', 'Shuttle' ] # what we want to predict target = 'AV_pctile' # Split the data into train and test sets train_df = pos_df.loc[pos_df.Year <= 2011] test_df = pos_df.loc[pos_df.Year.isin([2012, 2013, 2014, 2015])] X = train_df[features].values y = train_df[target].values X_test = test_df[features].values y_test = test_df[target].values # best parameter set pipe = Pipeline([("imputer", Imputer(strategy='median')), ("estimator", RandomForestRegressor(max_features=6, min_samples_split=63, n_estimators=500, random_state=420))]) with warnings.catch_warnings(): warnings.filterwarnings('ignore') #search.fit(X, y) pipe.fit(X, y) estimator = pipe.named_steps['estimator'] imputer = pipe.named_steps['imputer'] # create our SHAP explainer shap_explainer = shap.TreeExplainer(estimator) test_X_imp = imputer.transform(X_test) # calculate the shapley values for our test set test_shap_vals = shap_explainer.shap_values(test_X_imp) # load JS in order to use some of the plotting functions from the shap # package in the notebook shap.initjs() test_X_imp = imputer.transform(X_test) test_X_imp_df = pd.DataFrame(test_X_imp, columns=features) # plot the explanation for a single prediction #shap.force_plot(test_shap_vals[0, :], test_X_imp_df.iloc[0, :]) #shap.force_plot(test_X_imp_df.iloc[0, :], test_shap_vals[0, :]) # visualize the first prediction's explanation shap.force_plot(shap_explainer.expected_value, test_shap_vals[0, :], test_X_imp_df.iloc[0, :])
def main(): st.sidebar.info('This app is created to predict CO2 Solubility in Brine') st.sidebar.success('https://www.pycaret.org') add_selectbox = st.sidebar.selectbox( "How would you like to predict?", ("Single value prediction", "Multiple value prediction")) st.title("CO2 Solubility in Brine Prediction App") st.subheader("Created by: Khoirrashif") image_CCS = Image.open('CCS.jpg') st.image(image_CCS, use_column_width=False) st.text("(Image source: Global CCS Institute)") st.set_option('deprecation.showPyplotGlobalUse', False) ##Single value Prediction if add_selectbox == 'Single value prediction': mNaCl = st.number_input('mNaCl (mol/kg) | min = 0.016 mol/kg, max = 6.14 mol/kg',value=3.25, min_value=0.016, max_value=6.14) #input mNaCl Pressure = st.number_input('Pressure (bar) | min = 0.98 bar, max = 1400.00 bar',value=500.00, min_value=0.98, max_value=1400.00) #input Pressure Temperature = st.number_input('Temperature (K) | min = 273.15 K, max = 723.15 K', value=425.00,min_value=273.15, max_value=723.15) #input Temperature output="" input_dict = {'mNaCl (mol/kg)': mNaCl, 'Pressure (bar)': Pressure, 'Temperature (K)': Temperature} input_df = pd.DataFrame([input_dict]) if st.button("Predict"): output = predict(model = model, input_df = input_df) output = str(output) + 'mol/kg' st.success('The CO2 solubility is {}'.format(output)) ##Multiple value Prdiction if add_selectbox == 'Multiple value prediction': file_upload = st.file_uploader("Upload csv file for predictions", type=["csv"]) if file_upload is not None: data = pd.read_csv(file_upload) prediction = predict_model(estimator=model, data=data) st.write(prediction) shap.initjs() # train catBoost model X,y = data, prediction['Label'] #mod = xgboost.train({"learning_rate": 0.01}, xgboost.DMatrix(X, label=y), 100) mod = CatBoostRegressor(iterations=100, learning_rate=0.1, random_seed=123) mod.fit(X, y, verbose=False, plot=False) # explain the model's predictions using SHAP # (same syntax works for LightGBM, CatBoost, scikit-learn and spark models) explainer = shap.TreeExplainer(mod) shap_values = explainer.shap_values(X) st.title("Feature Importance and Prediction Explanation based on the SHAP values") st.write("For a complete explanation about SHAP (SHapley Additive exPlanations) values and their impacts on machine learning models interpretability please refer to Lundberg and Lee (2016), and their GitHub (https://github.com/slundberg/shap/blob/master/README.md)") st.header("Total distribution of observations based on the SHAP values, colored by Target Value") st.write("The plot below sorts features by the sum of SHAP value magnitudes all over samples, and uses SHAP values to show the distribution of the impacts each feature has on the model output. The colour represents the feature value (e.g: red shows high impact, while blue shows negative impact.") #st.write("and uses SHAP values to show the distribution of the impacts each feature has on the model output.") #st.write("The colour represents the feature value (e.g: red shows high impact, while blue shows negative impact.") shap.summary_plot(shap_values, X) st.pyplot() plt.clf() st.header("Feature Importance according to the SHAP values (simplified plot)") st.write("The following plot is the simplified version of the plot above which is basically built by taking the mean absolute value of the SHAP value for each feature. It also shows the feature importance in descending order and highlights the correlation in colours.") #st.write("for each feature to get a standard bar plot.") ABS_SHAP(shap_values, X) st.pyplot() plt.clf() st.header("Prediction explanation for a single observation") st.write("The following plots are the Individual Force Plots. Each of them shows how each feature affects the model output from the base value for a single prediction. Features pushing the prediction higher are shown in red, while those pushing the prediction lower are in blue. A set of samples are provided below from the 3rd, 7th, and 10th observation from the dataset.") st.subheader("Example on the 3rd observation") shap.force_plot(explainer.expected_value, shap_values[3,:], X.iloc[3,:], matplotlib=True, show=False, figsize=(16,5)) st.pyplot() plt.clf() st.subheader("Example on the 7th observation") shap.force_plot(explainer.expected_value, shap_values[7,:], X.iloc[7,:], matplotlib=True, show=False, figsize=(16,5)) st.pyplot() plt.clf() st.subheader("Example on the 10th observation") shap.force_plot(explainer.expected_value, shap_values[10,:], X.iloc[10,:], matplotlib=True, show=False, figsize=(16,5)) st.pyplot() plt.clf() #st.header("Prediction explanation for the entire dataset") #st.write("The plot below is the Collective Force Plot. It is built by rotating the individual force plot 90 degrees, and stack them horizontally for the entire dataset.") #st_shap(shap.force_plot(explainer.expected_value, shap_values, X), 400) st.header("Dependence plots for each feature") st.write("The following plots are the partial dependence plots which each of them shows the marginal effect one or two features have on the predicted outcome of a machine learning model (J.H. Friedman, 2001). The partial dependence plot tells wether the relationship between the target and a feature is linear, monotonic or more complex.") st.subheader("Pressure") shap.dependence_plot("Pressure (bar)",shap_values,X,show=False) st.pyplot() plt.clf() st.subheader("Temperature") shap.dependence_plot("Temperature (K)",shap_values,X,show=False) st.pyplot() plt.clf() st.subheader("mNaCl") shap.dependence_plot("mNaCl (mol/kg)",shap_values,X,show=False) st.pyplot() plt.clf()
def get_Kbest_feature_lgb(train_x, train_y, val_x, val_y, method="gain", span=(0, 1000, 1), sorted_feature_list=None, verbose=True): """get feature importances by lgb model, supported methods included: ["split","gain","shap"] "span" is a list with start, end and step index "sorted_feature_list" is a feature importance list if provided """ assert span[1] > span[0], "span should be a 3-gram tuple, span[1] > span[0]" span = list(span) span[1] = min((train_x.shape[1], span[1])) score_ls = [] num_feature_ls = [] eli_cols = [] import lightgbm as lgb import gc params = { # "max_bin": 512, "learning_rate": 0.01, "boosting_type": "gbdt", "objective": "binary", "metric": "auc", "num_leaves": 31, "max_depth": -1, "verbose": 200, "subsample": 0.8, "colsample_bytree": 0.9, "subsample_freq": 1, "reg_alpha": 0, "min_child_weight": 25, "random_state": 2018, "reg_lambda": 1, "n_jobs": -1, } if sorted_feature_list is None: d_train = lgb.Dataset(train_x, label=train_y) d_test = lgb.Dataset(val_x, label=val_y) print("begin to train ") clf_lgb = lgb.train(params, d_train, 4000, valid_sets=[d_train, d_test], early_stopping_rounds=100, verbose_eval=200) pre_score_val = clf_lgb.best_score["valid_1"]["auc"] pre_score_train = clf_lgb.best_score["training"]["auc"] score_ls.append(pre_score_val) num_feature_ls.append(span[1] + 1) if method == "gain": feature_importances_gain = sorted(zip( train_x.columns, clf_lgb.feature_importance(importance_type="gain")), key=lambda x: x[1], reverse=True) feature_importances = pd.DataFrame( [list(f) for f in feature_importances_gain], columns=["features", "importance"]) elif method == "split": feature_importances_split = sorted(zip( train_x.columns, clf_lgb.feature_importance(importance_type="split")), key=lambda x: x[1], reverse=True) feature_importances = pd.DataFrame( [list(f) for f in feature_importances_split], columns=["features", "importance"]) elif method == "shap": import shap import numpy as np shap.initjs() explainer = shap.TreeExplainer(clf_lgb) # shap_sample = val_x.sample(frac=1.0) shap_sample = train_x.sample(frac=0.6) shap_values = explainer.shap_values(shap_sample) shap.summary_plot(shap_values, shap_sample, plot_type="bar") feature_importances_shap = sorted(zip( train_x.columns, np.mean(np.abs(shap_values), axis=0)), key=lambda x: x[1], reverse=True) feature_importances = pd.DataFrame( [list(f) for f in feature_importances_shap], columns=["features", "importance"]) feature_importances.to_csv("../work/feature_importance_eli_cor.csv", header=True, index=False) del d_test, d_train, clf_lgb gc.collect() if verbose: print(feature_importances) print("feature {} to {}, score {}".format(0, span[1], score_ls[0])) num_turn = max((0, int((span[1] - span[0]) / span[2]))) feature_all = feature_importances["features"].unique().tolist() for i in range(num_turn): print("the {}th turn ".format(i)) num_feature = span[1] - span[2] * (i + 1) temp_features = feature_all[0:num_feature] d_train_temp = lgb.Dataset(train_x[temp_features], label=train_y) d_test_temp = lgb.Dataset(val_x[temp_features], label=val_y) print("begin to train ") clf_temp = lgb.train(params, d_train_temp, 4000, valid_sets=[d_train_temp, d_test_temp], early_stopping_rounds=100, verbose_eval=200) temp_score_val = clf_temp.best_score["valid_1"]["auc"] temp_score_train = clf_temp.best_score["training"]["auc"] if temp_score_val > pre_score_val and temp_score_train > pre_score_train: for f in feature_all[num_feature:num_feature + span[2]]: eli_cols.append(f) print("features do not help:", eli_cols) pre_score_train = temp_score_train pre_score_val = temp_score_val score_ls.append(temp_score_val) num_feature_ls.append(num_feature) del d_test_temp, d_train_temp, clf_temp best_score = max(score_ls) best_num_feature = num_feature_ls[score_ls.index(best_score)] if verbose: print("best score {}, best number of feature span {} to {}".format( best_score, 0, best_num_feature)) return feature_all[0:best_num_feature], eli_cols else: feature_importances = sorted_feature_list if verbose: print(feature_importances) num_turn = max((1, int((span[1] - span[0]) / span[2]))) feature_all = feature_importances["features"].unique().tolist() pre_score_val = 0 pre_score_train = 0 for i in range(num_turn): print("the {}th turn ".format(i)) num_feature = span[1] - span[2] * i temp_features = feature_all[0:num_feature] d_train_temp = lgb.Dataset(train_x[temp_features], label=train_y) d_test_temp = lgb.Dataset(val_x[temp_features], label=val_y) print("begin to train ") clf_temp = lgb.train(params, d_train_temp, 4000, valid_sets=[d_train_temp, d_test_temp], early_stopping_rounds=100, verbose_eval=100) temp_score_val = clf_temp.best_score["valid_1"]["auc"] temp_score_train = clf_temp.best_score["training"]["auc"] if i == 0: pre_score_val = temp_score_val pre_score_train = temp_score_train if temp_score_val > pre_score_val and temp_score_train > pre_score_train: for f in feature_all[num_feature:num_feature + span[2]]: eli_cols.append(f) print("features do not help:", eli_cols) pre_score_train = temp_score_train pre_score_val = temp_score_val score_ls.append(temp_score_val) num_feature_ls.append(num_feature) del d_test_temp, d_train_temp, clf_temp best_score = max(score_ls) best_num_feature = num_feature_ls[score_ls.index(best_score)] if verbose: print("best score {}, best number of feature span {} to {}".format( best_score, 0, best_num_feature)) return feature_all[0:best_num_feature], eli_cols
print(_+1, "Model Evaluation Result:", "\n", classification_report(y_test, cat_predict)) # 전체적인 성능 평가 bagging_predict = [] # 빈 리스트 생성 for lst2_index in range(X_test.shape[0]): # 테스트 데이터 개수만큼 반복 temp_predict = [] # 반복문 내 임시 빈 리스트 생성 for lst_index in range(len(bagging_predict_result)): # Bagging 결과 리스트 개수 만큼 반복 temp_predict.append(bagging_predict_result[lst_index][lst2_index]) # 각 Bagging 결과 예측한 값 중 같은 인덱스를 리스트에 저장 if np.mean(temp_predict) >= 0.5: # 0, 1 이진분류이므로, 예측값의 평균이 0.5보다 크면 1, 아니면 0으로 예측 다수결) bagging_predict.append(1) elif np.mean(temp_predict) < 0.5: # 예측값의 평균이 0.5보다 낮으면 0으로 결과 저장 bagging_predict.append(0) print("Confusion_Matrix: \n", confusion_matrix(y_test, bagging_predict)) # 혼돈행렬 print('\n') print("Model Evaluation Result: \n", classification_report(y_test, bagging_predict)) # 전체적인 성능 평가 import shap cat_model = CatBoostClassifier(n_estimators = 50, # 50번 추정 max_depth = 10, # 트리 최대 깊이 10 random_state = 42, # 시드값 고정 verbose = True) # 학습 진행 과정 표시 cat_model.fit(X_train_res2, y_train_res2) # 학습 진행 explainer = shap.TreeExplainer(cat_model) # 트리 모델 Shap Value 계산 객체 지정 shap_values = explainer.shap_values(X_test) # Shap Values 계산 shap.initjs() # 자바스크립트 초기화 (그래프 초기화) shap.force_plot(explainer.expected_value, shap_values[0,:], X_test[0,:]) # 첫 번째 검증 데이터 인스턴스에 대해 Shap Value를 적용하여 시각화 # 빨간색이 영향도가 높으며, 파란색이 영향도가 낮음 shap.summary_plot(shap_values, X_test) shap.summary_plot(shap_values, X_test, plot_type = "bar") # 각 변수에 대한 Shap Values의 절대값으로 중요도 파악
def func(): # split data into train and test sets seed = 7 test_size = .25 X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=test_size, random_state=seed) original_col = X_train.columns imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(X_train) X_train = imp.transform(X_train) X_test = imp.transform(X_test) # Random Forest regr_rf = RandomForestRegressor(max_depth=30, random_state=2) regr_rf.fit(X_train, y_train) y_pred_train1 = regr_rf.predict(X_train) y_pred1 = regr_rf.predict(X_test) # random forest end # XGBoost xgdmat = xgb.DMatrix(X_train, y_train) our_params={'eta':.03,'seed':0,'subsample':0.8,\ 'colsample_bytree':0.8,'objective':'reg:linear',\ 'max_depth':7,'min_child_weight':.5} final_gb = xgb.train(our_params, xgdmat, num_boost_round=1500) testmat = xgb.DMatrix(X_test) trainmat = xgb.DMatrix(X_train) y_pred2 = final_gb.predict(testmat) y_pred_train2 = final_gb.predict(trainmat) # SVM clf = svm.SVR(kernel='rbf', degree=3, gamma='auto', coef0=0.0, tol=0.1, C=1.0, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1) clf.fit(X_train, y_train) y_pred_train3 = clf.predict(X_train) y_pred3 = clf.predict(X_test) ###### Evaluation ###### # Random Forest mae = mean_absolute_error(y_test.values, y_pred1) print("MAE: %.5f" % mae) rmse = np.sqrt(mean_squared_error(y_test.values, y_pred1)) print("RMSE: %.5f" % rmse) R = np.corrcoef(y_test.values, y_pred1) print("Correlation Coef: %.5f" % R[0, 1]) r2 = r2_score(y_test.values, y_pred1) print("r2 score: %.5f" % r2) # XGBoost mae = mean_absolute_error(y_test.values, y_pred2) print("MAE: %.5f" % mae) rmse = np.sqrt(mean_squared_error(y_test.values, y_pred2)) print("RMSE: %.5f" % rmse) R = np.corrcoef(y_test.values, y_pred2) print("Correlation Coef: %.5f" % R[0, 1]) r2 = r2_score(y_test.values, y_pred2) print("r2 score: %.5f" % r2) # SVM mae = mean_absolute_error(y_test.values, y_pred3) print("MAE: %.5f" % mae) rmse = np.sqrt(mean_squared_error(y_test.values, y_pred3)) print("RMSE: %.5f" % rmse) R = np.corrcoef(y_test.values, y_pred3) print("Correlation Coef: %.5f" % R[0, 1]) r2 = r2_score(y_test.values, y_pred3) print("r2 score: %.5f" % r2) ###### Visualization ###### # plot predict error plt.gcf().set_size_inches((10, 4)) plt.plot(((y_pred1 - y_test.values) / y_test.values)[::8], color='g', marker='*', label='random forest') plt.plot(((y_pred2 - y_test.values) / y_test.values)[::8], color='c', marker='s', markerfacecolor='none', label='XGBoost') plt.plot(((y_pred3 - y_test.values) / y_test.values)[::8], color='y', marker='o', markerfacecolor='none', label='SVM') # plt.gca().legend() plt.legend(loc='upper right') plt.savefig('junk.jpg') # plot training error plt.gcf().set_size_inches((10, 4)) plt.plot(((y_pred_train1 - y_train.values) / y_train.values)[::20], color='g', marker='*', label='random forest') plt.plot(((y_pred_train2 - y_train.values) / y_train.values)[::20], color='c', marker='s', markerfacecolor='none', label='XGBoost') plt.plot(((y_pred_train3 - y_train.values) / y_train.values)[::20], color='y', marker='o', markerfacecolor='none', label='SVM') # plt.gca().legend() plt.legend(loc='upper right') plt.savefig('junk.jpg') # plot predictions on test split plt.gcf().set_size_inches((10, 4)) plt.plot(y_test.values[::3], color='b', label='value') plt.plot(y_pred1[::3], color='g', marker='*', markerfacecolor='none', label='random forest', linestyle='None') plt.plot(y_pred2[::3], color='c', marker='s', markerfacecolor='none', label='XGBoost', linestyle='None') plt.plot(y_pred3[::3], color='y', marker='o', markerfacecolor='none', label='SVM', linestyle='None') # plt.gca().legend() plt.legend(loc='upper right') plt.savefig('junk.jpg') # plot predictions on training split plt.gcf().set_size_inches((10, 4)) plt.plot(y_train.values[::10], color='b', label='value') plt.plot(y_pred_train1[::10], color='g', marker='*', markerfacecolor='none', label='random forest', linestyle='None') plt.plot(y_pred_train2[::10], color='c', marker='s', markerfacecolor='none', label='XGBoost', linestyle='None') plt.plot(y_pred_train3[::10], color='y', marker='o', markerfacecolor='none', label='SVM', linestyle='None') # plt.gca().legend() plt.legend(loc='upper right') plt.savefig('junk2.jpg') # shap the value for better visualization shap.initjs() shap_values = shap.TreeExplainer(final_gb).shap_values(X_train) X_train = pd.DataFrame(data=X_train, columns=original_col) X_train = X_train.rename( columns={ "X2": "X7", "X3": "X6", "X4": "X14", "X5": "X4", "X6": "X8", "X7": "X9", "X8": "X10", "X9": "X12", "X10": "X11", "X11": "X13", "X12": "X5", "X13": "X1", "X14": "X2", "X15": "X3" }) shap.summary_plot(shap_values, X_train)
def multiclass_s_lightGBM(merge_data3, outnameimp, outname, dayname, taitol_1): # 目的変数を分離 X = merge_data3.drop("target", axis=1).values y = merge_data3["target"].values columns_name = merge_data3.drop("target", axis=1).columns # 分類するための関数を定義 0508日 ここをシンプルにしたい # 訓練用のデータと、テスト用のデータに分ける関数 ~2019.3 でtrain, valid.シャッフルせずクロスバリデーション予定 def Test_data_and_training_data_split(df, X, Y): from sklearn.model_selection import train_test_split N_train = int(len(df) * 0.86) N_test = len(df) - N_train X_train, X_test, y_train, y_test = \ train_test_split(X, Y, test_size=N_test,shuffle=False) return X_train, X_test, y_train, y_test # 訓練用のデータと、テスト用のデータに分ける関数実行 X_train, X_test, y_train, y_test = Test_data_and_training_data_split( merge_data3, X, y) X_trainpp = pd.DataFrame(X_train, columns=columns_name) #X_test = pd.DataFrame(X_test, columns=columns_name) #pd.DataFrame に戻して 縦に train val 結合していく y_trainp = pd.DataFrame(y_train) X_trainp = pd.DataFrame(X_trainpp) train = pd.concat([y_trainp, X_trainp], axis=1) print("train shape", train.shape) print("train", train) merge_data_p = train.rename(columns={0: "target"}) #train_dataをクロスバリデーション # 目的変数を分離 X = merge_data_p.drop("target", axis=1).values y = merge_data_p["target"].values columns_name = merge_data_p.drop("target", axis=1).columns from sklearn.model_selection import train_test_split # 分類するための関数を定義 シャッフル実施 def Test_data_and_training_data_split(df, X, Y): N_train = int(len(df) * 0.80) N_test = len(df) - N_train X_train, X_test, y_train, y_test = \ train_test_split(X, Y, test_size=N_test,random_state=42) return X_train, X_test, y_train, y_test # 訓練用のデータと、テスト用のデータに分ける関数実行 X_train, X_val, y_train, y_val = Test_data_and_training_data_split( merge_data_p, X, y) #X_train = pd.DataFrame(X_train, columns=columns_name) #X_val = pd.DataFrame(X_val, columns=columns_name) # shape 確認 print("train shape", X_train.shape) print("X_train", X_train) print("test shape", X_test.shape) print("validation shape", X_val.shape) # shape 確認 print("y_train shape", y_train.shape) print("y_test shape", y_test.shape) print("y_validation shape", y_val.shape) print("y_val", y_val) import lightgbm as lgb #shap import shap shap.initjs() # データセットを作成 train = lgb.Dataset(X_train, label=y_train) valid = lgb.Dataset(X_val, label=y_val) # モデルのパラメータを設定 # パラメータを設定 params = { 'reg_lambda': 0.2, 'objective': 'multiclass', 'metric': 'multi_logloss', 'num_class': 8, 'reg_alpha': 0.1, 'min_data_leaf': 100, 'learning_rate': 0.025, # 'feature_fraction': 0.8, # 'bagging_fraction': 0.8 } # モデルを訓練 model = lgb.train(params, train, valid_sets=valid, num_boost_round=5000, early_stopping_rounds=500) # 予測 y_pred = model.predict(X_test, num_iteration=model.best_iteration) y_pred = np.argmax(y_pred, axis=1) #--------------------------モデルの評価----------------------------------------------- from sklearn.metrics import confusion_matrix from sklearn.metrics import cohen_kappa_score #shap explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(X_test) shap.summary_plot(shap_values, X_test, plot_type="bar") # 混合行列を作成 result_matrix = pd.DataFrame(confusion_matrix(y_test, y_pred)) # クラス毎の正解率を計算 class_accuracy = [(result_matrix[i][i] / result_matrix[i].sum()) * 1 for i in range(len(result_matrix))] result_matrix[7] = class_accuracy # Accuracy を計算する accuracy = sum(y_test == y_pred) / len(y_test) print('accuracy:', accuracy) #kappa係数を計算 kappa = cohen_kappa_score(y_test, y_pred) print("kappa score:", kappa) result_matrix.to_csv(r"" + "./output/" + dayname + '/' + "result_matrix" + taitol_1 + outname + '.csv', encoding='shift-jis') from sklearn.metrics import accuracy_score accuracy_score1 = accuracy_score(y_test, y_pred) rezurt_1 = pd.DataFrame({ taitol_1 + 'accuracy_score': accuracy_score1, taitol_1 + "y_test": y_test, taitol_1 + "y_pred": y_pred }) #rezurt_1[taitol_1+"y_train"] =y_train #rezurt_1[taitol_1+"y_val"] =y_val rezurt_1.to_csv(r"" + "./output/" + dayname + '/' + "rezurt_1" + taitol_1 + outname + '.csv', encoding='shift-jis') importance = pd.DataFrame(model.feature_importance(), columns=[taitol_1 + 'importance']) display(importance) importance.to_csv(r"" + "./output/" + dayname + '/' + "importance" + taitol_1 + outname + '.csv', encoding='shift-jis')
def compute_predictor_importance(): shap.initjs() explainer = shap.KernelExplainer(model.predict_proba, test_X[0:100, :]) shap_values = explainer.shap_values(test_X[0:100, :]) shap.summary_plot(shap_values, test_X[0:100, :], plot_type="bar")
def train(train_data, test_data=None): G = train_data[0] # G 是一个Networkx里的对象,这几个都是经过load_data()处理过的 features = train_data[1] id_map = train_data[2] class_map = train_data[4] class_map2 = train_data[5] class_map3 = train_data[6] #class_map = class_map hierarchy = FLAGS.hierarchy degreelist = [] countnode = 0 sumedge = 0 for key in G.edge: if len(G.edge[key]) > 1: countnode += 1 sumedge = sumedge + len(G.edge[key]) degreelist.append(key) avg_edge = sumedge/countnode if features is not None: # pad with dummy zero vector features = np.vstack([features, np.zeros((features.shape[1],))]) features = tf.cast(features, tf.float32) for hi_num in range(hierarchy): if hi_num == 0: class_map = class_map if isinstance(list(class_map.values())[0], list): num_classes = len(list(class_map.values())[0]) else: num_classes = len(set(class_map.values())) class_map_ko_0 = construct_class_numpy(class_map) elif hi_num == 1: class_map = class_map2 if isinstance(list(class_map.values())[0], list): num_classes = len(list(class_map.values())[0]) else: num_classes = len(set(class_map.values())) class_map_ko_1 = construct_class_numpy(class_map) elif hi_num == 2: class_map = class_map3 if isinstance(list(class_map.values())[0], list): num_classes = len(list(class_map.values())[0]) else: num_classes = len(set(class_map.values())) class_map_ko_2 = construct_class_numpy(class_map) class_map_ko = construct_class_numpy(class_map) OTU_ko_num = class_map_ko.sum(axis=1) ko_samle_num = class_map_ko.sum(axis=0) count = 0 for num in OTU_ko_num: if num < 100: count += 1 ko_cb = construct_class_para(class_map_ko, 0, FLAGS.beta1) ko_cb = tf.cast(ko_cb, tf.float32) f1_par = construct_class_para(class_map_ko, 1, FLAGS.beta2) context_pairs = train_data[3] if FLAGS.random_context else None placeholders = construct_placeholders(num_classes) minibatch = NodeMinibatchIterator(G, id_map, placeholders, class_map, num_classes, batch_size=FLAGS.batch_size, max_degree=FLAGS.max_degree, context_pairs=context_pairs) ctrain = 0 cval =0 ctest = 0 for i in minibatch.train_nodes: if i in degreelist: ctrain += 1 for i in minibatch.val_nodes: if i in degreelist: cval += 1 for i in minibatch.test_nodes: if i in degreelist: ctest += 1 #pdb.set_trace() with open('test_nodes.txt', 'w') as f: json.dump(minibatch.test_nodes, f) ########### list_node = minibatch.nodes for otu in minibatch.train_nodes: if otu in list_node: list_node.remove(otu) for otu in minibatch.val_nodes: if otu in list_node: list_node.remove(otu) for otu in minibatch.test_nodes: if otu in list_node: list_node.remove(otu) ########### if hi_num == 0: adj_info_ph = tf.placeholder(tf.int32, shape=minibatch.adj.shape) # 把adj_info设成Variable应该是因为在训练和测试时会改变adj_info的值,所以 # 用Varible然后用tf.assign()赋值。 adj_info = tf.Variable(adj_info_ph, trainable=False, name="adj_info") shap.initjs() if FLAGS.model == 'graphsage_mean': # Create model sampler = UniformNeighborSampler(adj_info) if FLAGS.samples_3 != 0: layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1), SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2), SAGEInfo("node", sampler, FLAGS.samples_3, FLAGS.dim_2)] elif FLAGS.samples_2 != 0: layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1), SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2)] else: layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1)] model = SupervisedGraphsage(num_classes, placeholders, features, adj_info, minibatch.deg, # 每一个的度 layer_infos, ko_cb, hi_num, model_size=FLAGS.model_size, sigmoid_loss=FLAGS.sigmoid, identity_dim=FLAGS.identity_dim, logging=True, concat=False ) elif FLAGS.model == 'gcn': # Create model sampler = UniformNeighborSampler(adj_info) layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, 2 * FLAGS.dim_1), SAGEInfo("node", sampler, FLAGS.samples_2, 2 * FLAGS.dim_2)] model = SupervisedGraphsage(num_classes, placeholders, features, adj_info, minibatch.deg, layer_infos=layer_infos, aggregator_type="gcn", model_size=FLAGS.model_size, concat=False, sigmoid_loss=FLAGS.sigmoid, identity_dim=FLAGS.identity_dim, logging=True) elif FLAGS.model == 'graphsage_seq': sampler = UniformNeighborSampler(adj_info) layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1), SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2)] model = SupervisedGraphsage(num_classes, placeholders, features, adj_info, minibatch.deg, layer_infos=layer_infos, aggregator_type="seq", model_size=FLAGS.model_size, sigmoid_loss=FLAGS.sigmoid, identity_dim=FLAGS.identity_dim, logging=True, concat=True) elif FLAGS.model == 'graphsage_maxpool': sampler = UniformNeighborSampler(adj_info) layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1), SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2)] model = SupervisedGraphsage(num_classes, placeholders, features, adj_info, minibatch.deg, layer_infos=layer_infos, aggregator_type="maxpool", model_size=FLAGS.model_size, sigmoid_loss=FLAGS.sigmoid, identity_dim=FLAGS.identity_dim, logging=True, concat=True) elif FLAGS.model == 'graphsage_meanpool': sampler = UniformNeighborSampler(adj_info) layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1), SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2)] model = SupervisedGraphsage(num_classes, placeholders, features, adj_info, minibatch.deg, layer_infos=layer_infos, aggregator_type="meanpool", model_size=FLAGS.model_size, sigmoid_loss=FLAGS.sigmoid, identity_dim=FLAGS.identity_dim, logging=True, concat=True) elif FLAGS.model == 'gat': sampler = UniformNeighborSampler(adj_info) # 建立两层网络 采样邻居、邻居个数、输出维度 layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1), SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2)] model = SupervisedGraphsage(num_classes, placeholders, features, adj_info, minibatch.deg, concat=True, layer_infos=layer_infos, aggregator_type="gat", model_size=FLAGS.model_size, sigmoid_loss=FLAGS.sigmoid, identity_dim=FLAGS.identity_dim, logging=True, ) else: raise Exception('Error: model name unrecognized.') config = tf.ConfigProto(log_device_placement=FLAGS.log_device_placement) config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = GPU_MEM_FRACTION config.allow_soft_placement = True # Initialize session sess = tf.Session(config=config) # sess = tf_dbg.LocalCLIDebugWrapperSession(sess) #merged = tf.summary.merge_all() # 将所有东西保存到磁盘,可视化会用到 #summary_writer = tf.summary.FileWriter(log_dir(), sess.graph) # 记录信息,可视化,可以用tensorboard查看 # Init variables sess.run(tf.global_variables_initializer(), feed_dict={adj_info_ph: minibatch.adj}) #sess.run(tf.global_variables_initializer(), feed_dict={adj_info_ph2: minibatch2.adj}) # Train model total_steps = 0 avg_time = 0.0 epoch_val_costs = [] epoch_val_costs2 = [] # 这里minibatch.adj和minibathc.test_adj的大小是一样的,只不过adj里面把不是train的值都变成一样 # val在这里是validation的意思,验证 train_adj_info = tf.assign(adj_info, minibatch.adj) # tf.assign()是为一个tf.Variable赋值,返回值是一个Variable,是赋值后的值 val_adj_info = tf.assign(adj_info, minibatch.test_adj) # assign()是一个Opration,要用sess.run()才能执行 it = 0 train_loss = [] val_loss = [] train_f1_mics = [] val_f1_mics = [] loss_plt = [] loss_plt2 = [] trainf1mi = [] trainf1ma = [] valf1mi = [] valf1ma = [] iter_num = 0 for epoch in range(FLAGS.epochs*2): if epoch < FLAGS.epochs: minibatch.shuffle() iter = 0 print('Epoch: %04d' % (epoch + 1)) epoch_val_costs.append(0) while not minibatch.end(): # Construct feed dictionary # 通过改变feed_dict来改变每次minibatch的节点 feed_dict, labels = minibatch.next_minibatch_feed_dict() # feed_dict是mibatch修改过的placeholder feed_dict.update({placeholders['dropout']: FLAGS.dropout}) t = time.time() # Training step outs = sess.run([model.opt_op, model.loss, model.preds], feed_dict=feed_dict) train_cost = outs[1] iter_num = iter_num + 1 loss_plt.append(float(train_cost)) if iter % FLAGS.print_every == 0: # Validation 验证集 sess.run(val_adj_info.op) # sess.run() fetch参数是一个Opration,代表执行这个操作。 if FLAGS.validate_batch_size == -1: val_cost, val_f1_mic, val_f1_mac, duration, otu_lazy, _, val_preds, __, val_accuracy, val_mi_roc_auc = incremental_evaluate(sess, model, minibatch, f1_par, FLAGS.batch_size) else: val_cost, val_f1_mic, val_f1_mac, duration, val_accuracy, val_mi_roc_auc = evaluate(sess, model, minibatch, f1_par, FLAGS.validate_batch_size) sess.run(train_adj_info.op) # 每一个tensor都有op属性,代表产生这个张量的opration。 epoch_val_costs[-1] += val_cost #if iter % FLAGS.print_every == 0: #summary_writer.add_summary(outs[0], total_steps) # Print results avg_time = (avg_time * total_steps + time.time() - t) / (total_steps + 1) loss_plt2.append(float(val_cost)) valf1mi.append(float(val_f1_mic)) valf1ma.append(float(val_f1_mac)) if iter % FLAGS.print_every == 0: train_f1_mic, train_f1_mac, train_f1_none, train_accuracy, train_mi_roc_auc = calc_f1(labels, outs[-1], f1_par) trainf1mi.append(float(train_f1_mic)) trainf1ma.append(float(train_f1_mac)) print("Iter:", '%04d' % iter, # 训练集上的损失函数等信息 "train_loss=", "{:.5f}".format(train_cost), "train_f1_mic=", "{:.5f}".format(train_f1_mic), "train_f1_mac=", "{:.5f}".format(train_f1_mac), "train_accuracy=", "{:.5f}".format(train_accuracy), "train_ra_mi=", "{:.5f}".format(train_mi_roc_auc), # 在测试集上的损失函数值等信息 "val_loss=", "{:.5f}".format(val_cost), "val_f1_mic=", "{:.5f}".format(val_f1_mic), "val_f1_mac=", "{:.5f}".format(val_f1_mac), "val_accuracy=", "{:.5f}".format(val_accuracy), "val_ra_mi=", "{:.5f}".format(val_mi_roc_auc), "time=", "{:.5f}".format(avg_time)) train_loss.append(train_cost) val_loss.append(val_cost) train_f1_mics.append(train_f1_mic) val_f1_mics.append(val_f1_mic) iter += 1 total_steps += 1 if total_steps > FLAGS.max_total_steps: break if total_steps > FLAGS.max_total_steps: break ################################################################################################################### # begin second degree training ################################################################################################################### print("Optimization Finished!") sess.run(val_adj_info.op) if hi_num == 1: last_preds = test_preds last_labels = test_labels val_cost, val_f1_mic, val_f1_mac, duration, otu_f1, ko_none, test_preds, test_labels, test_accuracy, test_mi_roc_auc = incremental_evaluate(sess, model, minibatch, f1_par, FLAGS.batch_size, test=True) print("Full validation stats:", "loss=", "{:.5f}".format(val_cost), "f1_micro=", "{:.5f}".format(val_f1_mic), "f1_macro=", "{:.5f}".format(val_f1_mac), "accuracy=", "{:.5f}".format(test_accuracy), "roc_auc_mi=", "{:.5f}".format(test_mi_roc_auc), "time=", "{:.5f}".format(duration),) pred = y_ture_pre(sess, model, minibatch, FLAGS.batch_size) for i in range(pred.shape[0]): sum = 0 for l in range(pred.shape[1]): sum = sum + pred[i, l] for m in range(pred.shape[1]): pred[i, m] = pred[i, m]/sum id = json.load(open(FLAGS.train_prefix + "-id_map.json")) # x_train = np.empty([pred.shape[0], array.s) num = 0 session = tf.Session() array = session.run(features) x_test = np.empty([pred.shape[0], array.shape[1]]) x_train = np.empty([len(minibatch.train_nodes), array.shape[1]]) for node in minibatch.val_nodes: x_test[num] = array[id[node]] num = num + 1 num1 = 0 for node in minibatch.train_nodes: x_train[num1] = array[id[node]] num1 = num1 + 1 with open(log_dir() + "val_stats.txt", "w") as fp: fp.write("loss={:.5f} f1_micro={:.5f} f1_macro={:.5f} time={:.5f}". format(val_cost, val_f1_mic, val_f1_mac, duration)) print("Writing test set stats to file (don't peak!)") val_cost, val_f1_mic, val_f1_mac, duration, otu_lazy, ko_none, _, __, test_accuracy, test_mi_roc_auc = incremental_evaluate(sess, model, minibatch, f1_par, FLAGS.batch_size, test=True) with open(log_dir() + "test_stats.txt", "w") as fp: fp.write("loss={:.5f} f1_micro={:.5f} f1_macro={:.5f}". format(val_cost, val_f1_mic, val_f1_mac)) incremental_evaluate_for_each(sess, model, minibatch, FLAGS.batch_size, test=True) ################################################################################################################## # plot loss plt.figure() plt.plot(loss_plt, label='train_loss') plt.plot(loss_plt2, label='val_loss') plt.legend(loc=0) plt.xlabel('Iteration') plt.ylabel('loss') plt.title('Loss plot') plt.grid(True) plt.axis('tight') #plt.savefig("./graph/HMC_SAGE_CB_loss.png") # plt.show() # plot f1 score plt.figure() plt.subplot(211) plt.plot(trainf1mi, label='train_f1_micro') plt.plot(valf1mi, label='val_f1_micro') plt.legend(loc=0) plt.xlabel('Iterations') plt.ylabel('f1_micro') plt.title('train_val_f1_score') plt.grid(True) plt.axis('tight') plt.subplot(212) plt.plot(trainf1ma, label='train_f1_macro') plt.plot(valf1ma, label='val_f1_macro') plt.legend(loc=0) plt.xlabel('Iteration') plt.ylabel('f1_macro') plt.grid(True) plt.axis('tight') # plt.savefig("./graph/HMC_SAGE_CB_f1.png") # plt.show() plt.figure() plt.plot(np.arange(len(train_loss)) + 1, train_loss, label='train') plt.plot(np.arange(len(val_loss)) + 1, val_loss, label='val') plt.legend() plt.savefig('loss.png') plt.figure() plt.plot(np.arange(len(train_f1_mics)) + 1, train_f1_mics, label='train') plt.plot(np.arange(len(val_f1_mics)) + 1, val_f1_mics, label='val') plt.legend() #plt.savefig('f1.png') # OTU f1 plt.figure() plt.plot(otu_f1, label='otu_f1') plt.legend(loc=0) plt.xlabel('OTU') plt.ylabel('f1_score') plt.title('OTU f1 plot') plt.grid(True) plt.axis('tight') #plt.savefig("./graph/below_1500_CECB15_otu_f1.png") # plt.show() # Ko f1 score plt.figure() plt.plot(ko_none, label='Ko f1 score') plt.legend(loc=0) plt.xlabel('Ko') plt.ylabel('f1_score') plt.grid(True) plt.axis('tight') #plt.savefig("./graph/below1500_CECB15_ko_f1.png") bad_ko = [] b02 = 0 b05 = 0 b07 = 0 for i in range(len(ko_none)): if ko_none[i] < 0.2: bad_ko.append(i) b02 += 1 elif ko_none[i] < 0.5: b05 += 1 elif ko_none[i] < 0.7: b07 += 1 print("ko f1 below 0.2:", b02) print("ko f1 below 0.5:", b05) print("ko f1 below 0.7:", b07) print("ko f1 over 0.7:", num_classes-b02-b05-b07) bad_ko = np.array(bad_ko) with open('./new_data_badko/graph7 ko below zero point two .txt', 'w') as f: np.savetxt(f, bad_ko, fmt='%d', delimiter=",") workbook = xlwt.Workbook() sheet = workbook.add_sheet("sample_performance") for row in range(num_classes): sheet.write(row, 0, str(ko_samle_num[row])) sheet.write(row, 1, str(train_f1_none[row])) workbook.save('./graph/sample_performance11.xls')
def heart_disease_risk_factors(model, patient): explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(patient) shap.initjs() return plot(shap.force_plot(explainer.expected_value[1], shap_values[1], patient))
import numpy as np import shap from catboost import Pool, CatBoostRegressor from sklearn.model_selection import train_test_split np.set_printoptions(precision=2, suppress=True) X, y = shap.datasets.boston() X_train, X_validation, y_train, y_validation = train_test_split( X, y, train_size=0.8, random_state=12) model = CatBoostRegressor(iterations=700, learning_rate=0.001, eval_metric='RMSE', random_seed=12, silent=True) model.fit(X_train, y_train, eval_set=(X_validation, y_validation), plot=True) shap.initjs() explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(X) shap.force_plot(explainer.expected_value, shap_values[0, :], X.iloc[0, :], matplotlib=True) shap.force_plot(explainer.expected_value, shap_values, X)
def Proof_of_concept(self,clf,scaler,explainer,imputer,imputer_raw, # Make dynamic plots for local feature importance / predicions X,ids_events,ts,t,plot=True): import shap import matplotlib.pyplot as plt shap.initjs() # all available indexes: idx = np.unique(X[:,-1]) for i in range( len(idxs) # 1 ): print('\n Patient',i) # try: patient = X[np.where(X[:,-1] == idx)][:,:-1] print(patient.shape) total_features = make_total_features(self.features,self.specs) print(total_features.shape) patient = pd.DataFrame(patient,columns=total_features) #HEREEE if label == 'pos': print('to ICU:',t_event) else: print('discharge:',t_event) # Calculate model risks, PLOT SHAPLEY FORCE PLOTS predictions = predict(clf, X) diff = [] if len(predictions) > 2: for p in range(len(predictions)-1): diff.append(np.abs(predictions[p+1]-predictions[p])) diff = np.asarray(diff) n = len(predictions) if label == 'pos': diff_idx = diff.argsort()[-(n-1):] else: diff_idx = diff.argsort()[-3:] feature_inc_units = [] for feature in features_tot: feature_inc_units.append(feature+' '+self.dict_unit[feature]) feature_inc_units = np.asarray(feature_inc_units) count = 1 if plot: for idx in diff_idx: # new_base_value = np.log(t / (1 - t)) # the logit function shap_display = shap.force_plot( explainer.expected_value[1], # new_base_value, # link='logit', explainer.shap_values(X[idx+1,:])[1], features=np.round(X_raw.iloc[idx+1,:],2), feature_names=feature_inc_units, text_rotation=30, matplotlib=True,show=False, # plot_cmap=["#FF5733","#335BFF"] ) plt.savefig('results/POC_plot_FORCE_'+ str(i) + '_' + str(count) + '.png',bbox_inches='tight',dpi=300) count+=1 #Calculate feature impacts feature_impacts = list() for j in range(X.shape[0]): feature_impacts.append(explainer.shap_values(X[j,:])[1]) feature_impacts = np.array([np.array(x) for x in feature_impacts]) feature_impacts = pd.DataFrame(feature_impacts) feature_impacts.columns = features_tot # Calculate NEWS score news = [] for v in range(X_raw_imputed.shape[0]): a = NEWS(X_raw_imputed.loc[v,'SpO2'], X_raw_imputed.loc[v,'HR'], X_raw_imputed.loc[v,'BP'], X_raw_imputed.loc[v,'RR'], X_raw_imputed.loc[v,'Temp'] ) news.append(a) # 'Global' SHAPs for specific patient shap_values = explainer.shap_values(X) shap_mean = np.mean(np.abs(shap_values[1]),axis=0) sorted_idx = shap_mean.argsort() sorted_idx = list(sorted_idx) sorted_idx.remove(0) sorted_idx.remove(1) sorted_idx = np.asarray(sorted_idx) features_to_plot = features_tot[sorted_idx][-8:] if plot: plt = subplot(X_raw,ts,predictions,news,features_to_plot,i,t_event,feature_impacts,label,t,self.dict_unit,self.specs) if i == 0: X_overall = X else: X_overall = np.concatenate([X_overall,X],axis=0) # except: # print('patient', i, ' too short') return X_overall