def run_exp_02_logistic_regression_pd(model_path, retrain_model=False): """ Linear Model Experiment """ # local preprocessing (e.g., OHE for linear models) X, y = dataloader.get_train_ohe(dask=False) X.fillna(0, inplace=True) """ Fit model to training data """ from project.experiments.training_scripts.train_02_logistic_regression import train_logistic_regression train_logistic_regression(X, y, save_to=model_path, recompute=retrain_model) """ Test performance on unseen data """ X, y = dataloader.get_test_ohe(dask=False) X.fillna(0, inplace=True) from project.experiments.testing_scripts.test_classification import test_classification test_classification(X, y, model_path) """ Get interpretation - simple weights for linear models """ import eli5 import joblib explanation_df = eli5.explain_weights_df(joblib.load(model_path), feature_names=X.columns.values) explanation_df.sort_values('weight', inplace=True) print(explanation_df.head())
def main(): start_time = time.time() train = read_train_data(nrows=None) test = read_test_data() train, test = process_data(train, test) X = train.drop(['ID_code', 'target'], axis=1) y = train['target'] X_test = test.drop(['ID_code'], axis=1) model = lgb.LGBMClassifier(**params, n_estimators=20000, n_jobs=10) X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], verbose=1000, early_stopping_rounds=200) perm = PermutationImportance(model, random_state=1).fit(X_valid, y_valid) eli_df = eli5.explain_weights_df(perm, feature_names=X.columns.tolist(), top=len(X.columns)) eli_df.to_csv( os.path.join(data.permutation_importance.__path__[0], '0304_square_feature.csv')) elapsed_time = time.time() - start_time print(elapsed_time)
def permutation_importance(model, x_all, targets_all, config): _logger.info("Computing permutation importance!!") if config.algorithm not in transformed_modelmaps.keys(): raise AttributeError("Only the following can be used for permutation " "importance {}".format( list(transformed_modelmaps.keys()))) y = targets_all.observations classification = hasattr(model, 'predict_proba') if not classification: for score in ['explained_variance', 'r2', 'neg_mean_absolute_error', 'neg_mean_squared_error']: pi_cv = apply_multiple_masked( PermutationImportance(model, scoring=score, cv='prefit', n_iter=10, refit=False).fit, data=(x_all, y) ) feature_names = geoio.feature_names(config) df_picv = eli5.explain_weights_df( pi_cv, feature_names=feature_names, top=100) csv = Path(config.output_dir).joinpath( config.name + "_permutation_importance_{}.csv".format( score)).as_posix() df_picv.to_csv(csv, index=False)
def permutation_importance(my_model, val_X, val_y, ret_df=False): import eli5 from eli5.sklearn import PermutationImportance perm = PermutationImportance(my_model, random_state=1).fit(val_X, val_y) if ret_df: return eli5.explain_weights_df(perm, feature_names=val_X.columns.tolist()) else: return eli5.show_weights(perm, feature_names=val_X.columns.tolist())
def show_global_interpretation_eli5(X_train, y_train, features, clf, dim_model): """show most important features via permutation importance in ELI5""" if dim_model == "XGBoost": df_global_explain = eli5.explain_weights_df( clf, feature_names=features.values, top=5).round(2) else: perm = PermutationImportance(clf, n_iter=2, random_state=1).fit(X_train, y_train) df_global_explain = eli5.explain_weights_df( perm, feature_names=features.values, top=5).round(2) bar = (alt.Chart(df_global_explain).mark_bar( color="red", opacity=0.6, size=16).encode(x="weight", y=alt.Y("feature", sort="-x"), tooltip=["weight"]).properties(height=160)) st.write(bar)
def calculate_pfi(rf, X, y): """Calculate the PFI.""" rf.n_jobs = get_ncpus() perm_importance = eli5.sklearn.PermutationImportance(rf, random_state=1).fit( X, y) return eli5.explain_weights_df(perm_importance, feature_names=list(X.columns))
def get_feature_importance(model): feature_importances = eli5.explain_weights_df(model['chosen_model']) if feature_importances is None: return list() else: return feature_importances.to_dict(orient='records')
def test_explain_weights(boston_train): X, y, feature_names = boston_train reg = LinearRegression() reg.fit(X, y) expl = explain_weights(reg) df = format_as_dataframe(expl) check_targets_dataframe(df, expl) check_targets_dataframe(explain_weights_df(reg), expl) df_dict = explain_weights_dfs(reg) assert set(df_dict.keys()) == {'targets'} check_targets_dataframe(df_dict['targets'], expl)
def analyze_fi_pi(self): "Feature Importance - Permutation Importance" # we need to impute the data first before calculating permutation importance train_X_imp = self.imputer.transform(self.X) # set up the met-estimator to calculate permutation importance on our training # data perm_train = PermutationImportance(self.estimator, scoring=self.spearman_scorer, n_iter=50, random_state=RANDOM_STATE) # fit and see the permuation importances perm_train.fit(train_X_imp, self.y) eli5.explain_weights_df(perm_train, feature_names=self.features) # plot the distributions perm_train_feat_imp_df = pd.DataFrame(data=perm_train.results_, columns=self.features) sns.boxplot(data=perm_train_feat_imp_df).set( title='Permutation Importance Distributions (training data)', ylabel='Importance')
def global_feautures_b(self, data): #data = self._data final = pd.DataFrame.from_dict({'feature' : list(data.columns) + ['<BIAS>']}) for model in self._models: #print(model) #print(self._model[str(model) final_features = eli5.explain_weights_df(self._models[str(model)], feature_names= list(data)) final_features = final_features[['feature','weight']].set_index('feature') final_features = final_features.rename(columns={'weight':model}) final_features.reset_index(drop=True, inplace=True) final = final.join(final_features) #efor return(final)
def main(): df = pd.read_csv('/users/zcb/desktop/num_test/train.csv') label = df.columns print(label) df_target = df.iloc[:, 0] df_feature = df.iloc[:, 1:] x_train, x_test, y_train, y_test = train_test_split(df_feature, df_target, train_size=0.7, random_state=0) my_model = KNeighborsClassifier(n_neighbors=3).fit(x_train, y_train) perm = PermutationImportance(my_model, random_state=1).fit(x_test, y_test) df = eli5.explain_weights_df(perm) print(df)
def analyze_fi_mdi(self): "Feature Importance - Mean Decrease Impurity" feat_imp_df = eli5.explain_weights_df(self.estimator, feature_names=self.features) #import pdb; pdb.set_trace() # get the feature importances from each tree and then visualize the # distributions as boxplots all_feat_imp_df = pd.DataFrame( data=[tree.feature_importances_ for tree in self.estimator], columns=self.features) sns.boxplot(data=all_feat_imp_df).set( title='Feature Importance Distributions', ylabel='Importance')
def perm_import( model, X_val, y_val, score, return_importances=False, ): # Load up model ml_model = pickle.load(open(model, 'rb')) perm = PermutationImportance(ml_model, scoring=score, random_state=1).fit(X_val, y_val) feat_name = X_val.columns.tolist() eli5_show_weights = eli5.show_weights(perm, feature_names=feat_name) importances = eli5.explain_weights_df(perm, feature_names=feat_name) if return_importances == True: return importances
def _get_permutation_importances(self, pipeline, X_train, Y_train): if not self.configs['fit'].get('permutation'): return None if X_train.ndim > 2: return None if len(self.feature_columns) > 50: logger.warning('COLUMNS IS TOO LARGE, THEN NO PERMUTATION') return None _estimator = pipeline.steps[-1][1] if not hasattr(_estimator, 'score'): logger.warning('NO SCORE METHOD, THEN NO PERMUTATION') return None perm = PermutationImportance(_estimator, random_state=42).fit( self.toarray_like(X_train), Y_train) return eli5.explain_weights_df(perm, feature_names=self.feature_columns)
def get_feature_importance(model, features): """Calculates feature importance of classifier using eli5 `explain_weights` method Parameters ---------- model : dict Carries the model class Returns ------- list of dicts Response structured as list of dicts with keys: 'feature' and 'weight' """ feature_importances = eli5.explain_weights_df(model['chosen_model'], feature_names=list( features.columns)) if feature_importances is None: return list() else: return feature_importances.to_dict(orient='records')
def get_permutation_imp(m, X, y, feats, random_state=random_state, scoring='roc_auc'): perm_train = PermutationImportance(m, random_state=random_state, scoring=scoring) _ = perm_train.fit(X, y) all_feat_imp_df = eli5.explain_weights_df(perm_train, feature_names=feats) perm_train_feat_imp_df = pd.DataFrame(data=perm_train.results_, columns=feats) perm_train_feat_imp_df = perm_train_feat_imp_df[list( all_feat_imp_df.feature)] ax = perm_train_feat_imp_df.iloc[:, :15].boxplot(figsize=(9, 7)) ax.set(title='Permutation Importance Distributions (training data)', ylabel='Importance') plt.xticks(rotation=90) plt.show() display(all_feat_imp_df[:15]) return all_feat_imp_df
def analyze_feature_importance(pipelines: List[Pipeline], serialization_dir: Path, logger: Logger): feature_importances = [] for i, p in enumerate(pipelines): # Analyze feature importance: This is quite a "scrapy" endeavour. We need to obtain the feature names in order (for # which sklearn provides no method) and we need to find the classifier step in the pipeline (again, no method). feature_names = get_feature_names_from_pipeline(p) classifier = get_named_component_of_pipeline(p, CLASSIFIER_PIPELINE_STEP_NAME) assert classifier is not None, "Pipeline broken? Could not find classifier." assert type(classifier) is PredictOnTransformClassifierWrapper, "Unexpected pipeline step type." try: feature_importance = eli5.explain_weights_df(classifier.classifier_, feature_names=feature_names) if feature_importance is None: raise ValueError except Exception as e: logger.warning(f"Could not determine feature importance for {repr(type(classifier.classifier_))}.", e) continue feature_importance["run"] = i feature_importances.append(feature_importance) if not feature_importances: logger.warning("No feature importances found.") return feature_importances = pd.concat(feature_importances) # write raw data to file all_importances_file = serialization_dir / "feature_importances.csv" feature_importances.to_csv(all_importances_file) # average weight by run and write it to file importances_aggregated = feature_importances.groupby("feature")["weight"].describe(percentiles=[]) importances_aggregated.sort_values("mean", ascending=False, inplace=True) aggregated_importances_file = serialization_dir / "feature_importances_aggregated.txt" with aggregated_importances_file.open("w") as f: f.write(tabulate(importances_aggregated, headers="keys"))
def explain_classifiers(X,y, df, feature_cols, stop_words,n=20): lr_txt = LogisticRegression( random_state=42, warm_start=True, C = 10, class_weight='balanced', solver="newton-cg", penalty="l2", ) rf = RandomForestClassifier( random_state=42, max_features="sqrt", max_depth= 10, n_estimators=1000 ) lr_cat = LogisticRegression(class_weight='balanced', C = 0.06, warm_start=True, random_state=42) X_text = df['relevant_reviews'] vect = TfidfVectorizer(stop_words=stop_words, norm="l2", max_df=0.6, max_features=1000) X_vect = vect.fit_transform(X_text) feature_df = df[[col for col in X if col in feature_cols]] X_feat = pd.concat([feature_df],axis=1) #LR cat Feature Importance rcParams.update({'figure.autolayout': True}) lr_cat.fit(X_feat, y) weights = list(zip(lr_cat.coef_[0], X_feat.columns)) weights.sort(reverse=True) weights_df = pd.DataFrame(weights[:20], columns=['weight', 'feature']) sns.set_context("talk") c = sns.barplot(x="weight", y='feature', data=weights_df, palette="Set3") plt.title("Yelp Resteraunt Features Associated With Resteraunt Inspection Failures \n (Positive weights imply increased risk of failing.)") plt.xlabel("Logistic Regression Weights") plt.ylabel("Feature") plt.xticks(rotation=90) plt.show() #RF Feature Importance rcParams.update({'figure.autolayout': True}) rf.fit(X_feat, y) weights = list(zip(rf.feature_importances_, X_feat.columns)) weights.sort(reverse=True) weights_df = pd.DataFrame(weights[:20], columns=['weight', 'feature']) sns.set_context("talk") c = sns.barplot(x="weight", y='feature', data=weights_df, palette="Set3") plt.title("Yelp Resteraunt Features Associated With Resteraunt Inspection Failures \n (Larger weights imply increased importance)") plt.xticks(rotation=90) plt.xlabel("Random Forest Weights") plt.ylabel("Feature") plt.show() rcParams.update({'figure.autolayout': False}) lr_txt.fit(X_vect,y) weights_df = eli5.explain_weights_df(lr_txt, vec=vect, top=20,target_names=y) sns.set_context("talk") b = sns.barplot(x="feature", y='weight', data=weights_df, palette="Set3") plt.xlabel('Word') plt.title("Yelp Resteraunt Features Associated With Resteraunt Inspection Failures \n (Positive weights imply increased risk of failing.)") plt.ylabel("Logistic Regression Weights") plt.xticks(rotation=45) plt.show()
y_train = df.iloc[:, 0].values - 1 f_names = df.columns[1:].values t_names = df.iloc[:, 0].unique() # 不同 Class 统计 (根据 Target 列) print('\nTraining dataset shape: ', X_train.shape, ' Number of features: ', X_train.shape[1]) num_categories = np.unique(y_train).size sum_y = np.asarray(np.unique(y_train.astype(int), return_counts=True)) df_sum_y = pd.DataFrame(sum_y.T, columns=['Class', 'Sum'], index=None) print('\n', df_sum_y) # 初始化 classifier 并完成数据集训练 clf = RandomForestClassifier(verbose=1, n_jobs=-1, random_state=args.randomseed, n_estimators=100).fit(X_train, y_train) print('\nClassifier parameters:\n') print(clf.get_params()) # 输出重要特征评分 df_import = eli5.explain_weights_df(clf, target_names=t_names, feature_names=f_names) df_import.to_csv('f_weight_output.csv', index=None) print( "\nThe importance features have been saved to 'f_weight_output.csv'.") end_time = time.time() # 程序结束时间 print('\n[Finished in: {0:.6f} mins = {1:.6f} seconds]\n'.format( ((end_time - start_time) / 60), (end_time - start_time)))
def examine_top_weights(df, clf, vec, CONTEXT_THRESH, top_n_domains, n_features): # Get top feature weights weights = eli5.explain_weights_df(clf, vec=vec) # Take the absolute value to distinguish between positive and negative predictors weights['weight'] = np.absolute(weights['weight']) # Sort from largest to smallest weights.sort_values('weight', inplace=True, ascending=False) # Get the percentage of representation of the top n domains base_props = df['domain'].value_counts(normalize=True)[0:top_n_domains] # Iterate over top n features for i in range(0, n_features): # Get feature weight row = weights.iloc[i] # Get feature name feature = row['feature'] # Find domains whose articles contain top features and get the proportion of articles per domain which have it props = df[(df['preprocessed_text'].str.contains( " {} ".format(feature), regex=False)) & ( df['domain'].isin(base_props.keys()))]['domain'].value_counts( normalize=True) # Iterate over domains for key in props.keys(): # If it shows up more than expected if props[key] > 2 * base_props[key]: # Examine forward and backward context of term, print common contexts context = find_common_context_windows(feature, key, df, 'preprocessed_text', 1) forward_context = context[0].value_counts(normalize=True) if forward_context[0] > CONTEXT_THRESH: print(feature.upper()) print( key, "- Observed: {:.3f}, Expected: {:.3f}, Difference: {:.3f}" .format(props[key], base_props[key], props[key] / base_props[key])) print(forward_context[forward_context > CONTEXT_THRESH]) print('\n') backward_context = context[2].value_counts(normalize=True) if backward_context[0] > CONTEXT_THRESH: print(feature.upper()) print( key, "- Observed: {:.3f}, Expected: {:.3f}, Difference: {:.3f}" .format(props[key], base_props[key], props[key] / base_props[key])) print(backward_context[backward_context > CONTEXT_THRESH]) print('\n') print('\n')
plt.clim(0.003,0.010) plt.colorbar() plt.show #permutation feature weights import eli5 from eli5 import format_as_image from eli5.sklearn import PermutationImportance from sklearn.neural_network import MLPClassifier NNMLP_clf = MLPClassifier(random_state=48, max_iter=50) NNMLP_clf.fit(new_last_conv1, y_test1[:]) perm_all = PermutationImportance(NNMLP_clf).fit(new_last_conv1, y_test1) print('CNN results') exp = eli5.explain_weights_df(perm_all, feature_names = [0,1,2,3,4,5,6,7,8,9,10]) perm_corr = PermutationImportance(NNMLP_clf).fit(new_last_conv1[correct_cnn[:]], y_test1[[correct_cnn[:]]]) print('CNN Correct results') exp_corr = eli5.explain_weights_df(perm_corr, feature_names = [0,1,2,3,4,5,6,7,8,9,10]) perm_mis = PermutationImportance(NNMLP_clf).fit(new_last_conv1[misclass_cnn[:]], y_test1[misclass_cnn[:]]) print('CNN Misclass results') exp_mis = eli5.explain_weights_df(perm_mis, feature_names = [0,1,2,3,4,5,6,7,8,9,10]) from sklearn.preprocessing import normalize n0= normalize(final_last_conv1[correct_cnn[:]]) n1= normalize(final_last_conv1[misclass_cnn[:]]) n2= normalize(X_test1[:,:,0,0])
# 7.11.3 Conclude: Get feature weights """ # If you are using jupyter notebook, use: eli5.show_weights( perm, feature_names = colnames # X_test.columns.tolist() ) """ fw = eli5.explain_weights_df( perm, feature_names = colnames # X_test.columns.tolist() ) # 7.11.4 Print importance fw ##################### EE. Randomized Search ################# # Tune parameters using randomized search # 8. Hyperparameters to tune and their ranges parameters = {'xg__learning_rate': uniform(0, 1), 'xg__n_estimators': range(50,300), 'xg__max_depth': range(3,10), 'pca__n_components' : range(20,30)}
def permutation_importance(model, val_X, val_y, path): perm = PermutationImportance(model, random_state=1).fit(val_X, val_y) Table = eli5.explain_weights_df(perm, feature_names=val_X.columns.tolist()) Table.to_csv(path) print('generate ' + path)
misclass_gbc = np.where(y_pred_gbc!=y_test) misclass_gbc = misclass_gbc[0].tolist() print(misclass_gbc) correct_gbc = np.where(y_pred_gbc==y_test) correct_gbc = correct_gbc[0].tolist() print(correct_gbc) import eli5 from eli5.sklearn import PermutationImportance from IPython.display import display perm_gbc = PermutationImportance(gbc_clf).fit(X_test, y_test) print('GBC Results') exp_gbc = eli5.explain_weights_df(perm_gbc, feature_names = [0,1,2,3,4,5,6,7,8,9,10]) perm_gbc = PermutationImportance(gbc_clf).fit(X_test[correct_gbc[:]], y_test[correct_gbc[:]]) print('GBC Correct Results') exp_gbc_corr = eli5.explain_weights_df(perm_gbc, feature_names = [0,1,2,3,4,5,6,7,8,9,10]) perm_gbc = PermutationImportance(gbc_clf).fit(X_test[misclass_gbc[:]], y_test[misclass_gbc[:]]) print('GBC Misclass Results') exp_gbc_mis = eli5.explain_weights_df(perm_gbc, feature_names = [0,1,2,3,4,5,6,7,8,9,10]) misclass_ada = np.where(y_pred_ada!=y_test) misclass_ada = misclass_ada[0].tolist() print(misclass_ada) correct_ada = np.where(y_pred_ada==y_test) correct_ada = correct_ada[0].tolist()
# importance in decreasing order imp_ord = np.argsort(perm.feature_importances_) plt.figure(figsize=(12, 20)) yaxis = np.arange(len(perm.feature_importances_)) * 1.2 plt.barh(y=yaxis, width=perm.feature_importances_[imp_ord]) plt.yticks(yaxis, feature_names[imp_ord]) plt.ylabel('Feature') plt.xlabel('Importance') plt.show() # ## Select the top 100 important terms to run the model # In[15]: WeightDF = eli5.explain_weights_df(perm, feature_names=feature_names) WeightDF.head(100) # In[16]: imp_100 = WeightDF["feature"][0:100].tolist() impX_100_DF = tfidfDF[imp_100].copy() impX_100_DF_Testing = tfidfTestingDF[imp_100].copy() impX_100_DF.head() # #### 1.2 MLP-NN Model with the top 100 most important variables(Terms) # # # In[17]:
def make_eli5_interpretation(training_set, target, model, features, X, ml_name): """to display most important features via permutation in eli5 and sklearn formats""" # Permutation importances by eli5 perm = PermutationImportance(model, n_iter=1, random_state=0).fit(training_set, target) df_explain = explain_weights_df(perm, feature_names=features, top=10).round(3) bar = (alt.Chart(df_explain, title=f'ELI5 Weights Explained from {ml_name}').mark_bar( color="red", opacity=0.6, size=14).encode(x="weight", y=alt.Y("feature", sort="-x"), tooltip=["weight" ]).properties(height=300, width=675)) st.markdown("#### ELI5 Weights Explained") info_global = st.button("How it is calculated") if info_global: st.info(""" Each feature importance is obtained from permutation importances. Also, all the features are randomly shuffled and it shows how much impact the model perfomance used decreases. The eli5 plot is only displaying the top 10 features. For more information, check out this free course at kaggle: [Link](https://www.kaggle.com/dansbecker/permutation-importance) To check out the eli5 documentation, click the link: [ELI5 Documentation]( https://eli5.readthedocs.io/en/latest/overview.html ) """) st.write(bar) st.markdown("#### Permutation Importances") info_local = st.button("Information") if info_local: st.info(""" The sklearn plot is displaying all the features in the dataset. It shows which are the least important to the most important features. For more information, check out this free course at kaggle: [Link](https://www.kaggle.com/dansbecker/permutation-importance) To check out the sklearn documentation, click the link: [Sklearn Documentation]( https://scikit-learn.org/stable/modules/permutation_importance.html ) """) # Permutation importances by sklearn imp = permutation_importance(model, training_set, target, random_state=0) data = { 'importances_mean': imp['importances_mean'], 'importances_std': imp['importances_std'] } imp = pd.DataFrame(data, index=X.columns) imp.sort_values('importances_mean', ascending=False, inplace=True) fig, ax = plt.subplots(figsize=(12, 16)) imp.importances_mean.plot(kind='barh', ax=ax) plt.title('Sklearn Permutation Importances', fontsize=14, fontweight='bold') plt.xlabel(ml_name, fontsize=12) plt.tight_layout() st.write(fig)
'Dataset': ['Raw data', 'Dataset 1', 'Dataset 2'], 'R^2 score': [scoreR, score1, score2], 'Best params': [None, None, None] } pd.DataFrame(lrDict) # In[ ]: lr = LinearRegression(n_jobs=njobs, normalize=True) lr.fit(X_trnR, Y_trnR) # Extracting weights of features (not normalized) # In[ ]: weights = eli5.explain_weights_df( lr) # weights of LinearRegression model for RawData rank = [int(i[1:]) for i in weights['feature'].values[1:]] labels = ['BIAS'] + [X_trnR.columns[i] for i in rank] weights['feature'] = labels weights # #### KNeighbors <a name='knn'></a> # KNeighbors Regressor requires more parameters than Linear Regression, so using GridSearchCV to tune hyperparameters seem to be good idea. # In[ ]: tuned_parameters = { 'n_neighbors': list(range(1, 21)), 'weights': ['uniform', 'distance'] }
plt.ylabel('features', fontsize = 20) # More balanced results - its use is not extremely spread. ### Permutation feature importance - (or Mean Decrease Accuracy) # Import special library, designed for interpretation tasks import eli5 from eli5.sklearn import PermutationImportance #Fit and see permutation importance on our training data perm_train = PermutationImportance(classifier) perm_train.fit(X_train, y_train) eli5.explain_weights_df(perm_train, feature_names=features) #Fit and see permutation importance on our test data perm_test = PermutationImportance(classifier) perm_test.fit(X_test, y_test) eli5.explain_weights_df(perm_test, feature_names=features) # For this method, it is not clear on what set it should be applied. # In both cases, we can observe a table where features are ranked according to their importance. # The output takes the form of a weight (along with a standard deviation measure) # Results vary according to the method used. Take into account limitations / biases of each method # Combinining them allows to get a more objective view of true feature importance, which is a great explanation factor. # Cross compare with deductions made during the data visualisation phase, where each feature's impact on churn was more or less assessed.
print(mapk(actual, pred)) # ----------------------------------------------------------------------------- # MODEL INTERPRETABILITY import seaborn as sns import matplotlib.pyplot as plt import lime import shap from eli5.sklearn import PermutationImportance from eli5 import explain_weights_df, explain_prediction_df from lime.lime_tabular import LimeTabularExplainer # eli5 feat_imp_df = explain_weights_df(model, feature_names=all_cols) feat_imp_df.head(10) X_train = train.values exp_pred_df = explain_prediction_df(estimator=model, doc=X_train[0], feature_names=all_cols) # lime explainer = LimeTabularExplainer(X_train, mode='regression', feature_names=all_cols, categorical_features=cat_cols, random_state=1981, discretize_continuous=True) exp = explainer.explain_instance(X_valid[10], model.predict, num_features=20)
def Run_RF(allLabels, allFeatures, valLabels, valFeatures, outputPrefix, repetitions, group, header, outputDir, version, data_id, gini, perm): # defaultFI = [0] * len(valFeatures[0]) oob = [] acc = [] o = [0] * len(valLabels) ## allLabels_pridf = DataFrame(allLabels) allLabels_pridf.columns = ["label"] allLabels_df = allLabels_pridf['label'].apply(str) valLabels_pridf = DataFrame(valLabels) valLabels_pridf.columns = ["label"] valLabels_df = valLabels_pridf['label'].apply(str) ## allFeatures_df = DataFrame(allFeatures, dtype=float) valFeatures_df = DataFrame(valFeatures, dtype=float) allFeatures_df.columns = header valFeatures_df.columns = header valFeatures_arr = np.array(valFeatures_df, dtype="float32") ## for rep in range(0, repetitions): print("@@@@ Repetition " + str(rep) + " is started " + time.asctime(time.localtime(time.time())) + "@@@@ ") # print("@@@@ Start fitting model. " + time.asctime(time.localtime(time.time())) + " @@@@ ") rfc = RandomForestClassifier(n_estimators=100, oob_score=True) rfc.fit(allFeatures_df, allLabels_df) print("@@@@ Fitting model is done! " + time.asctime(time.localtime(time.time())) + " @@@@") print(rfc.oob_score_) oob.append(rfc.oob_score_) predictions = rfc.predict(valFeatures) # i = 0 correct = 0 incorrect = 0 for x in predictions: if x == valLabels[i]: correct += 1 else: incorrect += 1 i += 1 # M = rfc.predict_proba(valFeatures) acc.append(correct / float(correct + incorrect)) # k = 0 for entry in M: o[k] += entry[1] k += 1 ## default ## defaultFI = [ x + y for x, y in zip(defaultFI, rfc.feature_importances_) ] # perm if perm == "perm": print("@@@@ perm FI is started. " + time.asctime(time.localtime(time.time())) + " @@@@") # # rfpimp # rfppermFIOutput=outputDir+"/FI_rfpperm_" + str(rep) + "_" + outputPrefix+"_"+str(group)+"."+version+".txt" # output3=open(rfppermFIOutput, "w") # rfppermFI=[0]*len(valFeatures[0]) # imp = permutation_importances(rfc, valFeatures_df, valLabels_df, oob_classifier_accuracy) # imp_neworder = imp.loc[header] # rfppermFI=[x + y for x, y in zip(rfppermFI, imp_neworder["Importance"])] # if perm == "perm": # m=0 # for element in rfppermFI: # output3.write(header[m] + "\t" + str(element) + "\n") # m+=1 # ## eli5 ## eli5permFIOutput = outputDir + "/FI_perm_rep" + str( rep) + "_" + outputPrefix + "_" + str( group) + "." + version + ".txt" permFI = PermutationImportance(rfc, random_state=1).fit( valFeatures_df, valLabels_df) eli5_permFI_df = eli5.explain_weights_df( permFI, feature_names=valFeatures_df.columns.tolist()) eli5_permFI_df.to_csv(eli5permFIOutput, sep='\t', index=False) print("@@@@ perm FI is done! " + time.asctime(time.localtime(time.time())) + " @@@@") # gini if gini == "gini": print("@@@@ gini FI is started. " + time.asctime(time.localtime(time.time())) + " @@@@") valginifiOutput = outputDir + "/FI_gini_rep" + str( rep) + "_" + outputPrefix + "_" + str( group) + "." + version + ".txt" output4 = open(valginifiOutput, "w") ## valginiFI ## ## pos sample and neg sample ## pos_row = [] neg_row = [] pos_row = [ i for i in range(len(valLabels_df)) if valLabels_df[i] == "1" ] neg_row = [ i for i in range(len(valLabels_df)) if valLabels_df[i] == "0" ] # a = calculate_gini_impurity(rfc, 0, pos_row, neg_row, valLabels_df) ensemble_importances = dict(zip(header, [0] * len(header))) all_tree_importance = [] pool = multiprocessing.Pool(processes=1) for tree_id in range(len(rfc.estimators_)): all_tree_importance.append( pool.apply_async(calculate_gini_impurity, args=( rfc, tree_id, pos_row, neg_row, valLabels_df, valFeatures_arr, data_id, ))) pool.close() pool.join() for i in range(len(rfc.estimators_)): for j in header: ensemble_importances[j] += all_tree_importance[i].get()[j] for j in header: ensemble_importances[j] /= len(rfc.estimators_) ## gini FI ## if gini == "gini": k = 0 for element in ensemble_importances: output4.write(element + "\t" + str(ensemble_importances[element]) + "\n") k += 1 output4.close() print("@@@@ gini FI is done! " + time.asctime(time.localtime(time.time())) + " @@@@") print("@@@@ Repetition " + str(rep) + " is done! " + time.asctime(time.localtime(time.time())) + " @@@@") print("\n") # print(numpy.mean(oob), "\t", numpy.mean(acc)) # valOutput = outputDir + "/Predictions_" + outputPrefix + "_" + str( group) + "." + version + ".txt" defaultFIOutput = outputDir + "/FI_default_" + outputPrefix + "_" + str( group) + "." + version + ".txt" output1 = open(valOutput, "w") output2 = open(defaultFIOutput, "w") ## validation ## i = 0 for entry in o: output1.write( str(valLabels[i]) + "\t" + str(entry / float(repetitions)) + "\n") i += 1 ## defaultFI ## j = 0 for element in defaultFI: output2.write(header[j] + "\t" + str(element / float(repetitions)) + "\n") j += 1 output1.close() output2.close()