def selec_by_features_random_weight(self, model=None): train_X, val_X, train_y, val_y = train_test_split(self.train_X, self.train_y, random_state=1) if not model: mymodel = LinearRegression() else: mymodel= model from eli5.sklearn import PermutationImportance perm = PermutationImportance(mymodel, n_iter=5, random_state=1024, cv=5) perm.fit(train_X.values, train_y.values) result_ = {'var': train_X.columns.values, 'feature_importances_': perm.feature_importances_, 'feature_importances_std_': perm.feature_importances_std_} feature_importances_ = pd.DataFrame(result_, columns=['var', 'feature_importances_', 'feature_importances_std_']) feature_importances_ = feature_importances_.sort_values('feature_importances_', ascending=False) # import eli5 # display(eli5.show_weights(perm)) # eli5.show_weights(perm, feature_names=train_X.columns.tolist()) #结果可视化 sel = SelectFromModel(perm, threshold=0.00, prefit=True) X_train_ = sel.transform(train_X) X_valid_ = sel.transform(val_X) return feature_importances_, X_train_, X_valid_
def compute_imp_score(pipe, clf_name, training_features, training_classes, random_state, perm): # clf = pipe.named_steps[clf_name] clf = pipe # pdb.set_trace() if hasattr(clf, 'coef_'): coefs = np.abs(clf.coef_.flatten()) coefs = coefs / np.sum(coefs) elif clf_name == 'ScaleLR': coefs = np.abs(clf.named_steps['lr'].coef_.flatten()) coefs = coefs / np.sum(coefs) else: coefs = getattr(clf, 'feature_importances_', None) # print('coefs:',coefs) if coefs is None or perm: perm = PermutationImportance(estimator=clf, n_iter=5, random_state=random_state, refit=False) perm.fit(training_features, training_classes) coefs = perm.feature_importances_ #return (coefs-np.min(coefs))/(np.max(coefs)-np.min(coefs)) # return coefs/np.sum(coefs) return coefs
def load_and_feature_analysis(): from sklearn import preprocessing from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve import xgboost ################### eli5 import eli5 # pip install eli5 from eli5.sklearn import PermutationImportance #featrues_filename = 'features_3_sec.csv' # Test Accuracy: 0.90224 #featrues_filename = 'data_adv_3_sec_no_var_hccho.csv' # 실수로 mfcc_var를 빼먹고 만들었다. Test Accuracy: 0.96663 featrues_filename = 'data_adv_3_sec_hccho.csv' # Test Accuracy : 0.95762 data = pd.read_csv(f'{general_path}/{featrues_filename}') data = data.iloc[0:, 1:] # 첫번째 column은 파일 이름이므로, 버린다. print(data.shape, data.head(5)) y = data['label'] # genre variable. X = data.loc[:, data.columns != 'label'] #select all columns but not the labels #### NORMALIZE X #### # Normalize so everything is on the same scale. cols = X.columns min_max_scaler = preprocessing.MinMaxScaler() np_scaled = min_max_scaler.fit_transform(X) # return numpy array (9990,58) # new data frame with the new scaled data. X = pd.DataFrame(np_scaled, columns=cols) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42) # Frame in Frame out ########## model load ############## xgb_classifier = pickle.load(open("my_xgb_model.pkl", "rb")) preds = xgb_classifier.predict( X_test) # array(['hiphop', 'jazz', 'blues', ....],dtype=object) print('Accuracy', ':', round(accuracy_score(y_test, preds), 5), '\n') # feature F score. pandas dataframe으로 train했기 때문에, featrue이름이 표시된다. numpy array로 data를 넣었다면, feature이름이 표시되지 않는다. xgboost.plot_importance(xgb_classifier) plt.show() ####### eli5 PermutationImportance ################# perm = PermutationImportance(estimator=xgb_classifier, random_state=1) perm.fit(X_test, y_test) # return 되는 값은 정확도의 변화이다. 한번만 simulation하는 것이 아니므로, +/-가 있다. weights = eli5.show_weights(estimator=perm, feature_names=X_test.columns.tolist() ) #### weights.data가 string인데, 내용은 html형식. with open('Permutation_Importance.htm', 'wb') as f: f.write(weights.data.encode("UTF-8"))
def test_allow_nans(iris_train): xgboost = pytest.importorskip('xgboost') X, y, feature_names, target_names = iris_train X = X.copy() X[0, 0] = np.nan perm = PermutationImportance(xgboost.XGBClassifier(), cv=5) # There should be not error thrown during fitting of the model perm.fit(X, y)
def feature_extraction_method(self, method=Names.ELI5_PERMUTATION): print("Starting Feature Extraction...") start_time = time.time() if method == True: method = Names.ELI5_PERMUTATION if method == Names.ELI5_PERMUTATION: pi_object = PermutationImportance(self.base_run_instance.test_harness_model.model) pi_object.fit(self.base_run_instance.testing_data[self.base_run_instance.feature_cols_to_use], self.base_run_instance.testing_data[self.base_run_instance.col_to_predict] ) feature_importances_df = pd.DataFrame() feature_importances_df["Feature"] = self.base_run_instance.feature_cols_to_use feature_importances_df["Importance"] = pi_object.feature_importances_ feature_importances_df["Importance_Std"] = pi_object.feature_importances_std_ feature_importances_df.sort_values(by='Importance', inplace=True, ascending=False) self.feature_importances = feature_importances_df.copy() elif method == Names.RFPIMP_PERMUTATION: pis = rfpimp.importances(self.base_run_instance.test_harness_model.model, self.base_run_instance.testing_data[self.base_run_instance.feature_cols_to_use], self.base_run_instance.testing_data[self.base_run_instance.col_to_predict]) pis['Feature'] = pis.index pis.reset_index(inplace=True, drop=True) pis = pis[['Feature', 'Importance']] pis.sort_values(by='Importance', inplace=True, ascending=False) self.feature_importances = pis.copy() elif method == "sklearn_rf_default": pass # TODO elif method == Names.BBA_AUDIT: self.bba_plots_dict = {} data = self.perform_bba_audit(training_data=self.base_run_instance.training_data.copy(), testing_data=self.base_run_instance.testing_data.copy(), features=self.base_run_instance.feature_cols_to_use, classifier=self.base_run_instance.test_harness_model.model, col_to_predict=self.base_run_instance.col_to_predict) feature_importances_df = pd.DataFrame(data, columns=["Feature", "Importance"]) self.feature_importances = feature_importances_df.copy() elif method == Names.SHAP_AUDIT: self.shap_plots_dict = {} data = self.perform_shap_audit() feature_importances_df = pd.DataFrame(data, columns=["Feature", "Importance"]) self.feature_importances = feature_importances_df.copy() print(("Feature Extraction time with method {0} was: {1:.2f} seconds".format(method, time.time() - start_time)))
def perm_importance(self): """ Calculates feature importances for each treatment group, based on the permutation method. """ importance_dict = {} for group, idx in self.classes.items(): if self.r_learners is None: perm_estimator = self.model_tau cv = 3 else: perm_estimator = self.r_learners[group] cv = 'prefit' perm_fitter = PermutationImportance(perm_estimator, cv=cv) perm_fitter.fit(self.X, self.tau[:, idx]) importance_dict[group] = perm_fitter.feature_importances_ return importance_dict
def compute_imp_score(model, model_name, training_features, training_classes, random_state): clf = model.named_steps[model_name] # pdb.set_trace() if hasattr(clf, 'coef_'): coefs = np.abs(clf.coef_.flatten()) else: coefs = getattr(clf, 'feature_importances_', None) if coefs is None: perm = PermutationImportance(estimator=model, n_iter=5, random_state=random_state, refit=False) perm.fit(training_features, training_classes) coefs = perm.feature_importances_ #return (coefs-np.min(coefs))/(np.max(coefs)-np.min(coefs)) return coefs / np.sum(coefs)
def evaluate(model, X_tr, y_tr, X_vl, y_vl, metric=False, imp=False): model.fit(X_tr, y_tr) y_tr_pred = model.predict(X_tr) y_vl_pred = model.predict(X_vl) if metric: index = ['mae', 'r2'] metrics = { 'tr': [mean_absolute_error(y_tr, y_tr_pred), r2_score(y_tr, y_tr_pred)], 'vl': [mean_absolute_error(y_vl, y_vl_pred), r2_score(y_vl, y_vl_pred)], } display(pd.DataFrame(metrics, index=index)) if imp: pimp = PermutationImportance(model, random_state=42, n_iter=20) pimp.fit(X_vl, y_vl) display(eli5.show_weights(pimp))
def analyze_fi_pi(self): "Feature Importance - Permutation Importance" # we need to impute the data first before calculating permutation importance train_X_imp = self.imputer.transform(self.X) # set up the met-estimator to calculate permutation importance on our training # data perm_train = PermutationImportance(self.estimator, scoring=self.spearman_scorer, n_iter=50, random_state=RANDOM_STATE) # fit and see the permuation importances perm_train.fit(train_X_imp, self.y) eli5.explain_weights_df(perm_train, feature_names=self.features) # plot the distributions perm_train_feat_imp_df = pd.DataFrame(data=perm_train.results_, columns=self.features) sns.boxplot(data=perm_train_feat_imp_df).set( title='Permutation Importance Distributions (training data)', ylabel='Importance')
def permutationImports(model, X_val, y_val): ''' Get and display permutation importances ''' # We'll look at the importances for both accuracy score and recall permuter = PermutationImportance( model, scoring='accuracy', random_state=42 ) permuter.fit(X_val, y_val) print('Permutation Importances\n') permute_scores = pd.Series(permuter.feature_importances_, X_val.columns) display(permute_scores.sort_values(ascending=False)) print('\n') plt.figure(figsize=(10, len(X_val.columns) / 2)) permute_scores.sort_values().plot.barh() plt.show()
def permuter(model, X,y, **kwargs): """ Uses eli5 package to plot permutation importance Scoring parameter keyword argument takes string arguments avilable here: https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter If no arguments are passed, defaults are: scoring = "accuracy", cv = "prefit", n_iter = 5 """ if 'scoring' in kwargs: scoring = kwargs['scoring'] else: scoring = 'accuracy' if 'cv' in kwargs: cv = kwargs['cv'] else: cv = 'prefit' if 'n_iter' in kwargs: n_iter = kwargs['n_iter'] else: n_iter = 5 perm= PermutationImportance(model, scoring = scoring, cv = cv, n_iter = n_iter, random_state = 42) #fit perm.fit(X,y) #show weights based on feature names feature_names = X.columns.tolist() display(show_weights(perm, top=None, feature_names=feature_names))
def get_permutation_imp(m, X, y, feats, random_state=random_state, scoring='roc_auc'): perm_train = PermutationImportance(m, random_state=random_state, scoring=scoring) _ = perm_train.fit(X, y) all_feat_imp_df = eli5.explain_weights_df(perm_train, feature_names=feats) perm_train_feat_imp_df = pd.DataFrame(data=perm_train.results_, columns=feats) perm_train_feat_imp_df = perm_train_feat_imp_df[list( all_feat_imp_df.feature)] ax = perm_train_feat_imp_df.iloc[:, :15].boxplot(figsize=(9, 7)) ax.set(title='Permutation Importance Distributions (training data)', ylabel='Importance') plt.xticks(rotation=90) plt.show() display(all_feat_imp_df[:15]) return all_feat_imp_df
plt.yticks(fontsize = 12) plt.ylabel('features', fontsize = 20) # More balanced results - its use is not extremely spread. ### Permutation feature importance - (or Mean Decrease Accuracy) # Import special library, designed for interpretation tasks import eli5 from eli5.sklearn import PermutationImportance #Fit and see permutation importance on our training data perm_train = PermutationImportance(classifier) perm_train.fit(X_train, y_train) eli5.explain_weights_df(perm_train, feature_names=features) #Fit and see permutation importance on our test data perm_test = PermutationImportance(classifier) perm_test.fit(X_test, y_test) eli5.explain_weights_df(perm_test, feature_names=features) # For this method, it is not clear on what set it should be applied. # In both cases, we can observe a table where features are ranked according to their importance. # The output takes the form of a weight (along with a standard deviation measure) # Results vary according to the method used. Take into account limitations / biases of each method # Combinining them allows to get a more objective view of true feature importance, which is a great explanation factor.
print(classification_report(y_train_svm, easy_lgbm.predict(X_train_svm))) print(confusion_matrix(y_train_svm, easy_lgbm.predict(X_train_svm))) print('Recall Score = ', recall_score(y_train_svm, easy_lgbm.predict(X_train_svm))) print('Precision Score = ', precision_score(y_train_svm, easy_lgbm.predict(X_train_svm))) print(f1_score(y_train_svm, easy_lgbm.predict(X_train_svm))) print(f1_score(y_test_svm, easy_lgbm.predict(X_test_svm))) eli5_permutation = PermutationImportance(estimator=easy_lgbm, scoring='f1', random_state=42, n_iter=5) eli5_permutation.fit(X_test_svm, y_test_svm) eli5_permutation.feature_importances_.T.reshape(-1, 1) feature_importance_with_eli5 = pd.DataFrame(np.hstack( (np.array([X.columns[0:]]).T, eli5_permutation.feature_importances_.T.reshape(-1, 1))), columns=['feature', 'importance']) feature_importance_with_eli5['importance'] = pd.to_numeric( feature_importance_with_eli5['importance']) feature_importance_with_eli5.sort_values(by='importance', ascending=False) fig = plt.figure(figsize=(15, 8)) plt.xticks(fontsize=15) plt.yticks(fontsize=15) sns.barplot(x='importance', y='feature',
'test':'deepskyblue'}) plt.legend(loc=9) plt.title('Distributions of Feature Contributions'); !pip install eli5 from eli5.sklearn import PermutationImportance import eli5 # let's check the importance of each attributes perm = PermutationImportance(model, random_state = 0).fit(X_test, y_test) eli5.show_weights(perm, feature_names = X_test.columns.tolist()) perm_train = PermutationImportance(model, scoring='accuracy', n_iter=100, random_state=1) # fit and see the permuation importances perm_train.fit(X_train, y_train) eli5.explain_weights_df(perm_train, feature_names=X_train.columns.tolist()).head() # figure size in inches from matplotlib import rcParams rcParams['figure.figsize'] = 25,5 perm_train_df = pd.DataFrame(data=perm.results_, columns=X.columns) (sns.boxplot(data=perm_train_df) .set(title='Permutation Importance Distributions (training data)', ylabel='Importance')); plt.xticks(rotation=90) plt.show() !pip install pdpbox
import eli5 from eli5.sklearn import PermutationImportance encoder = GDB_pipeline.named_steps.ordinalencoder X_train_encoded = encoder.fit_transform(X_train_cut) X_val_encoded = encoder.transform(X_val_cut) imputer = GDB_pipeline.named_steps.iterativeimputer X_train_imputed = imputer.fit_transform(X_train_encoded) X_val_imputed = imputer.fit_transform(X_val_encoded) model = GDB_pipeline.named_steps.gradientboostingclassifier # model.fit(X_train_imputed,y_train) permuter = PermutationImportance(model, scoring='accuracy', n_iter=2) permuter.fit(X_val_imputed, y_val) feature_names = X_val_encoded.columns.tolist() eli5.show_weights(permuter, top=None, feature_names=feature_names) # In[78]: from pdpbox import pdp plt.style.use('seaborn-dark-palette') feature = 'down' model = GDB_pipeline.named_steps['gradientboostingclassifier'] model_features = X_train_cut.columns X_train_imputed = pd.DataFrame(X_train_imputed) X_train_imputed.columns = X_train_cut.columns pdp_dist = pdp.pdp_isolate(model=model, dataset=X_train_imputed,
#Random forest classifier rf_model=RandomForestClassifier(n_estimators=10,random_state=42) #fitting the model rf_model.fit(X_train,y_train) # In[41]: #Permutation importance from eli5.sklearn import PermutationImportance perm_imp=PermutationImportance(rf_model,random_state=42) #fitting the model perm_imp.fit(X_valid,y_valid) # In[44]: #Important features eli5.show_weights(perm_imp,feature_names=X_valid.columns.tolist(),top=200) # Findings: # * The variables in green rows have positive impact on our prediction # * The variables in white rows have no impact on our prediction # * The variables in red rows have negative impact on our prediction # ### Handling of imbalanced data
learning_rate=.005, reg_lambda=.01, verbosity=1) print('fitting...') model.fit(X_train, y_train, eval_set=eval_set, eval_metric='auc', verbose=True) y_pred_proba = model.predict_proba(X_val)[:, 1] print(f'Validation ROC AUC score: {roc_auc_score(y_val, y_pred_proba)}') print('permuting...') permuter = PermutationImportance(model, cv='prefit', n_iter=5, scoring='roc_auc', random_state=42) permuter.fit(X_val, y_val) features_of_import = pd.Series(permuter.feature_importances_, val.columns).sort_values(ascending=True) print('importance', features_of_import) print('plotting...') fig1 = go.Figure() fig1.add_trace(go.Bar(x=features_of_import, y=val.columns)) py.iplot(fig1, filename='features1') mask = features_of_import > 0 trimmed_columns = train.columns[mask] train_trimmed = train[trimmed_columns] val_trimmed = val[trimmed_columns] test_trimmed = test[trimmed_columns]
shap.initjs() shap.force_plot( base_value=explainer.expected_value, shap_values=shap_values, features=row, link='logit' # For classification, this returns predicted probs ) permuter = PermutationImportance( model1, scoring='accuracy', n_iter=5, random_state=42 ) permuter.fit(X_val_processed, y_val) permuter.feature_importances_ eli5.show_weights( permuter, top=None, feature_names=X_val.columns.tolist() ) """## Logistic Regression Model for Classification: Models 3 and 4""" # 3rd Model logmodel = LogisticRegression() logmodel.fit(X_train,y_train)
def RFE_perm(model, X, y, feats, cv=5, scoring='neg_mean_absolute_error', timing=False): #def RFE_perm(model,X,y,min_features=1,step=1,cv=5,scoring='neg_mean_absolute_error',timing=False): from eli5.sklearn import PermutationImportance from types import GeneratorType import time # if pandas data then convert to numpy arrays if isinstance(X, pd.DataFrame): X = X.to_numpy() if isinstance(y, pd.Series): y = y.to_numpy() # if cv is a generator convert to list so it doesn't disappear after first iter if isinstance(cv, GeneratorType): cv = list(cv) nfeat = np.shape(X)[1] index = np.arange(nfeat) bestscore = -99 niter = len(feats) # niter = int(np.floor((nfeat - min_features)/step)+1) scores = np.empty(niter) nfeats = np.empty(niter) traintime = np.empty(niter) predtime = np.empty(niter) featsets = np.zeros([niter, nfeat]) # for i, n in enumerate(range(nfeat,min_features-1,-step)): for i, n in enumerate(feats): if n == nfeat: # first iter newfeat = index Xcut = X else: newfeat = sortimport[: n] # take n most important features from previous iter Xcut = Xcut[:, newfeat] index = index[newfeat] # Get train time and prediction time if timing: start = time.time() model.fit(Xcut, y) end = time.time() traintime[i] = end - start start = time.time() model.predict(Xcut) end = time.time() predtime[i] = end - start perm = PermutationImportance(model, random_state=42, scoring=scoring, cv=cv) perm.fit(Xcut, y) featimport = perm.feature_importances_ sortimport = np.argsort(featimport)[::-1] score = np.mean(perm.scores_) print('Number of features: %i, score: %.2f %%' % (n, 100 * np.abs(score))) scores[i] = score nfeats[i] = n featsets[i, index] = 1 if (score >= bestscore ): #> or = because if equal with smaller nfeat, then better! bestscore = score bestfeat = index if timing: return [nfeats, scores, traintime, predtime], bestscore, bestfeat, featsets else: return [nfeats, scores], bestscore, bestfeat, featsets
prediction = pipeline.predict(X) y_test = numpy.asanyarray(Y) r, p_value = scipy.stats.pearsonr(y_test, prediction) error = 0 tot_samples = len(prediction) for j in range(0, tot_samples): print("%s - %s" % (y_test[j], prediction[j])) error = error + (prediction[j] - y_test[j])**2 mse = error / len(prediction) rmse = numpy.math.sqrt(mse) perm = PermutationImportance(pipeline, random_state=1) res = perm.fit(X, y_test) ret = eli5.format_as_dict( eli5.explain_weights(res, top=180, feature_names=X.columns.tolist())) print(ret) print("---------------") print("MEAN SQUARED ERROR:", mse) print("ROOT MEAN SQUARED ERROR:", rmse) print("PEARSON'S CORRELATION COEFFICIENT:", r, "p-value", p_value) print("---------------") for i in ret['feature_importances']['importances']: print(i)
SimpleImputer(strategy='median')) # fit the model Rand_pipeline.fit(X_train, y_train) # transform the model TT_val = Rand_pipeline.transform(X_val) model_permuter = PermutationImportance( model_predictor, scoring='accuracy', n_iter=7, random_state=42 ) model_permuter.fit(TT_val, y_val); # eli5 graph with weight and feature with my 14 selecting features eli5.show_weights( model_permuter, top=None, feature_names=X_val.columns.tolist() ) """### Model Interpretation ### Isolated Partial Dependence Plots with 1 feature """ plt.rcParams['figure.dpi']=70 pdf_feature = 'lead_time'
cm = confusion_matrix(y_test, y_pred) print("True positives: {}\nFalse positives: {}".format(cm[0, 0], cm[0, 1])) print("True negatives: {}\nFalse negatives: {}".format(cm[1, 1], cm[1, 0])) # visualize confusion matrix with seaborn heatmap cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0'], index=['Predict Positive:1', 'Predict Negative:0']) sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu') X_list = X_test.columns.tolist() clf = xgb.XGBClassifier(n_estimators=150, random_state=2020) clf.fit(X_train, y_train) perm = PermutationImportance(clf, random_state=2010) perm.fit(X_test, y_test) # Store feature weights in an object html_obj = eli5.show_weights(perm, feature_names=X_list) # Write html object to a file (adjust file path; Windows path is used here) with open( r'C:\Users\lukem\Desktop\Github AI Projects\Higgs-Boson-machine-learning-challenge\boson-importance.htm', 'wb') as f: f.write(html_obj.data.encode("UTF-8")) lr = LogisticRegression() lr.fit(X_train, y_train) pred = lr.predict(X_test) mae = mean_absolute_error(y_test, pred) print(f"logistic regression, mae: {mae}")
del train0,train1,train2,train3,train4,train5,train6,train7,train8,train9, label = pd.read_csv("../Documents/safety/labels/part-00000-e9445087-aa0a-433b-a7f6-7f4c19d78ad6-c000.csv") label = pd.DataFrame(label.groupby('bookingID')['label'].sum()) label = label.reset_index() train = preprocessing(train) train_agg = all_features(train) cols = ['Bearing_zero_crossing','gyro_tot_zero_crossing','Accuracy_zero_crossing', 'tilt_angle_zero_crossing','acc_tot_zero_crossing','second_zero_crossing', 'second_ssc','is_negative_speed_zero_crossing','roll_cpt5','tilt_angle_cpt5', 'second_cpt5','Accuracy_cpt5','Bearing_cpt5','Speed_cpt5','pitch_cpt5','gyro_z_cpt5','gyro_y_cpt5'] train_agg.drop(cols,axis=1,inplace=True) train_agg = pd.merge(train_agg,label,on='bookingID') #Calculated permutation importance using XGBoost X = train_agg.drop(['bookingID','label'],axis=1) perm = PermutationImportance(xgb.XGBClassifier(), cv=skf) perm.fit(X.values,y) #put feature importances in dataframe importances = pd.DataFrame() importances['features'] = X.columns importances['value'] = perm.feature_importances_ importances = importances.sort_values(by=['value'],ascending=False) importances = importances.reset_index() #Building weighted ensemble model scaler = StandardScaler() X = train_agg.drop(['bookingID','label'],axis=1)[importances[:150]['features']] y = train_agg['label'] X_scaling = scaler.fit_transform(X) model_xgb = xgb.XGBClassifier(n_estimators = 100) model_lgb = lgb.LGBMClassifier() model_lr = LogisticRegression() model_rf = RandomForestClassifier()
X = df.iloc[:, 3:194] Y_tmp = df.iloc[:, 0] Y = [] total_sents = len(Y_tmp) for i in range(0,total_sents): Y.append(Y_tmp[i]/total_sents) # fix random seed for reproducibility seed = 7 numpy.random.seed(seed) Y = numpy.asanyarray(Y) X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2) perm = PermutationImportance(pipeline, random_state=1) res = perm.fit(X_test,y_test) #ret = eli5.format_as_text(eli5.explain_weights(perm)) ret = eli5.format_as_dict(eli5.explain_weights(res)) #ret = eli5.show_weights(perm, feature_names = X.columns.tolist()) print(ret) for i in ret['feature_importances']['importances']: print(i) print('------') print(perm.feature_importances_)
import eli5 from eli5.sklearn import PermutationImportance from sklearn.svm import SVC from sklearn.feature_selection import SelectFromModel # ... load data perm = PermutationImportance(SVC(), cv=5) perm.fit(TL.drop('Telo.Length',axis=1).loc[training_indices], TL.loc[training_indices,'Telo.Length']) # perm.feature_importances_ attribute is now available, it can be used # for feature selection - let's e.g. select features which increase # accuracy by at least 0.05: sel = SelectFromModel(perm, threshold=0.05, prefit=True) X_trans = sel.transform(X) # It is possible to combine SelectFromModel and # PermutationImportance directly, without fitting # PermutationImportance first: sel = SelectFromModel( PermutationImportance(SVC(), cv=5), threshold=0.05, ).fit(X, y) X_trans = sel.transform(X)
model1.fit(X_train_transformed, y_train) # Get permutation importances ! pip install eli5 from eli5.sklearn import PermutationImportance import eli5 permuter = PermutationImportance( model1, scoring='r2', n_iter=2, random_state=42 ) permuter.fit(X_val_transformed, y_val) feature_names = X_val.columns.tolist() eli5.show_weights( permuter, top=None, # show permutation importances for all features feature_names=feature_names ) from sklearn.metrics import mean_squared_error, r2_score # Coefficient of determination r2 for the training set pipeline_score = permuter.score(X_train_transformed,y_train) print("Coefficient of determination r2 for the training set.: ", pipeline_score) # Coefficient of determination r2 for the validation set
for i in range(5): if key == 'xgboost': model = XGBClassifier(**params_XGboost[str(i)]) elif key == 'catboost': model = CatBoostClassifier(**paramsCatBoost[str(i)]) y = target.iloc[:, i] train_X, val_X, train_y, val_y = \ train_test_split(X_train, y, random_state=SEED, shuffle=True) model.fit(train_X.values, train_y.values) perm = PermutationImportance(model, cv=5, scoring='roc_auc', random_state=SEED) perm.fit(val_X.values, val_y.values) sel = SelectFromModel(perm, threshold=value['threshold'], prefit=True) X_train_transformed = sel.transform(X_train) X_test_transformed = sel.transform(X_test) prediction, cv_scores_mean = train_and_predict(model, X_train_transformed, y.values, X_test_transformed, cv) cv_scores.append(cv_scores_mean) predictions.append(prediction) print(round(np.array(cv_scores).mean(), 5)) write_to_submission_file(predictions, ID, value['filename'])