def fsel(bcl, X, d, m, forward=True, floating=False, cv=0, show=0): if show > 0: print('Feature Selection - ' + bcl[0] + ': - number of features reducing from ' + str(X.shape[1]) + ' to ' + str(m) + ' ...') if bcl[0] == 'Fisher': sel = sfsfisher(X, d, m) else: estimator = defineModel(bcl) sfs = SFS(estimator, k_features=m, forward=True, floating=False, verbose=show, scoring='accuracy', cv=cv) sfs = sfs.fit(X, d) sel = list(sfs.k_feature_idx_) if show > 0: print(' ') if show: plot_sfs(sfs.get_metric_dict(), kind='std_err') plt.title('Sequential Forward Selection') plt.grid() plt.show() return sel
def score_of_SFS(self, model=None): # 前向选择:该过程从一个空的特性集合开始,并逐个添加最优特征到集合中。 # 展示:随着特征个数增加得分变化趋势图 # 可选择K个最优特征 selector = SFS(model, k_features=self.K, forward=True, floating=False, # scoring='neg_mean_squared_error', scoring=self.score, cv=0) selector.fit(self.train_X, self.train_y) features_idx = [] for k, v in selector.get_metric_dict().items(): for f in v['feature_idx']: if f not in features_idx: # 按顺序取出重要性最高的特征 features_idx.append(f) sort_num = [] for f in self.continuous_feature_names: i = features_idx.index(f)+1 sort_num.append(i) sc = [1/x for x in sort_num] sum_sc = sum(sc) featureScore = [round(s/sum_sc, 4) for s in sc] model_name = str(model).split('(')[0] print(model_name + ' by SFS is finished') return featureScore
def FeatureSelection(pipeline_name, data_dev_mode, tag, train_filepath, test_filepath): logger.info('FEATURE SELECTION...') if bool(config.params.clean_experiment_directory_before_training ) and os.path.isdir(config.params.experiment_dir): logger.info('Cleaning experiment directory...') shutil.rmtree(config.params.experiment_dir) data = _read_data(data_dev_mode, train_filepath, test_filepath) train_set = data['train'] y = train_set[config.TARGET_COL].values.reshape(-1, ) train_set = train_set.drop(columns=config.TARGET_COL) pipeline = PIPELINES[pipeline_name](so_config=config.SOLUTION_CONFIG, suffix=tag) sfs = SequentialFeatureSelector(estimator=pipeline, k_features=(10, len(train_set.columns)), forward=False, verbose=2, cv=5, scoring='roc_auc') sfs.fit(train_set.to_numpy(), y) fig = plot_sequential_feature_selection(sfs.get_metric_dict()) plt.ylim([0.6, 1]) plt.title('Sequential Feature Selection') plt.grid() plt.show()
class DFSequentialFeatureSelector(BaseEstimator, TransformerMixin): def __init__(self, columns=None, **kwargs): self.columns = columns self.selector = SequentialFeatureSelector(**kwargs) self.transform_cols = None self.stat_df = None def fit(self, X, y): self.columns = X.columns if self.columns is None else self.columns self.transform_cols = [x for x in X.columns if x in self.columns] self.selector.fit(X[self.transform_cols], y) self.stat_df = pd.DataFrame.from_dict( self.selector.get_metric_dict()).T self.stat_df.at[self.stat_df['avg_score'].astype(float).idxmax(), 'support'] = True self.stat_df['support'].fillna(False, inplace=True) return self def transform(self, X): if self.transform_cols is None: raise NotFittedError( f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator." ) features = list( self.stat_df[self.stat_df['support']]['feature_names'].values[0]) new_X = X[features].copy() return new_X def fit_transform(self, X, y): return self.fit(X, y).transform(X)
def get_best_logisitc(y): from mlxtend.feature_selection import SequentialFeatureSelector as SFS from sklearn.cross_validation import StratifiedKFold import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import cross_val_score my_data = pd.read_csv('data/my_data_test.csv', encoding='utf-8') y = my_data.target my_data = my_data.drop('target', axis=1) # To have better CV skf = StratifiedKFold(y, n_folds=5, random_state=17, shuffle=False) C_params = [0.01 , 1, 10, 50, 70, 100] solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag'] my_result_list = [] for C_param in C_params: for solver in solvers: print "Looking for C : %s and solver : %s" % (C_param, solver) model = LogisticRegression(class_weight='balanced', random_state=17, solver=solver, C=C_param) sfs = SFS(model, k_features=len(my_data.columns), forward=True, floating=False, scoring='roc_auc', print_progress=False, cv=skf, n_jobs=-1) sfs = sfs.fit(my_data.values, y.values) result_sfs = pd.DataFrame.from_dict(sfs.get_metric_dict()).T result_sfs.sort_values('avg_score', ascending=0, inplace=True) features_sfs = result_sfs.feature_idx.head(1).tolist() select_features_sfs = list(my_data.columns[features_sfs]) scores = cross_val_score(model, my_data[select_features_sfs], y, cv=skf, scoring='roc_auc') my_result_list.append({'C' : C_param, 'solver' : solver, 'auc' : scores.mean(), 'std' : scores.std(), 'best_columns' : select_features_sfs, 'estimator' : model}) my_result = pd.DataFrame(my_result_list) my_result.sort_values('auc', ascending=0, inplace=True) best_features = my_result.best_columns.head(1).values[0] best_model = my_result.estimator.head(1).values[0] return best_features, best_model
def figs_of_SFS(self, model=None): selector = SFS(model, k_features=self.K, forward=True, floating=False, # scoring='neg_mean_squared_error', cv=0) selector.fit(self.train_X, self.train_y) model_name = str(model).split('(')[0] fig1 = plot_sfs(selector.get_metric_dict(), kind='std_dev') plt.title('SFS of {}'.format(model_name)) plt.grid() plt.show()
def test_randomholdoutsplit_in_sfs(): h_iter = RandomHoldoutSplit(valid_size=0.3, random_seed=123) knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, k_features=3, forward=True, floating=False, verbose=2, scoring='accuracy', cv=h_iter) sfs1 = sfs1.fit(X, y) d = sfs1.get_metric_dict() assert d[1]['cv_scores'].shape[0] == 1
def plot_feed_forward_models(): """ Plots the performance for each iteration of the feedforward model. The number of features chosen are 15 and 20, since these showed the best result """ # create Linear Regression model regr = LinearRegression() sfs_model = SequentialFeatureSelector(regr, k_features=15, forward=True, floating=False, scoring='neg_mean_squared_error', cv=10) sfs_model = sfs_model.fit(X_train, y_train) plot_sfs(sfs_model.get_metric_dict(), kind='std_err') plt.title('Sequential Forward Selection Linear Regression (w. StdErr)') plt.grid() plt.show() # Same for the Decision Tree, with some different settings clf = tree.DecisionTreeClassifier() sfs_model = SequentialFeatureSelector(clf, k_features=20, forward=True, floating=False, scoring='accuracy', cv=10) sfs_model = sfs_model.fit(X_train, y_train_binned) plot_sfs(sfs_model.get_metric_dict(), kind='std_err') plt.title('Sequential Forward Selection Decision Tree (w. StdErr)') plt.grid() plt.show()
def test_predefinedholdoutsplit_in_sfs(): h_iter = PredefinedHoldoutSplit(valid_indices=[0, 1, 99]) knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, k_features=3, forward=True, floating=False, verbose=2, scoring='accuracy', cv=h_iter) sfs1 = sfs1.fit(X, y) d = sfs1.get_metric_dict() assert d[1]['cv_scores'].shape[0] == 1
def sequential_feature_selection(data_set, y_values, want_graph): lr = LinearRegression() sfs = SFS(lr, k_features=13, forward=True, floating=False, scoring='neg_mean_squared_error', cv=10) sfs = sfs.fit(data_set, y_values) if want_graph: fig = plot_sfs(sfs.get_metric_dict(), kind='std_err') plt.title('Sequential Forward Selection (w. StdErr)') plt.grid() plt.show() return sfs
def perform_sfs(curr_classifier, X_train, X_test, y_train, y_test): sfs1 = SFS(curr_classifier, k_features=100, verbose=0, forward=True, floating=False, scoring='accuracy', cv=5, n_jobs=8) sfs1 = sfs1.fit(X_train, y_train) df = pd.DataFrame.from_dict(sfs1.get_metric_dict(), orient='index') df[[ 'accuracy', 'precision', 'recall', 'f1_score', 'confusion_matrix' ]] = df['feature_idx'].apply(lambda x: get_test_score( X_train, X_test, y_train, y_test, x, curr_classifier)).apply(pd.Series) return df
def fse_sfs(bcl, X, d, m, cv=0, show=0): estimator = defineModel(bcl) sfs = SFS(estimator, k_features=m, forward=True, floating=False, verbose=2, scoring='accuracy', cv=cv) sfs = sfs.fit(X, d) sel = sfs.k_feature_idx_ print(' ') if show: plot_sfs(sfs.get_metric_dict(), kind='std_err') plt.title('Sequential Forward Selection (w. StdErr)') plt.grid() plt.show() return sel
def feature_selection(regr, train): x, y = train.drop(columns=['Id', 'SalePrice']), train['SalePrice'] regr.fit(x, y) sfs = SFS(regr, k_features=x.shape[1] - 10, forward=False, verbose=2, scoring='neg_mean_squared_error', cv=4) sfs.fit(x, y) selected_features = (pd.DataFrame(sfs.get_metric_dict()) .T .loc[:, ['feature_names', 'avg_score', 'std_dev', 'std_err']] .sort_values(['avg_score', 'std_dev'], ascending=False) .reset_index(drop=True)) best_features = selected_features.at[0, 'feature_names'] best_features = list(best_features) bad_features = [f for f in x if f not in best_features] return bad_features
def FSRCV(X_train, y_train, forward=True, cv=10): """ Sequential Feature Selector from mlxtend package, used to implement a brute-force forward/backward selection. https://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/. They also have a GridSearchCV function. Add/remove the feature that reports the best/worst MSE score in the model and run cross validation (default 10-fold). Args: X_train: numpy Y_train: numpy forward: FSR, iteratively adding features cv: default=10 Return: sfs1: model cv_scores: tuple (mean, std error) X_train_sfs: the new subsets based on the selected features """ # FSR estimator = LinearRegression() sfs1 = SFS(estimator, k_features=X_train.shape[1], forward=forward, floating=False, verbose=2, scoring='neg_mean_absolute_error', cv=cv) sfs1 = sfs1.fit(X_train, y_train) X_train_sfs = sfs1.tranform(X_train) # selected best # get cv scores fsr_results = sfs1.get_metric_dict() fsr = pd.DataFrame.from_dict(fsr_results).T score = fsr.avg_score std = fsr.std_err cv_scores = [x for x in zip(score, std)] return sfs1, cv_scores, X_train_sfs
def select_by_SFS(self, model=None): # 前向选择:该过程从一个空的特性集合开始,并逐个添加最优特征到集合中。 # 展示:随着特征个数增加得分变化趋势图 # 可选择K个最优特征 selector = SFS(model, k_features=self.K, forward=True, floating=False, # scoring='neg_mean_squared_error', cv=0) selector.fit(self.train_X, self.train_y) k_feature = selector.k_feature_names_ print('selected features:', k_feature) print('selected index:', selector.k_feature_idx_) if self.showFig: model_name = str(model).split('(')[0] plot_sfs(selector.get_metric_dict(), kind='std_dev') plt.title('SFS of {}'.format(model_name)) plt.grid() plt.show()
x_scaled_np = StandardScaler().fit_transform(x_data) x_scaled_np = PolynomialFeatures(degree=2).fit_transform(x_scaled_np) print(y) print(x_scaled_np) cv = RepeatedKFold(n_splits=5, n_repeats=20) bins = np.linspace(y.min(), y.max(), 5) labels = ["1", "2", "3", "4"] Y_groups = pd.cut(y, bins) sfs = SFS(regr, floating=True, verbose=2, k_features=2, forward=False, n_jobs=2, scoring='neg_mean_absolute_error', cv=cv) sfs.fit(x_scaled_np, y) print("Optimal number of features : %d" % sfs.k_features) print('Best features :', sfs.k_feature_names_) print('Best score :', sfs.k_score_) print(sfs.get_params()) print(sfs) fig1 = plot_sfs(sfs.get_metric_dict(), kind='std_dev', figsize=(6, 4)) plt.show()
cv = RepeatedKFold(n_splits=4, n_repeats=3, random_state=1) sfs_ridge_forward= SFS(Ridge(alpha=0.1), k_features=4, forward=True, floating=True, scoring = 'neg_mean_squared_error', verbose=2, cv = cv) sfs_ridge_forward.fit(X_norm, y) sfs_ridge_forward.k_feature_names_ fig1 = plot_sfs(sfs_ridge_forward.get_metric_dict(), kind='std_err') plt.title('Sequential Forward Selection (w. StdErr)') plt.ylabel('Perfomance') plt.grid() plt.savefig("forward_processing_Porperty_ridge_"+name+".png", dpi=300) plt.show() X_selcted_columns= list(sfs_ridge_forward.k_feature_names_) X_selected=X_norm[X_selcted_columns] ridge=Ridge() cv = RepeatedKFold(n_splits=4, n_repeats=3, random_state=1) parameters={'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40,45,50,55,100]} ridge_regressor=GridSearchCV(ridge,parameters,scoring='neg_mean_squared_error',cv=cv) result_ridge= ridge_regressor.fit(X_selected,y)
nval = test[data_type=='validation'].shape[0] split = trainN - nval cv = args.cv or [(range(split),range(split,trainN))] print 'original train_err:\n', pd.concat((train.apply(lambda x:log_loss(train_y,x)),pd.Series(range(train.shape[1]),name='#',index=train.columns)),1) print 'simple av validation error:', log_loss(train_y.iloc[split:],train.iloc[split:].mean(1)) Cs=10.**np.array(np.linspace(-4,5,50)) k_features=(1,min(train.shape[1],args.max)) lr=LogisticRegressionCV(Cs=Cs,fit_intercept=True) # lr.fit(train,train_y) if args.fs=='sfs': fs=SFS(lr,k_features=k_features,forward=True,floating=True,scoring='neg_log_loss',cv=cv,verbose=2) else: fs=EFS(lr,min_features=1,max_features=min(train.shape[1],8),scoring='neg_log_loss',cv=cv) fs.fit(train.values,train_y.values) print print pd.DataFrame.from_dict(fs.get_metric_dict()).T if args.fs=='sfs': print 'SFS best score:', fs.k_score_ print len(fs.k_feature_idx_),'features:',fs.k_feature_idx_ else: print 'EFS best score:', fs.best_score_ print len(fs.best_idx_),'features:',fs.best_idx_ lr.fit(fs.transform(train.iloc[:split].values),train_y.iloc[:split]) print print 'Regularization C:', lr.C_ print 'validation error fitting on train:', log_loss(train_y.iloc[split:],lr.predict_proba(fs.transform(train.iloc[split:].values))[:,1]) lr.fit(fs.transform(train.iloc[split:].values),train_y.iloc[split:]) print 'validation error fitting on val:', log_loss(train_y.iloc[split:],lr.predict_proba(fs.transform(train.iloc[split:].values))[:,1]) if args.out:
forward=False, floating=False, scoring='r2', cv=0) sbs.fit(x_train, y_train) sbs.k_feature_names_ sfs1 = SFS(LinearRegression(), k_features=(1,7), forward=True, floating=False, cv=0) sfs1.fit(x_train, y_train) from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs import matplotlib.pyplot as plt fig1 = plot_sfs(sfs1.get_metric_dict(), kind='std_dev') plt.title('Sequential Forward Selection (w. StdErr)') plt.grid() plt.show() dtree = DecisionTreeRegressor() model = dtree.fit(x_train, y_train) #train parameters: features and target dtree_pred = dtree.predict(x_test) #Visualize the minimal error classification tree #export graphviz doesn't work in Jupyter Notebook - COMMENT OUT IF USING JUPYTER NOTEBOOK from sklearn.externals.six import StringIO #use to visualize and classification tree from IPython.display import Image #use to visualize and classification tree import pydotplus #use to visualize and classification tree dot_data = StringIO() export_graphviz(dtree, out_file=dot_data,
# **Best subset of Features selected after feature selection process.** # In[199]: f_selector.k_feature_names_ # **Plot of Number of Features v/s Performance of Regressor.** # In[200]: plot_sfs(f_selector.get_metric_dict(),kind='std_dev') # Selecting the best subset of features and removing others from X_Train. # In[201]: feat_random_forest=list(f_selector.k_feature_names_) X_train_rf=X_train_rf.loc[:,list(f_selector.k_feature_names_)] # Using GridSearchCV for Hyperparameter Tuning. # In[206]:
#the auc of random forest is: 0.8112633992853715 f_number = 50 sfs4 = SFS(clfRandomForest, k_features=f_number, forward=True, floating=False, scoring='roc_auc', cv=5) result4 = sfs4.fit(X_train, y_train, custom_feature_names=feature_names) #print(X) result4.subsets_ result4.k_score_ selection_res = pd.DataFrame.from_dict(sfs4.get_metric_dict()).T # print(selection_res) selection_res.to_csv( "/Users/shuojiawang/Documents/ppdmodel/result1907/selection_log_withistoryrf.csv", sep='\t') selected_feature_idx = result4.k_feature_idx_ #print(type(selected_feature_idx)) selected_feature = list(selected_feature_idx) feature_name = [] for i in selected_feature: feature_name.append(feature_names[i]) print(feature_name) fig = plot_sfs(sfs4.get_metric_dict(), kind='std_err') plt.title('Sequential Forward Selection (w. StdErr)')
y_pred = model.predict(X_test_scoring) predictions = [round(value) for value in y_pred] IG_Test_accuracy = accuracy_score(y_test_scoring, predictions) print('Info Gain Accuracy (Test, Hold-Out): %.2f%%' % (Baseline_Test_accuracy * 100.0)) # WRAPPER-BASED FORWARD SEQUENTIAL SEARCH #The Forward Seqeuntial Search will use Gradient Boost classifier and look at all the features added sequentially. Then, re-evaluate using the least amount of features which give the best accuracy. # It doesn't appear to add any value past ~7 features, so change k_features to 7 if this runs slowly sfs_forward = SFS(model,k_features=44,forward=True, verbose=1, scoring='accuracy',cv=10, n_jobs =-1) sfs_forward = sfs_forward.fit(X_train, y_train) # This will create a graphic that shows performance (accuracy) as a solid blue line for each feature added, # and the feint blue is the standard error for that feature from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs fig1 = plot_sfs(sfs_forward.get_metric_dict(), kind='std_dev', figsize=(10, 5)) plt.ylim([0.5, 1]) plt.title('Sequential Forward Selection (Standard Error)') plt.grid() plt.show() """ DISCUSSION From the graph above, it would appear the 7 features would be the best model, after that the model performance again plateau's like with Information Gain. We will re-run the model using the 7 best features. """ # Rerun with 7 features sfs_forward = SFS(model,k_features=7,forward=True, verbose=1, scoring='accuracy',cv=10, n_jobs =-1) sfs_forward = sfs_forward.fit(X_train, y_train) # Get the 7 features used
1, 64 ), #(1,64) # SFS will consider return any feature combination between min and max that scored highest in cross-validtion forward=FLAGS.FORWARD, # forward or backward floating=FLAGS.FLOATING, # put back? verbose=0, scoring='accuracy', #'neg_mean_squared_error', cv=5) sfs = sfs.fit(X, y) best_feature_index = sfs.k_feature_idx_ best_feature_name = [feature_names[i] for i in best_feature_index] print("The number of best features is:", len(best_feature_index)) print("The best features' index are:", best_feature_index) print("The best features are:", best_feature_name) fig = plot_sfs(sfs.get_metric_dict(), kind='std_err') config = {(True,True):('FORWARD','FLOATING'),\ (True,False):('FORWARD','NFLOATING'),\ (False,True):('BACKWARD','FLOATING'),\ (False,False):('BACKWARD','NFLOATING'),} fig.savefig('feature_selection/SINGLE-' + config[(FLAGS.FORWARD, FLAGS.FLOATING)][0] + '-' + config[(FLAGS.FORWARD, FLAGS.FLOATING)][1]) else: config = {(True,True):('FORWARD','FLOATING'),\ (True,False):('FORWARD','NFLOATING'),\ (False,True):('BACKWARD','FLOATING'),\ (False,False):('BACKWARD','NFLOATING'),} best_feature_index_array = config.copy()
Y = pd.read_csv('original_data/y_train.csv', names=['target'], delimiter=';') estimator = ExtraTreesClassifier(criterion="gini", max_features=0.4, min_samples_split=6, n_estimators=100) sfs1 = SFS(estimator, k_features=(10, 40), forward=True, floating=False, verbose=2, scoring='accuracy', cv=5, n_jobs=4) sfs1 = sfs1.fit(X[X.columns].as_matrix(), Y['target'].values) results = pd.DataFrame.from_dict(sfs1.get_metric_dict()).T fig1 = plot_sfs(sfs1.get_metric_dict(), kind='std_dev') plt.title('Sequential Forward Selection (w. StdDev)') plt.grid() plt.show() # (96, 97, 98, 131, 200, 138, 11, 76, 115, 83, 212, 182, 187, 156) # 0.642879680873 # (96, 98, 131, 138, 11, 76, 43, 209, 115, 182, 29) # 0.638581676053 print(sfs1.subsets_) print(sfs1.k_feature_idx_) print(sfs1.k_score_)
def get_best_logisitc(y): from mlxtend.feature_selection import SequentialFeatureSelector as SFS from sklearn.cross_validation import StratifiedKFold import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import cross_val_score my_data = pd.read_csv('data/my_data_test.csv', encoding='utf-8') y = my_data.target my_data = my_data.drop('target', axis=1) # To have better CV skf = StratifiedKFold(y, n_folds=5, random_state=17, shuffle=False) C_params = [0.01, 1, 10, 50, 70, 100] solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag'] my_result_list = [] for C_param in C_params: for solver in solvers: print "Looking for C : %s and solver : %s" % (C_param, solver) model = LogisticRegression(class_weight='balanced', random_state=17, solver=solver, C=C_param) sfs = SFS(model, k_features=len(my_data.columns), forward=True, floating=False, scoring='roc_auc', print_progress=False, cv=skf, n_jobs=-1) sfs = sfs.fit(my_data.values, y.values) result_sfs = pd.DataFrame.from_dict(sfs.get_metric_dict()).T result_sfs.sort_values('avg_score', ascending=0, inplace=True) features_sfs = result_sfs.feature_idx.head(1).tolist() select_features_sfs = list(my_data.columns[features_sfs]) scores = cross_val_score(model, my_data[select_features_sfs], y, cv=skf, scoring='roc_auc') my_result_list.append({ 'C': C_param, 'solver': solver, 'auc': scores.mean(), 'std': scores.std(), 'best_columns': select_features_sfs, 'estimator': model }) my_result = pd.DataFrame(my_result_list) my_result.sort_values('auc', ascending=0, inplace=True) best_features = my_result.best_columns.head(1).values[0] best_model = my_result.estimator.head(1).values[0] return best_features, best_model
#print("Features selected in forward fit") #print(x.columns[b]) #%% FORWARD FIT - Sequential Search (SFS) sfs_f = SFS( lr, k_features=(1, predictors.shape[1]), forward=True, # Forward fit floating=False, scoring='neg_mean_squared_error', cv=5) # Fit this on the data sfs_f = sfs_f.fit(x.values, y.values) # Get all the details of the forward fits a = sfs_f.get_metric_dict() n = [] o = [] # Compute the mean cross validation scores for i in np.arange(1, predictors.shape[1]): n.append(-np.mean(a[i]['cv_scores'])) m = np.arange(1, predictors.shape[1]) # Plot the CV scores vs the number of features fig1 = plt.plot(m, n, label='SFS_f') plt.title('Mean CV Scores vs N# of features') plt.xlabel('N# features') plt.ylabel('MSE') # Forward steps with Cross-Validation
knn = KNeighborsClassifier(n_neighbors=4) lr = LinearRegression() from mlxtend.feature_selection import SequentialFeatureSelector as SFS from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs lr = LinearRegression() sfs = SFS(lr, k_features=13, forward=True, floating=False, scoring='neg_mean_squared_error', cv=10) sfs = sfs.fit(X, Y) fig = plot_sfs(sfs.get_metric_dict(), kind='std_err') plt.title('Sequential Forward Selection (w. StdErr)') plt.grid() plt.show() # In[148]: from sklearn.model_selection import train_test_split from sklearn.model_selection import cross_val_score #Funded_amt_inv #Term #Grade #Subgrade #Dti #Delinq_2_yrs
def sequential_feature_selector(features, labels, classifier, k_features, kfold, selection_type, plot=True, **kwargs): """Sequential feature selection to reduce the number of features. The function reduces a d-dimensional feature space to a k-dimensional feature space by sequential feature selection. The features are selected using ``mlxtend.feature_selection.SequentialFeatureSelection`` which essentially selects or removes a feature from the d-dimensional input space until the preferred size is reached. The function will pass ``ftype='feature'`` and forward ``features`` on to a classifier's ``static_opts`` method. Args: features: The original d-dimensional feature space labels: corresponding labels classifier (str or object): The classifier which should be used for feature selection. This can be either a string (name of a classifier known to gumpy) or an instance of a classifier which adheres to the sklearn classifier interface. k_features (int): Number of features to select kfold (int): k-fold cross validation selection_type (str): One of ``SFS`` (Sequential Forward Selection), ``SBS`` (Sequential Backward Selection), ``SFFS`` (Sequential Forward Floating Selection), ``SBFS`` (Sequential Backward Floating Selection) plot (bool): Plot the results of the dimensinality reduction **kwargs: Additional keyword arguments that will be passed to the Classifier instantiation Returns: A 3-element tuple containing - **feature index**: Index of features in the remaining set - **cv_scores**: cross validation scores during classification - **algorithm**: Algorithm that was used for search """ # retrieve the appropriate classifier if isinstance(classifier, str): if not (classifier in available_classifiers): raise ClassifierError("Unknown classifier {c}".format(c=classifier.__repr__())) kwopts = kwargs.pop('opts', dict()) # opts = dict() # retrieve the options that we need to forward to the classifier # TODO: should we forward all arguments to sequential_feature_selector ? opts = available_classifiers[classifier].static_opts('sequential_feature_selector', features=features) opts.update(kwopts) # XXX: now merged into the static_opts invocation. TODO: test # if classifier == 'SVM': # opts['cross_validation'] = kwopts.pop('cross_validation', False) # elif classifier == 'RandomForest': # opts['cross_validation'] = kwopts.pop('cross_validation', False) # elif classifier == 'MLP': # # TODO: check if the dimensions are correct here # opts['hidden_layer_sizes'] = (features.shape[1], features.shape[2]) # get all additional entries for the options # opts.update(kwopts) # retrieve a classifier object classifier_obj = available_classifiers[classifier](**opts) # extract the backend classifier clf = classifier_obj.clf else: # if we received a classifier object we'll just use this one clf = classifier.clf if selection_type == 'SFS': algorithm = "Sequential Forward Selection (SFS)" sfs = SFS(clf, k_features, forward=True, floating=False, verbose=2, scoring='accuracy', cv=kfold, n_jobs=-1) elif selection_type == 'SBS': algorithm = "Sequential Backward Selection (SBS)" sfs = SFS(clf, k_features, forward=False, floating=False, verbose=2, scoring='accuracy', cv=kfold, n_jobs=-1) elif selection_type == 'SFFS': algorithm = "Sequential Forward Floating Selection (SFFS)" sfs = SFS(clf, k_features, forward=True, floating=True, verbose=2, scoring='accuracy', cv=kfold, n_jobs=-1) elif selection_type == 'SBFS': algorithm = "Sequential Backward Floating Selection (SFFS)" sfs = SFS(clf, k_features, forward=True, floating=True, verbose=2, scoring='accuracy', cv=kfold, n_jobs=-1) else: raise Exception("Unknown selection type '{}'".format(selection_type)) pipe = make_pipeline(StandardScaler(), sfs) pipe.fit(features, labels) subsets = sfs.subsets_ feature_idx = sfs.k_feature_idx_ cv_scores = sfs.k_score_ if plot: fig1 = plot_sfs(sfs.get_metric_dict(), kind='std_dev') plt.ylim([0.5, 1]) plt.title(algorithm) plt.grid() plt.show() return feature_idx, cv_scores, algorithm, sfs, clf
logreg = linear_model.LogisticRegression() sfs = SFS(logreg, k_features=30, forward=True, floating=False, scoring='roc_auc', cv=4) sfs = sfs.fit(X, y) print('\nSequential Floating Forward Selection (k=30):') print(sfs.k_feature_idx_) print('CV Score:') print(sfs.k_score_) pd.DataFrame.from_dict(sfs.get_metric_dict()).T plt.figure(figsize=(19,10)) fig = plot_sfs(sfs.get_metric_dict(), kind=None) plt.title('Sequential Forward Selection (rocauc)') plt.grid() plt.show() # In[7]: idxs_selected=sfs.k_feature_idx_ featureindex = [] for i in idxs_selected: featureindex.append(i) featuredataframe=df.iloc[:,1:134]
dic[i] = rfe.score() plt.xlabel('feature_num') plt.ylabel('score') plt.plot(dic.keys(), dic.values()) plt.show() return dic if __name__ == "__main__": train_data = load_data(train_url) train_y = train_data['price'] train_data.drop(['SaleID'], axis=1, inplace=True) train_data.drop(['price'], axis=1, inplace=True) col_name = [ 'name', 'brand', 'bodyType', 'fuelType', 'gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode', 'v_0', 'v_3', 'v_12', 'usedTime' ] sfs = SFS(LinearRegression(), k_features=13, forward=True, floating=False, scoring='r2', cv=0) train_data = train_data.fillna(0) sfs.fit(train_data, train_y) print(sfs.k_feature_names_) print(pd.DataFrame.from_dict(sfs.get_metric_dict()).T) fig = plot_sfs(sfs.get_metric_dict(), kind='std_dev') plt.grid() plt.show()
df_sim = comp.load_dataframe(datatype='sim', config=args.config) # Extract training features and targets from simulation DataFrame feature_list, feature_labels = comp.get_training_features() X_train_sim, X_test_sim, y_train_sim, y_test_sim, le = comp.get_train_test_sets( df_sim, feature_list, comp_class=True) # Load pipeline to use pipeline = comp.get_pipeline(args.pipeline) k_features = X_train_sim.shape[1] if args.method == 'forward' else 1 # Set up sequential feature selection algorithm sfs = SFS(pipeline, k_features=k_features, forward=True if args.method == 'forward' else False, floating=args.floating, scoring=args.scoring, print_progress=True, cv=args.cv, n_jobs=args.n_jobs) # Run algorithm sfs = sfs.fit(X_train_sim, y_train_sim) # Get DataFrame of sfs results results_df = pd.DataFrame.from_dict(sfs.get_metric_dict()).T # Save DataFrame to csv file output_file = 'SFS-results/{}_{}_{}_{}_cv{}.csv'.format(args.pipeline, args.method, 'floating' if args.floating else 'nofloat', args.scoring, args.cv) results_df.to_csv(output_file)