def fsel(bcl, X, d, m, forward=True, floating=False, cv=0, show=0): if show > 0: print('Feature Selection - ' + bcl[0] + ': - number of features reducing from ' + str(X.shape[1]) + ' to ' + str(m) + ' ...') if bcl[0] == 'Fisher': sel = sfsfisher(X, d, m) else: estimator = defineModel(bcl) sfs = SFS(estimator, k_features=m, forward=True, floating=False, verbose=show, scoring='accuracy', cv=cv) sfs = sfs.fit(X, d) sel = list(sfs.k_feature_idx_) if show > 0: print(' ') if show: plot_sfs(sfs.get_metric_dict(), kind='std_err') plt.title('Sequential Forward Selection') plt.grid() plt.show() return sel
def sfs_eval(model_,xtest): # get best features best_feats_idx = model_.best_estimator_['selector'].k_feature_idx_ best_feats = list(xtest.columns[list(best_feats_idx)].values.tolist()) print('\nBest features: \n{}'.format(best_feats)) # plotting feature selection characteristics plot_sfs(model_.best_estimator_['selector'].get_metric_dict(), kind='std_err', figsize=(12, 5)) plt.title('Sequential Forward Selection (w. StdErr)') plt.grid(b=True, which='major', axis='both') plt.show()
def make_plot(sfs, k, weights, is_forward): fig = plot_sfs( sfs.get_metric_dict(), kind='std_dev', ) axes = fig.add_subplot(111) a = axes.get_xticks().tolist() for i in range(len(a)): if i % 3 != 0: a[i] = '' axes.set_xticklabels(a) axes.tick_params(axis='both', which='major', labelsize=40) axes.tick_params(axis='both', which='minor', labelsize=40) fig.set_size_inches(20, 12, forward=True) plt.ylim([0.70, 1.00]) plt.xlabel('Число признаков', fontsize=40) plt.ylabel(SCORING, fontsize=40) # plt.title('Последовательный отбор признаков (k = {0}, weights = {1})'.format( # k, weights # ) # ) # plt.grid(True) plt.savefig( '../../results/knn/features_selection/' + 'knn_k={0}_weights={1}_forward={2}.svg'.format(k, weights, is_forward), format='svg', dpi=300)
def plot(self): if self.selector_name == 'rfe': step = self.selector.sep if self.selector.sep > 1 else int( self.selector.sep * self.dim) plt.figure(figsize=(12, 9)) plt.xlabel(f'Number of features tested x {step}') plt.ylabel('Cross-validation score') plt.plot(range(1, len(self.selector.grid_scores_) + 1), self.selector.grid_scores_) # plt.savefig('ELO-lgbmcv-02.png', dpi=150) plt.show() plot_sfs(self.selector.get_metric_dict(), kind='std_dev') plt.ylim([0.8, 1]) plt.title('Sequential Forward Selection (w. StdDev)') plt.show()
def fse_sfs(bcl, X, d, m, cv=0, show=0): estimator = defineModel(bcl) sfs = SFS(estimator, k_features=m, forward=True, floating=False, verbose=2, scoring='accuracy', cv=cv) sfs = sfs.fit(X, d) sel = sfs.k_feature_idx_ print(' ') if show: plot_sfs(sfs.get_metric_dict(), kind='std_err') plt.title('Sequential Forward Selection (w. StdErr)') plt.grid() plt.show() return sel
def select_by_SFS(self, model=None): # 前向选择:该过程从一个空的特性集合开始,并逐个添加最优特征到集合中。 # 展示:随着特征个数增加得分变化趋势图 # 可选择K个最优特征 selector = SFS(model, k_features=self.K, forward=True, floating=False, # scoring='neg_mean_squared_error', cv=0) selector.fit(self.train_X, self.train_y) k_feature = selector.k_feature_names_ print('selected features:', k_feature) print('selected index:', selector.k_feature_idx_) if self.showFig: model_name = str(model).split('(')[0] plot_sfs(selector.get_metric_dict(), kind='std_dev') plt.title('SFS of {}'.format(model_name)) plt.grid() plt.show()
def figs_of_SFS(self, model=None): selector = SFS(model, k_features=self.K, forward=True, floating=False, # scoring='neg_mean_squared_error', cv=0) selector.fit(self.train_X, self.train_y) model_name = str(model).split('(')[0] fig1 = plot_sfs(selector.get_metric_dict(), kind='std_dev') plt.title('SFS of {}'.format(model_name)) plt.grid() plt.show()
def plot_feed_forward_models(): """ Plots the performance for each iteration of the feedforward model. The number of features chosen are 15 and 20, since these showed the best result """ # create Linear Regression model regr = LinearRegression() sfs_model = SequentialFeatureSelector(regr, k_features=15, forward=True, floating=False, scoring='neg_mean_squared_error', cv=10) sfs_model = sfs_model.fit(X_train, y_train) plot_sfs(sfs_model.get_metric_dict(), kind='std_err') plt.title('Sequential Forward Selection Linear Regression (w. StdErr)') plt.grid() plt.show() # Same for the Decision Tree, with some different settings clf = tree.DecisionTreeClassifier() sfs_model = SequentialFeatureSelector(clf, k_features=20, forward=True, floating=False, scoring='accuracy', cv=10) sfs_model = sfs_model.fit(X_train, y_train_binned) plot_sfs(sfs_model.get_metric_dict(), kind='std_err') plt.title('Sequential Forward Selection Decision Tree (w. StdErr)') plt.grid() plt.show()
def sequential_feature_selection(data_set, y_values, want_graph): lr = LinearRegression() sfs = SFS(lr, k_features=13, forward=True, floating=False, scoring='neg_mean_squared_error', cv=10) sfs = sfs.fit(data_set, y_values) if want_graph: fig = plot_sfs(sfs.get_metric_dict(), kind='std_err') plt.title('Sequential Forward Selection (w. StdErr)') plt.grid() plt.show() return sfs
y_pred = model.predict(X_test_scoring) predictions = [round(value) for value in y_pred] IG_Test_accuracy = accuracy_score(y_test_scoring, predictions) print('Info Gain Accuracy (Test, Hold-Out): %.2f%%' % (Baseline_Test_accuracy * 100.0)) # WRAPPER-BASED FORWARD SEQUENTIAL SEARCH #The Forward Seqeuntial Search will use Gradient Boost classifier and look at all the features added sequentially. Then, re-evaluate using the least amount of features which give the best accuracy. # It doesn't appear to add any value past ~7 features, so change k_features to 7 if this runs slowly sfs_forward = SFS(model,k_features=44,forward=True, verbose=1, scoring='accuracy',cv=10, n_jobs =-1) sfs_forward = sfs_forward.fit(X_train, y_train) # This will create a graphic that shows performance (accuracy) as a solid blue line for each feature added, # and the feint blue is the standard error for that feature from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs fig1 = plot_sfs(sfs_forward.get_metric_dict(), kind='std_dev', figsize=(10, 5)) plt.ylim([0.5, 1]) plt.title('Sequential Forward Selection (Standard Error)') plt.grid() plt.show() """ DISCUSSION From the graph above, it would appear the 7 features would be the best model, after that the model performance again plateau's like with Information Gain. We will re-run the model using the 7 best features. """ # Rerun with 7 features sfs_forward = SFS(model,k_features=7,forward=True, verbose=1, scoring='accuracy',cv=10, n_jobs =-1) sfs_forward = sfs_forward.fit(X_train, y_train) # Get the 7 features used
cv = RepeatedKFold(n_splits=4, n_repeats=3, random_state=1) sfs_ridge_forward= SFS(Ridge(alpha=0.1), k_features=4, forward=True, floating=True, scoring = 'neg_mean_squared_error', verbose=2, cv = cv) sfs_ridge_forward.fit(X_norm, y) sfs_ridge_forward.k_feature_names_ fig1 = plot_sfs(sfs_ridge_forward.get_metric_dict(), kind='std_err') plt.title('Sequential Forward Selection (w. StdErr)') plt.ylabel('Perfomance') plt.grid() plt.savefig("forward_processing_Porperty_ridge_"+name+".png", dpi=300) plt.show() X_selcted_columns= list(sfs_ridge_forward.k_feature_names_) X_selected=X_norm[X_selcted_columns] ridge=Ridge() cv = RepeatedKFold(n_splits=4, n_repeats=3, random_state=1) parameters={'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40,45,50,55,100]} ridge_regressor=GridSearchCV(ridge,parameters,scoring='neg_mean_squared_error',cv=cv) result_ridge= ridge_regressor.fit(X_selected,y)
y_btr = y[y == 1][:smpnum] x_btr = x[y == 1][:smpnum] for i in range(2, 6): x_btr = np.concatenate([x_btr, x[y == i][:smpnum]]) y_btr = np.concatenate([y_btr, y[y == i][:smpnum]]) x_tr, x_te, y_tr, y_te = train_test_split( x_btr, y_btr, test_size=0.20, ) best = do_sfs(x_tr, y_tr) # examine the results plot = plot_sfs(best.get_metric_dict()) plot[1].figure.savefig("SFS-" + str(n_features) + ".png") for i in range(1, 11): best.get_metric_dict()[i]['avg_score'] test_svm(x_all, y_all) # make a more select dataset # Filter the rest of the data x_obs, y_obs, x_nuls = load_data() keep = list(best.k_feature_idx_) np.save('sfs_features', keep) # keep = np.load('sfs_features.npy') x_obs = x_obs[:, keep] x_nuls = x_nuls[:, keep]
# **Best subset of Features selected after feature selection process.** # In[199]: f_selector.k_feature_names_ # **Plot of Number of Features v/s Performance of Regressor.** # In[200]: plot_sfs(f_selector.get_metric_dict(),kind='std_dev') # Selecting the best subset of features and removing others from X_Train. # In[201]: feat_random_forest=list(f_selector.k_feature_names_) X_train_rf=X_train_rf.loc[:,list(f_selector.k_feature_names_)] # Using GridSearchCV for Hyperparameter Tuning. # In[206]:
def ExecuteSFFS(x, y, featureNames, featureList, clusters, clusterNames, svc, kFolds, nbOfSplit, featMaxNbrSFFS, standardizationType, removedData, permutation_flag, nbPermutation, balance_flag, currentDateTime, resultDir, debug_flag, verbose): import scipy import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split as tts from sklearn.metrics import confusion_matrix from mlxtend.feature_selection import SequentialFeatureSelector as SFS from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs from sklearn.model_selection import RandomizedSearchCV from slpClass_toolbox import BalanceClasses from slpClass_toolbox import Standardize from slpClass_toolbox import Permute from slpClass_toolbox import ComputePermutationAvgDA from slpClass_toolbox import PlotPermHist from slpClass_toolbox import ApplyStandardization from slpClass_toolbox import plot_confusion_matrix plt.rcParams.update({'figure.max_open_warning': 0}) # Get features values since SFFS works only with numpy array! bestFeaturesHist = np.zeros([len(featureNames)]) CvResult = pd.DataFrame() permResults = pd.DataFrame() tmpBest = [] DA = [] avg_perm_DA = [] skipFS = False # flag to skip feature selection fitFeatOverTresh = False # fit classifier with most frequent features in best set #********************** TRAIN pre-procesing ******************************* for it in list(range(nbOfSplit)): print('\nSplit #{}'.format(str(it))) # Use all features or given ones only if len(featureList) == 0: xx = x elif isinstance(featureList[0], float): xx = x fitFeatOverTresh = True else: xx = x[featureList] skipFS = True # Balance the number of old woman and old man or not if balance_flag: X, Y = BalanceClasses(xx, y) else: X, Y = xx, y # slpit dataset into train and test random subset X_train, X_test, y_train, y_test = tts(X, Y['Cluster'], test_size=0.33, stratify=Y['Cluster']) # Data z-score standardisation xTrainSet, zPrm = Standardize(X_train, y_train, standardizationType, debug_flag) #**************************** SVM optimisation ************************ params_dict = { 'C': scipy.stats.expon(scale=100), 'kernel': ['linear'], 'class_weight': ['balanced', None] } n_iter_search = 20 random_search = RandomizedSearchCV(svc, param_distributions=params_dict, n_iter=n_iter_search) random_search.fit(xTrainSet, y_train) optimClf = random_search.best_estimator_ #*************************** TRAIN ************************************ print('Fitting...') if skipFS: optimClf = optimClf.fit(xTrainSet.as_matrix(), y_train) yPred = optimClf.predict(xTrainSet.as_matrix()) # Compute the accuracy of the test prediction acc = float((y_train == yPred).sum()) / yPred.shape[0] print('Train predicted accuracy: %.2f %%' % (acc * 100)) fitRes = pd.DataFrame(data=[acc], columns=['CV_DA_' + str(it + 1)]) else: # set k_features = (1,X.shape[1]) to test all possible combinations sffs = SFS(optimClf, k_features=(1, featMaxNbrSFFS), forward=True, floating=False, scoring='accuracy', cv=kFolds, n_jobs=-1) sffs = sffs.fit(xTrainSet.as_matrix(), y_train) print('Best combination for fit #%d (ACC: %.3f): %s' % \ (it,sffs.k_score_, sffs.k_feature_idx_)) # Fit the estimator using the new feature subset and make a # prediction on the test data X_train_sfs = sffs.transform(xTrainSet.as_matrix()) optimClf.fit(X_train_sfs, y_train) fitRes = pd.DataFrame.from_dict(sffs.get_metric_dict()).T fitRes['avg_over_std'] = fitRes['avg_score'] / fitRes['std_dev'] if featMaxNbrSFFS > 1: # plot feature selection process metrics fig1 = plot_sfs(sffs.get_metric_dict(), kind='std_err') savedPlotName = resultDir+'Decoding_accuracy_'+clusters+'_'+\ str(it)+'_'+str(nbOfSplit)+'.png' tmpBest.append(sffs.k_feature_idx_) bestFeaturesHist[[tmpBest[-1]]] += 1 fig1.set_dpi(300) plt.tight_layout() plt.savefig(savedPlotName, bbox_inches='tight') plt.clf() plt.close(fig1) # plot mean / std plt.figure(dpi=300) plt.title('Moyenne sur ecart-type') plt.xlabel("nb attributs dans combinaison") plt.xticks(range(featMaxNbrSFFS)) plt.ylabel("Moyenne sur ecart-type") plt.plot(list(range(1, featMaxNbrSFFS + 1)), fitRes['avg_over_std']) figName = resultDir+'SFFS_'+clusters+'_bestSet_metric_'+ \ str(it)+'_'+str(nbOfSplit) plt.savefig(figName, bbox_inches='tight') plt.clf() plt.close() # add metrics iteration identifier fitRes = fitRes.add_suffix('_' + str(it + 1)) CvResult = pd.concat([CvResult, fitRes], axis=1) #***************************** TEST *********************************** print('Testing...') # standardize test set using trainset standardization parameters xTestSet = ApplyStandardization(X_test, zPrm) # prepare test data if skipFS: xTest = xTestSet savedPlotName = resultDir+clusters+'_ConfusionMatrix_'+str(it+1)+ \ '_'+str(nbOfSplit) else: # Generate a new subset of data according to selected features xTest = sffs.transform(xTestSet.as_matrix()) savedPlotName = resultDir+'SFFS_'+clusters+'_ConfusionMatrix_'+ \ str(it+1)+'_'+str(nbOfSplit) # actually test classifier and compute decoding accuracy on predictions y_pred = optimClf.predict(xTest) acc = float((y_test == y_pred).sum()) / y_pred.shape[0] print('Test set accuracy: %.2f %%' % (acc * 100)) DA.append(acc) # stack test DA for further use # plot confusion matrix cm = confusion_matrix(y_test, y_pred) fig_CM = plt.figure(dpi=300) plot_confusion_matrix(cm, clusterNames, title=savedPlotName, normalize=True, precision=2) plt.clf() plt.close(fig_CM) #**************** STATISTICAL ASSESSMENT (PERMUTATION) **************** if permutation_flag: permResults['permutation_DA_' + str(it)] = Permute( clusters, xTrainSet, xTestSet, y_train, y_test, nbPermutation, standardizationType, debug_flag=0) avg_perm_DA.append( np.mean(permResults['permutation_DA_' + str(it)])) dfDA = pd.DataFrame(data=DA, columns=['DA_test']) # CvResult = pd.concat([CvResult, dfDA[:]], axis=1) CvResult = pd.concat([ CvResult, dfDA[:], pd.DataFrame(data=[np.mean(DA)], columns=['avg_DA']) ], axis=1) #***************** COMPUTE STATISTICAL ASSESSMENT RESULTS ***************** if permutation_flag: # compute permutation DA average and keep results in a dataframe print('\nAverage permutation DA') for i in list(range(len(avg_perm_DA))): print('\t' + str(avg_perm_DA[i])) savedHistName = resultDir + 'Average_Permutation_hist_' + clusters + '.png' PlotPermHist(permResults, CvResult['avg_DA'].iloc[0], currentDateTime, savedHistName) #formating permutation results to save in excel file permResults = pd.concat( [permResults, ComputePermutationAvgDA(avg_perm_DA)], axis=1) print('Mean permutation decoding accuracy : {}'.format( np.mean(permResults['Avg_Permutation_DA_per_epoch']))) else: # binomial law from scipy.stats import binom q = 0.001 # p value n = X.shape[0] + 1 # nombre d'observation (sujets) p = 1 / len(clusterNames) # probablité d'avoir un essai correctement luckLvl = pd.DataFrame(date=[binom.isf(q, n, p) / n], columns=['Chance_Level']) #****************************** Compute results ******************************* if not skipFS: # Build structure of histogram data to save in excel hist = pd.DataFrame(data=featureNames, columns=['Features_Name']) hist['Occurence_Best'] = bestFeaturesHist # Search best set across every iteration best set best_Combination = tmpBest[np.argmax(DA)] # Compute average size of best combination l = 0 for n in list(range(len(tmpBest))): l += len(tmpBest[n]) avgBestCombSize = pd.DataFrame(data=[np.ceil(l / len(tmpBest))], columns=['avgBestCombSize']) # subsetHist = GetSubsetOccurence(tmpBest) # PlotHist(subsetHist[1],'Subsets occurences',subsetHist[0],'Comb_Hist.png') # Get best set's feature names tmp = [] tmp.append(np.max(DA)) for i in best_Combination: tmp.append(featureNames[i]) print('\t' + featureNames[i]) bestFeatNames = pd.DataFrame(data=tmp, columns=['Best_Features_Set']) sffsRes = pd.concat([hist, bestFeatNames, avgBestCombSize], axis=1) # Plot best combination custom metric (mean / std_dev) from slpClass_toolbox import PlotBestCombinationMetrics filteredData = CvResult.filter(regex=r'avg_over_std_', axis=1) metrics = pd.DataFrame(data=filteredData) metrics.dropna(inplace=True) figName = resultDir + 'SFFS_' + clusters + '_bestSet_metric_aggreg.png' PlotBestCombinationMetrics(metrics, figName) #save training and permutation results in an excel file nbSubject = pd.DataFrame(data=[len(X)], columns=['Number_Of_Subjects']) #************************ Build results structure ************************* excelResults = pd.concat([ CvResult, permResults if permutation_flag else luckLvl, sffsRes if not skipFS else None, removedData, nbSubject ], axis=1) print('Mean Decoding accuracy :{}'.format(np.mean(DA))) # compute occurence of every subset in bestsets of every iteration # from slpClass_toolbox import GetSubsetOccurence # subsetHist = GetSubsetOccurence(tmpBest) # excelResults = pd.concat([excelResults, subsetHist], axis=1) # excelResults.to_excel(saveTo, sheet_name=xlSheetName) if fitFeatOverTresh: tresh = featureList[0] * nbOfSplit bestFeatColumns = hist.iloc[:, 0][hist.iloc[:, 1] > tresh] bestDataSet = xx[bestFeatColumns] classes = y DABestFeat = [] print('Fitting with features occuring over %d times in best sets' % tresh) for i in list(range(nbOfSplit)): print('\rFit #{} of {}\n'.format(i + 1, nbOfSplit), end='\r', flush=True) # Balance the number of old woman and old man or not if balance_flag: XX, YY = BalanceClasses(bestDataSet, classes) else: XX, YY = bestDataSet, classes # slpit dataset into train and test random subset XXtrain, XXtest, yytrain, yytest = tts(XX, YY['Cluster'], test_size=0.33, stratify=YY['Cluster']) # Data z-score standardisation xxTrainSet, zzPrm = Standardize(XXtrain, yytrain, standardizationType, debug_flag) # fit and predict on training data optimClf = optimClf.fit(xxTrainSet.as_matrix(), yytrain) yPred = optimClf.predict(xxTrainSet.as_matrix()) # Compute accuracy of prediction on trainnnig set acc = float((yytrain == yPred).sum()) / yPred.shape[0] print('Train predicted accuracy: %.2f %%' % (acc * 100)) fitRes = pd.DataFrame(data=[acc], columns=['CV_DA_' + str(it + 1)]) # test classifier and compute decoding accuracy on predictions xxTestSet = ApplyStandardization(XXtest, zzPrm) yypred = optimClf.predict(xxTestSet) acc = float((yytest == yypred).sum()) / yypred.shape[0] print('Test set accuracy: %.2f %%' % (acc * 100)) DABestFeat.append(acc) # stack test DA for further use # plot confusion matrix cm = confusion_matrix(yytest, yypred) fig_CM = plt.figure(dpi=300) plot_confusion_matrix(cm, clusterNames, title=savedPlotName, normalize=True, precision=2) plt.clf() plt.close(fig_CM) df = pd.DataFrame(data=DABestFeat, columns=['optim DA']) df = pd.concat([ df, pd.DataFrame(data=[np.mean(DABestFeat)], columns=['optim avg DA']) ], axis=1) print('Classifier trained with best features (occ > %d) only' % tresh) print(df) excelResults = pd.concat([excelResults, df], axis=1) return excelResults
selection_res = pd.DataFrame.from_dict(sfs4.get_metric_dict()).T # print(selection_res) selection_res.to_csv( "/Users/shuojiawang/Documents/ppdmodel/result1907/selection_log_withistoryrf.csv", sep='\t') selected_feature_idx = result4.k_feature_idx_ #print(type(selected_feature_idx)) selected_feature = list(selected_feature_idx) feature_name = [] for i in selected_feature: feature_name.append(feature_names[i]) print(feature_name) fig = plot_sfs(sfs4.get_metric_dict(), kind='std_err') plt.title('Sequential Forward Selection (w. StdErr)') plt.xlabel("Feature number") plt.ylabel("AUC") plt.grid() #plt.savefig("Users/bu/Desktop/feature_selection.png", dpi=600) plt.show() #plt.clf() from sklearn.model_selection import learning_curve # Create CV training and test scores for various training set sizes train_sizes, train_scores, test_scores = learning_curve( ensemble.RandomForestClassifier(), X, y, # Number of folds in cross-validation
def feature_selection_mlextend(dataframe, target_feature_name, scoring_metric, n_jobs, cross_val, range_of_features): """ This function will take a dataframe as input and perform the following: a) Remove noise using Boruta Feature Selection b) Select the top features given a range of features to select :param dataframe: dataframe with features to select from :param target_feature_name: target name of the feature :param scoring_metric: f1_weighted, accuracy, roc_auc etc. :param cross_val: # of cross validation folds :param n_jobs: # of cpu jobs :param range_of_features: tuple of the range of features to select from (low, high) :return: dataframe with features reduced """ if not isinstance(dataframe, pd.DataFrame): raise ValueError("Object passed is not a dataframe") if not isinstance(range_of_features, tuple): raise ValueError("range_of_features passed is not a tuple") import lightgbm as lgb classifier = lgb.LGBMClassifier(n_jobs=n_jobs, class_weight="balanced", max_depth=6, random_state=2019) x = dataframe.drop(target_feature_name, axis=1).values y = dataframe[target_feature_name].values.ravel() # forward selection sequential_forward_feature_selection = sfs(classifier, k_features=range_of_features, forward=True, n_jobs=n_jobs, floating=False, verbose=False, scoring=scoring_metric, cv=cross_val) sfs_algo = sequential_forward_feature_selection.fit(x, y) sfs_cross_val_score = sfs_algo.k_score_ cross_val = round(sfs_cross_val_score, 2) selected_features = list(sfs_algo.k_feature_names_) print("Number of features selected is: {}".format( len(sfs_algo.k_feature_names_))) print("Cross Validation Score for {}, is {}".format( scoring_metric, cross_val)) plot_sfs(sfs_algo.get_metric_dict(), kind='std_err', figsize=(11, 7)) df = x[selected_features] # merge back the target variable to the dataframe(df) df = df.merge(y, on=y.index) # drop generated index df.drop("key_0", axis=1, inplace=True) cat_features = [] for col_name in df.columns: if df[col_name].dtype != 'float64': if col_name != 'loan_status': cat_features.append(col_name) print(cat_features) return df
cols=x.columns.tolist() lr=LinearRegression() import matplotlib.pyplot as plt from mlxtend.feature_selection import SequentialFeatureSelector as SFS from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs sfs = SFS(lr, k_features=10, forward=True, floating=False, scoring='neg_mean_squared_error', cv=20) #without Autoregression sfs = sfs.fit(x, y) fig = plot_sfs(sfs.get_metric_dict(), kind='std_err') #Autoregression with time one hour sfs_1= sfs.fit(X_t_1,Y_tplus1 ) fig = plot_sfs(sfs_1.get_metric_dict(), kind='std_err') #Autoregression with time two hours sfs_2= sfs.fit(X_t_2,Y_tplus2 ) fig = plot_sfs(sfs_2.get_metric_dict(), kind='std_err') plt.title('Sequential Forward Selection (w. StdErr)') plt.grid() plt.show() print(sfs.k_feature_names_) print(sfs.k_score_) print(sfs.subsets_) import pandas as pd
mask = selector.support_ print(f"Best features according to RFE {X_m.columns[mask].values}") X_m1 = X_m.iloc[:,mask] # We could have used train test split or cross validation strategies # for scoring the model but in order to compare with the stats model # we will use the whole data model1 = LinearRegression().fit(X_m1,y_m) print(f"R2 Score: {model1.score(X_m1,y_m)}") """### Forward Selection""" model = LinearRegression(fit_intercept=False) sfs1 = sfs(model,k_features=20,forward=True,scoring='r2',cv=5) sfs1.fit(X_m,y_m) fig = plot_sfs(sfs1.get_metric_dict()) plt.title('Forward Selection') plt.grid() plt.show() print(sfs1.k_features, sfs1.k_feature_names_,sep="\n") index = list(sfs1.k_feature_idx_) X_m1 = X_m.iloc[:,index] model1 = LinearRegression().fit(X_m1,y_m) print(f"R2 Score: {model1.score(X_m1,y_m)}") """## Regularization 1. Lasso 2. Ridge 3. ElasticNet
sfs1 = SFS(estimator=classifier_, k_features=(5, 30), forward=True, floating=False, scoring='accuracy', cv=3) pipe = make_pipeline(StandardScaler(), sfs1) pipe.fit(X_train, y_train) print('best combination (ACC: %.3f): %s\n' % (sfs1.k_score_, sfs1.k_feature_idx_)) print('all subsets:\n', sfs1.subsets_) from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs plot_sfs(sfs1.get_metric_dict(), kind='std_err'); selected_features1 = list(sfs1.k_feature_names_) # save the model to disk model = LogisticRegression() model.fit(X_train, Y_train) filename = 'finalized_model.sav' pickle.dump(model, open(filename, 'wb')) # load the model from disk loaded_model = pickle.load(open(filename, 'rb')) result = loaded_model.score(X_test, Y_test) #%% # Feature Importance
def change(self, x_train, y_train, percetage, mnb, change_plan): number_change_requested = int(percetage / 100 * x_train.shape[0]) print("{} percentage error is equal to {} change \n".format( percetage, number_change_requested)) #find the most important feature sfs = SFS(mnb, k_features=len(x_train[0]), forward=True, floating=False, verbose=2, scoring='accuracy', cv=5) pipe = make_pipeline(StandardScaler(), sfs) pipe.fit(x_train, y_train) #-------------plotting------------------ fig = plot_sfs(sfs.get_metric_dict(), kind='std_err') plt.show() #get future of the sfs order and only change them. x_train_changed = np.copy(x_train) used_row = {} all_changed = 1 for i in range(len(change_plan["key"])): occurred_change = 0 indices = [ t for t, x in enumerate(y_train) if x == change_plan["key"][i][0] ] print("{} rows have target {} \n".format(len(indices), change_plan["key"][i][0])) for L in range(1, len(sfs.subsets_) + 1): #number of the features subset = list(sfs.subsets_[L]['feature_idx']) if (occurred_change == change_plan["number"][i]): break print("change feature index {} ----".format(subset)) for p in range(len(indices)): x_train_changed[indices[p]][subset] = 0 if y_train[indices[p]] == mnb.predict( [x_train[indices[p]]]) and indices[p] not in used_row: if (change_plan["key"][i][1] == mnb.predict( [x_train_changed[indices[p]]])[0]): print( "with change features index {} row number {} has been changed" .format(subset, indices[p])) print(x_train[indices[p]], mnb.predict([x_train[indices[p]]])[0]) print( x_train_changed[indices[p]], mnb.predict([x_train_changed[indices[p]]])[0]) print( " \n change number {} \n".format(all_changed)) used_row.update({indices[p]: indices[p]}) occurred_change = occurred_change + 1 all_changed = all_changed + 1 if (occurred_change == change_plan["number"][i]): print("part of your request has been done :)") break else: x_train_changed[indices[p]] = np.copy( x_train[indices[p]]) else: x_train_changed[indices[p]] = np.copy( x_train[indices[p]]) #check for rest of the possible changes # for LL in range(0, len(x_train_changed[0]) + 1): print( "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$" ) for subsets in Change_Combination.combinations_index( self, x_train_changed[0], L): if (subset != subsets): if not subsets: pass else: if (occurred_change == change_plan["number"][i]): #print("part of your request has been done :))))") break print( "change feature index {} ----".format(subsets)) for pp in range(len(indices)): x_train_changed[indices[pp]][subsets] = 0 if y_train[indices[pp]] == mnb.predict([ x_train[indices[pp]] ]) and indices[pp] not in used_row: if (change_plan["key"][i][1] == mnb.predict([ x_train_changed[indices[pp]] ])[0]): print( "with change features index {} row number {} has been changed" .format(subsets, indices[pp])) print( x_train[indices[pp]], mnb.predict([x_train[indices[pp]] ])[0]) print( x_train_changed[indices[pp]], mnb.predict([ x_train_changed[indices[pp]] ])[0]) print(" \n change number {} \n".format( all_changed)) used_row.update( {indices[pp]: indices[pp]}) occurred_change = occurred_change + 1 all_changed = all_changed + 1 if (occurred_change == change_plan["number"][i]): print( "part of your request has been done :)" ) break else: x_train_changed[indices[pp]] = np.copy( x_train[indices[pp]]) else: x_train_changed[indices[pp]] = np.copy( x_train[indices[pp]]) else: print( "subsets are equal {}----------------------------------------------" .format(subsets)) if (all_changed <= number_change_requested): print("your request doesn't complete! please change your plan") else: print("your request is done :)") return np.copy(x_train_changed)
y = imp_2.transform(y) y = y.reshape(-1) #特徵縮放 #----------------------------------------------------------------------------------------------------------------------- sc = preprocessing.StandardScaler() sc.fit(X) X = sc.transform(X) #----------------------------------------------------------------------------------------------------------------------- #rbf_svr = SVR(kernel='rbf', C=1e3) RF = RandomForestRegressor(n_estimators=10, criterion='mse', random_state=14) sfs = SFS( RF, k_features=10, forward=True, floating=False, scoring= 'r2', #{'mean_absolute_error', 'mean_squared_error'/'neg_mean_squared_error', 'median_absolute_error', 'r2'} cv=10) # n_jobs=-1 means all CPUs sfs = sfs.fit(X, y) fig = plot_sfs(sfs.get_metric_dict(), kind='std_dev') #{'std_dev', 'std_err', 'ci', None}. plt.title('Sequential Forward Selection (季風)') plt.grid() plt.show()
def sequential_feature_selector(features, labels, classifier, k_features, kfold, selection_type, plot=True, **kwargs): """Sequential feature selection to reduce the number of features. The function reduces a d-dimensional feature space to a k-dimensional feature space by sequential feature selection. The features are selected using ``mlxtend.feature_selection.SequentialFeatureSelection`` which essentially selects or removes a feature from the d-dimensional input space until the preferred size is reached. The function will pass ``ftype='feature'`` and forward ``features`` on to a classifier's ``static_opts`` method. Args: features: The original d-dimensional feature space labels: corresponding labels classifier (str or object): The classifier which should be used for feature selection. This can be either a string (name of a classifier known to gumpy) or an instance of a classifier which adheres to the sklearn classifier interface. k_features (int): Number of features to select kfold (int): k-fold cross validation selection_type (str): One of ``SFS`` (Sequential Forward Selection), ``SBS`` (Sequential Backward Selection), ``SFFS`` (Sequential Forward Floating Selection), ``SBFS`` (Sequential Backward Floating Selection) plot (bool): Plot the results of the dimensinality reduction **kwargs: Additional keyword arguments that will be passed to the Classifier instantiation Returns: A 3-element tuple containing - **feature index**: Index of features in the remaining set - **cv_scores**: cross validation scores during classification - **algorithm**: Algorithm that was used for search """ # retrieve the appropriate classifier if isinstance(classifier, str): if not (classifier in available_classifiers): raise ClassifierError("Unknown classifier {c}".format(c=classifier.__repr__())) kwopts = kwargs.pop('opts', dict()) # opts = dict() # retrieve the options that we need to forward to the classifier # TODO: should we forward all arguments to sequential_feature_selector ? opts = available_classifiers[classifier].static_opts('sequential_feature_selector', features=features) opts.update(kwopts) # XXX: now merged into the static_opts invocation. TODO: test # if classifier == 'SVM': # opts['cross_validation'] = kwopts.pop('cross_validation', False) # elif classifier == 'RandomForest': # opts['cross_validation'] = kwopts.pop('cross_validation', False) # elif classifier == 'MLP': # # TODO: check if the dimensions are correct here # opts['hidden_layer_sizes'] = (features.shape[1], features.shape[2]) # get all additional entries for the options # opts.update(kwopts) # retrieve a classifier object classifier_obj = available_classifiers[classifier](**opts) # extract the backend classifier clf = classifier_obj.clf else: # if we received a classifier object we'll just use this one clf = classifier.clf if selection_type == 'SFS': algorithm = "Sequential Forward Selection (SFS)" sfs = SFS(clf, k_features, forward=True, floating=False, verbose=2, scoring='accuracy', cv=kfold, n_jobs=-1) elif selection_type == 'SBS': algorithm = "Sequential Backward Selection (SBS)" sfs = SFS(clf, k_features, forward=False, floating=False, verbose=2, scoring='accuracy', cv=kfold, n_jobs=-1) elif selection_type == 'SFFS': algorithm = "Sequential Forward Floating Selection (SFFS)" sfs = SFS(clf, k_features, forward=True, floating=True, verbose=2, scoring='accuracy', cv=kfold, n_jobs=-1) elif selection_type == 'SBFS': algorithm = "Sequential Backward Floating Selection (SFFS)" sfs = SFS(clf, k_features, forward=True, floating=True, verbose=2, scoring='accuracy', cv=kfold, n_jobs=-1) else: raise Exception("Unknown selection type '{}'".format(selection_type)) pipe = make_pipeline(StandardScaler(), sfs) pipe.fit(features, labels) subsets = sfs.subsets_ feature_idx = sfs.k_feature_idx_ cv_scores = sfs.k_score_ if plot: fig1 = plot_sfs(sfs.get_metric_dict(), kind='std_dev') plt.ylim([0.5, 1]) plt.title(algorithm) plt.grid() plt.show() return feature_idx, cv_scores, algorithm, sfs, clf
x_scaled_np = StandardScaler().fit_transform(x_data) x_scaled_np = PolynomialFeatures(degree=2).fit_transform(x_scaled_np) print(y) print(x_scaled_np) cv = RepeatedKFold(n_splits=5, n_repeats=20) bins = np.linspace(y.min(), y.max(), 5) labels = ["1", "2", "3", "4"] Y_groups = pd.cut(y, bins) sfs = SFS(regr, floating=True, verbose=2, k_features=2, forward=False, n_jobs=2, scoring='neg_mean_absolute_error', cv=cv) sfs.fit(x_scaled_np, y) print("Optimal number of features : %d" % sfs.k_features) print('Best features :', sfs.k_feature_names_) print('Best score :', sfs.k_score_) print(sfs.get_params()) print(sfs) fig1 = plot_sfs(sfs.get_metric_dict(), kind='std_dev', figsize=(6, 4)) plt.show()
dic[i] = rfe.score() plt.xlabel('feature_num') plt.ylabel('score') plt.plot(dic.keys(), dic.values()) plt.show() return dic if __name__ == "__main__": train_data = load_data(train_url) train_y = train_data['price'] train_data.drop(['SaleID'], axis=1, inplace=True) train_data.drop(['price'], axis=1, inplace=True) col_name = [ 'name', 'brand', 'bodyType', 'fuelType', 'gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode', 'v_0', 'v_3', 'v_12', 'usedTime' ] sfs = SFS(LinearRegression(), k_features=13, forward=True, floating=False, scoring='r2', cv=0) train_data = train_data.fillna(0) sfs.fit(train_data, train_y) print(sfs.k_feature_names_) print(pd.DataFrame.from_dict(sfs.get_metric_dict()).T) fig = plot_sfs(sfs.get_metric_dict(), kind='std_dev') plt.grid() plt.show()
selector.k_feature_idx_ selector.k_feature_names_ selector.k_score_ pd.DataFrame.from_dict(selector.get_metric_dict()).T # Backward Selection select_back = SequentialFeatureSelector(knn_pipe, k_features=3, forward=False, floating=False, verbose=2, scoring='accuracy', cv=5, n_jobs=1) select_back.fit(X=X, y=y) # Plot results of Feature Selection (using `mlxtend`) from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs import matplotlib.pyplot as plt fig1 = plot_sfs(selector.get_metric_dict(), kind='std_dev') plt.ylim([0.8, 1]) plt.title('Sequential Forward Selection (w. StdDev)') plt.grid() plt.show();