def forward_selection_regression(data, target, k_features=3): """ :param data: pandas dataframe of input data :param target: pandas dataframe of input data's corresponding target :param k_features: number of desired features to fit the regression upon, features are chosen based on their importance :return: prints out the mean squared error and regression coefficients """ reg = LinearRegression() sfs = SFS(reg, k_features, forward=True, floating=False, verbose=0, scoring='r2', cv=5) X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=.3) sfs = sfs.fit(X_train, y_train) X_train_sfs = sfs.transform(X_train) X_test_sfs = sfs.transform(X_test) reg = reg.fit(X_train_sfs, y_train) print('estimated coefficients for the linear regression:', reg.coef_) print('interception coefficient b_0:', reg.intercept_) print('MSE_train:', metrics.mean_squared_error(y_train, reg.predict(X_train_sfs))) print('MSE_test:', metrics.mean_squared_error(y_test, reg.predict(X_test_sfs)))
def vote(X_train, Y_train, X_test, Y_test, voting_type, feature_selection, k_features): """Invokation of a soft voting/majority rule classification. This is a wrapper around `sklearn.ensemble.VotingClassifier` which automatically uses all classifiers that are known to `gumpy` in `gumpy.classification.available_classifiers`. Args: X_train: training data (values) Y_train: training data (labels) X_test: evaluation data (values) Y_test: evaluation data (labels) voting_type (str): either of 'soft' or 'hard'. See the sklearn.ensemble.VotingClassifier documentation for more details Returns: 2-element tuple containing - **ClassificationResult**: The result of the classification. - **Classifier**: The instance of `sklearn.ensemble.VotingClassifier` that was used during the classification. """ k_cross_val = 10 N_JOBS = -1 clfs = [] for classifier in available_classifiers: # determine kwargs such that the classifiers get initialized with # proper default settings. This avoids cross-validation, for instance opts = available_classifiers[classifier].static_opts('vote', X_train=X_train) # retrieve instance cobj = available_classifiers[classifier](**opts) clfs.append((classifier, cobj.clf)) # instantiate the VotingClassifier soft_vote_clf = VotingClassifier(estimators=clfs, voting=voting_type) if feature_selection: sfs = SFS(soft_vote_clf, k_features, forward=True, floating=True, verbose=2, scoring='accuracy', cv=k_cross_val, n_jobs=N_JOBS) sfs = sfs.fit(X_train, Y_train) X_train = sfs.transform(X_train) X_test = sfs.transform(X_test) soft_vote_clf.fit(X_train, Y_train) Y_pred = soft_vote_clf.predict(X_test) return ClassificationResult(Y_test, Y_pred), soft_vote_clf
def step_feature_selection(keras_est, x_train, y_train, x_test, y_test, features_lower_bound, features_upper_bound, *, scoring='accuracy', cv=0, n_jobs=-1): # feature selection step forward/backward: sk_keras_est = SFS(keras_est, k_features=(features_lower_bound, features_upper_bound), forward=True, floating=False, verbose=2, scoring=scoring, cv=cv, n_jobs=n_jobs) sk_keras_est = sk_keras_est.fit(x_train, y_train) # transforming data to only contain chosen features: x_train_sfs = sk_keras_est.transform(x_train) x_test_sfs = sk_keras_est.transform(x_test) # print(pd.DataFrame(x_train_sfs)) # print(pd.DataFrame(x_test_sfs)) global feature_names selected_features = [] selected_features = [feature_names[i] for i in sk_keras_est.k_feature_idx_] feature_names = selected_features #print(feature_names) feature_names_SFS = pd.DataFrame(feature_names) feature_names_SFS.to_csv(RUNDIR + "feature_names_SFS_{}.csv".format(average_runs), index=False) k.clear_session() return x_train_sfs, x_test_sfs #return original dataframe if none is dropped
def select_r2(df_in, ss_label, f_n, eps): dfx = df_in.copy() if len(dfx.columns) > f_n: select = SFS(RandomForestClassifier(n_estimators=eps, random_state=1), k_features=f_n, forward=True, floating=False, scoring='accuracy', cv=4, n_jobs=3) select.fit(dfx.values, ss_label.values) mask = select.k_feature_idx_ x_sfs = select.transform(dfx.values) m_mir_list = dfx.columns[[x for x in mask]] return x_sfs, ','.join(m_mir_list), len(m_mir_list) else: f_list = dfx.columns.tolist() return dfx.values, ','.join(f_list), len(f_list)
def test_check_pandas_dataframe_transform(): iris = load_iris() X = iris.data y = iris.target lr = SoftmaxRegression(random_seed=1) sfs1 = SFS(lr, k_features=2, forward=True, floating=False, scoring='accuracy', cv=0, verbose=0, n_jobs=1) df = pd.DataFrame(X, columns=['sepal length', 'sepal width', 'petal length', 'petal width']) sfs1 = sfs1.fit(df, y) assert sfs1.k_feature_idx_ == (1, 3) assert (150, 2) == sfs1.transform(df).shape
def fs_rfe(df_in, ss_label, f_n, tp=0): dfx = df_in.copy() if len(dfx.columns) > f_n: es1 = Ridge(alpha=0.1) es2 = Lasso(alpha=0.1) # es3 = SVR(C=1.0, epsilon=0.2, kernel='linear', cache_size=3000) ch = {0: es1, 1: es2} select = SFS(ch.get(tp, es1), k_features=f_n, forward=True, floating=False, verbose=1, scoring='neg_mean_squared_error', cv=4, n_jobs=3) select.fit(dfx.values, ss_label.values) mask = select.k_feature_idx_ print(mask) x_rfe = select.transform(dfx.values) m_mir_list = dfx.columns[[x for x in mask]] return x_rfe, m_mir_list else: f_list = dfx.columns.tolist() return dfx.values, f_list
def step_feature_selection(keras_est, x_train, y_train, x_test, y_test, features_lower_bound, features_upper_bound, *, scoring='accuracy', cv=0, n_jobs=-1): # feature selection step forward/backward: sk_keras_est = SFS(keras_est, k_features=(features_lower_bound, features_upper_bound), forward=True, floating=False, verbose=2, scoring=scoring, cv=cv, n_jobs=n_jobs) sk_keras_est = sk_keras_est.fit(x_train, y_train) # transforming data to only contain chosen features: x_train_sfs = sk_keras_est.transform(x_train) x_test_sfs = sk_keras_est.transform(x_test) # print(pd.DataFrame(x_train_sfs)) # print(pd.DataFrame(x_test_sfs)) global feature_names selected_features = [] selected_features = [feature_names[i] for i in sk_keras_est.k_feature_idx_] feature_names = selected_features #print(feature_names) feature_names_SFS=pd.DataFrame(feature_names) feature_names_SFS.to_csv(DATADIR+"feature_names_SFS.csv", index=False) k.clear_session() # # training model with chosen features # keras_est.fit(x_train_sfs, y_train) # y_pred = keras_est.predict(x_test_sfs) # # evaluating model with accuracy and false positive index # correct = 0 # index_wrong=[] # false_positive=[] # y_test = y_test.flatten() # y_pred = y_pred.flatten() # # for i in range(len(y_pred)): # # if y_test[i] == y_pred[i]: # # correct += 1 # # else: # # index_wrong.append(i) # # if y_test[i] == 0: # # false_positive.append(i) # for i in range(len(y_pred)): # if y_test[i] != y_pred[i]: # index_wrong.append(i) # if y_test[i] == 0: # false_positive.append(i) # # checking model accuracy: # percent_correct= accuracy_score(y_test, y_pred) # accuracy_result = pd.DataFrame.from_dict(sk_keras_est.get_metric_dict()).T # accuracy_result.to_csv(DATADIR+"accuracy_result.csv", index=False) # print('Selected features:', sk_keras_est.k_feature_idx_) # #percent_correct = (correct/len(df_y_test)) # print("Model accurary is: {:.2f}%".format(percent_correct*100)) # print("Wrong prediction index: ", index_wrong) # print("Index with False Positive: ", false_positive) return x_train_sfs, x_test_sfs #return original dataframe if none is dropped
random_state=10, stratify=df3["descCanalRadicacion"]) knn = KNeighborsClassifier(n_neighbors=50, weights='distance') sfs1 = SFS(knn, k_features=11, forward=True, floating=False, verbose=1, scoring=make_scorer(f1_score, average='weighted'), cv=5) sfs1 = sfs1.fit(X_train, y_train) X_train_sfs = sfs1.transform(X_train) X_test_sfs = sfs1.transform(X_test) clfKnn_sfs = knn.fit(X_train_sfs, y_train) #Predictions predictions = clfKnn_sfs.predict(X_test_sfs) #Score print(f1_score(y_test, predictions.astype('int64'), average='weighted')) #Caracteristicas seleccionadas print(sfs1.k_feature_names_) with open('./outputs/model.pkl', 'wb') as model_pkl: pickle.dump(clfKnn_sfs, model_pkl)
cv=5) sfs = sfs1.fit(X3_train_scaled, y_train) fig1 = plot_sfs(sfs.get_metric_dict(), kind='std_dev') plt.ylim([0, 0.3]) plt.title('Sequential Forward Selection (w. StdDev)') plt.ylabel("Cross validation f1-score") plt.grid() plt.savefig("/Users/quan/Documents/Sinto_Project/report/selection.png") plt.show() print('Selected features:', sfs1.k_feature_idx_) print(sfs.k_score_) X_train_sfs = sfs1.transform(X3_train_scaled) X_test_sfs = sfs1.transform(X3_test_scaled) # Fit the estimator using the new feature subset # and make a prediction on the test data svc.fit(X_train_sfs, y_train) y_pred = svc.predict(X_test_sfs) y_probs=svc.predict_proba(X_test_sfs) # Compute the accuracy of the prediction recall = float(sum(y_test[np.where(y_test==1)] == y_pred[np.where(y_test==1)])) / len(np.where(y_test==1)[0]) print('Test set recall: %.2f %%' % (recall * 100)) precision = float(sum(y_test[np.where(y_pred==1)] == y_pred[np.where(y_pred==1)])) / len(np.where(y_pred==1)[0]) print('Test set precision: %.2f %%' % (precision * 100))
gamma=gamma_curr, C=c_curr, class_weight=curr_class_weight, decision_function_shape=dfs) sffs = SFS(classifier, k_features=20, forward=True, floating=floatingEnabled, scoring=scoreMethod, print_progress=False, cv=cv_folds) #Select the features sffs = sffs.fit(data_train, target_train) data_sffs_train = sffs.transform(data_train) data_sffs_val = sffs.transform(data_val) #Fit the classifier to the training data classifier.fit(data_sffs_train, target_train) #Print the features selected if printSelectedFeats: print('Selected features: ', end="") output.write('Selected features: ') for feat in sffs.k_feature_idx_: print(currentHeader[feat], end=",") output.write(currentHeader[feat] + ",") print() output.write("\n")
lr=LogisticRegressionCV(Cs=Cs,fit_intercept=True) # lr.fit(train,train_y) if args.fs=='sfs': fs=SFS(lr,k_features=k_features,forward=True,floating=True,scoring='neg_log_loss',cv=cv,verbose=2) else: fs=EFS(lr,min_features=1,max_features=min(train.shape[1],8),scoring='neg_log_loss',cv=cv) fs.fit(train.values,train_y.values) print print pd.DataFrame.from_dict(fs.get_metric_dict()).T if args.fs=='sfs': print 'SFS best score:', fs.k_score_ print len(fs.k_feature_idx_),'features:',fs.k_feature_idx_ else: print 'EFS best score:', fs.best_score_ print len(fs.best_idx_),'features:',fs.best_idx_ lr.fit(fs.transform(train.iloc[:split].values),train_y.iloc[:split]) print print 'Regularization C:', lr.C_ print 'validation error fitting on train:', log_loss(train_y.iloc[split:],lr.predict_proba(fs.transform(train.iloc[split:].values))[:,1]) lr.fit(fs.transform(train.iloc[split:].values),train_y.iloc[split:]) print 'validation error fitting on val:', log_loss(train_y.iloc[split:],lr.predict_proba(fs.transform(train.iloc[split:].values))[:,1]) if args.out: test1['probability']=lr.predict_proba(fs.transform(test[train.columns].values))[:,1] test1.to_csv('~/val.csv',index=True,columns=['probability'],float_format='%.6f') lr.fit(fs.transform(train.values),train_y) print 'validation error fitting on trainval:', log_loss(train_y.iloc[split:],lr.predict_proba(fs.transform(train.iloc[split:].values))[:,1]) if args.out: test1['probability']=lr.predict_proba(fs.transform(test[train.columns].values))[:,1] test1.to_csv('~/trainval.csv',index=True,columns=['probability'],float_format='%.6f')
#X_train_scaled = scaler.fit(X_train).transform(X_train) #X_test_scaled = scaler.fit(X_test).transform(X_test) train_data_scaled = scaler.fit_transform(train_data) test_data_scaled = scaler.fit_transform(test_data) mlp = MLPRegressor(max_iter=200, solver='lbfgs', hidden_layer_sizes=(50, 50), activation='identity') sfs = SFS(mlp, k_features=3, forward=True, floating=False, scoring='r2', cv=10) sfs = sfs.fit(train_data_scaled, train_target) print(sfs.k_feature_idx_) train_sfs = sfs.transform(train_data_scaled) test_sfs = sfs.transform(test_data_scaled) acc_train = 0 acc_test = 0 for i in range(0, 100): mlp.fit(train_sfs, train_target) acc_train = acc_train + mlp.score(train_sfs, train_target) acc_test = acc_test + mlp.score(test_sfs, test_target) print(acc_test) print(acc_train) datanow = np.zeros(11) test_data = []
def ExecuteSFFS(x, y, featureNames, featureList, clusters, clusterNames, svc, kFolds, nbOfSplit, featMaxNbrSFFS, standardizationType, removedData, permutation_flag, nbPermutation, balance_flag, currentDateTime, resultDir, debug_flag, verbose): import scipy import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split as tts from sklearn.metrics import confusion_matrix from mlxtend.feature_selection import SequentialFeatureSelector as SFS from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs from sklearn.model_selection import RandomizedSearchCV from slpClass_toolbox import BalanceClasses from slpClass_toolbox import Standardize from slpClass_toolbox import Permute from slpClass_toolbox import ComputePermutationAvgDA from slpClass_toolbox import PlotPermHist from slpClass_toolbox import ApplyStandardization from slpClass_toolbox import plot_confusion_matrix plt.rcParams.update({'figure.max_open_warning': 0}) # Get features values since SFFS works only with numpy array! bestFeaturesHist = np.zeros([len(featureNames)]) CvResult = pd.DataFrame() permResults = pd.DataFrame() tmpBest = [] DA = [] avg_perm_DA = [] skipFS = False # flag to skip feature selection fitFeatOverTresh = False # fit classifier with most frequent features in best set #********************** TRAIN pre-procesing ******************************* for it in list(range(nbOfSplit)): print('\nSplit #{}'.format(str(it))) # Use all features or given ones only if len(featureList) == 0: xx = x elif isinstance(featureList[0], float): xx = x fitFeatOverTresh = True else: xx = x[featureList] skipFS = True # Balance the number of old woman and old man or not if balance_flag: X, Y = BalanceClasses(xx, y) else: X, Y = xx, y # slpit dataset into train and test random subset X_train, X_test, y_train, y_test = tts(X, Y['Cluster'], test_size=0.33, stratify=Y['Cluster']) # Data z-score standardisation xTrainSet, zPrm = Standardize(X_train, y_train, standardizationType, debug_flag) #**************************** SVM optimisation ************************ params_dict = { 'C': scipy.stats.expon(scale=100), 'kernel': ['linear'], 'class_weight': ['balanced', None] } n_iter_search = 20 random_search = RandomizedSearchCV(svc, param_distributions=params_dict, n_iter=n_iter_search) random_search.fit(xTrainSet, y_train) optimClf = random_search.best_estimator_ #*************************** TRAIN ************************************ print('Fitting...') if skipFS: optimClf = optimClf.fit(xTrainSet.as_matrix(), y_train) yPred = optimClf.predict(xTrainSet.as_matrix()) # Compute the accuracy of the test prediction acc = float((y_train == yPred).sum()) / yPred.shape[0] print('Train predicted accuracy: %.2f %%' % (acc * 100)) fitRes = pd.DataFrame(data=[acc], columns=['CV_DA_' + str(it + 1)]) else: # set k_features = (1,X.shape[1]) to test all possible combinations sffs = SFS(optimClf, k_features=(1, featMaxNbrSFFS), forward=True, floating=False, scoring='accuracy', cv=kFolds, n_jobs=-1) sffs = sffs.fit(xTrainSet.as_matrix(), y_train) print('Best combination for fit #%d (ACC: %.3f): %s' % \ (it,sffs.k_score_, sffs.k_feature_idx_)) # Fit the estimator using the new feature subset and make a # prediction on the test data X_train_sfs = sffs.transform(xTrainSet.as_matrix()) optimClf.fit(X_train_sfs, y_train) fitRes = pd.DataFrame.from_dict(sffs.get_metric_dict()).T fitRes['avg_over_std'] = fitRes['avg_score'] / fitRes['std_dev'] if featMaxNbrSFFS > 1: # plot feature selection process metrics fig1 = plot_sfs(sffs.get_metric_dict(), kind='std_err') savedPlotName = resultDir+'Decoding_accuracy_'+clusters+'_'+\ str(it)+'_'+str(nbOfSplit)+'.png' tmpBest.append(sffs.k_feature_idx_) bestFeaturesHist[[tmpBest[-1]]] += 1 fig1.set_dpi(300) plt.tight_layout() plt.savefig(savedPlotName, bbox_inches='tight') plt.clf() plt.close(fig1) # plot mean / std plt.figure(dpi=300) plt.title('Moyenne sur ecart-type') plt.xlabel("nb attributs dans combinaison") plt.xticks(range(featMaxNbrSFFS)) plt.ylabel("Moyenne sur ecart-type") plt.plot(list(range(1, featMaxNbrSFFS + 1)), fitRes['avg_over_std']) figName = resultDir+'SFFS_'+clusters+'_bestSet_metric_'+ \ str(it)+'_'+str(nbOfSplit) plt.savefig(figName, bbox_inches='tight') plt.clf() plt.close() # add metrics iteration identifier fitRes = fitRes.add_suffix('_' + str(it + 1)) CvResult = pd.concat([CvResult, fitRes], axis=1) #***************************** TEST *********************************** print('Testing...') # standardize test set using trainset standardization parameters xTestSet = ApplyStandardization(X_test, zPrm) # prepare test data if skipFS: xTest = xTestSet savedPlotName = resultDir+clusters+'_ConfusionMatrix_'+str(it+1)+ \ '_'+str(nbOfSplit) else: # Generate a new subset of data according to selected features xTest = sffs.transform(xTestSet.as_matrix()) savedPlotName = resultDir+'SFFS_'+clusters+'_ConfusionMatrix_'+ \ str(it+1)+'_'+str(nbOfSplit) # actually test classifier and compute decoding accuracy on predictions y_pred = optimClf.predict(xTest) acc = float((y_test == y_pred).sum()) / y_pred.shape[0] print('Test set accuracy: %.2f %%' % (acc * 100)) DA.append(acc) # stack test DA for further use # plot confusion matrix cm = confusion_matrix(y_test, y_pred) fig_CM = plt.figure(dpi=300) plot_confusion_matrix(cm, clusterNames, title=savedPlotName, normalize=True, precision=2) plt.clf() plt.close(fig_CM) #**************** STATISTICAL ASSESSMENT (PERMUTATION) **************** if permutation_flag: permResults['permutation_DA_' + str(it)] = Permute( clusters, xTrainSet, xTestSet, y_train, y_test, nbPermutation, standardizationType, debug_flag=0) avg_perm_DA.append( np.mean(permResults['permutation_DA_' + str(it)])) dfDA = pd.DataFrame(data=DA, columns=['DA_test']) # CvResult = pd.concat([CvResult, dfDA[:]], axis=1) CvResult = pd.concat([ CvResult, dfDA[:], pd.DataFrame(data=[np.mean(DA)], columns=['avg_DA']) ], axis=1) #***************** COMPUTE STATISTICAL ASSESSMENT RESULTS ***************** if permutation_flag: # compute permutation DA average and keep results in a dataframe print('\nAverage permutation DA') for i in list(range(len(avg_perm_DA))): print('\t' + str(avg_perm_DA[i])) savedHistName = resultDir + 'Average_Permutation_hist_' + clusters + '.png' PlotPermHist(permResults, CvResult['avg_DA'].iloc[0], currentDateTime, savedHistName) #formating permutation results to save in excel file permResults = pd.concat( [permResults, ComputePermutationAvgDA(avg_perm_DA)], axis=1) print('Mean permutation decoding accuracy : {}'.format( np.mean(permResults['Avg_Permutation_DA_per_epoch']))) else: # binomial law from scipy.stats import binom q = 0.001 # p value n = X.shape[0] + 1 # nombre d'observation (sujets) p = 1 / len(clusterNames) # probablité d'avoir un essai correctement luckLvl = pd.DataFrame(date=[binom.isf(q, n, p) / n], columns=['Chance_Level']) #****************************** Compute results ******************************* if not skipFS: # Build structure of histogram data to save in excel hist = pd.DataFrame(data=featureNames, columns=['Features_Name']) hist['Occurence_Best'] = bestFeaturesHist # Search best set across every iteration best set best_Combination = tmpBest[np.argmax(DA)] # Compute average size of best combination l = 0 for n in list(range(len(tmpBest))): l += len(tmpBest[n]) avgBestCombSize = pd.DataFrame(data=[np.ceil(l / len(tmpBest))], columns=['avgBestCombSize']) # subsetHist = GetSubsetOccurence(tmpBest) # PlotHist(subsetHist[1],'Subsets occurences',subsetHist[0],'Comb_Hist.png') # Get best set's feature names tmp = [] tmp.append(np.max(DA)) for i in best_Combination: tmp.append(featureNames[i]) print('\t' + featureNames[i]) bestFeatNames = pd.DataFrame(data=tmp, columns=['Best_Features_Set']) sffsRes = pd.concat([hist, bestFeatNames, avgBestCombSize], axis=1) # Plot best combination custom metric (mean / std_dev) from slpClass_toolbox import PlotBestCombinationMetrics filteredData = CvResult.filter(regex=r'avg_over_std_', axis=1) metrics = pd.DataFrame(data=filteredData) metrics.dropna(inplace=True) figName = resultDir + 'SFFS_' + clusters + '_bestSet_metric_aggreg.png' PlotBestCombinationMetrics(metrics, figName) #save training and permutation results in an excel file nbSubject = pd.DataFrame(data=[len(X)], columns=['Number_Of_Subjects']) #************************ Build results structure ************************* excelResults = pd.concat([ CvResult, permResults if permutation_flag else luckLvl, sffsRes if not skipFS else None, removedData, nbSubject ], axis=1) print('Mean Decoding accuracy :{}'.format(np.mean(DA))) # compute occurence of every subset in bestsets of every iteration # from slpClass_toolbox import GetSubsetOccurence # subsetHist = GetSubsetOccurence(tmpBest) # excelResults = pd.concat([excelResults, subsetHist], axis=1) # excelResults.to_excel(saveTo, sheet_name=xlSheetName) if fitFeatOverTresh: tresh = featureList[0] * nbOfSplit bestFeatColumns = hist.iloc[:, 0][hist.iloc[:, 1] > tresh] bestDataSet = xx[bestFeatColumns] classes = y DABestFeat = [] print('Fitting with features occuring over %d times in best sets' % tresh) for i in list(range(nbOfSplit)): print('\rFit #{} of {}\n'.format(i + 1, nbOfSplit), end='\r', flush=True) # Balance the number of old woman and old man or not if balance_flag: XX, YY = BalanceClasses(bestDataSet, classes) else: XX, YY = bestDataSet, classes # slpit dataset into train and test random subset XXtrain, XXtest, yytrain, yytest = tts(XX, YY['Cluster'], test_size=0.33, stratify=YY['Cluster']) # Data z-score standardisation xxTrainSet, zzPrm = Standardize(XXtrain, yytrain, standardizationType, debug_flag) # fit and predict on training data optimClf = optimClf.fit(xxTrainSet.as_matrix(), yytrain) yPred = optimClf.predict(xxTrainSet.as_matrix()) # Compute accuracy of prediction on trainnnig set acc = float((yytrain == yPred).sum()) / yPred.shape[0] print('Train predicted accuracy: %.2f %%' % (acc * 100)) fitRes = pd.DataFrame(data=[acc], columns=['CV_DA_' + str(it + 1)]) # test classifier and compute decoding accuracy on predictions xxTestSet = ApplyStandardization(XXtest, zzPrm) yypred = optimClf.predict(xxTestSet) acc = float((yytest == yypred).sum()) / yypred.shape[0] print('Test set accuracy: %.2f %%' % (acc * 100)) DABestFeat.append(acc) # stack test DA for further use # plot confusion matrix cm = confusion_matrix(yytest, yypred) fig_CM = plt.figure(dpi=300) plot_confusion_matrix(cm, clusterNames, title=savedPlotName, normalize=True, precision=2) plt.clf() plt.close(fig_CM) df = pd.DataFrame(data=DABestFeat, columns=['optim DA']) df = pd.concat([ df, pd.DataFrame(data=[np.mean(DABestFeat)], columns=['optim avg DA']) ], axis=1) print('Classifier trained with best features (occ > %d) only' % tresh) print(df) excelResults = pd.concat([excelResults, df], axis=1) return excelResults