def step_forward(X, y, name): print(name) # Inspiration: https://www.kdnuggets.com/2018/06/step-forward-feature-selection-python.html # Set up training/testing standardized data X_train_std, X_test_std, y_train, y_test = tts_std(X, y) # Build RF classifier to use in feature selection: liblinear solver recommended when you have # high dimension dataset, but once you standardize your data, the accuracy of all solvers is # pretty much the same. max_iter (maximum number of iterations taken for the solvers to converge.) # is set to a higher number than default (100) so that the model will actually converge (lower # values cause a no convergence warning). clf = skllm.LogisticRegression(penalty='l1', C=0.1, solver='liblinear', max_iter=100) # Build step forward feature selection: cv (cross validation) is set to zero for no # cross validation, k_features = 3 means we are selecting the 3 best attributes to desribe # our feature, and verbose is just used for logging the progress of the feature selector sfs1 = mlx.SequentialFeatureSelector(clf, k_features=5, forward=True, floating=False, verbose=0, scoring='accuracy', cv=10) # Perform SFS sfs1 = sfs1.fit(X_train_std, y_train, custom_feature_names=X.columns) # Which features? print('\t' + 'Top 5 features: ' + str(sfs1.k_feature_names_)) feat_cols1 = list(sfs1.k_feature_idx_) # Build full model with selected features: sfs has no predict function clf = skllm.LogisticRegression(penalty='l1', C=0.1, solver='liblinear', max_iter=100) # Now that we have the relevant features according to SFS, we can use logistic regression # on JUST those features and see how accurately they can predict the classification of # single loaded, clear, straight, etc. clf.fit(X_train_std[:, feat_cols1], y_train) # 'kind' represents the kind of error bar you get in your plot {'std_dev', 'std_err', 'ci', # None}. This error bar is the error of the cv scores. fig1 = mlxp.plot_sequential_feature_selection(sfs1.get_metric_dict(), kind='std_dev') plt.title('Sequential Forward Feature Selection CV Scores: ' + name + ' (std dev)') plt.ylabel('Mean CV Score') plt.grid() plt.savefig('feature_selection/sfs_' + name + ".png") plt.close() # Accuracy y_train_pred = clf.predict(X_train_std[:, feat_cols1]) print('\tTraining accuracy on selected features: %.3f' % sklm.accuracy_score(y_train, y_train_pred)) print('\tTraining mean absolute error on selected features: %.3f' % mean_abs_error(y_train, y_train_pred)) y_test_pred = clf.predict(X_test_std[:, feat_cols1]) print('\tTesting accuracy on selected features: %.3f' % sklm.accuracy_score(y_test, y_test_pred)) print('\tTesting mean_abs_error on selected features: %.3f' % mean_abs_error(y_test, y_test_pred)) # Confusion matrix generation confusion_matrix(y_train, y_train_pred, name + "_sfs_Training_Data_") confusion_matrix(y_test, y_test_pred, name + "_sfs_Testing_Data_") my_auc(y_train, X_train_std[:, feat_cols1], name + '_sfs_training', sfs1.k_feature_names_) # CV scores scores = sklms.cross_val_score(clf, X, y, cv=4) print('\t' + name + ' CVs: ' + str(scores)) return sfs1, clf, pd.DataFrame.from_dict(sfs1.get_metric_dict()).T
n_repeats=10), scoring='r2', n_jobs=-1) selector3 = selector3.fit(dataFrame3[colsNotSalePrice2], dataFrame3["SalePrice"]) for col in colsNotSalePrice2[selector3.ranking_ == 1]: print("%s" % col, end=", ") #Seleção por forward SequentialFeatureSelector from mlxtend import feature_selection #SFS para random forests sfs3 = feature_selection.SequentialFeatureSelector( estimator3, k_features=79, forward=True, scoring="r2", cv=model_selection.RepeatedStratifiedKFold(3, 10), n_jobs=-1) sfs4 = sfs3.fit(dataFrame3[colsNotSalePrice2], dataFrame3["SalePrice"]) #Gera gráficos com os resultados # #DataFrame com resultados do SVM # svmPlot=pd.DataFrame({ # "Nº variáveis": range(5, len(selector3.grid_scores_) + 5), # "algorithm":"svm", # "Correlação": selector.grid_scores_}) #DataFrame com resultados do RF
def part_d(x, y): linear = sklm.LinearRegression() sfs = mlfs.SequentialFeatureSelector(linear, k_features=1, floating=False, forward=False, cv=0) sfs_fit = sfs.fit(x, y) print('\nPART D') print(sfs_fit.subsets_)