Beispiel #1
0
def step_forward(X, y, name):
    print(name)
    # Inspiration: https://www.kdnuggets.com/2018/06/step-forward-feature-selection-python.html

    # Set up training/testing standardized data
    X_train_std, X_test_std, y_train, y_test = tts_std(X, y)

    # Build RF classifier to use in feature selection: liblinear solver recommended when you have
    # high dimension dataset, but once you standardize your data, the accuracy of all solvers is
    # pretty much the same. max_iter (maximum number of iterations taken for the solvers to converge.)
    # is set to a higher number than default (100) so that the model will actually converge (lower
    # values cause a no convergence warning).
    clf = skllm.LogisticRegression(penalty='l1',
                                   C=0.1,
                                   solver='liblinear',
                                   max_iter=100)

    # Build step forward feature selection: cv (cross validation) is set to zero for no
    # cross validation, k_features = 3 means we are selecting the 3 best attributes to desribe
    # our feature, and verbose is just used for logging the progress of the feature selector
    sfs1 = mlx.SequentialFeatureSelector(clf,
                                         k_features=5,
                                         forward=True,
                                         floating=False,
                                         verbose=0,
                                         scoring='accuracy',
                                         cv=10)

    # Perform SFS
    sfs1 = sfs1.fit(X_train_std, y_train, custom_feature_names=X.columns)

    # Which features?
    print('\t' + 'Top 5 features: ' + str(sfs1.k_feature_names_))
    feat_cols1 = list(sfs1.k_feature_idx_)

    # Build full model with selected features: sfs has no predict function
    clf = skllm.LogisticRegression(penalty='l1',
                                   C=0.1,
                                   solver='liblinear',
                                   max_iter=100)
    # Now that we have the relevant features according to SFS, we can use logistic regression
    # on JUST those features and see how accurately they can predict the classification of
    # single loaded, clear, straight, etc.
    clf.fit(X_train_std[:, feat_cols1], y_train)

    # 'kind' represents the kind of error bar you get in your plot {'std_dev', 'std_err', 'ci',
    # None}. This error bar is the error of the cv scores.
    fig1 = mlxp.plot_sequential_feature_selection(sfs1.get_metric_dict(),
                                                  kind='std_dev')
    plt.title('Sequential Forward Feature Selection CV Scores: ' + name +
              ' (std dev)')
    plt.ylabel('Mean CV Score')
    plt.grid()
    plt.savefig('feature_selection/sfs_' + name + ".png")
    plt.close()

    # Accuracy
    y_train_pred = clf.predict(X_train_std[:, feat_cols1])
    print('\tTraining accuracy on selected features: %.3f' %
          sklm.accuracy_score(y_train, y_train_pred))
    print('\tTraining mean absolute error on selected features: %.3f' %
          mean_abs_error(y_train, y_train_pred))
    y_test_pred = clf.predict(X_test_std[:, feat_cols1])
    print('\tTesting accuracy on selected features: %.3f' %
          sklm.accuracy_score(y_test, y_test_pred))
    print('\tTesting mean_abs_error on selected features: %.3f' %
          mean_abs_error(y_test, y_test_pred))

    # Confusion matrix generation
    confusion_matrix(y_train, y_train_pred, name + "_sfs_Training_Data_")
    confusion_matrix(y_test, y_test_pred, name + "_sfs_Testing_Data_")
    my_auc(y_train, X_train_std[:, feat_cols1], name + '_sfs_training',
           sfs1.k_feature_names_)

    # CV scores
    scores = sklms.cross_val_score(clf, X, y, cv=4)
    print('\t' + name + ' CVs: ' + str(scores))

    return sfs1, clf, pd.DataFrame.from_dict(sfs1.get_metric_dict()).T
                                                             n_repeats=10),
                  scoring='r2',
                  n_jobs=-1)
selector3 = selector3.fit(dataFrame3[colsNotSalePrice2],
                          dataFrame3["SalePrice"])
for col in colsNotSalePrice2[selector3.ranking_ == 1]:
    print("%s" % col, end=", ")

#Seleção por forward SequentialFeatureSelector
from mlxtend import feature_selection

#SFS para random forests
sfs3 = feature_selection.SequentialFeatureSelector(
    estimator3,
    k_features=79,
    forward=True,
    scoring="r2",
    cv=model_selection.RepeatedStratifiedKFold(3, 10),
    n_jobs=-1)

sfs4 = sfs3.fit(dataFrame3[colsNotSalePrice2], dataFrame3["SalePrice"])

#Gera gráficos com os resultados

# #DataFrame com resultados do SVM
# svmPlot=pd.DataFrame({
#     "Nº variáveis": range(5, len(selector3.grid_scores_) + 5),
#     "algorithm":"svm",
#     "Correlação": selector.grid_scores_})

#DataFrame com resultados do RF
def part_d(x, y):
    linear = sklm.LinearRegression()
    sfs = mlfs.SequentialFeatureSelector(linear, k_features=1, floating=False, forward=False, cv=0)
    sfs_fit = sfs.fit(x, y)
    print('\nPART D')
    print(sfs_fit.subsets_)