max_iter=1000,
                         shuffle=True)

    # xgb_model.fit(x,y)
    # y_pred = xgb_model.predict(x_test)
    # mse = mean_squared_error(y_test, y_pred)
    # print("MSE: ", mse)
    # rmse = np.sqrt(mse)
    # print("RMSE: ", rmse)
    # print('-'*100)

    ### Mlxtend Implementation of Sequential Feature Selection
    sfs = SFS(
        model,
        k_features=20,  #x.shape[1], 
        forward=True,
        floating=False,
        scoring='neg_mean_squared_error',
        cv=0)

    sfs = sfs.fit(x, y)

    # scores = pd.DataFrame.from_dict(sfs.get_metric_dict()).T
    # fig = plot_sfs(sfs.get_metric_dict(), kind='std_err')
    # plt.grid()
    # plt.show()

    idx = sfs.k_feature_idx_
    print(type(idx))
    print('Selected features indexes:', sfs.k_feature_idx_)
    idx_list.append(idx)
Beispiel #2
0
            loc.append(np.mean(np.array(ref_data)))

        x.append(loc)

        if 'F' == row['Type']:
            y.append(0)
        elif 'D' == row['Type']:
            y.append(1)
        elif 'G' == row['Type']:
            y.append(2)

    print(len(x), len(x[0]))
    print(len(y))

    # Create the RFE object and rank each pixel
    print('Find the right features...')
    knn = KNeighborsClassifier(n_neighbors=3)
    sfs1 = SFS(knn,
               k_features=7,
               forward=True,
               floating=True,
               verbose=2,
               scoring='accuracy',
               cv=0,
               n_jobs=-1)

    sfs1 = sfs1.fit(np.array(x),
                    np.array(y),
                    custom_feature_names=tuple(fields))
    print()
    pprint(sfs1.subsets_)
Beispiel #3
0
#
#         0.0       0.99      0.77      0.87      3309
#         1.0       0.04      0.46      0.07        68
#
#    accuracy                           0.77      3377
#   macro avg       0.51      0.62      0.47      3377
#weighted avg       0.97      0.77      0.85      3377
#
#the auc of logistics is: 0.7029313992142641
#the brier socre is 0.17659280049694137


f_number = 100
sfs = SFS(clfLogisticRegression,
           k_features=f_number,
           forward=True,
           floating=False,
           scoring='roc_auc',
           cv=5)
print('ok3')
result = sfs.fit(X_train, y_train, custom_feature_names=feature_names)
#print(X)
result.subsets_
result.k_score_

selection_res = pd.DataFrame.from_dict(sfs.get_metric_dict()).T
# print(selection_res)
selection_res.to_csv("/Users/shuojiawang/Documents/ppdmodel/result1907/selection_log_withouthistorylg.csv", sep='\t')

selected_feature_idx = result.k_feature_idx_
#print(type(selected_feature_idx))
selected_feature = list(selected_feature_idx)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from numpy import genfromtxt
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

if __name__ == "__main__":
    X = genfromtxt('../../../../features.csv',
                   delimiter=',',
                   usecols=range(1, 5))
    y = genfromtxt('../../../../features.csv',
                   delimiter=',',
                   usecols=range(5, 6))
    clf = SVC()
    sfs = SFS(clf,
              k_features=3,
              forward=True,
              floating=True,
              scoring='accuracy',
              cv=0)
    sfs = sfs.fit(X, y)
    feature_count = len(sfs.k_feature_idx_)
    count = 0
    text_file = open("../../../../selected_floating_features.txt", "w")
    for feature in sfs.k_feature_idx_:
        count = count + 1
        text_file.write("%s" % feature)
        if count < feature_count:
            text_file.write(",")
    text_file.close()
Beispiel #5
0
y_pred_proba_cdt =clfXGboost.predict_proba(X_test)[:, 1]
confmat_test_c = confusion_matrix(y_true=y_test, y_pred=y_pred_c)
print('confmat_test:\n', confmat_test_c)
print('the acc is:', accuracy_score(y_test, y_pred_c))
print('the classification_report:', classification_report(y_test, y_pred_c))
print('the auc of XGboost is:', roc_auc_score(y_test, y_pred_proba_cdt))


feature_names = X.columns.values.tolist()
print(feature_names)
print('ok2')

f_number = 50
sfs = SFS(clfXGboost,
           k_features=f_number,
           forward=True,
           floating=False,
           scoring='roc_auc',
           cv=5)

print('ok')


result2 = sfs.fit(X_train, y_train, custom_feature_names=feature_names)
#print(X)
result2.subsets_
result2.k_score_

selection_res = pd.DataFrame.from_dict(sfs.get_metric_dict()).T
# print(selection_res)
selection_res.to_csv("/Users/shuojiawang/Documents/ppdmodel/result1907/selection_log_withhistoryxgboost.csv", sep='\t')
    'Annual performance C', 'Age level', 'Marital Status',
    'Job tenure level A', 'Job tenure level B', 'Average working years',
    'Graduated School', 'Family numbers'
]]

# # Sequential Feature Selector

# In[28]:

X.shape

# In[29]:

sfsl = SFS(clf_pipeline,
           k_features=26,
           forward=True,
           scoring=make_scorer(fbeta_score, beta=1.5),
           cv=10)

sfsl.fit(X, Y)
sfsl.subsets_

# In[36]:

X = X[[
    'yyyy', 'Job classification', 'Work experience5', 'Special project',
    'Training hours B', 'Training hours C', 'leave this three mon. A',
    'leave this year A', 'leave this three mon. B', 'leave this year B',
    'Annual performance C', 'Job tenure level A', 'Job tenure level B',
    'Family numbers'
]]
Beispiel #7
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Project      : tql-Python.
# @File         : featureSelector
# @Time         : 2019-07-26 13:39
# @Author       : yuanjie
# @Email        : [email protected]
# @Software     : PyCharm
# @Description  : 


from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_selection import GenericUnivariateSelect, \
    SelectPercentile, SelectKBest, f_classif, mutual_info_classif, RFE

from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt

sfs = SFS(LGBMClassifier(),
          k_features=10,
          forward=True,
          floating=False,
          verbose=2,
          scoring='roc_auc',
          cv=5,
          n_jobs=-1)

sfs.fit(X, y)
Beispiel #8
0
from sklearn.neighbors import KNeighborsClassifier
import math
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

mnist_dataset = datasets.load_digits()
X = mnist_dataset.data
Y = mnist_dataset.target
target_names = mnist_dataset.target_names
train, test, train_targets, test_targets = model_selection.train_test_split(
    X, Y, train_size=0.5, test_size=0.5)

knn = KNeighborsClassifier(round(math.sqrt(train.shape[0] + test.shape[0])))

sfbs = SFS(knn,
           k_features=round(train.shape[1] * 0.05),
           forward=True,
           floating=True,
           scoring="accuracy",
           cv=0)
sfbs = sfbs.fit(train, train_targets)

best_k_features = round(train.shape[1] * 0.05)
best_score = sfbs.k_score_
features = 2
for i in range(1, 5):
    features = features * i
    sfbs = SFS(knn,
               k_features=features,
               forward=False,
               floating=True,
               scoring="accuracy",
               cv=0)
    #Separación train y test

    X_train, X_test, y_train, y_test = train_test_split(
        df4,
        df3["descCanalRadicacion"],
        test_size=0.2,
        random_state=10,
        stratify=df3["descCanalRadicacion"])

    #ML Model: Model Selection
    knn = KNeighborsClassifier(n_neighbors=50, weights='distance')

    sfs1 = SFS(knn,
               k_features=11,
               forward=True,
               floating=False,
               verbose=1,
               scoring=make_scorer(f1_score, average='weighted'),
               cv=5)

    sfs1 = sfs1.fit(X_train, y_train)

    X_train_sfs = sfs1.transform(X_train)
    X_test_sfs = sfs1.transform(X_test)

    clfKnn_sfs = knn.fit(X_train_sfs, y_train)

    #clases
    LE_name_mapping = dict(zip(LE.classes_, LE.transform(LE.classes_)))
    clases_Canal = (LE_name_mapping)
    predicted_clases = list(clases_Canal.keys())
Beispiel #10
0
    'learning_rate': 0.013,
    'max_depth': 5,
    'nthread': 4,
    'silent': 1,
    'subsample': 0.463,
    'reg_lambda': 0.715,
    'gamma': 0.01,
    'min_child_weight': 30.4,
}

estimator = xgb.XGBClassifier(**params_est)

sfs1 = SFS(estimator,
           k_features=(1, 26),
           forward=True,
           floating=False,
           verbose=2,
           scoring='log_loss',
           cv=4,
           n_jobs=4)
sfs1 = sfs1.fit(train.as_matrix(), Y)

results = pd.DataFrame.from_dict(sfs1.get_metric_dict()).T
fig1 = plot_sfs(sfs1.get_metric_dict(), kind='std_dev')
plt.title('Sequential Forward Selection (w. StdDev)')
plt.grid()
plt.show()

print(sfs1.subsets_)
print(sfs1.k_feature_idx_)
print(sfs1.k_score_)
"""
# In[9]:

X = X.values
y = y.values

# In[11]:

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt
from sklearn import linear_model
import pandas as pd
lr = linear_model.LinearRegression()

sfs = SFS(lr, k_features=30, forward=True, floating=False, scoring='r2', cv=4)

sfs = sfs.fit(X, y)
print('\nSequential Floating Forward Selection (k=30):')
print(sfs.k_feature_idx_)
print('CV Score:')
print(sfs.k_score_)

pd.DataFrame.from_dict(sfs.get_metric_dict()).T

plt.figure(figsize=(19, 10))
fig = plot_sfs(sfs.get_metric_dict(), kind=None)
plt.title('Sequential Forward Selection (R Sqaure)')
plt.grid()
plt.show()
start_features = df.tail(15)
#print(these_choices)
#print(df)
start_features = list(start_features['feature'].values)[::-1]

#test_cols = df.tail(40)['feature'].values

for start_feature in start_features:
    for k_features in range(2, 20):
        print(start_feature)
        sfs = SFS(
            estimator=rfc,
            k_features=k_features,
            forward=True,
            floating=True,
            verbose=1,
            scoring='accuracy',
            n_jobs=15,
            fixed_features=[start_feature],
            cv=4,
        )

        start_time = time.time()
        try:
            sfs = sfs.fit(X_train[test_cols], y_train)
        except:
            continue
        end_time = time.time()
        #print()
        #print(feature_choices, end_time - start_time)
        best_features = list(sfs.k_feature_names_)
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

metr=[accuracy_score, recall_score, precision_score, f1_score]

result=pd.DataFrame(columns=['N_features','Fold','Acc','Recall','Precision','F1'])

for s in ks:
    selvars=Fscore.index[:s]
    for pos, (train,valid) in enumerate(skfold.split(data[selvars], data['target'])):
        clf.fit(data.iloc[train][selvars], data.iloc[train]['target'])
        y_pred=clf.predict(data.iloc[valid][selvars])
        result.loc[len(result)]=[s,pos+1]+[m(data.iloc[valid]['target'],y_pred) for m in metr]
        
result.groupby(['N_features'])['Acc','Recall','Precision','F1'].mean()

Fscore['Group']=[x[:3] for x in Fscore.index]

group_F=Fscore.groupby(['Group'])['F'].mean()
group_F=group_F.sort_values(ascending=False)

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import sklearn

clf=LogisticRegression(C=1, max_iter=300)


sfs=SFS(clf,k_features=10, forward=True, floating=False, scoring='f1', cv=5)
sfs.fit(data[cols],data['target'])

sfs.subsets_
Beispiel #14
0
from sklearn import datasets
from sklearn import model_selection
from sklearn.neighbors import KNeighborsClassifier
import math
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

mnist_dataset = datasets.load_digits()
X = mnist_dataset.data
Y = mnist_dataset.target
target_names = mnist_dataset.target_names
train, test, train_targets, test_targets = model_selection.train_test_split(X, Y, train_size=0.5,test_size=0.5)

knn = KNeighborsClassifier(round(math.sqrt(train.shape[0]+test.shape[0])))

best_k_features=0
best_score=0
features=1
for i in range (1,5):
    features=features*i
    sbs = SFS(knn, k_features=features, forward=False,floating=False,scoring="accuracy", cv=0)
    sbs = sbs.fit(train, train_targets)
    #print("For number of featres: {0}, best features: {1}, prediction score: {2}".format(features, sffs.k_feature_idx_, sffs.k_score_))
    if best_score<sbs.k_score_:
        best_score=sbs.k_score_
        best_k_features=features
    
print("The best score: {0} for number of features: {1}".format(best_score,best_k_features ))
#data visualization
data_numeric = data[['power', 'kilometer', 'brand_amount', 'brand_price_average', 
                     'brand_price_max', 'brand_price_median']]
correlation = data_numeric.corr()

f , ax = plt.subplots(figsize = (7, 7))
plt.title('Correlation of Numeric Features with Price',y=1,size=16)
sns.heatmap(correlation,square = True,  vmax=0.8)
plt.show()

# (2) wrapper
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression
sfs = SFS(LinearRegression(),
           k_features=10,
           forward=True,
           floating=False,
           scoring = 'r2',
           cv = 0)
x = data.drop(['price'], axis=1)
x = x.fillna(0)
#use LabelEncoder to deal with string data
print(x.info())
le = preprocessing.LabelEncoder()
for column in x.columns:
    if x[column].dtype == object:
        x[column] = le.fit_transform(x[column])

y = data['price']
y = y.fillna(0)

sfs.fit(x, y)
def step_feature_selection(keras_est,
                           x_train,
                           y_train,
                           x_test,
                           y_test,
                           features_lower_bound,
                           features_upper_bound,
                           *,
                           scoring='accuracy',
                           cv=0,
                           n_jobs=-1):
    # feature selection step forward/backward:
    sk_keras_est = SFS(keras_est,
                       k_features=(features_lower_bound, features_upper_bound),
                       forward=True,
                       floating=False,
                       verbose=2,
                       scoring=scoring,
                       cv=cv,
                       n_jobs=n_jobs)

    sk_keras_est = sk_keras_est.fit(x_train, y_train)

    # transforming data to only contain chosen features:
    x_train_sfs = sk_keras_est.transform(x_train)
    x_test_sfs = sk_keras_est.transform(x_test)

    # print(pd.DataFrame(x_train_sfs))
    # print(pd.DataFrame(x_test_sfs))

    global feature_names
    selected_features = []
    selected_features = [feature_names[i] for i in sk_keras_est.k_feature_idx_]
    feature_names = selected_features
    #print(feature_names)
    feature_names_SFS = pd.DataFrame(feature_names)
    feature_names_SFS.to_csv(RUNDIR + "feature_names_SFS.csv", index=False)
    k.clear_session()

    # # training model with chosen features
    # keras_est.fit(x_train_sfs, y_train)
    # y_pred = keras_est.predict(x_test_sfs)

    # # evaluating model with accuracy and false positive index
    # correct = 0
    # index_wrong=[]
    # false_positive=[]
    # y_test = y_test.flatten()
    # y_pred = y_pred.flatten()

    # # for i in range(len(y_pred)):
    # #   if y_test[i] == y_pred[i]:
    # #       correct += 1
    # #   else:
    # #       index_wrong.append(i)
    # #       if y_test[i] == 0:
    # #           false_positive.append(i)

    # for i in range(len(y_pred)):
    #   if y_test[i] != y_pred[i]:
    #       index_wrong.append(i)
    #       if y_test[i] == 0:
    #         false_positive.append(i)

    # # checking model accuracy:
    # percent_correct= accuracy_score(y_test, y_pred)
    # accuracy_result = pd.DataFrame.from_dict(sk_keras_est.get_metric_dict()).T
    # accuracy_result.to_csv(DATADIR+"accuracy_result.csv", index=False)

    # print('Selected features:', sk_keras_est.k_feature_idx_)
    # #percent_correct = (correct/len(df_y_test))
    # print("Model accurary is: {:.2f}%".format(percent_correct*100))
    # print("Wrong prediction index: ", index_wrong)
    # print("Index with False Positive: ", false_positive)

    return x_train_sfs, x_test_sfs  #return original dataframe if none is dropped
Beispiel #17
0
X_test = scaler.transform(X_test)

print(labels)
print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)

# Use sequential feature selection to decide what features to use. Grid search to determine best hyperparameter values.

# In[3]:

knn = KNeighborsRegressor()
sfs1 = SFS(estimator=knn,
           k_features='best',
           forward=False,
           floating=True,
           cv=5)

pipe = Pipeline([('sfs', sfs1), ('knn', knn)])

param_grid = [{
    'sfs__estimator__n_neighbors': range(1, len(X_idx)),
    'sfs__estimator__weights': ['distance', 'uniform'],
    'sfs__estimator__metric': ['euclidean', 'manhattan', 'chebyshev']
}]

gs = GridSearchCV(estimator=pipe,
                  param_grid=param_grid,
                  n_jobs=-1,
                  cv=5,
Beispiel #18
0
def objective_function(args):
    n_components = args['n_components']
    quantiles = args['quantiles']
    if args['preprocessing'] == 'NoTransform':
        X, Y, scaler = transform(dataset)
    elif args['preprocessing'] == 'MinMaxScaler':
        X, Y, scaler = transform(dataset)
    elif args['preprocessing'] == 'StandardScaler':
        X, Y, scaler = standard_scaler(dataset)
    elif args['preprocessing'] == 'RobustScaler':
        X, Y, scaler = robust_scaler(dataset)
    elif args['preprocessing'] == 'QuantileTransformer':
        X, Y, scaler = quantile_transformer(dataset, quantiles)
    elif args['preprocessing'] == 'PowerTransformer':
        X, Y, scaler = power_transformer(dataset)
    elif args['preprocessing'] == 'PCA':
        X, Y, scaler = pca_transform(dataset, n_components)
    if args['preprocessing'] != 'PCA':
        k_features = args['k_features']
    else:
        k_features = X.shape[1]
    if args['model'] == RandomForestRegressor:
        n_estimators = args['params']['n_estimators']
        max_depth = args['params']['max_depth']
        min_samples_split = args['params']['min_samples_split']
        min_samples_leaf = args['params']['min_samples_leaf']
        min_weight_fraction_leaf = args['params']['min_weight_fraction_leaf']
        max_features = args['params']['max_features']
        max_leaf_nodes = args['params']['max_leaf_nodes']
        estimator = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth,
                  min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf,
                  max_leaf_nodes = max_leaf_nodes, min_weight_fraction_leaf = min_weight_fraction_leaf,
                  max_features = max_features, n_jobs = -1)
        reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False)
    elif args['model'] == AdaBoostRegressor:
        learning_rate = args['params']['learning_rate']
        n_estimators = args['params']['n_estimators']
        loss = args['params']['loss']
        max_depth = args['params']['base_estimator']['max_depth']
        min_samples_split = args['params']['base_estimator']['min_samples_split']
        min_samples_leaf = args['params']['base_estimator']['min_samples_leaf']
        min_weight_fraction_leaf = args['params']['base_estimator']['min_weight_fraction_leaf']
        max_features = args['params']['base_estimator']['max_features']
        estimator = AdaBoostRegressor(base_estimator = DecisionTreeRegressor(max_depth = max_depth, min_samples_split = min_samples_split,
                  min_samples_leaf = min_samples_leaf, min_weight_fraction_leaf = min_weight_fraction_leaf,
                  max_features = max_features), learning_rate = learning_rate, n_estimators = n_estimators, loss = loss)
        reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False)
    elif args['model'] == ExtraTreesRegressor:
        n_estimators = args['params']['n_estimators']
        max_depth = args['params']['max_depth']
        min_samples_split = args['params']['min_samples_split']
        max_features = args['params']['max_features']
        min_samples_leaf = args['params']['min_samples_leaf']
        min_weight_fraction_leaf = args['params']['min_weight_fraction_leaf']
        max_leaf_nodes = args['params']['max_leaf_nodes']
        estimator = ExtraTreesRegressor(n_estimators = n_estimators, max_depth = max_depth,
                  min_samples_split = min_samples_split, max_features = max_features,
                  max_leaf_nodes = max_leaf_nodes, min_weight_fraction_leaf = min_weight_fraction_leaf,
                  min_samples_leaf = min_samples_leaf, n_jobs = -1)
        reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False)
    elif args['model'] == GradientBoostingRegressor:
        loss = args['params']['loss']
        learning_rate = args['params']['learning_rate']
        n_estimators = args['params']['n_estimators']
        subsample = args['params']['subsample']
        min_samples_split = args['params']['min_samples_split']
        max_depth = args['params']['max_depth']
        tol = args['params']['tol']
        estimator = GradientBoostingRegressor(loss = loss, n_estimators = n_estimators,
                  subsample = subsample, min_samples_split = min_samples_split, learning_rate = learning_rate,
                  max_depth = max_depth, tol = tol)
        reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False)
    elif args['model'] == SGDRegressor:
        loss = args['params']['loss']
        penalty = args['params']['penalty']
        alpha = args['params']['alpha']
        l1_ratio = args['params']['l1_ratio']
        tol = args['params']['tol']
        learning_rate = args['params']['learning_rate']
        power_t = args['params']['power_t']
        estimator = SGDRegressor(loss = loss, penalty = penalty, alpha = alpha, max_iter = 13000,
                  l1_ratio = l1_ratio, tol = tol, learning_rate = learning_rate, power_t = power_t)
        reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False)
    elif args['model'] == ElasticNet:
        alpha = args['params']['alpha']
        l1_ratio = args['params']['l1_ratio']
        tol = args['params']['tol']
        estimator = ElasticNet(alpha = alpha, l1_ratio = l1_ratio, tol = tol)
        reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False)
    elif args['model'] == Ridge:
        alpha = args['params']['alpha']
        tol = args['params']['tol']
        solver = args['params']['solver']
        estimator = Ridge(alpha = alpha, tol = tol, solver = solver)
        reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False)
    elif args['model'] == KNeighborsRegressor:
        n_neighbors = args['params']['n_neighbors']
        weights = args['params']['weights']
        algorithm = args['params']['algorithm']
        leaf_size = args['params']['leaf_size']
        p = args['params']['p']
        estimator = KNeighborsRegressor(n_neighbors = n_neighbors, weights = weights,
                                        algorithm = algorithm, leaf_size = leaf_size, p = p, n_jobs = -1)
        reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False)
    elif args['model'] == GaussianProcessRegressor:
        alpha = args['params']['alpha']
        estimator = GaussianProcessRegressor(alpha = alpha)
        reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False)
    elif args['model'] == SVR:
        kernel = args['params']['kernel']
        if kernel == 'poly':
            degree = args['params']['degree']
        else:
            degree = 3
        if kernel == 'rbf' or 'poly' or 'sigmoid':
            gamma = args['params']['gamma']
        else:
            gamma = 'auto'
        tol = args['params']['tol']
        C = args['params']['C']
        shrinking = args['params']['shrinking']
        estimator = SVR(kernel = kernel, degree = degree, gamma = gamma, tol = tol, C = C, shrinking = shrinking)
        reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False)
    elif args['model'] == xgb:
        booster = args['params']['booster']
        eta = args['params']['eta']
        gamma = args['params']['gamma']
        max_depth = args['params']['max_depth']
        n_estimators = args['params']['n_estimators']
        min_child_weight = args['params']['min_child_weight']
        subsample = args['params']['subsample']
        alpha = args['params']['alpha']
        random_state = args['params']['random_state']
        colsample_bytree = args['params']['colsample_bytree']
        colsample_bylevel = args['params']['colsample_bylevel']
        colsample_bynode = args['params']['colsample_bynode']
        reg_lambda = args['params']['reg_lambda']
        grow_policy = args['params']['grow_policy']
        if booster == 'dart':
            sample_type = args['params']['sample_type']
            normalize_type = args['params']['normalize_type']
            rate_drop = args['params']['rate_drop']
            skip_drop = args['params']['skip_drop']
        if args['preprocessing'] != 'PCA':
            k_features = args['k_features']
        else:
            k_features = sample(scope.int(hp.quniform('k_features', 1, X.shape[1], 1)))
        if booster == 'gbtree':
            estimator = xgb.XGBRegressor(booster = booster, eta = eta, gamma = gamma, max_depth = max_depth, n_estimators = n_estimators,
                              min_child_weight = min_child_weight, subsample = subsample, alpha = alpha, random_state = random_state,
                              colsample_bytree = colsample_bytree, colsample_bylevel = colsample_bylevel, grow_policy = grow_policy,
                              colsample_bynode = colsample_bynode, reg_lambda = reg_lambda, n_jobs = -1)
            reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False, scoring = metrics_names[eval_metric])
        elif booster == 'dart':
            num_round = 50
            estimator = xgb.XGBRegressor(booster = booster, eta = eta, gamma = gamma, max_depth = max_depth, n_estimators = n_estimators,
                              min_child_weight = min_child_weight, subsample = subsample, alpha = alpha, random_state = random_state,
                              colsample_bytree = colsample_bytree, sample_type = sample_type, normalize_type = normalize_type,
                              rate_drop = rate_drop, skip_drop = skip_drop, colsample_bylevel = colsample_bylevel, grow_policy = grow_policy,
                              colsample_bynode = colsample_bynode, reg_lambda = reg_lambda, n_jobs = -1)
            reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False, scoring = metrics_names[eval_metric])
    if eval_metric == 'mse':
        x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 1 - percent_train, random_state = 1, shuffle = False)
        sfsl = reg.fit(X, Y)
        x_sfs = sfsl.transform(X)
        x_train_sfs = x_sfs[:length_train]
        x_test_sfs = x_sfs[length_train:]
        estimator.fit(x_train_sfs, y_train)
        if args['model'] == xgb:
            if booster == "gbtree":
                y_pred = estimator.predict(x_test_sfs)
            elif booster == "dart":
                y_pred = estimator.predict(x_test_sfs, ntree_limit = num_round)
        else:
            y_pred = estimator.predict(x_test_sfs)
        if args['preprocessing'] != 'NoTransform':
            predictions = y_pred.reshape(-1, 1)
            for i in range(predictions.shape[1]):
                if args['preprocessing'] != 'PCA':
                    tmp = np.zeros((predictions.shape[0], n_features))
                else:
                    tmp = np.zeros((predictions.shape[0], X.shape[1]))
                tmp[:, 0] = predictions[:, i]
                predictions[:, i] = scaler.inverse_transform(tmp)[:, 0]
            mse = mean_squared_error(dataset[target][length_train:], predictions)
            print('mse value: {}, model: {}'.format(mse, args['model']))
            return mse
        else:
            mse = mean_squared_error(dataset[target][length_train:], y_pred)
            print('mse value: {}, model: {}'.format(mse, args['model']))
            return mse
    else:
        reg.fit(X, Y)
        print('Model: {}, r2 value: {}, Selected variables {}'.format(args['model'], reg.k_score_, reg.k_feature_names_))
        loss_function = 1 - reg.k_score_
        return loss_function
Beispiel #19
0
def main():
    # read the data using pandas
    bank_data = pd.read_csv("E:/Study/AI_Sem1/ML/bank.csv", delimiter=",")

    # Run pre-processing on data frame
    feature_train, class_label = performPreprocessing(bank_data)

    # ----------------- create baseline models ------------------------- #

    # split the data into test and train
    X_train, X_test, Y_train, Y_test = train_test_split(feature_train,
                                                        class_label,
                                                        test_size=0.2,
                                                        random_state=11)

    print(
        '-------------------------------- Baseline Models before Feature Selection Pre-processing --------------------------------------'
    )

    # call run model function without feature selection pre-processing
    model_df = runModels(X_train, Y_train)
    print(model_df)

    # call metrics function for one model without feature selection pre-processing
    report(X_train, Y_train)

    # ----------------- create models after applying feature selection technique in pre-processing ------------------------- #

    # call feature selection function before spliting the data
    featureSelection(feature_train, class_label)

    # remove features which has lower importance ranking ie, marital, default and loan columns
    feature_train1 = feature_train.drop(['marital', 'default', 'loan'], axis=1)

    # split the data into test and train
    X_train1, X_test1, Y_train1, Y_test1 = train_test_split(feature_train1,
                                                            class_label,
                                                            test_size=0.2,
                                                            random_state=10)

    print(
        '--------------------------------Models After Feature Selection --------------------------------------'
    )
    # call run model function with feature selection pre-processing
    model_df_feature = runModels(X_train1, Y_train1)
    print(model_df_feature)

    # call metrics function for one model after feature selection
    report(X_train1, Y_train1)

    print(
        '-------------------------------- Hyper Parameter optimization on top 3 models ----------------------------------------'
    )

    hyperParameter(X_train1, Y_train1)

    print(
        '-------------------------------- Research Topic - Feature Selection -----------------------------------'
    )

    # Research - Feature Selection

    print(
        '-------------------------------- 1. Recursive Feature Elimination -------------------------------------'
    )

    # 1. Recursive Feature Elimination
    # using Logistic Regression model to get the score of each feature
    model = LogisticRegression()
    # create the RFE model and select 10 attributes
    rfe = RFE(model, 10)
    rfe = rfe.fit(X_train, Y_train)
    # summarize the selection of the attributes
    print(rfe.support_)
    print(rfe.ranking_)

    # plot the ranking
    plt.bar(range(len(rfe.ranking_)), rfe.ranking_)
    plt.show()

    # based on these ranking remove the columns
    feature_train2 = feature_train.drop(
        ['age', 'job', 'balance', 'day', 'duration', 'pdays'], axis=1)
    # split the data into test and train
    X_train2, X_test2, Y_train2, Y_test2 = train_test_split(feature_train2,
                                                            class_label,
                                                            test_size=0.2,
                                                            random_state=10)

    # call run model function for Recursive Feature Elimination
    model_df_RFE = runModels(X_train2, Y_train2)
    print(model_df_RFE)

    print(
        '-------------- Hyper Parameter optimization on top 3 models for Recursive Feature Elimination technique ------------------'
    )

    hyperParameter(X_train2, Y_train2)

    print(
        '-------------------------------- 2. Feature Importance -------------------------------------'
    )

    # 2. Feature Importance
    # fit an Extra Trees model to the data
    model = ExtraTreesClassifier()
    model.fit(X_train, Y_train)
    # display the relative importance of each feature
    print('Score values of each fetaure: ', model.feature_importances_)

    # plot the scores
    plt.bar(range(len(model.feature_importances_)), model.feature_importances_)
    plt.show()

    # based on these score remove the columns
    feature_train3 = feature_train.drop([
        'marital', 'education', 'default', 'housing', 'loan', 'duration',
        'pdays', 'previous'
    ],
                                        axis=1)
    # split the data into test and train
    X_train3, X_test3, Y_train3, Y_test3 = train_test_split(feature_train3,
                                                            class_label,
                                                            test_size=0.2,
                                                            random_state=10)

    # call run model function for Feature Importance technique
    model_df_Feature_Importance = runModels(X_train3, Y_train3)
    print(model_df_Feature_Importance)

    print(
        '------------------- Hyper Parameter optimization on top 3 models for Feature Importance technique ----------------------'
    )

    hyperParameter(X_train3, Y_train3)

    print(
        '-------------------------------- 3. Sequential Feature Selector -------------------------------------'
    )

    sfs1 = SFS(KNeighborsClassifier(),
               k_features=10,
               forward=True,
               floating=False,
               verbose=2,
               scoring='accuracy',
               cv=0)
    sfs1 = sfs1.fit(X_train, Y_train)
    print('Indices of the 10 best features: ', sfs1.k_feature_idx_)

    # based on these score remove the columns
    feature_train4 = feature_train.drop(
        ['age', 'job', 'education', 'balance', 'day', 'campaign'], axis=1)
    # split the data into test and train
    X_train4, X_test4, Y_train4, Y_test4 = train_test_split(feature_train4,
                                                            class_label,
                                                            test_size=0.2,
                                                            random_state=10)

    # call run model function for Sequential Feature Selector
    model_df_SFS = runModels(X_train4, Y_train4)
    print(model_df_SFS)

    print(
        '------------------- Hyper Parameter optimization on top 3 models for Sequential Feature Selector ----------------------'
    )

    hyperParameter(X_train4, Y_train4)
Beispiel #20
0
def main():
    options = ['5', 'mean', 'median']
    targets = [0, 1]
    selections = ['00', '01', '10', '11']
    filename = 'EEG.csv'
    train, label = load(filename)
    train = norm(train)

    ############### define classifier ##################
    clf = SVC(C=0.25, kernel='linear')

    for option in options:
        label, _ = transform(label, type=option)
        for selection in selections:
            for target in targets:
                forward = False
                floating = False
                if selection[0] == '1':
                    forward = True
                if selection[1] == '1':
                    floating = True

                print('')
                if forward:
                    print('forward ', end='')
                else:
                    print('backward ', end='')
                if floating:
                    print('floating ', end='')
                print('selection --- target:', end='')
                if target == 0:
                    print(' arousal')
                elif target == 1:
                    print(' valence')
                else:
                    print('target error ({})'.format(target))
                    sys.exit()
                print('')

                ############### target the label  ##################
                ###################
                # 0 : arousal     #
                # 1 : valence     #
                ###################
                train_y = label[:, target].reshape(-1)

                sfs = SFS(clf,
                          k_features='best',
                          forward=forward,
                          floating=floating,
                          scoring='accuracy',
                          cv=4,
                          n_jobs=-1,
                          verbose=1)

                sfs.fit(train, train_y)

                # save model
                pickle_on = open(
                    'sequence/eeg_{}_{}_{}'.format(option, selection, target),
                    "wb")
                pickle.dump(sfs.k_feature_idx_, pickle_on)
                pickle_on.close()
Beispiel #21
0
def ExecuteSFFS(x, y, featureNames, featureList, clusters, clusterNames, svc,
                kFolds, nbOfSplit, featMaxNbrSFFS, standardizationType,
                removedData, permutation_flag, nbPermutation, balance_flag,
                currentDateTime, resultDir, debug_flag, verbose):
    import scipy
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    from sklearn.model_selection import train_test_split as tts
    from sklearn.metrics import confusion_matrix
    from mlxtend.feature_selection import SequentialFeatureSelector as SFS
    from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
    from sklearn.model_selection import RandomizedSearchCV

    from slpClass_toolbox import BalanceClasses
    from slpClass_toolbox import Standardize
    from slpClass_toolbox import Permute
    from slpClass_toolbox import ComputePermutationAvgDA
    from slpClass_toolbox import PlotPermHist
    from slpClass_toolbox import ApplyStandardization
    from slpClass_toolbox import plot_confusion_matrix

    plt.rcParams.update({'figure.max_open_warning': 0})

    # Get features values since SFFS works only with numpy array!
    bestFeaturesHist = np.zeros([len(featureNames)])
    CvResult = pd.DataFrame()
    permResults = pd.DataFrame()
    tmpBest = []
    DA = []
    avg_perm_DA = []
    skipFS = False  # flag to skip feature selection
    fitFeatOverTresh = False  # fit classifier with most frequent features in best set

    #********************** TRAIN pre-procesing *******************************
    for it in list(range(nbOfSplit)):
        print('\nSplit #{}'.format(str(it)))

        # Use all features or given ones only
        if len(featureList) == 0:
            xx = x
        elif isinstance(featureList[0], float):
            xx = x
            fitFeatOverTresh = True
        else:
            xx = x[featureList]
            skipFS = True

        # Balance the number of old woman and old man or not
        if balance_flag:
            X, Y = BalanceClasses(xx, y)
        else:
            X, Y = xx, y

        # slpit dataset into train and test random subset
        X_train, X_test, y_train, y_test = tts(X,
                                               Y['Cluster'],
                                               test_size=0.33,
                                               stratify=Y['Cluster'])
        # Data z-score standardisation
        xTrainSet, zPrm = Standardize(X_train, y_train, standardizationType,
                                      debug_flag)

        #**************************** SVM optimisation ************************
        params_dict = {
            'C': scipy.stats.expon(scale=100),
            'kernel': ['linear'],
            'class_weight': ['balanced', None]
        }

        n_iter_search = 20
        random_search = RandomizedSearchCV(svc,
                                           param_distributions=params_dict,
                                           n_iter=n_iter_search)

        random_search.fit(xTrainSet, y_train)
        optimClf = random_search.best_estimator_

        #*************************** TRAIN ************************************
        print('Fitting...')
        if skipFS:
            optimClf = optimClf.fit(xTrainSet.as_matrix(), y_train)

            yPred = optimClf.predict(xTrainSet.as_matrix())

            # Compute the accuracy of the test prediction
            acc = float((y_train == yPred).sum()) / yPred.shape[0]
            print('Train predicted accuracy: %.2f %%' % (acc * 100))
            fitRes = pd.DataFrame(data=[acc], columns=['CV_DA_' + str(it + 1)])

        else:
            # set k_features = (1,X.shape[1]) to test all possible combinations
            sffs = SFS(optimClf,
                       k_features=(1, featMaxNbrSFFS),
                       forward=True,
                       floating=False,
                       scoring='accuracy',
                       cv=kFolds,
                       n_jobs=-1)
            sffs = sffs.fit(xTrainSet.as_matrix(), y_train)

            print('Best combination for fit #%d (ACC: %.3f): %s' % \
                  (it,sffs.k_score_, sffs.k_feature_idx_))

            # Fit the estimator using the new feature subset and make a
            # prediction on the test data
            X_train_sfs = sffs.transform(xTrainSet.as_matrix())
            optimClf.fit(X_train_sfs, y_train)

            fitRes = pd.DataFrame.from_dict(sffs.get_metric_dict()).T
            fitRes['avg_over_std'] = fitRes['avg_score'] / fitRes['std_dev']

            if featMaxNbrSFFS > 1:
                # plot feature selection process metrics
                fig1 = plot_sfs(sffs.get_metric_dict(), kind='std_err')
                savedPlotName = resultDir+'Decoding_accuracy_'+clusters+'_'+\
                                str(it)+'_'+str(nbOfSplit)+'.png'

                tmpBest.append(sffs.k_feature_idx_)
                bestFeaturesHist[[tmpBest[-1]]] += 1

                fig1.set_dpi(300)
                plt.tight_layout()
                plt.savefig(savedPlotName, bbox_inches='tight')
                plt.clf()
                plt.close(fig1)

                # plot mean / std
                plt.figure(dpi=300)
                plt.title('Moyenne sur ecart-type')
                plt.xlabel("nb attributs dans combinaison")
                plt.xticks(range(featMaxNbrSFFS))
                plt.ylabel("Moyenne sur ecart-type")
                plt.plot(list(range(1, featMaxNbrSFFS + 1)),
                         fitRes['avg_over_std'])
                figName = resultDir+'SFFS_'+clusters+'_bestSet_metric_'+ \
                          str(it)+'_'+str(nbOfSplit)
                plt.savefig(figName, bbox_inches='tight')
                plt.clf()
                plt.close()

        # add metrics iteration identifier
        fitRes = fitRes.add_suffix('_' + str(it + 1))

        CvResult = pd.concat([CvResult, fitRes], axis=1)

        #***************************** TEST ***********************************
        print('Testing...')
        # standardize test set using trainset standardization parameters
        xTestSet = ApplyStandardization(X_test, zPrm)

        # prepare test data
        if skipFS:
            xTest = xTestSet
            savedPlotName = resultDir+clusters+'_ConfusionMatrix_'+str(it+1)+ \
                            '_'+str(nbOfSplit)
        else:
            # Generate a new subset of data according to selected features
            xTest = sffs.transform(xTestSet.as_matrix())
            savedPlotName = resultDir+'SFFS_'+clusters+'_ConfusionMatrix_'+ \
                        str(it+1)+'_'+str(nbOfSplit)

        # actually test classifier and compute decoding accuracy on predictions
        y_pred = optimClf.predict(xTest)
        acc = float((y_test == y_pred).sum()) / y_pred.shape[0]
        print('Test set accuracy: %.2f %%' % (acc * 100))
        DA.append(acc)  # stack test DA for further use

        # plot confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        fig_CM = plt.figure(dpi=300)
        plot_confusion_matrix(cm,
                              clusterNames,
                              title=savedPlotName,
                              normalize=True,
                              precision=2)
        plt.clf()
        plt.close(fig_CM)

        #**************** STATISTICAL ASSESSMENT (PERMUTATION) ****************
        if permutation_flag:
            permResults['permutation_DA_' + str(it)] = Permute(
                clusters,
                xTrainSet,
                xTestSet,
                y_train,
                y_test,
                nbPermutation,
                standardizationType,
                debug_flag=0)
            avg_perm_DA.append(
                np.mean(permResults['permutation_DA_' + str(it)]))

    dfDA = pd.DataFrame(data=DA, columns=['DA_test'])
    #    CvResult = pd.concat([CvResult, dfDA[:]], axis=1)
    CvResult = pd.concat([
        CvResult, dfDA[:],
        pd.DataFrame(data=[np.mean(DA)], columns=['avg_DA'])
    ],
                         axis=1)

    #***************** COMPUTE STATISTICAL ASSESSMENT RESULTS *****************
    if permutation_flag:
        # compute permutation DA average and keep results in a dataframe
        print('\nAverage permutation DA')
        for i in list(range(len(avg_perm_DA))):
            print('\t' + str(avg_perm_DA[i]))

        savedHistName = resultDir + 'Average_Permutation_hist_' + clusters + '.png'
        PlotPermHist(permResults, CvResult['avg_DA'].iloc[0], currentDateTime,
                     savedHistName)
        #formating permutation results to save in excel file
        permResults = pd.concat(
            [permResults, ComputePermutationAvgDA(avg_perm_DA)], axis=1)
        print('Mean permutation decoding accuracy : {}'.format(
            np.mean(permResults['Avg_Permutation_DA_per_epoch'])))
    else:  # binomial law
        from scipy.stats import binom
        q = 0.001  # p value
        n = X.shape[0] + 1  # nombre d'observation (sujets)
        p = 1 / len(clusterNames)  # probablité d'avoir un essai correctement
        luckLvl = pd.DataFrame(date=[binom.isf(q, n, p) / n],
                               columns=['Chance_Level'])

#****************************** Compute results *******************************
    if not skipFS:
        # Build structure of histogram data to save in excel
        hist = pd.DataFrame(data=featureNames, columns=['Features_Name'])
        hist['Occurence_Best'] = bestFeaturesHist
        # Search best set across every iteration best set
        best_Combination = tmpBest[np.argmax(DA)]
        # Compute average size of best combination
        l = 0
        for n in list(range(len(tmpBest))):
            l += len(tmpBest[n])
        avgBestCombSize = pd.DataFrame(data=[np.ceil(l / len(tmpBest))],
                                       columns=['avgBestCombSize'])

        #    subsetHist = GetSubsetOccurence(tmpBest)
        #    PlotHist(subsetHist[1],'Subsets occurences',subsetHist[0],'Comb_Hist.png')

        # Get best set's feature names
        tmp = []
        tmp.append(np.max(DA))
        for i in best_Combination:
            tmp.append(featureNames[i])
            print('\t' + featureNames[i])
        bestFeatNames = pd.DataFrame(data=tmp, columns=['Best_Features_Set'])

        sffsRes = pd.concat([hist, bestFeatNames, avgBestCombSize], axis=1)

        # Plot best combination custom metric (mean / std_dev)
        from slpClass_toolbox import PlotBestCombinationMetrics
        filteredData = CvResult.filter(regex=r'avg_over_std_', axis=1)
        metrics = pd.DataFrame(data=filteredData)
        metrics.dropna(inplace=True)
        figName = resultDir + 'SFFS_' + clusters + '_bestSet_metric_aggreg.png'
        PlotBestCombinationMetrics(metrics, figName)

    #save training and permutation results in an excel file
    nbSubject = pd.DataFrame(data=[len(X)], columns=['Number_Of_Subjects'])

    #************************ Build results structure *************************
    excelResults = pd.concat([
        CvResult, permResults if permutation_flag else luckLvl,
        sffsRes if not skipFS else None, removedData, nbSubject
    ],
                             axis=1)

    print('Mean Decoding accuracy :{}'.format(np.mean(DA)))

    # compute occurence of every subset in bestsets of every iteration
    #    from slpClass_toolbox import GetSubsetOccurence
    #    subsetHist = GetSubsetOccurence(tmpBest)
    #    excelResults = pd.concat([excelResults, subsetHist], axis=1)
    #    excelResults.to_excel(saveTo, sheet_name=xlSheetName)

    if fitFeatOverTresh:
        tresh = featureList[0] * nbOfSplit
        bestFeatColumns = hist.iloc[:, 0][hist.iloc[:, 1] > tresh]
        bestDataSet = xx[bestFeatColumns]
        classes = y
        DABestFeat = []
        print('Fitting with features occuring over %d times in best sets' %
              tresh)
        for i in list(range(nbOfSplit)):
            print('\rFit #{} of {}\n'.format(i + 1, nbOfSplit),
                  end='\r',
                  flush=True)
            # Balance the number of old woman and old man or not
            if balance_flag:
                XX, YY = BalanceClasses(bestDataSet, classes)
            else:
                XX, YY = bestDataSet, classes

            # slpit dataset into train and test random subset
            XXtrain, XXtest, yytrain, yytest = tts(XX,
                                                   YY['Cluster'],
                                                   test_size=0.33,
                                                   stratify=YY['Cluster'])
            # Data z-score standardisation
            xxTrainSet, zzPrm = Standardize(XXtrain, yytrain,
                                            standardizationType, debug_flag)

            # fit and predict on training data
            optimClf = optimClf.fit(xxTrainSet.as_matrix(), yytrain)
            yPred = optimClf.predict(xxTrainSet.as_matrix())
            # Compute accuracy of prediction on trainnnig set
            acc = float((yytrain == yPred).sum()) / yPred.shape[0]
            print('Train predicted accuracy: %.2f %%' % (acc * 100))
            fitRes = pd.DataFrame(data=[acc], columns=['CV_DA_' + str(it + 1)])

            # test classifier and compute decoding accuracy on predictions
            xxTestSet = ApplyStandardization(XXtest, zzPrm)
            yypred = optimClf.predict(xxTestSet)
            acc = float((yytest == yypred).sum()) / yypred.shape[0]
            print('Test set accuracy: %.2f %%' % (acc * 100))
            DABestFeat.append(acc)  # stack test DA for further use
            # plot confusion matrix
            cm = confusion_matrix(yytest, yypred)
            fig_CM = plt.figure(dpi=300)
            plot_confusion_matrix(cm,
                                  clusterNames,
                                  title=savedPlotName,
                                  normalize=True,
                                  precision=2)
            plt.clf()
            plt.close(fig_CM)
        df = pd.DataFrame(data=DABestFeat, columns=['optim DA'])
        df = pd.concat([
            df,
            pd.DataFrame(data=[np.mean(DABestFeat)], columns=['optim avg DA'])
        ],
                       axis=1)
        print('Classifier trained with best features (occ > %d) only' % tresh)
        print(df)
        excelResults = pd.concat([excelResults, df], axis=1)

    return excelResults
Beispiel #22
0
         for j in range(i):
             if abs(corr_matrix.iloc[i,j])>threshold:
                 colname=corr_matrix.columns[i]
                 col_corr.add(colname)
    return col_corr             
                 
corr_features=correlation(X_train,0.8)
print('correlated features:',len(set(corr_features)))

X_train.drop(labels=corr_features,axis=1,inplace=True)
X_test.drop(labels=corr_features,axis=1,inplace=True)

sfs1=SFS(RandomForestClassifier(n_jobs=4),
         k_features=10,
         forward=True,
         floating=False,
         verbose=2,
         scoring='roc_auc',
         cv=3
         )

sfs1=sfs1.fit(np.array(X_train.fillna(0)),y_train)
select_feat= X_train.columns[list(sfs1.k_feature_idx_)]
select_feat

def run_randomForests(X_train,X_test,y_train,y_test):
    rf=RandomForestClassifier(n_estimators=200,random_state=39,max_depth=4)
    rf.fit(X_train,y_train)
    print('Train set')
    pred=rf.predict_proba(X_train)
    print('Random Forests roc_auc :{}'.format(roc_auc_score(y_train,pred[:,1])))
    print('Test set')
import warnings
warnings.filterwarnings('ignore')

print(
    "\n\nWrapper-based Method (using K-Nearest Neighbor classifier as the underlying classification algorithm)\n"
)
X = pd.read_csv("glass_features.csv")
y = pd.read_csv("glass_target.csv")

#Lets try to put the number of nearest neighbors from in the range of given number of features,because depending upon this the KNN will compare the values
for n in range(1, 10):
    print('When The Number of nearest neighbors selected are', n)
    knn = KNeighborsClassifier(n_neighbors=n)
    # the param forward when set to False will do sequential backward selectioni.e recursive feature elimination
    #Also since we have provided the string "best" in K_features , as per the docstring it will give us the best subset which is having best cross validation score
    sbs = SFS(knn, k_features='best', forward=False, scoring='accuracy')
    sbs = sbs.fit(X, y)
    print(
        "Best  features Subset by SFS for this KNN algorithm when selected number of neighbors are : ",
        n, 'are :', sbs.k_feature_idx_)
    Data2 = []
    for ig in sbs.k_feature_idx_:
        k = int(ig)
        print(k)
        Data2.append(X.columns[k])
    new_X = X[Data2]
    knn.fit(new_X, y)
    y_predict = knn.predict(new_X)
    #print ('The Corresponding R2 value of Nearest neighbor selected :',n,'is :',knn.score(new_X,y_predict))
    print('The Corresponding RMSE valueof Nearest neighbor selected :', n,
          'is :', math.sqrt(mean_squared_error(y, y_predict)))