max_iter=1000, shuffle=True) # xgb_model.fit(x,y) # y_pred = xgb_model.predict(x_test) # mse = mean_squared_error(y_test, y_pred) # print("MSE: ", mse) # rmse = np.sqrt(mse) # print("RMSE: ", rmse) # print('-'*100) ### Mlxtend Implementation of Sequential Feature Selection sfs = SFS( model, k_features=20, #x.shape[1], forward=True, floating=False, scoring='neg_mean_squared_error', cv=0) sfs = sfs.fit(x, y) # scores = pd.DataFrame.from_dict(sfs.get_metric_dict()).T # fig = plot_sfs(sfs.get_metric_dict(), kind='std_err') # plt.grid() # plt.show() idx = sfs.k_feature_idx_ print(type(idx)) print('Selected features indexes:', sfs.k_feature_idx_) idx_list.append(idx)
loc.append(np.mean(np.array(ref_data))) x.append(loc) if 'F' == row['Type']: y.append(0) elif 'D' == row['Type']: y.append(1) elif 'G' == row['Type']: y.append(2) print(len(x), len(x[0])) print(len(y)) # Create the RFE object and rank each pixel print('Find the right features...') knn = KNeighborsClassifier(n_neighbors=3) sfs1 = SFS(knn, k_features=7, forward=True, floating=True, verbose=2, scoring='accuracy', cv=0, n_jobs=-1) sfs1 = sfs1.fit(np.array(x), np.array(y), custom_feature_names=tuple(fields)) print() pprint(sfs1.subsets_)
# # 0.0 0.99 0.77 0.87 3309 # 1.0 0.04 0.46 0.07 68 # # accuracy 0.77 3377 # macro avg 0.51 0.62 0.47 3377 #weighted avg 0.97 0.77 0.85 3377 # #the auc of logistics is: 0.7029313992142641 #the brier socre is 0.17659280049694137 f_number = 100 sfs = SFS(clfLogisticRegression, k_features=f_number, forward=True, floating=False, scoring='roc_auc', cv=5) print('ok3') result = sfs.fit(X_train, y_train, custom_feature_names=feature_names) #print(X) result.subsets_ result.k_score_ selection_res = pd.DataFrame.from_dict(sfs.get_metric_dict()).T # print(selection_res) selection_res.to_csv("/Users/shuojiawang/Documents/ppdmodel/result1907/selection_log_withouthistorylg.csv", sep='\t') selected_feature_idx = result.k_feature_idx_ #print(type(selected_feature_idx)) selected_feature = list(selected_feature_idx)
from sklearn.neighbors import KNeighborsClassifier from sklearn.datasets import load_iris from sklearn.svm import SVC from numpy import genfromtxt from mlxtend.feature_selection import SequentialFeatureSelector as SFS if __name__ == "__main__": X = genfromtxt('../../../../features.csv', delimiter=',', usecols=range(1, 5)) y = genfromtxt('../../../../features.csv', delimiter=',', usecols=range(5, 6)) clf = SVC() sfs = SFS(clf, k_features=3, forward=True, floating=True, scoring='accuracy', cv=0) sfs = sfs.fit(X, y) feature_count = len(sfs.k_feature_idx_) count = 0 text_file = open("../../../../selected_floating_features.txt", "w") for feature in sfs.k_feature_idx_: count = count + 1 text_file.write("%s" % feature) if count < feature_count: text_file.write(",") text_file.close()
y_pred_proba_cdt =clfXGboost.predict_proba(X_test)[:, 1] confmat_test_c = confusion_matrix(y_true=y_test, y_pred=y_pred_c) print('confmat_test:\n', confmat_test_c) print('the acc is:', accuracy_score(y_test, y_pred_c)) print('the classification_report:', classification_report(y_test, y_pred_c)) print('the auc of XGboost is:', roc_auc_score(y_test, y_pred_proba_cdt)) feature_names = X.columns.values.tolist() print(feature_names) print('ok2') f_number = 50 sfs = SFS(clfXGboost, k_features=f_number, forward=True, floating=False, scoring='roc_auc', cv=5) print('ok') result2 = sfs.fit(X_train, y_train, custom_feature_names=feature_names) #print(X) result2.subsets_ result2.k_score_ selection_res = pd.DataFrame.from_dict(sfs.get_metric_dict()).T # print(selection_res) selection_res.to_csv("/Users/shuojiawang/Documents/ppdmodel/result1907/selection_log_withhistoryxgboost.csv", sep='\t')
'Annual performance C', 'Age level', 'Marital Status', 'Job tenure level A', 'Job tenure level B', 'Average working years', 'Graduated School', 'Family numbers' ]] # # Sequential Feature Selector # In[28]: X.shape # In[29]: sfsl = SFS(clf_pipeline, k_features=26, forward=True, scoring=make_scorer(fbeta_score, beta=1.5), cv=10) sfsl.fit(X, Y) sfsl.subsets_ # In[36]: X = X[[ 'yyyy', 'Job classification', 'Work experience5', 'Special project', 'Training hours B', 'Training hours C', 'leave this three mon. A', 'leave this year A', 'leave this three mon. B', 'leave this year B', 'Annual performance C', 'Job tenure level A', 'Job tenure level B', 'Family numbers' ]]
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Project : tql-Python. # @File : featureSelector # @Time : 2019-07-26 13:39 # @Author : yuanjie # @Email : [email protected] # @Software : PyCharm # @Description : from mlxtend.feature_selection import SequentialFeatureSelector as SFS from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs from sklearn.neighbors import NearestNeighbors from sklearn.feature_selection import GenericUnivariateSelect, \ SelectPercentile, SelectKBest, f_classif, mutual_info_classif, RFE from lightgbm import LGBMClassifier import matplotlib.pyplot as plt sfs = SFS(LGBMClassifier(), k_features=10, forward=True, floating=False, verbose=2, scoring='roc_auc', cv=5, n_jobs=-1) sfs.fit(X, y)
from sklearn.neighbors import KNeighborsClassifier import math from mlxtend.feature_selection import SequentialFeatureSelector as SFS mnist_dataset = datasets.load_digits() X = mnist_dataset.data Y = mnist_dataset.target target_names = mnist_dataset.target_names train, test, train_targets, test_targets = model_selection.train_test_split( X, Y, train_size=0.5, test_size=0.5) knn = KNeighborsClassifier(round(math.sqrt(train.shape[0] + test.shape[0]))) sfbs = SFS(knn, k_features=round(train.shape[1] * 0.05), forward=True, floating=True, scoring="accuracy", cv=0) sfbs = sfbs.fit(train, train_targets) best_k_features = round(train.shape[1] * 0.05) best_score = sfbs.k_score_ features = 2 for i in range(1, 5): features = features * i sfbs = SFS(knn, k_features=features, forward=False, floating=True, scoring="accuracy", cv=0)
#Separación train y test X_train, X_test, y_train, y_test = train_test_split( df4, df3["descCanalRadicacion"], test_size=0.2, random_state=10, stratify=df3["descCanalRadicacion"]) #ML Model: Model Selection knn = KNeighborsClassifier(n_neighbors=50, weights='distance') sfs1 = SFS(knn, k_features=11, forward=True, floating=False, verbose=1, scoring=make_scorer(f1_score, average='weighted'), cv=5) sfs1 = sfs1.fit(X_train, y_train) X_train_sfs = sfs1.transform(X_train) X_test_sfs = sfs1.transform(X_test) clfKnn_sfs = knn.fit(X_train_sfs, y_train) #clases LE_name_mapping = dict(zip(LE.classes_, LE.transform(LE.classes_))) clases_Canal = (LE_name_mapping) predicted_clases = list(clases_Canal.keys())
'learning_rate': 0.013, 'max_depth': 5, 'nthread': 4, 'silent': 1, 'subsample': 0.463, 'reg_lambda': 0.715, 'gamma': 0.01, 'min_child_weight': 30.4, } estimator = xgb.XGBClassifier(**params_est) sfs1 = SFS(estimator, k_features=(1, 26), forward=True, floating=False, verbose=2, scoring='log_loss', cv=4, n_jobs=4) sfs1 = sfs1.fit(train.as_matrix(), Y) results = pd.DataFrame.from_dict(sfs1.get_metric_dict()).T fig1 = plot_sfs(sfs1.get_metric_dict(), kind='std_dev') plt.title('Sequential Forward Selection (w. StdDev)') plt.grid() plt.show() print(sfs1.subsets_) print(sfs1.k_feature_idx_) print(sfs1.k_score_) """
# In[9]: X = X.values y = y.values # In[11]: from mlxtend.feature_selection import SequentialFeatureSelector as SFS from sklearn.neighbors import KNeighborsClassifier from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs import matplotlib.pyplot as plt from sklearn import linear_model import pandas as pd lr = linear_model.LinearRegression() sfs = SFS(lr, k_features=30, forward=True, floating=False, scoring='r2', cv=4) sfs = sfs.fit(X, y) print('\nSequential Floating Forward Selection (k=30):') print(sfs.k_feature_idx_) print('CV Score:') print(sfs.k_score_) pd.DataFrame.from_dict(sfs.get_metric_dict()).T plt.figure(figsize=(19, 10)) fig = plot_sfs(sfs.get_metric_dict(), kind=None) plt.title('Sequential Forward Selection (R Sqaure)') plt.grid() plt.show()
start_features = df.tail(15) #print(these_choices) #print(df) start_features = list(start_features['feature'].values)[::-1] #test_cols = df.tail(40)['feature'].values for start_feature in start_features: for k_features in range(2, 20): print(start_feature) sfs = SFS( estimator=rfc, k_features=k_features, forward=True, floating=True, verbose=1, scoring='accuracy', n_jobs=15, fixed_features=[start_feature], cv=4, ) start_time = time.time() try: sfs = sfs.fit(X_train[test_cols], y_train) except: continue end_time = time.time() #print() #print(feature_choices, end_time - start_time) best_features = list(sfs.k_feature_names_)
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score metr=[accuracy_score, recall_score, precision_score, f1_score] result=pd.DataFrame(columns=['N_features','Fold','Acc','Recall','Precision','F1']) for s in ks: selvars=Fscore.index[:s] for pos, (train,valid) in enumerate(skfold.split(data[selvars], data['target'])): clf.fit(data.iloc[train][selvars], data.iloc[train]['target']) y_pred=clf.predict(data.iloc[valid][selvars]) result.loc[len(result)]=[s,pos+1]+[m(data.iloc[valid]['target'],y_pred) for m in metr] result.groupby(['N_features'])['Acc','Recall','Precision','F1'].mean() Fscore['Group']=[x[:3] for x in Fscore.index] group_F=Fscore.groupby(['Group'])['F'].mean() group_F=group_F.sort_values(ascending=False) from mlxtend.feature_selection import SequentialFeatureSelector as SFS import sklearn clf=LogisticRegression(C=1, max_iter=300) sfs=SFS(clf,k_features=10, forward=True, floating=False, scoring='f1', cv=5) sfs.fit(data[cols],data['target']) sfs.subsets_
from sklearn import datasets from sklearn import model_selection from sklearn.neighbors import KNeighborsClassifier import math from mlxtend.feature_selection import SequentialFeatureSelector as SFS mnist_dataset = datasets.load_digits() X = mnist_dataset.data Y = mnist_dataset.target target_names = mnist_dataset.target_names train, test, train_targets, test_targets = model_selection.train_test_split(X, Y, train_size=0.5,test_size=0.5) knn = KNeighborsClassifier(round(math.sqrt(train.shape[0]+test.shape[0]))) best_k_features=0 best_score=0 features=1 for i in range (1,5): features=features*i sbs = SFS(knn, k_features=features, forward=False,floating=False,scoring="accuracy", cv=0) sbs = sbs.fit(train, train_targets) #print("For number of featres: {0}, best features: {1}, prediction score: {2}".format(features, sffs.k_feature_idx_, sffs.k_score_)) if best_score<sbs.k_score_: best_score=sbs.k_score_ best_k_features=features print("The best score: {0} for number of features: {1}".format(best_score,best_k_features ))
#data visualization data_numeric = data[['power', 'kilometer', 'brand_amount', 'brand_price_average', 'brand_price_max', 'brand_price_median']] correlation = data_numeric.corr() f , ax = plt.subplots(figsize = (7, 7)) plt.title('Correlation of Numeric Features with Price',y=1,size=16) sns.heatmap(correlation,square = True, vmax=0.8) plt.show() # (2) wrapper from mlxtend.feature_selection import SequentialFeatureSelector as SFS from sklearn.linear_model import LinearRegression sfs = SFS(LinearRegression(), k_features=10, forward=True, floating=False, scoring = 'r2', cv = 0) x = data.drop(['price'], axis=1) x = x.fillna(0) #use LabelEncoder to deal with string data print(x.info()) le = preprocessing.LabelEncoder() for column in x.columns: if x[column].dtype == object: x[column] = le.fit_transform(x[column]) y = data['price'] y = y.fillna(0) sfs.fit(x, y)
def step_feature_selection(keras_est, x_train, y_train, x_test, y_test, features_lower_bound, features_upper_bound, *, scoring='accuracy', cv=0, n_jobs=-1): # feature selection step forward/backward: sk_keras_est = SFS(keras_est, k_features=(features_lower_bound, features_upper_bound), forward=True, floating=False, verbose=2, scoring=scoring, cv=cv, n_jobs=n_jobs) sk_keras_est = sk_keras_est.fit(x_train, y_train) # transforming data to only contain chosen features: x_train_sfs = sk_keras_est.transform(x_train) x_test_sfs = sk_keras_est.transform(x_test) # print(pd.DataFrame(x_train_sfs)) # print(pd.DataFrame(x_test_sfs)) global feature_names selected_features = [] selected_features = [feature_names[i] for i in sk_keras_est.k_feature_idx_] feature_names = selected_features #print(feature_names) feature_names_SFS = pd.DataFrame(feature_names) feature_names_SFS.to_csv(RUNDIR + "feature_names_SFS.csv", index=False) k.clear_session() # # training model with chosen features # keras_est.fit(x_train_sfs, y_train) # y_pred = keras_est.predict(x_test_sfs) # # evaluating model with accuracy and false positive index # correct = 0 # index_wrong=[] # false_positive=[] # y_test = y_test.flatten() # y_pred = y_pred.flatten() # # for i in range(len(y_pred)): # # if y_test[i] == y_pred[i]: # # correct += 1 # # else: # # index_wrong.append(i) # # if y_test[i] == 0: # # false_positive.append(i) # for i in range(len(y_pred)): # if y_test[i] != y_pred[i]: # index_wrong.append(i) # if y_test[i] == 0: # false_positive.append(i) # # checking model accuracy: # percent_correct= accuracy_score(y_test, y_pred) # accuracy_result = pd.DataFrame.from_dict(sk_keras_est.get_metric_dict()).T # accuracy_result.to_csv(DATADIR+"accuracy_result.csv", index=False) # print('Selected features:', sk_keras_est.k_feature_idx_) # #percent_correct = (correct/len(df_y_test)) # print("Model accurary is: {:.2f}%".format(percent_correct*100)) # print("Wrong prediction index: ", index_wrong) # print("Index with False Positive: ", false_positive) return x_train_sfs, x_test_sfs #return original dataframe if none is dropped
X_test = scaler.transform(X_test) print(labels) print('X_train shape:', X_train.shape) print('y_train shape:', y_train.shape) print('X_test shape:', X_test.shape) print('y_test shape:', y_test.shape) # Use sequential feature selection to decide what features to use. Grid search to determine best hyperparameter values. # In[3]: knn = KNeighborsRegressor() sfs1 = SFS(estimator=knn, k_features='best', forward=False, floating=True, cv=5) pipe = Pipeline([('sfs', sfs1), ('knn', knn)]) param_grid = [{ 'sfs__estimator__n_neighbors': range(1, len(X_idx)), 'sfs__estimator__weights': ['distance', 'uniform'], 'sfs__estimator__metric': ['euclidean', 'manhattan', 'chebyshev'] }] gs = GridSearchCV(estimator=pipe, param_grid=param_grid, n_jobs=-1, cv=5,
def objective_function(args): n_components = args['n_components'] quantiles = args['quantiles'] if args['preprocessing'] == 'NoTransform': X, Y, scaler = transform(dataset) elif args['preprocessing'] == 'MinMaxScaler': X, Y, scaler = transform(dataset) elif args['preprocessing'] == 'StandardScaler': X, Y, scaler = standard_scaler(dataset) elif args['preprocessing'] == 'RobustScaler': X, Y, scaler = robust_scaler(dataset) elif args['preprocessing'] == 'QuantileTransformer': X, Y, scaler = quantile_transformer(dataset, quantiles) elif args['preprocessing'] == 'PowerTransformer': X, Y, scaler = power_transformer(dataset) elif args['preprocessing'] == 'PCA': X, Y, scaler = pca_transform(dataset, n_components) if args['preprocessing'] != 'PCA': k_features = args['k_features'] else: k_features = X.shape[1] if args['model'] == RandomForestRegressor: n_estimators = args['params']['n_estimators'] max_depth = args['params']['max_depth'] min_samples_split = args['params']['min_samples_split'] min_samples_leaf = args['params']['min_samples_leaf'] min_weight_fraction_leaf = args['params']['min_weight_fraction_leaf'] max_features = args['params']['max_features'] max_leaf_nodes = args['params']['max_leaf_nodes'] estimator = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf, max_leaf_nodes = max_leaf_nodes, min_weight_fraction_leaf = min_weight_fraction_leaf, max_features = max_features, n_jobs = -1) reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False) elif args['model'] == AdaBoostRegressor: learning_rate = args['params']['learning_rate'] n_estimators = args['params']['n_estimators'] loss = args['params']['loss'] max_depth = args['params']['base_estimator']['max_depth'] min_samples_split = args['params']['base_estimator']['min_samples_split'] min_samples_leaf = args['params']['base_estimator']['min_samples_leaf'] min_weight_fraction_leaf = args['params']['base_estimator']['min_weight_fraction_leaf'] max_features = args['params']['base_estimator']['max_features'] estimator = AdaBoostRegressor(base_estimator = DecisionTreeRegressor(max_depth = max_depth, min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf, min_weight_fraction_leaf = min_weight_fraction_leaf, max_features = max_features), learning_rate = learning_rate, n_estimators = n_estimators, loss = loss) reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False) elif args['model'] == ExtraTreesRegressor: n_estimators = args['params']['n_estimators'] max_depth = args['params']['max_depth'] min_samples_split = args['params']['min_samples_split'] max_features = args['params']['max_features'] min_samples_leaf = args['params']['min_samples_leaf'] min_weight_fraction_leaf = args['params']['min_weight_fraction_leaf'] max_leaf_nodes = args['params']['max_leaf_nodes'] estimator = ExtraTreesRegressor(n_estimators = n_estimators, max_depth = max_depth, min_samples_split = min_samples_split, max_features = max_features, max_leaf_nodes = max_leaf_nodes, min_weight_fraction_leaf = min_weight_fraction_leaf, min_samples_leaf = min_samples_leaf, n_jobs = -1) reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False) elif args['model'] == GradientBoostingRegressor: loss = args['params']['loss'] learning_rate = args['params']['learning_rate'] n_estimators = args['params']['n_estimators'] subsample = args['params']['subsample'] min_samples_split = args['params']['min_samples_split'] max_depth = args['params']['max_depth'] tol = args['params']['tol'] estimator = GradientBoostingRegressor(loss = loss, n_estimators = n_estimators, subsample = subsample, min_samples_split = min_samples_split, learning_rate = learning_rate, max_depth = max_depth, tol = tol) reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False) elif args['model'] == SGDRegressor: loss = args['params']['loss'] penalty = args['params']['penalty'] alpha = args['params']['alpha'] l1_ratio = args['params']['l1_ratio'] tol = args['params']['tol'] learning_rate = args['params']['learning_rate'] power_t = args['params']['power_t'] estimator = SGDRegressor(loss = loss, penalty = penalty, alpha = alpha, max_iter = 13000, l1_ratio = l1_ratio, tol = tol, learning_rate = learning_rate, power_t = power_t) reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False) elif args['model'] == ElasticNet: alpha = args['params']['alpha'] l1_ratio = args['params']['l1_ratio'] tol = args['params']['tol'] estimator = ElasticNet(alpha = alpha, l1_ratio = l1_ratio, tol = tol) reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False) elif args['model'] == Ridge: alpha = args['params']['alpha'] tol = args['params']['tol'] solver = args['params']['solver'] estimator = Ridge(alpha = alpha, tol = tol, solver = solver) reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False) elif args['model'] == KNeighborsRegressor: n_neighbors = args['params']['n_neighbors'] weights = args['params']['weights'] algorithm = args['params']['algorithm'] leaf_size = args['params']['leaf_size'] p = args['params']['p'] estimator = KNeighborsRegressor(n_neighbors = n_neighbors, weights = weights, algorithm = algorithm, leaf_size = leaf_size, p = p, n_jobs = -1) reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False) elif args['model'] == GaussianProcessRegressor: alpha = args['params']['alpha'] estimator = GaussianProcessRegressor(alpha = alpha) reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False) elif args['model'] == SVR: kernel = args['params']['kernel'] if kernel == 'poly': degree = args['params']['degree'] else: degree = 3 if kernel == 'rbf' or 'poly' or 'sigmoid': gamma = args['params']['gamma'] else: gamma = 'auto' tol = args['params']['tol'] C = args['params']['C'] shrinking = args['params']['shrinking'] estimator = SVR(kernel = kernel, degree = degree, gamma = gamma, tol = tol, C = C, shrinking = shrinking) reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False) elif args['model'] == xgb: booster = args['params']['booster'] eta = args['params']['eta'] gamma = args['params']['gamma'] max_depth = args['params']['max_depth'] n_estimators = args['params']['n_estimators'] min_child_weight = args['params']['min_child_weight'] subsample = args['params']['subsample'] alpha = args['params']['alpha'] random_state = args['params']['random_state'] colsample_bytree = args['params']['colsample_bytree'] colsample_bylevel = args['params']['colsample_bylevel'] colsample_bynode = args['params']['colsample_bynode'] reg_lambda = args['params']['reg_lambda'] grow_policy = args['params']['grow_policy'] if booster == 'dart': sample_type = args['params']['sample_type'] normalize_type = args['params']['normalize_type'] rate_drop = args['params']['rate_drop'] skip_drop = args['params']['skip_drop'] if args['preprocessing'] != 'PCA': k_features = args['k_features'] else: k_features = sample(scope.int(hp.quniform('k_features', 1, X.shape[1], 1))) if booster == 'gbtree': estimator = xgb.XGBRegressor(booster = booster, eta = eta, gamma = gamma, max_depth = max_depth, n_estimators = n_estimators, min_child_weight = min_child_weight, subsample = subsample, alpha = alpha, random_state = random_state, colsample_bytree = colsample_bytree, colsample_bylevel = colsample_bylevel, grow_policy = grow_policy, colsample_bynode = colsample_bynode, reg_lambda = reg_lambda, n_jobs = -1) reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False, scoring = metrics_names[eval_metric]) elif booster == 'dart': num_round = 50 estimator = xgb.XGBRegressor(booster = booster, eta = eta, gamma = gamma, max_depth = max_depth, n_estimators = n_estimators, min_child_weight = min_child_weight, subsample = subsample, alpha = alpha, random_state = random_state, colsample_bytree = colsample_bytree, sample_type = sample_type, normalize_type = normalize_type, rate_drop = rate_drop, skip_drop = skip_drop, colsample_bylevel = colsample_bylevel, grow_policy = grow_policy, colsample_bynode = colsample_bynode, reg_lambda = reg_lambda, n_jobs = -1) reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False, scoring = metrics_names[eval_metric]) if eval_metric == 'mse': x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 1 - percent_train, random_state = 1, shuffle = False) sfsl = reg.fit(X, Y) x_sfs = sfsl.transform(X) x_train_sfs = x_sfs[:length_train] x_test_sfs = x_sfs[length_train:] estimator.fit(x_train_sfs, y_train) if args['model'] == xgb: if booster == "gbtree": y_pred = estimator.predict(x_test_sfs) elif booster == "dart": y_pred = estimator.predict(x_test_sfs, ntree_limit = num_round) else: y_pred = estimator.predict(x_test_sfs) if args['preprocessing'] != 'NoTransform': predictions = y_pred.reshape(-1, 1) for i in range(predictions.shape[1]): if args['preprocessing'] != 'PCA': tmp = np.zeros((predictions.shape[0], n_features)) else: tmp = np.zeros((predictions.shape[0], X.shape[1])) tmp[:, 0] = predictions[:, i] predictions[:, i] = scaler.inverse_transform(tmp)[:, 0] mse = mean_squared_error(dataset[target][length_train:], predictions) print('mse value: {}, model: {}'.format(mse, args['model'])) return mse else: mse = mean_squared_error(dataset[target][length_train:], y_pred) print('mse value: {}, model: {}'.format(mse, args['model'])) return mse else: reg.fit(X, Y) print('Model: {}, r2 value: {}, Selected variables {}'.format(args['model'], reg.k_score_, reg.k_feature_names_)) loss_function = 1 - reg.k_score_ return loss_function
def main(): # read the data using pandas bank_data = pd.read_csv("E:/Study/AI_Sem1/ML/bank.csv", delimiter=",") # Run pre-processing on data frame feature_train, class_label = performPreprocessing(bank_data) # ----------------- create baseline models ------------------------- # # split the data into test and train X_train, X_test, Y_train, Y_test = train_test_split(feature_train, class_label, test_size=0.2, random_state=11) print( '-------------------------------- Baseline Models before Feature Selection Pre-processing --------------------------------------' ) # call run model function without feature selection pre-processing model_df = runModels(X_train, Y_train) print(model_df) # call metrics function for one model without feature selection pre-processing report(X_train, Y_train) # ----------------- create models after applying feature selection technique in pre-processing ------------------------- # # call feature selection function before spliting the data featureSelection(feature_train, class_label) # remove features which has lower importance ranking ie, marital, default and loan columns feature_train1 = feature_train.drop(['marital', 'default', 'loan'], axis=1) # split the data into test and train X_train1, X_test1, Y_train1, Y_test1 = train_test_split(feature_train1, class_label, test_size=0.2, random_state=10) print( '--------------------------------Models After Feature Selection --------------------------------------' ) # call run model function with feature selection pre-processing model_df_feature = runModels(X_train1, Y_train1) print(model_df_feature) # call metrics function for one model after feature selection report(X_train1, Y_train1) print( '-------------------------------- Hyper Parameter optimization on top 3 models ----------------------------------------' ) hyperParameter(X_train1, Y_train1) print( '-------------------------------- Research Topic - Feature Selection -----------------------------------' ) # Research - Feature Selection print( '-------------------------------- 1. Recursive Feature Elimination -------------------------------------' ) # 1. Recursive Feature Elimination # using Logistic Regression model to get the score of each feature model = LogisticRegression() # create the RFE model and select 10 attributes rfe = RFE(model, 10) rfe = rfe.fit(X_train, Y_train) # summarize the selection of the attributes print(rfe.support_) print(rfe.ranking_) # plot the ranking plt.bar(range(len(rfe.ranking_)), rfe.ranking_) plt.show() # based on these ranking remove the columns feature_train2 = feature_train.drop( ['age', 'job', 'balance', 'day', 'duration', 'pdays'], axis=1) # split the data into test and train X_train2, X_test2, Y_train2, Y_test2 = train_test_split(feature_train2, class_label, test_size=0.2, random_state=10) # call run model function for Recursive Feature Elimination model_df_RFE = runModels(X_train2, Y_train2) print(model_df_RFE) print( '-------------- Hyper Parameter optimization on top 3 models for Recursive Feature Elimination technique ------------------' ) hyperParameter(X_train2, Y_train2) print( '-------------------------------- 2. Feature Importance -------------------------------------' ) # 2. Feature Importance # fit an Extra Trees model to the data model = ExtraTreesClassifier() model.fit(X_train, Y_train) # display the relative importance of each feature print('Score values of each fetaure: ', model.feature_importances_) # plot the scores plt.bar(range(len(model.feature_importances_)), model.feature_importances_) plt.show() # based on these score remove the columns feature_train3 = feature_train.drop([ 'marital', 'education', 'default', 'housing', 'loan', 'duration', 'pdays', 'previous' ], axis=1) # split the data into test and train X_train3, X_test3, Y_train3, Y_test3 = train_test_split(feature_train3, class_label, test_size=0.2, random_state=10) # call run model function for Feature Importance technique model_df_Feature_Importance = runModels(X_train3, Y_train3) print(model_df_Feature_Importance) print( '------------------- Hyper Parameter optimization on top 3 models for Feature Importance technique ----------------------' ) hyperParameter(X_train3, Y_train3) print( '-------------------------------- 3. Sequential Feature Selector -------------------------------------' ) sfs1 = SFS(KNeighborsClassifier(), k_features=10, forward=True, floating=False, verbose=2, scoring='accuracy', cv=0) sfs1 = sfs1.fit(X_train, Y_train) print('Indices of the 10 best features: ', sfs1.k_feature_idx_) # based on these score remove the columns feature_train4 = feature_train.drop( ['age', 'job', 'education', 'balance', 'day', 'campaign'], axis=1) # split the data into test and train X_train4, X_test4, Y_train4, Y_test4 = train_test_split(feature_train4, class_label, test_size=0.2, random_state=10) # call run model function for Sequential Feature Selector model_df_SFS = runModels(X_train4, Y_train4) print(model_df_SFS) print( '------------------- Hyper Parameter optimization on top 3 models for Sequential Feature Selector ----------------------' ) hyperParameter(X_train4, Y_train4)
def main(): options = ['5', 'mean', 'median'] targets = [0, 1] selections = ['00', '01', '10', '11'] filename = 'EEG.csv' train, label = load(filename) train = norm(train) ############### define classifier ################## clf = SVC(C=0.25, kernel='linear') for option in options: label, _ = transform(label, type=option) for selection in selections: for target in targets: forward = False floating = False if selection[0] == '1': forward = True if selection[1] == '1': floating = True print('') if forward: print('forward ', end='') else: print('backward ', end='') if floating: print('floating ', end='') print('selection --- target:', end='') if target == 0: print(' arousal') elif target == 1: print(' valence') else: print('target error ({})'.format(target)) sys.exit() print('') ############### target the label ################## ################### # 0 : arousal # # 1 : valence # ################### train_y = label[:, target].reshape(-1) sfs = SFS(clf, k_features='best', forward=forward, floating=floating, scoring='accuracy', cv=4, n_jobs=-1, verbose=1) sfs.fit(train, train_y) # save model pickle_on = open( 'sequence/eeg_{}_{}_{}'.format(option, selection, target), "wb") pickle.dump(sfs.k_feature_idx_, pickle_on) pickle_on.close()
def ExecuteSFFS(x, y, featureNames, featureList, clusters, clusterNames, svc, kFolds, nbOfSplit, featMaxNbrSFFS, standardizationType, removedData, permutation_flag, nbPermutation, balance_flag, currentDateTime, resultDir, debug_flag, verbose): import scipy import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split as tts from sklearn.metrics import confusion_matrix from mlxtend.feature_selection import SequentialFeatureSelector as SFS from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs from sklearn.model_selection import RandomizedSearchCV from slpClass_toolbox import BalanceClasses from slpClass_toolbox import Standardize from slpClass_toolbox import Permute from slpClass_toolbox import ComputePermutationAvgDA from slpClass_toolbox import PlotPermHist from slpClass_toolbox import ApplyStandardization from slpClass_toolbox import plot_confusion_matrix plt.rcParams.update({'figure.max_open_warning': 0}) # Get features values since SFFS works only with numpy array! bestFeaturesHist = np.zeros([len(featureNames)]) CvResult = pd.DataFrame() permResults = pd.DataFrame() tmpBest = [] DA = [] avg_perm_DA = [] skipFS = False # flag to skip feature selection fitFeatOverTresh = False # fit classifier with most frequent features in best set #********************** TRAIN pre-procesing ******************************* for it in list(range(nbOfSplit)): print('\nSplit #{}'.format(str(it))) # Use all features or given ones only if len(featureList) == 0: xx = x elif isinstance(featureList[0], float): xx = x fitFeatOverTresh = True else: xx = x[featureList] skipFS = True # Balance the number of old woman and old man or not if balance_flag: X, Y = BalanceClasses(xx, y) else: X, Y = xx, y # slpit dataset into train and test random subset X_train, X_test, y_train, y_test = tts(X, Y['Cluster'], test_size=0.33, stratify=Y['Cluster']) # Data z-score standardisation xTrainSet, zPrm = Standardize(X_train, y_train, standardizationType, debug_flag) #**************************** SVM optimisation ************************ params_dict = { 'C': scipy.stats.expon(scale=100), 'kernel': ['linear'], 'class_weight': ['balanced', None] } n_iter_search = 20 random_search = RandomizedSearchCV(svc, param_distributions=params_dict, n_iter=n_iter_search) random_search.fit(xTrainSet, y_train) optimClf = random_search.best_estimator_ #*************************** TRAIN ************************************ print('Fitting...') if skipFS: optimClf = optimClf.fit(xTrainSet.as_matrix(), y_train) yPred = optimClf.predict(xTrainSet.as_matrix()) # Compute the accuracy of the test prediction acc = float((y_train == yPred).sum()) / yPred.shape[0] print('Train predicted accuracy: %.2f %%' % (acc * 100)) fitRes = pd.DataFrame(data=[acc], columns=['CV_DA_' + str(it + 1)]) else: # set k_features = (1,X.shape[1]) to test all possible combinations sffs = SFS(optimClf, k_features=(1, featMaxNbrSFFS), forward=True, floating=False, scoring='accuracy', cv=kFolds, n_jobs=-1) sffs = sffs.fit(xTrainSet.as_matrix(), y_train) print('Best combination for fit #%d (ACC: %.3f): %s' % \ (it,sffs.k_score_, sffs.k_feature_idx_)) # Fit the estimator using the new feature subset and make a # prediction on the test data X_train_sfs = sffs.transform(xTrainSet.as_matrix()) optimClf.fit(X_train_sfs, y_train) fitRes = pd.DataFrame.from_dict(sffs.get_metric_dict()).T fitRes['avg_over_std'] = fitRes['avg_score'] / fitRes['std_dev'] if featMaxNbrSFFS > 1: # plot feature selection process metrics fig1 = plot_sfs(sffs.get_metric_dict(), kind='std_err') savedPlotName = resultDir+'Decoding_accuracy_'+clusters+'_'+\ str(it)+'_'+str(nbOfSplit)+'.png' tmpBest.append(sffs.k_feature_idx_) bestFeaturesHist[[tmpBest[-1]]] += 1 fig1.set_dpi(300) plt.tight_layout() plt.savefig(savedPlotName, bbox_inches='tight') plt.clf() plt.close(fig1) # plot mean / std plt.figure(dpi=300) plt.title('Moyenne sur ecart-type') plt.xlabel("nb attributs dans combinaison") plt.xticks(range(featMaxNbrSFFS)) plt.ylabel("Moyenne sur ecart-type") plt.plot(list(range(1, featMaxNbrSFFS + 1)), fitRes['avg_over_std']) figName = resultDir+'SFFS_'+clusters+'_bestSet_metric_'+ \ str(it)+'_'+str(nbOfSplit) plt.savefig(figName, bbox_inches='tight') plt.clf() plt.close() # add metrics iteration identifier fitRes = fitRes.add_suffix('_' + str(it + 1)) CvResult = pd.concat([CvResult, fitRes], axis=1) #***************************** TEST *********************************** print('Testing...') # standardize test set using trainset standardization parameters xTestSet = ApplyStandardization(X_test, zPrm) # prepare test data if skipFS: xTest = xTestSet savedPlotName = resultDir+clusters+'_ConfusionMatrix_'+str(it+1)+ \ '_'+str(nbOfSplit) else: # Generate a new subset of data according to selected features xTest = sffs.transform(xTestSet.as_matrix()) savedPlotName = resultDir+'SFFS_'+clusters+'_ConfusionMatrix_'+ \ str(it+1)+'_'+str(nbOfSplit) # actually test classifier and compute decoding accuracy on predictions y_pred = optimClf.predict(xTest) acc = float((y_test == y_pred).sum()) / y_pred.shape[0] print('Test set accuracy: %.2f %%' % (acc * 100)) DA.append(acc) # stack test DA for further use # plot confusion matrix cm = confusion_matrix(y_test, y_pred) fig_CM = plt.figure(dpi=300) plot_confusion_matrix(cm, clusterNames, title=savedPlotName, normalize=True, precision=2) plt.clf() plt.close(fig_CM) #**************** STATISTICAL ASSESSMENT (PERMUTATION) **************** if permutation_flag: permResults['permutation_DA_' + str(it)] = Permute( clusters, xTrainSet, xTestSet, y_train, y_test, nbPermutation, standardizationType, debug_flag=0) avg_perm_DA.append( np.mean(permResults['permutation_DA_' + str(it)])) dfDA = pd.DataFrame(data=DA, columns=['DA_test']) # CvResult = pd.concat([CvResult, dfDA[:]], axis=1) CvResult = pd.concat([ CvResult, dfDA[:], pd.DataFrame(data=[np.mean(DA)], columns=['avg_DA']) ], axis=1) #***************** COMPUTE STATISTICAL ASSESSMENT RESULTS ***************** if permutation_flag: # compute permutation DA average and keep results in a dataframe print('\nAverage permutation DA') for i in list(range(len(avg_perm_DA))): print('\t' + str(avg_perm_DA[i])) savedHistName = resultDir + 'Average_Permutation_hist_' + clusters + '.png' PlotPermHist(permResults, CvResult['avg_DA'].iloc[0], currentDateTime, savedHistName) #formating permutation results to save in excel file permResults = pd.concat( [permResults, ComputePermutationAvgDA(avg_perm_DA)], axis=1) print('Mean permutation decoding accuracy : {}'.format( np.mean(permResults['Avg_Permutation_DA_per_epoch']))) else: # binomial law from scipy.stats import binom q = 0.001 # p value n = X.shape[0] + 1 # nombre d'observation (sujets) p = 1 / len(clusterNames) # probablité d'avoir un essai correctement luckLvl = pd.DataFrame(date=[binom.isf(q, n, p) / n], columns=['Chance_Level']) #****************************** Compute results ******************************* if not skipFS: # Build structure of histogram data to save in excel hist = pd.DataFrame(data=featureNames, columns=['Features_Name']) hist['Occurence_Best'] = bestFeaturesHist # Search best set across every iteration best set best_Combination = tmpBest[np.argmax(DA)] # Compute average size of best combination l = 0 for n in list(range(len(tmpBest))): l += len(tmpBest[n]) avgBestCombSize = pd.DataFrame(data=[np.ceil(l / len(tmpBest))], columns=['avgBestCombSize']) # subsetHist = GetSubsetOccurence(tmpBest) # PlotHist(subsetHist[1],'Subsets occurences',subsetHist[0],'Comb_Hist.png') # Get best set's feature names tmp = [] tmp.append(np.max(DA)) for i in best_Combination: tmp.append(featureNames[i]) print('\t' + featureNames[i]) bestFeatNames = pd.DataFrame(data=tmp, columns=['Best_Features_Set']) sffsRes = pd.concat([hist, bestFeatNames, avgBestCombSize], axis=1) # Plot best combination custom metric (mean / std_dev) from slpClass_toolbox import PlotBestCombinationMetrics filteredData = CvResult.filter(regex=r'avg_over_std_', axis=1) metrics = pd.DataFrame(data=filteredData) metrics.dropna(inplace=True) figName = resultDir + 'SFFS_' + clusters + '_bestSet_metric_aggreg.png' PlotBestCombinationMetrics(metrics, figName) #save training and permutation results in an excel file nbSubject = pd.DataFrame(data=[len(X)], columns=['Number_Of_Subjects']) #************************ Build results structure ************************* excelResults = pd.concat([ CvResult, permResults if permutation_flag else luckLvl, sffsRes if not skipFS else None, removedData, nbSubject ], axis=1) print('Mean Decoding accuracy :{}'.format(np.mean(DA))) # compute occurence of every subset in bestsets of every iteration # from slpClass_toolbox import GetSubsetOccurence # subsetHist = GetSubsetOccurence(tmpBest) # excelResults = pd.concat([excelResults, subsetHist], axis=1) # excelResults.to_excel(saveTo, sheet_name=xlSheetName) if fitFeatOverTresh: tresh = featureList[0] * nbOfSplit bestFeatColumns = hist.iloc[:, 0][hist.iloc[:, 1] > tresh] bestDataSet = xx[bestFeatColumns] classes = y DABestFeat = [] print('Fitting with features occuring over %d times in best sets' % tresh) for i in list(range(nbOfSplit)): print('\rFit #{} of {}\n'.format(i + 1, nbOfSplit), end='\r', flush=True) # Balance the number of old woman and old man or not if balance_flag: XX, YY = BalanceClasses(bestDataSet, classes) else: XX, YY = bestDataSet, classes # slpit dataset into train and test random subset XXtrain, XXtest, yytrain, yytest = tts(XX, YY['Cluster'], test_size=0.33, stratify=YY['Cluster']) # Data z-score standardisation xxTrainSet, zzPrm = Standardize(XXtrain, yytrain, standardizationType, debug_flag) # fit and predict on training data optimClf = optimClf.fit(xxTrainSet.as_matrix(), yytrain) yPred = optimClf.predict(xxTrainSet.as_matrix()) # Compute accuracy of prediction on trainnnig set acc = float((yytrain == yPred).sum()) / yPred.shape[0] print('Train predicted accuracy: %.2f %%' % (acc * 100)) fitRes = pd.DataFrame(data=[acc], columns=['CV_DA_' + str(it + 1)]) # test classifier and compute decoding accuracy on predictions xxTestSet = ApplyStandardization(XXtest, zzPrm) yypred = optimClf.predict(xxTestSet) acc = float((yytest == yypred).sum()) / yypred.shape[0] print('Test set accuracy: %.2f %%' % (acc * 100)) DABestFeat.append(acc) # stack test DA for further use # plot confusion matrix cm = confusion_matrix(yytest, yypred) fig_CM = plt.figure(dpi=300) plot_confusion_matrix(cm, clusterNames, title=savedPlotName, normalize=True, precision=2) plt.clf() plt.close(fig_CM) df = pd.DataFrame(data=DABestFeat, columns=['optim DA']) df = pd.concat([ df, pd.DataFrame(data=[np.mean(DABestFeat)], columns=['optim avg DA']) ], axis=1) print('Classifier trained with best features (occ > %d) only' % tresh) print(df) excelResults = pd.concat([excelResults, df], axis=1) return excelResults
for j in range(i): if abs(corr_matrix.iloc[i,j])>threshold: colname=corr_matrix.columns[i] col_corr.add(colname) return col_corr corr_features=correlation(X_train,0.8) print('correlated features:',len(set(corr_features))) X_train.drop(labels=corr_features,axis=1,inplace=True) X_test.drop(labels=corr_features,axis=1,inplace=True) sfs1=SFS(RandomForestClassifier(n_jobs=4), k_features=10, forward=True, floating=False, verbose=2, scoring='roc_auc', cv=3 ) sfs1=sfs1.fit(np.array(X_train.fillna(0)),y_train) select_feat= X_train.columns[list(sfs1.k_feature_idx_)] select_feat def run_randomForests(X_train,X_test,y_train,y_test): rf=RandomForestClassifier(n_estimators=200,random_state=39,max_depth=4) rf.fit(X_train,y_train) print('Train set') pred=rf.predict_proba(X_train) print('Random Forests roc_auc :{}'.format(roc_auc_score(y_train,pred[:,1]))) print('Test set')
import warnings warnings.filterwarnings('ignore') print( "\n\nWrapper-based Method (using K-Nearest Neighbor classifier as the underlying classification algorithm)\n" ) X = pd.read_csv("glass_features.csv") y = pd.read_csv("glass_target.csv") #Lets try to put the number of nearest neighbors from in the range of given number of features,because depending upon this the KNN will compare the values for n in range(1, 10): print('When The Number of nearest neighbors selected are', n) knn = KNeighborsClassifier(n_neighbors=n) # the param forward when set to False will do sequential backward selectioni.e recursive feature elimination #Also since we have provided the string "best" in K_features , as per the docstring it will give us the best subset which is having best cross validation score sbs = SFS(knn, k_features='best', forward=False, scoring='accuracy') sbs = sbs.fit(X, y) print( "Best features Subset by SFS for this KNN algorithm when selected number of neighbors are : ", n, 'are :', sbs.k_feature_idx_) Data2 = [] for ig in sbs.k_feature_idx_: k = int(ig) print(k) Data2.append(X.columns[k]) new_X = X[Data2] knn.fit(new_X, y) y_predict = knn.predict(new_X) #print ('The Corresponding R2 value of Nearest neighbor selected :',n,'is :',knn.score(new_X,y_predict)) print('The Corresponding RMSE valueof Nearest neighbor selected :', n, 'is :', math.sqrt(mean_squared_error(y, y_predict)))