def test_knn_cv3_groups(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, k_features=3, forward=True, floating=False, cv=GroupKFold(n_splits=3), verbose=0) np.random.seed(1630672634) groups = np.random.randint(0, 6, size=len(y)) sfs1 = sfs1.fit(X, y, groups=groups) # print(sfs1.subsets_) expect = { 1: {'cv_scores': np.array([0.97916667, 0.93877551, 0.96226415]), 'feature_idx': (3,), 'avg_score': 0.9600687759380482}, 2: {'cv_scores': np.array([0.95833333, 0.93877551, 0.98113208]), 'feature_idx': (1, 3), 'avg_score': 0.9594136396697044}, 3: {'cv_scores': np.array([0.97916667, 0.95918367, 0.94339623]), 'feature_idx': (1, 2, 3), 'avg_score': 0.9605821888503829}} dict_compare_utility(d_actual=sfs1.subsets_, d_desired=expect, decimal=3)
def test_knn_cv3(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, k_features=3, forward=True, floating=False, scoring='accuracy', cv=4, skip_if_stuck=True, print_progress=False) sfs1 = sfs1.fit(X, y) sfs1.subsets_ expect = {1: {'avg_score': 0.95299145299145294, 'cv_scores': np.array([0.97435897, 0.94871795, 0.88888889, 1.0]), 'feature_idx': (3,)}, 2: {'avg_score': 0.95993589743589736, 'cv_scores': np.array([0.97435897, 0.94871795, 0.91666667, 1.0]), 'feature_idx': (2, 3)}, 3: {'avg_score': 0.97275641025641035, 'cv_scores': np.array([0.97435897, 1.0, 0.94444444, 0.97222222]), 'feature_idx': (1, 2, 3)}} dict_compare_utility(d1=expect, d2=sfs1.subsets_)
def test_knn_scoring_metric(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs5 = SFS(knn, k_features=3, forward=False, floating=True, scoring='accuracy', cv=4, skip_if_stuck=True, print_progress=False) sfs5 = sfs5.fit(X, y) assert round(sfs5.k_score_, 4) == 0.9728 sfs6 = SFS(knn, k_features=3, forward=False, floating=True, scoring='precision', cv=4, skip_if_stuck=True, print_progress=False) sfs6 = sfs6.fit(X, y) assert round(sfs6.k_score_, 4) == 0.9737
def test_knn_cv3(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, k_features=3, forward=True, floating=False, cv=4, verbose=0) sfs1 = sfs1.fit(X, y) sfs1.subsets_ expect = {1: {'avg_score': 0.95299145299145294, 'cv_scores': np.array([0.97435897, 0.94871795, 0.88888889, 1.0]), 'feature_idx': (3,)}, 2: {'avg_score': 0.95993589743589736, 'cv_scores': np.array([0.97435897, 0.94871795, 0.91666667, 1.0]), 'feature_idx': (2, 3)}, 3: {'avg_score': 0.97275641025641035, 'cv_scores': np.array([0.97435897, 1.0, 0.94444444, 0.97222222]), 'feature_idx': (1, 2, 3)}} dict_compare_utility(d_actual=sfs1.subsets_, d_desired=expect)
def get_best_logisitc(y): from mlxtend.feature_selection import SequentialFeatureSelector as SFS from sklearn.cross_validation import StratifiedKFold import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import cross_val_score my_data = pd.read_csv('data/my_data_test.csv', encoding='utf-8') y = my_data.target my_data = my_data.drop('target', axis=1) # To have better CV skf = StratifiedKFold(y, n_folds=5, random_state=17, shuffle=False) C_params = [0.01 , 1, 10, 50, 70, 100] solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag'] my_result_list = [] for C_param in C_params: for solver in solvers: print "Looking for C : %s and solver : %s" % (C_param, solver) model = LogisticRegression(class_weight='balanced', random_state=17, solver=solver, C=C_param) sfs = SFS(model, k_features=len(my_data.columns), forward=True, floating=False, scoring='roc_auc', print_progress=False, cv=skf, n_jobs=-1) sfs = sfs.fit(my_data.values, y.values) result_sfs = pd.DataFrame.from_dict(sfs.get_metric_dict()).T result_sfs.sort_values('avg_score', ascending=0, inplace=True) features_sfs = result_sfs.feature_idx.head(1).tolist() select_features_sfs = list(my_data.columns[features_sfs]) scores = cross_val_score(model, my_data[select_features_sfs], y, cv=skf, scoring='roc_auc') my_result_list.append({'C' : C_param, 'solver' : solver, 'auc' : scores.mean(), 'std' : scores.std(), 'best_columns' : select_features_sfs, 'estimator' : model}) my_result = pd.DataFrame(my_result_list) my_result.sort_values('auc', ascending=0, inplace=True) best_features = my_result.best_columns.head(1).values[0] best_model = my_result.estimator.head(1).values[0] return best_features, best_model
def test_run_default(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier() sfs = SFS(estimator=knn, verbose=0) sfs.fit(X, y) assert sfs.k_feature_idx_ == (3,)
def test_fit_params(): iris = load_iris() X = iris.data y = iris.target sample_weight = np.ones(X.shape[0]) forest = RandomForestClassifier(n_estimators=100, random_state=123) sfs = SFS(estimator=forest, verbose=0) sfs.fit(X, y, sample_weight=sample_weight) assert sfs.k_feature_idx_ == (3,)
def test_max_feature_subset_best(): boston = load_boston() X, y = boston.data, boston.target lr = LinearRegression() sfs = SFS(lr, k_features='best', forward=True, floating=False, cv=10) sfs = sfs.fit(X, y) assert sfs.k_feature_idx_ == (1, 3, 5, 7, 8, 9, 10, 11, 12)
def test_regression_sbfs(): boston = load_boston() X, y = boston.data, boston.target lr = LinearRegression() sfs_r = SFS(lr, k_features=3, forward=False, floating=True, scoring='neg_mean_squared_error', cv=10, verbose=0) sfs_r = sfs_r.fit(X, y) assert sfs_r.k_feature_idx_ == (7, 10, 12), sfs_r.k_feature_idx_
def test_knn_option_sfbs(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs4 = SFS(knn, k_features=3, forward=False, floating=True, cv=4, verbose=0) sfs4 = sfs4.fit(X, y) assert sfs4.k_feature_idx_ == (1, 2, 3)
def test_max_feature_subset_parsimonious(): boston = load_boston() X, y = boston.data, boston.target lr = LinearRegression() sfs = SFS(lr, k_features='parsimonious', forward=True, floating=False, cv=10) sfs = sfs.fit(X, y) assert sfs.k_feature_idx_ == (5, 10, 11, 12)
def test_regression_in_range(): boston = load_boston() X, y = boston.data, boston.target lr = LinearRegression() sfs_r = SFS(lr, k_features=(1, 13), forward=True, floating=False, scoring='neg_mean_squared_error', cv=10, verbose=0) sfs_r = sfs_r.fit(X, y) assert len(sfs_r.k_feature_idx_) == 9 assert round(sfs_r.k_score_, 4) == -31.1537
def test_knn_option_sbs_tuplerange_1(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=3) sfs4 = SFS(knn, k_features=(1, 3), forward=False, floating=False, cv=4, verbose=0) sfs4 = sfs4.fit(X, y) assert round(sfs4.k_score_, 3) == 0.967, sfs4.k_score_ assert sfs4.k_feature_idx_ == (0, 2, 3), sfs4.k_feature_idx_
def test_regression(): boston = load_boston() X, y = boston.data, boston.target lr = LinearRegression() sfs_r = SFS(lr, k_features=13, forward=True, floating=False, scoring='mean_squared_error', cv=10, skip_if_stuck=True, print_progress=False) sfs_r = sfs_r.fit(X, y) assert round(sfs_r.k_score_, 4) == -34.7631
def test_knn_rbf_groupkfold(): nan_roc_auc_scorer = make_scorer(nan_roc_auc_score) rng = np.random.RandomState(123) iris = load_iris() X = iris.data # knn = KNeighborsClassifier(n_neighbors=4) forest = RandomForestClassifier(n_estimators=100, random_state=123) bool_01 = [True if item == 0 else False for item in iris['target']] bool_02 = [True if (item == 1 or item == 2) else False for item in iris['target']] groups = [] y_new = [] for ind, _ in enumerate(bool_01): if bool_01[ind]: groups.append('attribute_A') y_new.append(0) if bool_02[ind]: throw = rng.rand() if throw < 0.5: groups.append('attribute_B') else: groups.append('attribute_C') throw2 = rng.rand() if throw2 < 0.5: y_new.append(0) else: y_new.append(1) y_new_bool = [True if item is 1 else False for item in y_new] cv_obj = GroupKFold(n_splits=3) cv_obj_list = list(cv_obj.split(X, np.array(y_new_bool), groups)) sfs1 = SFS(forest, k_features=3, forward=True, floating=False, cv=cv_obj_list, scoring=nan_roc_auc_scorer, verbose=0 ) sfs1 = sfs1.fit(X, y_new) expect = { 1: {'cv_scores': np.array([0.52, nan, 0.72]), 'avg_score': 0.62, 'feature_idx': (1,)}, 2: {'cv_scores': np.array([0.42, nan, 0.65]), 'avg_score': 0.53, 'feature_idx': (1, 2)}, 3: {'cv_scores': np.array([0.47, nan, 0.63]), 'avg_score': 0.55, 'feature_idx': (1, 2, 3)}} dict_compare_utility(d_actual=sfs1.subsets_, d_desired=expect, decimal=1)
def test_predefinedholdoutsplit_in_sfs(): h_iter = PredefinedHoldoutSplit(valid_indices=[0, 1, 99]) knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, k_features=3, forward=True, floating=False, verbose=2, scoring='accuracy', cv=h_iter) sfs1 = sfs1.fit(X, y) d = sfs1.get_metric_dict() assert d[1]['cv_scores'].shape[0] == 1
def test_randomholdoutsplit_in_sfs(): h_iter = RandomHoldoutSplit(valid_size=0.3, random_seed=123) knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, k_features=3, forward=True, floating=False, verbose=2, scoring='accuracy', cv=h_iter) sfs1 = sfs1.fit(X, y) d = sfs1.get_metric_dict() assert d[1]['cv_scores'].shape[0] == 1
def test_regression(): boston = load_boston() X, y = boston.data, boston.target lr = LinearRegression() sfs_r = SFS(lr, k_features=13, forward=True, floating=False, scoring='mean_squared_error', cv=10, skip_if_stuck=True, print_progress=False) sfs_r = sfs_r.fit(X, y) assert len(sfs_r.k_feature_idx_) == 13 assert round(sfs_r.k_score_, 4) == -34.7631
def test_knn_option_sffs(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs2 = SFS(knn, k_features=3, forward=True, floating=True, scoring='accuracy', cv=4, skip_if_stuck=True, verbose=0) sfs2 = sfs2.fit(X, y) assert sfs2.k_feature_idx_ == (1, 2, 3)
def test_max_feature_subset_size_in_tuple_range(): boston = load_boston() X, y = boston.data, boston.target lr = LinearRegression() sfs = SFS(lr, k_features=(1, 5), forward=False, floating=True, scoring='neg_mean_squared_error', cv=10) sfs = sfs.fit(X, y) assert len(sfs.k_feature_idx_) == 5
def feature_selection(data, label, num_channel): print("test") channel_rm_list = [] channel_all_list = set(list(range(32))) sfs = SFS(LinearRegression(), k_features=num_channel, forward=True, floating=False, scoring = 'r2', cv = 0) sfs.fit(data, label) x = sfs.k_feature_names_ # to get the final set of features channel_list = set([int(a) for a in list(x)]) channel_rm_list = list(channel_all_list.difference(channel_list)) return channel_rm_list
def test_regression_in_range(): boston = load_boston() X, y = boston.data, boston.target lr = LinearRegression() sfs_r = SFS(lr, k_features=(1, 13), forward=True, floating=False, scoring='neg_mean_squared_error', cv=10, skip_if_stuck=True, verbose=0) sfs_r = sfs_r.fit(X, y) assert len(sfs_r.k_feature_idx_) == 9 assert round(sfs_r.k_score_, 4) == -31.1537
def test_knn_option_sfs(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, k_features=3, forward=True, floating=False, scoring='accuracy', cv=4, skip_if_stuck=True, print_progress=False) sfs1 = sfs1.fit(X, y) assert sfs1.k_feature_idx_ == (1, 2, 3)
def test_knn_option_sfbs(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs4 = SFS(knn, k_features=3, forward=False, floating=True, scoring='accuracy', cv=4, skip_if_stuck=True, print_progress=False) sfs4 = sfs4.fit(X, y) assert sfs4.k_feature_idx_ == (1, 2, 3)
def test_regression(): boston = load_boston() X, y = boston.data, boston.target lr = LinearRegression() sfs_r = SFS(lr, k_features=13, forward=True, floating=False, scoring=MEAN_SQUARED_ERROR, cv=10, skip_if_stuck=True, verbose=0) sfs_r = sfs_r.fit(X, y) assert len(sfs_r.k_feature_idx_) == 13 assert round(sfs_r.k_score_, 4) == -34.7631
def main(): x_train, y_train, x_test, y_test = get_data() for n in [2, 3, 5, 10, 16]: sfs = SFS(KNeighborsClassifier(n_neighbors=7), k_features=n, forward=False, floating=True, scoring='accuracy', cv=0) sfs = sfs.fit(x_train, y_train) print('\nSequential Floating Forward Selection: ', n) feat_cols = list(sfs.k_feature_idx_) print(feat_cols) knn = KNeighborsClassifier(n_neighbors=3) knn.fit(x_train[:, feat_cols], y_train) y_train_pred = knn.predict(x_train[:, feat_cols]) print('Training accuracy on selected features: %.3f' % acc(y_train, y_train_pred)) y_test_pred = knn.predict(x_test[:, feat_cols]) print('Testing accuracy on selected features: %.3f' % acc(y_test, y_test_pred)) print(confusion_matrix(y_test, y_test_pred)) print(classification_report(y_test, y_test_pred)) if n == 2: fig, axs = plt.subplots(2) fig.suptitle("SFS(KNN) Scatter Plot", fontsize='small') axs[0].scatter(x_train[:, feat_cols[0]], x_train[:, feat_cols[1]], marker='o', c=y_train, s=25, edgecolor='k') axs[1].scatter(x_test[:, feat_cols[0]], x_test[:, feat_cols[1]], marker='o', c=y_test, s=25, edgecolor='k') plt.show()
def run_experiment(X, y, clf, protected_groups, unfairness_metric, unfairness_weight): metric = unfairness_metrics.UnfairnessMetric(protected_groups, unfairness_metric) unfairness_scorer = metrics.make_scorer(metric) unfairness_means = [] auc_means = [] selected_feature_props = np.zeros([ITERATIONS, X.shape[1]]) for i in tqdm(range(ITERATIONS), desc=' Training ' + clf.__class__.__name__): xval = model_selection.KFold(4, shuffle=True, random_state=i) # Make a metric combining accuracy and subtracting unfairness w.r.t. the protected groups metric = unfairness_metrics.CombinedMetric(ACCURACY_METRIC, protected_groups, unfairness_metric, unfairness_weight) combined_scorer = metrics.make_scorer(metric) sfs = SequentialFeatureSelector(clf, 'best', verbose=0, cv=xval, scoring=combined_scorer, n_jobs=2) pipe = pipeline.Pipeline([ ('standardize', preprocessing.StandardScaler()), ('feature_selection', sfs), ('model', clf), ]) result = model_selection.cross_validate(pipe, X, y, verbose=0, cv=xval, scoring={ 'unfairness': unfairness_scorer, 'auc': metrics.make_scorer(ACCURACY_METRIC), }, return_estimator=True) unfairness_means.append(result['test_unfairness'].mean()) auc_means.append(result['test_auc'].mean()) for estimator in result['estimator']: for feature_i in estimator.named_steps['feature_selection'].k_feature_idx_: selected_feature_props[i][feature_i] += 1 / len(result['estimator']) return unfairness_means, auc_means, selected_feature_props
def test_pandas(): X_df = pd.DataFrame(X_iris, columns=['sepal length', 'sepal width', 'petal width', 'petal width']) knn = KNeighborsClassifier() sfs = SFS(estimator=knn, k_features=3, forward=True, floating=False, fixed_features=('sepal length', 'sepal width'), verbose=0) sfs.fit(X_df, y_iris) print(sfs.subsets_) for k in sfs.subsets_: assert 0 in sfs.subsets_[k]['feature_idx'] assert 1 in sfs.subsets_[k]['feature_idx']
def test_knn_option_sfbs_tuplerange_2(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=3) sfs4 = SFS(knn, k_features=(1, 4), forward=False, floating=True, scoring='accuracy', cv=4, skip_if_stuck=True, verbose=0) sfs4 = sfs4.fit(X, y) assert round(sfs4.k_score_, 3) == 0.966, sfs4.k_score_ assert sfs4.k_feature_idx_ == (1, 2, 3), sfs4.k_feature_idx_
def test_clone_params_pass(): iris = load_iris() X = iris.data y = iris.target lr = SoftmaxRegression(random_seed=1) sfs1 = SFS(lr, k_features=2, forward=True, floating=False, scoring='accuracy', cv=0, clone_estimator=True, verbose=0, n_jobs=1) sfs1 = sfs1.fit(X, y) assert (sfs1.k_feature_idx_ == (1, 3))
def test_string_scoring_clf(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, k_features=3, cv=0) sfs1 = sfs1.fit(X, y) sfs2 = SFS(knn, k_features=3, scoring='accuracy', cv=0) sfs2 = sfs2.fit(X, y) sfs3 = SFS(knn, k_features=3, scoring=make_scorer(accuracy_score), cv=0) sfs3 = sfs2.fit(X, y) assert sfs1.k_score_ == sfs2.k_score_ assert sfs1.k_score_ == sfs3.k_score_
def sequential_feature_selection(data_set, y_values, want_graph): lr = LinearRegression() sfs = SFS(lr, k_features=13, forward=True, floating=False, scoring='neg_mean_squared_error', cv=10) sfs = sfs.fit(data_set, y_values) if want_graph: fig = plot_sfs(sfs.get_metric_dict(), kind='std_err') plt.title('Sequential Forward Selection (w. StdErr)') plt.grid() plt.show() return sfs
def test_knn_option_sbs_tuplerange_1(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=3) sfs4 = SFS(knn, k_features=(1, 3), forward=False, floating=False, scoring='accuracy', cv=4, skip_if_stuck=True, print_progress=False) sfs4 = sfs4.fit(X, y) assert round(sfs4.k_score_, 3) == 0.967, sfs4.k_score_ assert sfs4.k_feature_idx_ == (0, 2, 3), sfs4.k_feature_idx_
def SFS_test(input, how_many_attrs, cv_scores): y = np.array(input[:, -1]) x = np.array(input[:, :-1]) sfs = SequentialFeatureSelector(KNeighborsClassifier(n_neighbors=5, metric="euclidean"), k_features=how_many_attrs, forward=True, floating=False, verbose=0, scoring='accuracy', n_jobs=-1, cv=4) sfs = sfs.fit(x, y) # print(sfs.k_feature_idx_) target = np.array(input[:, -1]).reshape(475, 1) return np.hstack((input[:, sfs.k_feature_idx_], target))
def filter_with_sfs(train_X, valid_X, test_X, train_Y, i): features = {item for item in train_X.head(0)} fs = SequentialFeatureSelector(RandomForestClassifier(n_estimators=30, random_state=0), k_features=i, forward=True, verbose=0, scoring='accuracy', cv=4) fs.fit(train_X, train_Y) selected_features = set(fs.k_feature_names_) features_to_drop = list(features - selected_features) return train_X.drop(features_to_drop, axis=1), valid_X.drop(features_to_drop, axis=1), \ test_X.drop(features_to_drop, axis=1)
def select_features(self, model, X_train, y_train, k_features=(1, 30), scorer='r2', cv=0): sfs = SFS(model, k_features=k_features, forward=True, floating=False, scoring=scorer, cv=cv, verbose=2) sfs.fit(np.array(X_train), np.array(y_train)) print(sfs.k_feature_idx_) """
def test_clone_params_pass(): iris = load_iris() X = iris.data y = iris.target lr = SoftmaxRegression(random_seed=1) sfs1 = SFS(lr, k_features=3, forward=True, floating=False, scoring='accuracy', cv=0, skip_if_stuck=True, clone_estimator=False, print_progress=False, n_jobs=1) sfs1 = sfs1.fit(X, y) assert sfs1.k_feature_idx_ == (0, 1, 2)
def sub_window_creation(self, images, kernels): gb_all_sw = [] label = [] for i in range(0, 100, 11): for j in range(0, 50, 11): for k in range(len(images)): image = images[k] sw_image = image[i:i + 50, j:j + 50] sw_image = cv2.resize(sw_image, dsize=(12, 12), interpolation=cv2.INTER_NEAREST) # print('sw size', sw_image.shape) gabored_image = Preprocessing.process( self, sw_image, kernels) # print('gab size', gabored_image.shape) # model = SpectralEmbedding(n_components=100, n_neighbors=10) # reduced_sw = model.fit_transform(gabored_image.reshape(-1, 1)) # print('gab size', gabored_image.reshape(1, -1).shape) # gb_all_sw.append(gabored_image) gb_all_sw.append(gabored_image) label.append(int(k / 4)) # print('red size', reduced_sw.reshape(-1, 1).shape) # plt.imshow(image[i:i+50, j:j+50], cmap='gray') # plt.show() # plt.imshow(gabored_image, cmap='gray') # plt.show() print(len(gb_all_sw)) print(len(gb_all_sw[0])) # LEM demension reduction model = SpectralEmbedding(n_components=100, n_neighbors=10) # reduced_sw = model.fit_transform(gb_all_sw) reduced_sw = model.fit_transform(gb_all_sw) knn = KNeighborsClassifier(n_neighbors=5) sffs = SFS(knn, k_features=5, forward=True, floating=True, scoring='accuracy', cv=4, n_jobs=-1) sffs = sffs.fit(reduced_sw, label) print('\nSequential Forward Floating Selection (k=', i, '):') print(sffs.k_feature_idx_) print('CV Score:') print(sffs.k_score_)
def test_check_pandas_dataframe_fit_backward(): for floating in [True, False]: iris = load_iris() X = iris.data y = iris.target lr = SoftmaxRegression(random_seed=1) sfs1 = SFS(lr, k_features=2, forward=False, floating=floating, scoring='accuracy', cv=0, verbose=0, n_jobs=1) df = pd.DataFrame( X, columns=['sepal len', 'sepal width', 'petal len', 'petal width']) sfs1 = sfs1.fit(X, y) assert sfs1.k_feature_idx_ == (1, 2) assert sfs1.k_feature_names_ == ('1', '2') assert sfs1.subsets_[2]['feature_names'] == ('1', '2') sfs1 = sfs1.fit(df, y) assert sfs1.subsets_[3]['feature_names'] == ('sepal len', 'sepal width', 'petal len') assert sfs1.subsets_[2]['feature_names'] == ('sepal width', 'petal len') assert sfs1.subsets_[3]['feature_idx'] == (0, 1, 2) assert sfs1.subsets_[2]['feature_idx'] == (1, 2) assert sfs1.k_feature_idx_ == (1, 2) assert sfs1.k_feature_names_ == ('sepal width', 'petal len') sfs1._TESTING_INTERRUPT_MODE = True out = sfs1.fit(df, y) assert len(out.subsets_.keys()) > 0 assert sfs1.interrupted_ assert sfs1.subsets_[3]['feature_names'] == ('sepal len', 'sepal width', 'petal len') assert sfs1.k_feature_idx_ == (0, 1, 2) assert sfs1.k_feature_names_ == ('sepal len', 'sepal width', 'petal len')
def sff_selection(k_features, pipeline, x, y, fwd=True, flt=True): ''' Selects a subset of available features Input: k_features - number of features to select pipeline - predictor pipeline x, y - features and labels fwd,flt - boolean parameters for SFFS algorithm, see mlxtend docs Output: tuple of accuracy score and list of k selected fatures The mlxtend SFS function implements four related feature selection algorithms; if the default parameters (fwd=True, flt=True) are not changed, this is Sequential Floating Forward Selection (SFFS) SFFS has been identified as a good way of performing dimension reduction through feature selection on triaxial accelerometer data: Gupta and Tim Dallas (2014) "Feature Selection and Activity Recognition System Using a Single Triaxial Accelerometer" IEEE Trans. Bomed. Eng., 61(6) ''' # hyperparameters sffs_scoring = 'accuracy' sffs_cv_folds = 10 # Feature selection sffs = SFS(pipeline, k_features=k_features, forward=fwd, floating=flt, scoring=sffs_scoring, cv=sffs_cv_folds, n_jobs=-1) sffs = sffs.fit(x.as_matrix(), y.as_matrix()) # list of the k best features feat_names = list(x.columns.values) feat_list = [feat_names[i] for i in sffs.k_feature_idx_] # return the prediction score and feature name list return sffs.k_score_, feat_list
def run_decision_tree(self): clf = DecisionTreeRegressor(random_state=7, max_depth=self.max_depth) sfs = SFS(clf, k_features=self.k_features, forward=True, floating=True, scoring=self.scoring, n_jobs=-1, cv=4) test_features = self.train.columns test_features = list(test_features.drop(['date', 'ticker', 'return'])) sfs = sfs.fit(self.train[test_features], self.train['return']) self.score = sfs.k_score_ self.features = list(sfs.k_feature_names_)
def FeatureSelection(xSet, ySet, nFeatures=10): sffs = SFS(svm.SVC(kernel='rbf', C=1), k_features=nFeatures, forward=True, floating=True, verbose=2, scoring='accuracy', cv=10, n_jobs=-1) #-1 sffs = sffs.fit(xSet, ySet) print('\nSequential Forward Floating Selection:') print(sffs.k_feature_idx_) print('CV Score:') print(sffs.k_score_) return sffs
def get_core_features(self, X, y) -> List[str]: if self.method == "SFS": mySFS = SFS( LogisticRegression(), k_features=10, forward=True, cv=0, scoring="roc_auc", ) myVars = mySFS.fit(X.values, y.values) return [X.columns[i] for i in myVars.k_feature_idx_] if self.method == "RFE": rfe = RFE(self.model, self.n_features) fit = rfe.fit(X, y) return [i[1] for i in zip(fit.support_, X.columns) if i[0]] raise ValueError("Unknown method for core feature selection")
def make_features_selection(X_train, y_train, is_forward): curr_C = float(sys.argv[1]) rkf = RepeatedKFold(n_splits=Q, n_repeats=T) features_number = 90 if is_forward else len(X_train.columns) - 12 curr_svm_classifier = LinearSVC(penalty='l2', dual=False, C=curr_C) sfs = SFS(estimator=curr_svm_classifier, k_features=features_number, forward=is_forward, floating=True, n_jobs=-1, verbose=2, scoring=SCORING, cv=rkf) sfs = sfs.fit(X_train.values, y_train) make_plot(sfs, curr_C, is_forward) make_debug_info(sfs, curr_C, is_forward)
def select_r2(df_in, ss_label, f_n, eps): dfx = df_in.copy() if len(dfx.columns) > f_n: select = SFS(RandomForestClassifier(n_estimators=eps, random_state=1), k_features=f_n, forward=True, floating=False, scoring='accuracy', cv=4, n_jobs=3) select.fit(dfx.values, ss_label.values) mask = select.k_feature_idx_ x_sfs = select.transform(dfx.values) m_mir_list = dfx.columns[[x for x in mask]] return x_sfs, ','.join(m_mir_list), len(m_mir_list) else: f_list = dfx.columns.tolist() return dfx.values, ','.join(f_list), len(f_list)
def perform_sfs(curr_classifier, X_train, X_test, y_train, y_test): sfs1 = SFS(curr_classifier, k_features=100, verbose=0, forward=True, floating=False, scoring='accuracy', cv=5, n_jobs=8) sfs1 = sfs1.fit(X_train, y_train) df = pd.DataFrame.from_dict(sfs1.get_metric_dict(), orient='index') df[[ 'accuracy', 'precision', 'recall', 'f1_score', 'confusion_matrix' ]] = df['feature_idx'].apply(lambda x: get_test_score( X_train, X_test, y_train, y_test, x, curr_classifier)).apply(pd.Series) return df
def select_features_wrapper(X, y, forward=True, k_features=20): # svc = SVC(gamma='auto') # linearSVC = LinearSVC(random_state=0, tol=1e-5, class_weight='balanced') random_forest_clssifier = RandomForestClassifier(max_depth=7, random_state=0) sgd = SGDClassifier(max_iter=1000, tol=1e-3) # knn = KNNeighborsClassifier(n_neighbors=3) sfs = SequentialFeatureSelector(sgd, k_features=k_features, forward=forward, floating=False, verbose=5, cv=0, n_jobs=-1) sfs.fit(X, y.values.ravel()) print(sfs.k_feature_names_) return sfs
def test_check_pandas_dataframe_fit_backward(): for floating in [True, False]: iris = load_iris() X = iris.data y = iris.target lr = SoftmaxRegression(random_seed=1) sfs1 = SFS(lr, k_features=2, forward=False, floating=floating, scoring='accuracy', cv=0, verbose=0, n_jobs=1) df = pd.DataFrame(X, columns=['sepal len', 'sepal width', 'petal len', 'petal width']) sfs1 = sfs1.fit(X, y) assert sfs1.k_feature_idx_ == (1, 2) assert sfs1.k_feature_names_ == ('1', '2') assert sfs1.subsets_[2]['feature_names'] == ('1', '2') sfs1 = sfs1.fit(df, y) assert sfs1.subsets_[3]['feature_names'] == ('sepal len', 'sepal width', 'petal len') assert sfs1.subsets_[2]['feature_names'] == ('sepal width', 'petal len') assert sfs1.subsets_[3]['feature_idx'] == (0, 1, 2) assert sfs1.subsets_[2]['feature_idx'] == (1, 2) assert sfs1.k_feature_idx_ == (1, 2) assert sfs1.k_feature_names_ == ('sepal width', 'petal len') sfs1._TESTING_INTERRUPT_MODE = True out = sfs1.fit(df, y) assert len(out.subsets_.keys()) > 0 assert sfs1.interrupted_ assert sfs1.subsets_[3]['feature_names'] == ('sepal len', 'sepal width', 'petal len') assert sfs1.k_feature_idx_ == (0, 1, 2) assert sfs1.k_feature_names_ == ('sepal len', 'sepal width', 'petal len')
def test_check_pandas_dataframe_transform(): iris = load_iris() X = iris.data y = iris.target lr = SoftmaxRegression(random_seed=1) sfs1 = SFS(lr, k_features=2, forward=True, floating=False, scoring='accuracy', cv=0, verbose=0, n_jobs=1) df = pd.DataFrame(X, columns=['sepal length', 'sepal width', 'petal length', 'petal width']) sfs1 = sfs1.fit(df, y) assert sfs1.k_feature_idx_ == (1, 3) assert (150, 2) == sfs1.transform(df).shape
def sfs_selection(X,y,n_features,forward): """ Performs the Sequential Forward/Backward Selection method and selects the top ranking features Keyword arguments: X -- The feature vectors y -- The target vector n_features -- n best ranked features """ if verbose: print '\nPerforming Feature Selection based on the Sequential Feature Selection method ...' clf=RandomForestClassifierWithCoef(n_estimators=5,n_jobs=-1) sfs = SFS(clf,k_features=n_features,forward=forward,scoring='accuracy',cv=0,n_jobs=-1, print_progress=True,) sfs = sfs.fit(X, y) feature_indexes=sfs.k_feature_idx_ return X[:,feature_indexes[0:n_features]],feature_indexes[0:n_features] #return selected features and original index features
def test_regression(): boston = load_boston() X, y = boston.data, boston.target lr = LinearRegression() sfs_r = SFS(lr, k_features=13, forward=True, floating=False, scoring='neg_mean_squared_error', cv=10, verbose=0) sfs_r = sfs_r.fit(X, y) assert len(sfs_r.k_feature_idx_) == 13 if Version(sklearn_version) < '0.20': assert round(sfs_r.k_score_, 4) == -34.7631, \ round(sfs_r.k_score_, 4) else: assert round(sfs_r.k_score_, 4) == -34.7053, \ round(sfs_r.k_score_, 4)
def test_custom_feature_names(): iris = load_iris() X = iris.data y = iris.target lr = SoftmaxRegression(random_seed=1) sfs1 = SFS(lr, k_features=2, forward=True, floating=False, scoring='accuracy', cv=0, verbose=0, n_jobs=1) sfs1 = sfs1.fit(X, y, custom_feature_names=( 'sepal length', 'sepal width', 'petal length', 'petal width')) assert sfs1.k_feature_idx_ == (1, 3) assert sfs1.k_feature_names_ == ('sepal width', 'petal width') assert sfs1.subsets_[2]['feature_names'] == ('sepal width', 'petal width')
def test_knn_scoring_metric(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs5 = SFS(knn, k_features=3, forward=False, floating=True, scoring='accuracy', cv=4, skip_if_stuck=True, verbose=0) sfs5 = sfs5.fit(X, y) assert round(sfs5.k_score_, 4) == 0.9728 sfs6 = SFS(knn, k_features=3, forward=False, floating=True, cv=4, skip_if_stuck=True, verbose=0) sfs6 = sfs6.fit(X, y) assert round(sfs6.k_score_, 4) == 0.9728 sfs7 = SFS(knn, k_features=3, forward=False, floating=True, scoring='f1_macro', cv=4, skip_if_stuck=True) sfs7 = sfs7.fit(X, y) assert round(sfs7.k_score_, 4) == 0.9727, sfs7.k_score_
def test_keyboard_interrupt(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS( knn, k_features=3, forward=True, floating=False, cv=3, clone_estimator=False, verbose=5, n_jobs=1 ) sfs1._TESTING_INTERRUPT_MODE = True out = sfs1.fit(X, y) assert len(out.subsets_.keys()) > 0 assert sfs1.interrupted_
def test_knn_wo_cv(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, k_features=3, forward=True, floating=False, cv=0, verbose=0) sfs1 = sfs1.fit(X, y) expect = {1: {'avg_score': 0.95999999999999996, 'cv_scores': np.array([0.96]), 'feature_idx': (3,)}, 2: {'avg_score': 0.97333333333333338, 'cv_scores': np.array([0.97333333]), 'feature_idx': (2, 3)}, 3: {'avg_score': 0.97333333333333338, 'cv_scores': np.array([0.97333333]), 'feature_idx': (1, 2, 3)}} dict_compare_utility(d_actual=sfs1.subsets_, d_desired=expect)
def test_knn_wo_cv(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, k_features=3, forward=True, floating=False, scoring='accuracy', cv=0, skip_if_stuck=True, print_progress=False) sfs1 = sfs1.fit(X, y) expect = {1: {'avg_score': 0.95999999999999996, 'cv_scores': np.array([0.96]), 'feature_idx': (3,)}, 2: {'avg_score': 0.97333333333333338, 'cv_scores': np.array([0.97333333]), 'feature_idx': (2, 3)}, 3: {'avg_score': 0.97333333333333338, 'cv_scores': np.array([0.97333333]), 'feature_idx': (1, 2, 3)}} dict_compare_utility(d1=expect, d2=sfs1.subsets_)