def test_selects_all(): from sklearn.neighbors import KNeighborsClassifier from mlxtend.data import wine_data X, y = wine_data() knn = KNeighborsClassifier(n_neighbors=4) sfs = SFS(knn, k_features=13, scoring='accuracy', cv=3, print_progress=False) sfs.fit(X, y) assert(len(sfs.indices_) == 13)
def test_Iris(): from sklearn.neighbors import KNeighborsClassifier from sklearn.datasets import load_iris iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs = SFS(knn, k_features=2, scoring='accuracy', cv=5, print_progress=False) sfs.fit(X, y) assert(sfs.indices_ == (2, 3)) assert(round(sfs.k_score_, 2) == 0.97 )
def test_regression(): boston = load_boston() X, y = boston.data, boston.target lr = LinearRegression() sfs_r = SFS(lr, k_features=13, forward=True, floating=False, scoring='mean_squared_error', cv=10, skip_if_stuck=True, print_progress=False) sfs_r = sfs_r.fit(X, y) assert len(sfs_r.k_feature_idx_) == 13 assert round(sfs_r.k_score_, 4) == -34.7631
def test_knn_scoring_metric(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs5 = SFS(knn, k_features=3, forward=False, floating=True, scoring='accuracy', cv=4, skip_if_stuck=True, verbose=0) sfs5 = sfs5.fit(X, y) assert round(sfs5.k_score_, 4) == 0.9728 sfs6 = SFS(knn, k_features=3, forward=False, floating=True, cv=4, skip_if_stuck=True, verbose=0) sfs6 = sfs6.fit(X, y) assert round(sfs6.k_score_, 4) == 0.9728 sfs7 = SFS(knn, k_features=3, forward=False, floating=True, scoring='f1_macro', cv=4, skip_if_stuck=True, ) sfs7 = sfs7.fit(X, y) assert round(sfs7.k_score_, 4) == 0.9727, sfs7.k_score_
def test_knn_option_sffs(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs2 = SFS(knn, k_features=3, forward=True, floating=True, scoring='accuracy', cv=4, skip_if_stuck=True, print_progress=False) sfs2 = sfs2.fit(X, y) assert sfs2.k_feature_idx_ == (1, 2, 3)
def test_knn_option_sbs_tuplerange_1(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=3) sfs4 = SFS(knn, k_features=(1, 3), forward=False, floating=False, cv=4, skip_if_stuck=True, verbose=0) sfs4 = sfs4.fit(X, y) assert round(sfs4.k_score_, 3) == 0.967, sfs4.k_score_ assert sfs4.k_feature_idx_ == (0, 2, 3), sfs4.k_feature_idx_
def test_regression(): boston = load_boston() X, y = boston.data, boston.target lr = LinearRegression() sfs_r = SFS(lr, k_features=13, forward=True, floating=False, scoring=MEAN_SQUARED_ERROR, cv=10, skip_if_stuck=True, verbose=0) sfs_r = sfs_r.fit(X, y) assert len(sfs_r.k_feature_idx_) == 13 assert round(sfs_r.k_score_, 4) == -34.7631
def test_regression_in_range(): boston = load_boston() X, y = boston.data, boston.target lr = LinearRegression() sfs_r = SFS(lr, k_features=(1, 13), forward=True, floating=False, scoring='neg_mean_squared_error', cv=10, skip_if_stuck=True, verbose=0) sfs_r = sfs_r.fit(X, y) assert len(sfs_r.k_feature_idx_) == 9 assert round(sfs_r.k_score_, 4) == -31.1537
def test_get_metric_dict_not_fitted(): knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, k_features=2, forward=True, floating=False, cv=0, clone_estimator=False, verbose=0, n_jobs=1) expect = 'SequentialFeatureSelector has not been fitted, yet.' assert_raises(AttributeError, expect, sfs1.get_metric_dict)
def feature_selection(data, label, num_channel): print("test") channel_rm_list = [] channel_all_list = set(list(range(32))) sfs = SFS(LinearRegression(), k_features=num_channel, forward=True, floating=False, scoring = 'r2', cv = 0) sfs.fit(data, label) x = sfs.k_feature_names_ # to get the final set of features channel_list = set([int(a) for a in list(x)]) channel_rm_list = list(channel_all_list.difference(channel_list)) return channel_rm_list
def test_knn_option_sfbs(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs4 = SFS(knn, k_features=3, forward=False, floating=True, scoring='accuracy', cv=4, skip_if_stuck=True, verbose=0) sfs4 = sfs4.fit(X, y) assert sfs4.k_feature_idx_ == (1, 2, 3)
def test_kfeatures_type_1(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier() expect = ('k_features must be a positive integer between 1 and X.shape[1],' ' got 0') sfs = SFS(estimator=knn, verbose=0, k_features=0) assert_raises(AttributeError, expect, sfs.fit, X, y)
def test_predefinedholdoutsplit_in_sfs(): h_iter = PredefinedHoldoutSplit(valid_indices=[0, 1, 99]) knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, k_features=3, forward=True, floating=False, verbose=2, scoring='accuracy', cv=h_iter) sfs1 = sfs1.fit(X, y) d = sfs1.get_metric_dict() assert d[1]['cv_scores'].shape[0] == 1
def test_randomholdoutsplit_in_sfs(): h_iter = RandomHoldoutSplit(valid_size=0.3, random_seed=123) knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, k_features=3, forward=True, floating=False, verbose=2, scoring='accuracy', cv=h_iter) sfs1 = sfs1.fit(X, y) d = sfs1.get_metric_dict() assert d[1]['cv_scores'].shape[0] == 1
def test_kfeatures_type_5(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier() expect = ('he min k_features value must be' ' larger than the max k_features value.') sfs = SFS(estimator=knn, verbose=0, k_features=(3, 1)) assert_raises(AttributeError, expect, sfs.fit, X, y)
def test_max_feature_subset_size_in_tuple_range(): boston = load_boston() X, y = boston.data, boston.target lr = LinearRegression() sfs = SFS(lr, k_features=(1, 5), forward=False, floating=True, scoring='neg_mean_squared_error', cv=10) sfs = sfs.fit(X, y) assert len(sfs.k_feature_idx_) == 5
def test_string_scoring_clf(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, k_features=3, cv=0) sfs1 = sfs1.fit(X, y) sfs2 = SFS(knn, k_features=3, scoring='accuracy', cv=0) sfs2 = sfs2.fit(X, y) sfs3 = SFS(knn, k_features=3, scoring=make_scorer(accuracy_score), cv=0) sfs3 = sfs2.fit(X, y) assert sfs1.k_score_ == sfs2.k_score_ assert sfs1.k_score_ == sfs3.k_score_
def main(): x_train, y_train, x_test, y_test = get_data() for n in [2, 3, 5, 10, 16]: sfs = SFS(KNeighborsClassifier(n_neighbors=7), k_features=n, forward=False, floating=True, scoring='accuracy', cv=0) sfs = sfs.fit(x_train, y_train) print('\nSequential Floating Forward Selection: ', n) feat_cols = list(sfs.k_feature_idx_) print(feat_cols) knn = KNeighborsClassifier(n_neighbors=3) knn.fit(x_train[:, feat_cols], y_train) y_train_pred = knn.predict(x_train[:, feat_cols]) print('Training accuracy on selected features: %.3f' % acc(y_train, y_train_pred)) y_test_pred = knn.predict(x_test[:, feat_cols]) print('Testing accuracy on selected features: %.3f' % acc(y_test, y_test_pred)) print(confusion_matrix(y_test, y_test_pred)) print(classification_report(y_test, y_test_pred)) if n == 2: fig, axs = plt.subplots(2) fig.suptitle("SFS(KNN) Scatter Plot", fontsize='small') axs[0].scatter(x_train[:, feat_cols[0]], x_train[:, feat_cols[1]], marker='o', c=y_train, s=25, edgecolor='k') axs[1].scatter(x_test[:, feat_cols[0]], x_test[:, feat_cols[1]], marker='o', c=y_test, s=25, edgecolor='k') plt.show()
def sequential_feature_selection(data_set, y_values, want_graph): lr = LinearRegression() sfs = SFS(lr, k_features=13, forward=True, floating=False, scoring='neg_mean_squared_error', cv=10) sfs = sfs.fit(data_set, y_values) if want_graph: fig = plot_sfs(sfs.get_metric_dict(), kind='std_err') plt.title('Sequential Forward Selection (w. StdErr)') plt.grid() plt.show() return sfs
def test_pandas(): X_df = pd.DataFrame(X_iris, columns=['sepal length', 'sepal width', 'petal width', 'petal width']) knn = KNeighborsClassifier() sfs = SFS(estimator=knn, k_features=3, forward=True, floating=False, fixed_features=('sepal length', 'sepal width'), verbose=0) sfs.fit(X_df, y_iris) print(sfs.subsets_) for k in sfs.subsets_: assert 0 in sfs.subsets_[k]['feature_idx'] assert 1 in sfs.subsets_[k]['feature_idx']
def test_knn_option_sfbs_tuplerange_2(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=3) sfs4 = SFS(knn, k_features=(1, 4), forward=False, floating=True, scoring='accuracy', cv=4, skip_if_stuck=True, print_progress=False) sfs4 = sfs4.fit(X, y) assert round(sfs4.k_score_, 3) == 0.966, sfs4.k_score_ assert sfs4.k_feature_idx_ == (1, 2, 3), sfs4.k_feature_idx_
def test_clone_params_pass(): iris = load_iris() X = iris.data y = iris.target lr = SoftmaxRegression(random_seed=1) sfs1 = SFS(lr, k_features=2, forward=True, floating=False, scoring='accuracy', cv=0, clone_estimator=True, verbose=0, n_jobs=1) sfs1 = sfs1.fit(X, y) assert (sfs1.k_feature_idx_ == (1, 3))
def test_clone_params_pass(): iris = load_iris() X = iris.data y = iris.target lr = SoftmaxRegression(random_seed=1) sfs1 = SFS(lr, k_features=3, forward=True, floating=False, scoring='accuracy', cv=0, skip_if_stuck=True, clone_estimator=False, print_progress=False, n_jobs=1) sfs1 = sfs1.fit(X, y) assert sfs1.k_feature_idx_ == (0, 1, 2)
def test_transform_not_fitted(): iris = load_iris() X = iris.data knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, k_features=2, forward=True, floating=False, cv=0, clone_estimator=False, verbose=0, n_jobs=1) expect = 'SequentialFeatureSelector has not been fitted, yet.' assert_raises(AttributeError, expect, sfs1.transform, X)
def sub_window_creation(self, images, kernels): gb_all_sw = [] label = [] for i in range(0, 100, 11): for j in range(0, 50, 11): for k in range(len(images)): image = images[k] sw_image = image[i:i + 50, j:j + 50] sw_image = cv2.resize(sw_image, dsize=(12, 12), interpolation=cv2.INTER_NEAREST) # print('sw size', sw_image.shape) gabored_image = Preprocessing.process( self, sw_image, kernels) # print('gab size', gabored_image.shape) # model = SpectralEmbedding(n_components=100, n_neighbors=10) # reduced_sw = model.fit_transform(gabored_image.reshape(-1, 1)) # print('gab size', gabored_image.reshape(1, -1).shape) # gb_all_sw.append(gabored_image) gb_all_sw.append(gabored_image) label.append(int(k / 4)) # print('red size', reduced_sw.reshape(-1, 1).shape) # plt.imshow(image[i:i+50, j:j+50], cmap='gray') # plt.show() # plt.imshow(gabored_image, cmap='gray') # plt.show() print(len(gb_all_sw)) print(len(gb_all_sw[0])) # LEM demension reduction model = SpectralEmbedding(n_components=100, n_neighbors=10) # reduced_sw = model.fit_transform(gb_all_sw) reduced_sw = model.fit_transform(gb_all_sw) knn = KNeighborsClassifier(n_neighbors=5) sffs = SFS(knn, k_features=5, forward=True, floating=True, scoring='accuracy', cv=4, n_jobs=-1) sffs = sffs.fit(reduced_sw, label) print('\nSequential Forward Floating Selection (k=', i, '):') print(sffs.k_feature_idx_) print('CV Score:') print(sffs.k_score_)
def select_features(self, model, X_train, y_train, k_features=(1, 30), scorer='r2', cv=0): sfs = SFS(model, k_features=k_features, forward=True, floating=False, scoring=scorer, cv=cv, verbose=2) sfs.fit(np.array(X_train), np.array(y_train)) print(sfs.k_feature_idx_) """
def test_check_pandas_dataframe_fit_backward(): for floating in [True, False]: iris = load_iris() X = iris.data y = iris.target lr = SoftmaxRegression(random_seed=1) sfs1 = SFS(lr, k_features=2, forward=False, floating=floating, scoring='accuracy', cv=0, verbose=0, n_jobs=1) df = pd.DataFrame( X, columns=['sepal len', 'sepal width', 'petal len', 'petal width']) sfs1 = sfs1.fit(X, y) assert sfs1.k_feature_idx_ == (1, 2) assert sfs1.k_feature_names_ == ('1', '2') assert sfs1.subsets_[2]['feature_names'] == ('1', '2') sfs1 = sfs1.fit(df, y) assert sfs1.subsets_[3]['feature_names'] == ('sepal len', 'sepal width', 'petal len') assert sfs1.subsets_[2]['feature_names'] == ('sepal width', 'petal len') assert sfs1.subsets_[3]['feature_idx'] == (0, 1, 2) assert sfs1.subsets_[2]['feature_idx'] == (1, 2) assert sfs1.k_feature_idx_ == (1, 2) assert sfs1.k_feature_names_ == ('sepal width', 'petal len') sfs1._TESTING_INTERRUPT_MODE = True out = sfs1.fit(df, y) assert len(out.subsets_.keys()) > 0 assert sfs1.interrupted_ assert sfs1.subsets_[3]['feature_names'] == ('sepal len', 'sepal width', 'petal len') assert sfs1.k_feature_idx_ == (0, 1, 2) assert sfs1.k_feature_names_ == ('sepal len', 'sepal width', 'petal len')
def sff_selection(k_features, pipeline, x, y, fwd=True, flt=True): ''' Selects a subset of available features Input: k_features - number of features to select pipeline - predictor pipeline x, y - features and labels fwd,flt - boolean parameters for SFFS algorithm, see mlxtend docs Output: tuple of accuracy score and list of k selected fatures The mlxtend SFS function implements four related feature selection algorithms; if the default parameters (fwd=True, flt=True) are not changed, this is Sequential Floating Forward Selection (SFFS) SFFS has been identified as a good way of performing dimension reduction through feature selection on triaxial accelerometer data: Gupta and Tim Dallas (2014) "Feature Selection and Activity Recognition System Using a Single Triaxial Accelerometer" IEEE Trans. Bomed. Eng., 61(6) ''' # hyperparameters sffs_scoring = 'accuracy' sffs_cv_folds = 10 # Feature selection sffs = SFS(pipeline, k_features=k_features, forward=fwd, floating=flt, scoring=sffs_scoring, cv=sffs_cv_folds, n_jobs=-1) sffs = sffs.fit(x.as_matrix(), y.as_matrix()) # list of the k best features feat_names = list(x.columns.values) feat_list = [feat_names[i] for i in sffs.k_feature_idx_] # return the prediction score and feature name list return sffs.k_score_, feat_list
def run_decision_tree(self): clf = DecisionTreeRegressor(random_state=7, max_depth=self.max_depth) sfs = SFS(clf, k_features=self.k_features, forward=True, floating=True, scoring=self.scoring, n_jobs=-1, cv=4) test_features = self.train.columns test_features = list(test_features.drop(['date', 'ticker', 'return'])) sfs = sfs.fit(self.train[test_features], self.train['return']) self.score = sfs.k_score_ self.features = list(sfs.k_feature_names_)
def get_core_features(self, X, y) -> List[str]: if self.method == "SFS": mySFS = SFS( LogisticRegression(), k_features=10, forward=True, cv=0, scoring="roc_auc", ) myVars = mySFS.fit(X.values, y.values) return [X.columns[i] for i in myVars.k_feature_idx_] if self.method == "RFE": rfe = RFE(self.model, self.n_features) fit = rfe.fit(X, y) return [i[1] for i in zip(fit.support_, X.columns) if i[0]] raise ValueError("Unknown method for core feature selection")
def select_r2(df_in, ss_label, f_n, eps): dfx = df_in.copy() if len(dfx.columns) > f_n: select = SFS(RandomForestClassifier(n_estimators=eps, random_state=1), k_features=f_n, forward=True, floating=False, scoring='accuracy', cv=4, n_jobs=3) select.fit(dfx.values, ss_label.values) mask = select.k_feature_idx_ x_sfs = select.transform(dfx.values) m_mir_list = dfx.columns[[x for x in mask]] return x_sfs, ','.join(m_mir_list), len(m_mir_list) else: f_list = dfx.columns.tolist() return dfx.values, ','.join(f_list), len(f_list)
def make_features_selection(X_train, y_train, is_forward): curr_C = float(sys.argv[1]) rkf = RepeatedKFold(n_splits=Q, n_repeats=T) features_number = 90 if is_forward else len(X_train.columns) - 12 curr_svm_classifier = LinearSVC(penalty='l2', dual=False, C=curr_C) sfs = SFS(estimator=curr_svm_classifier, k_features=features_number, forward=is_forward, floating=True, n_jobs=-1, verbose=2, scoring=SCORING, cv=rkf) sfs = sfs.fit(X_train.values, y_train) make_plot(sfs, curr_C, is_forward) make_debug_info(sfs, curr_C, is_forward)