def test_selects_all():
    from sklearn.neighbors import KNeighborsClassifier
    from mlxtend.data import wine_data

    X, y = wine_data()
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs = SFS(knn, k_features=13, scoring='accuracy', cv=3, print_progress=False)
    sfs.fit(X, y)
    assert(len(sfs.indices_) == 13)
def test_Iris():
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.datasets import load_iris

    iris = load_iris()
    X = iris.data
    y = iris.target

    knn = KNeighborsClassifier(n_neighbors=4)

    sfs = SFS(knn, k_features=2, scoring='accuracy', cv=5, print_progress=False)
    sfs.fit(X, y)

    assert(sfs.indices_ == (2, 3))
    assert(round(sfs.k_score_, 2) == 0.97 )
def test_regression():
    boston = load_boston()
    X, y = boston.data, boston.target
    lr = LinearRegression()
    sfs_r = SFS(lr,
                k_features=13,
                forward=True,
                floating=False,
                scoring='mean_squared_error',
                cv=10,
                skip_if_stuck=True,
                print_progress=False)
    sfs_r = sfs_r.fit(X, y)
    assert len(sfs_r.k_feature_idx_) == 13
    assert round(sfs_r.k_score_, 4) == -34.7631
Beispiel #4
0
def test_knn_scoring_metric():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs5 = SFS(knn,
               k_features=3,
               forward=False,
               floating=True,
               scoring='accuracy',
               cv=4,
               skip_if_stuck=True,
               verbose=0)
    sfs5 = sfs5.fit(X, y)
    assert round(sfs5.k_score_, 4) == 0.9728

    sfs6 = SFS(knn,
               k_features=3,
               forward=False,
               floating=True,
               cv=4,
               skip_if_stuck=True,
               verbose=0)
    sfs6 = sfs6.fit(X, y)
    assert round(sfs6.k_score_, 4) == 0.9728

    sfs7 = SFS(knn,
               k_features=3,
               forward=False,
               floating=True,
               scoring='f1_macro',
               cv=4,
               skip_if_stuck=True,
    )
    sfs7 = sfs7.fit(X, y)
    assert round(sfs7.k_score_, 4) == 0.9727, sfs7.k_score_
Beispiel #5
0
def test_knn_option_sffs():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs2 = SFS(knn,
               k_features=3,
               forward=True,
               floating=True,
               scoring='accuracy',
               cv=4,
               skip_if_stuck=True,
               print_progress=False)
    sfs2 = sfs2.fit(X, y)
    assert sfs2.k_feature_idx_ == (1, 2, 3)
def test_knn_option_sbs_tuplerange_1():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=3)
    sfs4 = SFS(knn,
               k_features=(1, 3),
               forward=False,
               floating=False,
               cv=4,
               skip_if_stuck=True,
               verbose=0)
    sfs4 = sfs4.fit(X, y)
    assert round(sfs4.k_score_, 3) == 0.967, sfs4.k_score_
    assert sfs4.k_feature_idx_ == (0, 2, 3), sfs4.k_feature_idx_
def test_regression():
    boston = load_boston()
    X, y = boston.data, boston.target
    lr = LinearRegression()
    sfs_r = SFS(lr,
                k_features=13,
                forward=True,
                floating=False,
                scoring=MEAN_SQUARED_ERROR,
                cv=10,
                skip_if_stuck=True,
                verbose=0)
    sfs_r = sfs_r.fit(X, y)
    assert len(sfs_r.k_feature_idx_) == 13
    assert round(sfs_r.k_score_, 4) == -34.7631
def test_regression_in_range():
    boston = load_boston()
    X, y = boston.data, boston.target
    lr = LinearRegression()
    sfs_r = SFS(lr,
                k_features=(1, 13),
                forward=True,
                floating=False,
                scoring='neg_mean_squared_error',
                cv=10,
                skip_if_stuck=True,
                verbose=0)
    sfs_r = sfs_r.fit(X, y)
    assert len(sfs_r.k_feature_idx_) == 9
    assert round(sfs_r.k_score_, 4) == -31.1537
def test_get_metric_dict_not_fitted():
    knn = KNeighborsClassifier(n_neighbors=4)

    sfs1 = SFS(knn,
               k_features=2,
               forward=True,
               floating=False,
               cv=0,
               clone_estimator=False,
               verbose=0,
               n_jobs=1)

    expect = 'SequentialFeatureSelector has not been fitted, yet.'

    assert_raises(AttributeError, expect, sfs1.get_metric_dict)
def feature_selection(data, label, num_channel):
    print("test")
    channel_rm_list = []
    channel_all_list = set(list(range(32)))
    sfs = SFS(LinearRegression(),
        k_features=num_channel,
        forward=True,
        floating=False,
        scoring = 'r2',
        cv = 0) 
    sfs.fit(data, label)
    x = sfs.k_feature_names_     # to get the final set of features
    channel_list = set([int(a) for a in list(x)])
    channel_rm_list = list(channel_all_list.difference(channel_list))
    return channel_rm_list
def test_knn_option_sfbs():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs4 = SFS(knn,
               k_features=3,
               forward=False,
               floating=True,
               scoring='accuracy',
               cv=4,
               skip_if_stuck=True,
               verbose=0)
    sfs4 = sfs4.fit(X, y)
    assert sfs4.k_feature_idx_ == (1, 2, 3)
def test_kfeatures_type_1():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier()
    expect = ('k_features must be a positive integer between 1 and X.shape[1],'
              ' got 0')
    sfs = SFS(estimator=knn,
              verbose=0,
              k_features=0)
    assert_raises(AttributeError,
                  expect,
                  sfs.fit,
                  X,
                  y)
Beispiel #13
0
def test_predefinedholdoutsplit_in_sfs():
    h_iter = PredefinedHoldoutSplit(valid_indices=[0, 1, 99])
    knn = KNeighborsClassifier(n_neighbors=4)

    sfs1 = SFS(knn,
               k_features=3,
               forward=True,
               floating=False,
               verbose=2,
               scoring='accuracy',
               cv=h_iter)

    sfs1 = sfs1.fit(X, y)
    d = sfs1.get_metric_dict()
    assert d[1]['cv_scores'].shape[0] == 1
Beispiel #14
0
def test_randomholdoutsplit_in_sfs():
    h_iter = RandomHoldoutSplit(valid_size=0.3, random_seed=123)
    knn = KNeighborsClassifier(n_neighbors=4)

    sfs1 = SFS(knn,
               k_features=3,
               forward=True,
               floating=False,
               verbose=2,
               scoring='accuracy',
               cv=h_iter)

    sfs1 = sfs1.fit(X, y)
    d = sfs1.get_metric_dict()
    assert d[1]['cv_scores'].shape[0] == 1
Beispiel #15
0
def test_kfeatures_type_5():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier()
    expect = ('he min k_features value must be'
              ' larger than the max k_features value.')
    sfs = SFS(estimator=knn,
              verbose=0,
              k_features=(3, 1))
    assert_raises(AttributeError,
                  expect,
                  sfs.fit,
                  X,
                  y)
def test_max_feature_subset_size_in_tuple_range():
    boston = load_boston()
    X, y = boston.data, boston.target

    lr = LinearRegression()

    sfs = SFS(lr,
              k_features=(1, 5),
              forward=False,
              floating=True,
              scoring='neg_mean_squared_error',
              cv=10)

    sfs = sfs.fit(X, y)
    assert len(sfs.k_feature_idx_) == 5
def test_string_scoring_clf():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs1 = SFS(knn,
               k_features=3,
               cv=0)
    sfs1 = sfs1.fit(X, y)

    sfs2 = SFS(knn,
               k_features=3,
               scoring='accuracy',
               cv=0)
    sfs2 = sfs2.fit(X, y)

    sfs3 = SFS(knn,
               k_features=3,
               scoring=make_scorer(accuracy_score),
               cv=0)
    sfs3 = sfs2.fit(X, y)

    assert sfs1.k_score_ == sfs2.k_score_
    assert sfs1.k_score_ == sfs3.k_score_
def main():

    x_train, y_train, x_test, y_test = get_data()

    for n in [2, 3, 5, 10, 16]:
        sfs = SFS(KNeighborsClassifier(n_neighbors=7),
                  k_features=n,
                  forward=False,
                  floating=True,
                  scoring='accuracy',
                  cv=0)
        sfs = sfs.fit(x_train, y_train)

        print('\nSequential Floating Forward Selection: ', n)
        feat_cols = list(sfs.k_feature_idx_)
        print(feat_cols)

        knn = KNeighborsClassifier(n_neighbors=3)
        knn.fit(x_train[:, feat_cols], y_train)

        y_train_pred = knn.predict(x_train[:, feat_cols])
        print('Training accuracy on selected features: %.3f' %
              acc(y_train, y_train_pred))

        y_test_pred = knn.predict(x_test[:, feat_cols])
        print('Testing accuracy on selected features: %.3f' %
              acc(y_test, y_test_pred))

        print(confusion_matrix(y_test, y_test_pred))
        print(classification_report(y_test, y_test_pred))

        if n == 2:
            fig, axs = plt.subplots(2)
            fig.suptitle("SFS(KNN) Scatter Plot", fontsize='small')
            axs[0].scatter(x_train[:, feat_cols[0]],
                           x_train[:, feat_cols[1]],
                           marker='o',
                           c=y_train,
                           s=25,
                           edgecolor='k')
            axs[1].scatter(x_test[:, feat_cols[0]],
                           x_test[:, feat_cols[1]],
                           marker='o',
                           c=y_test,
                           s=25,
                           edgecolor='k')

            plt.show()
def sequential_feature_selection(data_set, y_values, want_graph):
    lr = LinearRegression()
    sfs = SFS(lr,
              k_features=13,
              forward=True,
              floating=False,
              scoring='neg_mean_squared_error',
              cv=10)
    sfs = sfs.fit(data_set, y_values)
    if want_graph:
        fig = plot_sfs(sfs.get_metric_dict(), kind='std_err')
        plt.title('Sequential Forward Selection (w. StdErr)')
        plt.grid()
        plt.show()

    return sfs
def test_pandas():

    X_df = pd.DataFrame(X_iris, columns=['sepal length', 'sepal width',
                                         'petal width', 'petal width'])
    knn = KNeighborsClassifier()
    sfs = SFS(estimator=knn,
              k_features=3,
              forward=True,
              floating=False,
              fixed_features=('sepal length', 'sepal width'),
              verbose=0)
    sfs.fit(X_df, y_iris)
    print(sfs.subsets_)
    for k in sfs.subsets_:
        assert 0 in sfs.subsets_[k]['feature_idx']
        assert 1 in sfs.subsets_[k]['feature_idx']
def test_knn_option_sfbs_tuplerange_2():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=3)
    sfs4 = SFS(knn,
               k_features=(1, 4),
               forward=False,
               floating=True,
               scoring='accuracy',
               cv=4,
               skip_if_stuck=True,
               print_progress=False)
    sfs4 = sfs4.fit(X, y)
    assert round(sfs4.k_score_, 3) == 0.966, sfs4.k_score_
    assert sfs4.k_feature_idx_ == (1, 2, 3), sfs4.k_feature_idx_
Beispiel #22
0
def test_clone_params_pass():
    iris = load_iris()
    X = iris.data
    y = iris.target
    lr = SoftmaxRegression(random_seed=1)
    sfs1 = SFS(lr,
               k_features=2,
               forward=True,
               floating=False,
               scoring='accuracy',
               cv=0,
               clone_estimator=True,
               verbose=0,
               n_jobs=1)
    sfs1 = sfs1.fit(X, y)
    assert (sfs1.k_feature_idx_ == (1, 3))
Beispiel #23
0
def test_clone_params_pass():
    iris = load_iris()
    X = iris.data
    y = iris.target
    lr = SoftmaxRegression(random_seed=1)
    sfs1 = SFS(lr,
               k_features=3,
               forward=True,
               floating=False,
               scoring='accuracy',
               cv=0,
               skip_if_stuck=True,
               clone_estimator=False,
               print_progress=False,
               n_jobs=1)
    sfs1 = sfs1.fit(X, y)
    assert sfs1.k_feature_idx_ == (0, 1, 2)
Beispiel #24
0
def test_transform_not_fitted():
    iris = load_iris()
    X = iris.data
    knn = KNeighborsClassifier(n_neighbors=4)

    sfs1 = SFS(knn,
               k_features=2,
               forward=True,
               floating=False,
               cv=0,
               clone_estimator=False,
               verbose=0,
               n_jobs=1)

    expect = 'SequentialFeatureSelector has not been fitted, yet.'

    assert_raises(AttributeError, expect, sfs1.transform, X)
Beispiel #25
0
    def sub_window_creation(self, images, kernels):
        gb_all_sw = []
        label = []
        for i in range(0, 100, 11):
            for j in range(0, 50, 11):
                for k in range(len(images)):
                    image = images[k]
                    sw_image = image[i:i + 50, j:j + 50]
                    sw_image = cv2.resize(sw_image,
                                          dsize=(12, 12),
                                          interpolation=cv2.INTER_NEAREST)
                    # print('sw size', sw_image.shape)
                    gabored_image = Preprocessing.process(
                        self, sw_image, kernels)
                    # print('gab size', gabored_image.shape)
                    # model = SpectralEmbedding(n_components=100, n_neighbors=10)
                    # reduced_sw = model.fit_transform(gabored_image.reshape(-1, 1))
                    # print('gab size', gabored_image.reshape(1, -1).shape)
                    # gb_all_sw.append(gabored_image)
                    gb_all_sw.append(gabored_image)
                    label.append(int(k / 4))
                    # print('red size', reduced_sw.reshape(-1, 1).shape)
                    # plt.imshow(image[i:i+50, j:j+50], cmap='gray')
                    # plt.show()
                    # plt.imshow(gabored_image, cmap='gray')
                    # plt.show()
        print(len(gb_all_sw))
        print(len(gb_all_sw[0]))
        # LEM demension reduction
        model = SpectralEmbedding(n_components=100, n_neighbors=10)
        # reduced_sw = model.fit_transform(gb_all_sw)
        reduced_sw = model.fit_transform(gb_all_sw)

        knn = KNeighborsClassifier(n_neighbors=5)
        sffs = SFS(knn,
                   k_features=5,
                   forward=True,
                   floating=True,
                   scoring='accuracy',
                   cv=4,
                   n_jobs=-1)
        sffs = sffs.fit(reduced_sw, label)
        print('\nSequential Forward Floating Selection (k=', i, '):')
        print(sffs.k_feature_idx_)
        print('CV Score:')
        print(sffs.k_score_)
Beispiel #26
0
 def select_features(self,
                     model,
                     X_train,
                     y_train,
                     k_features=(1, 30),
                     scorer='r2',
                     cv=0):
     sfs = SFS(model,
               k_features=k_features,
               forward=True,
               floating=False,
               scoring=scorer,
               cv=cv,
               verbose=2)
     sfs.fit(np.array(X_train), np.array(y_train))
     print(sfs.k_feature_idx_)
     """
Beispiel #27
0
def test_check_pandas_dataframe_fit_backward():
    for floating in [True, False]:
        iris = load_iris()
        X = iris.data
        y = iris.target
        lr = SoftmaxRegression(random_seed=1)
        sfs1 = SFS(lr,
                   k_features=2,
                   forward=False,
                   floating=floating,
                   scoring='accuracy',
                   cv=0,
                   verbose=0,
                   n_jobs=1)

        df = pd.DataFrame(
            X,
            columns=['sepal len', 'sepal width', 'petal len', 'petal width'])

        sfs1 = sfs1.fit(X, y)
        assert sfs1.k_feature_idx_ == (1, 2)
        assert sfs1.k_feature_names_ == ('1', '2')
        assert sfs1.subsets_[2]['feature_names'] == ('1', '2')

        sfs1 = sfs1.fit(df, y)
        assert sfs1.subsets_[3]['feature_names'] == ('sepal len',
                                                     'sepal width',
                                                     'petal len')
        assert sfs1.subsets_[2]['feature_names'] == ('sepal width',
                                                     'petal len')
        assert sfs1.subsets_[3]['feature_idx'] == (0, 1, 2)
        assert sfs1.subsets_[2]['feature_idx'] == (1, 2)
        assert sfs1.k_feature_idx_ == (1, 2)
        assert sfs1.k_feature_names_ == ('sepal width', 'petal len')

        sfs1._TESTING_INTERRUPT_MODE = True
        out = sfs1.fit(df, y)
        assert len(out.subsets_.keys()) > 0
        assert sfs1.interrupted_
        assert sfs1.subsets_[3]['feature_names'] == ('sepal len',
                                                     'sepal width',
                                                     'petal len')
        assert sfs1.k_feature_idx_ == (0, 1, 2)
        assert sfs1.k_feature_names_ == ('sepal len', 'sepal width',
                                         'petal len')
Beispiel #28
0
def sff_selection(k_features, pipeline, x, y, fwd=True, flt=True):
    '''
    Selects a subset of available features

    Input:
      k_features - number of features to select
      pipeline - predictor pipeline
      x, y - features and labels
      fwd,flt - boolean parameters for SFFS algorithm, see mlxtend docs
    Output:
      tuple of accuracy score and list of k selected fatures

    The mlxtend SFS function implements four related feature selection algorithms;
    if the default parameters (fwd=True, flt=True) are not changed, this is
    Sequential Floating Forward Selection (SFFS)

    SFFS has been identified as a good way of performing dimension reduction
    through feature selection on triaxial accelerometer data:

      Gupta and Tim Dallas (2014) "Feature Selection and Activity Recognition
      System Using a Single Triaxial Accelerometer"
      IEEE Trans. Bomed. Eng., 61(6)

    '''
    # hyperparameters
    sffs_scoring = 'accuracy'
    sffs_cv_folds = 10

    # Feature selection
    sffs = SFS(pipeline,
               k_features=k_features,
               forward=fwd,
               floating=flt,
               scoring=sffs_scoring,
               cv=sffs_cv_folds,
               n_jobs=-1)

    sffs = sffs.fit(x.as_matrix(), y.as_matrix())

    # list of the k best features
    feat_names = list(x.columns.values)
    feat_list = [feat_names[i] for i in sffs.k_feature_idx_]

    # return the prediction score and feature name list
    return sffs.k_score_, feat_list
Beispiel #29
0
    def run_decision_tree(self):
        clf = DecisionTreeRegressor(random_state=7, max_depth=self.max_depth)

        sfs = SFS(clf,
                  k_features=self.k_features,
                  forward=True,
                  floating=True,
                  scoring=self.scoring,
                  n_jobs=-1,
                  cv=4)
        test_features = self.train.columns
        test_features = list(test_features.drop(['date', 'ticker', 'return']))

        sfs = sfs.fit(self.train[test_features], self.train['return'])

        self.score = sfs.k_score_

        self.features = list(sfs.k_feature_names_)
Beispiel #30
0
    def get_core_features(self, X, y) -> List[str]:
        if self.method == "SFS":
            mySFS = SFS(
                LogisticRegression(),
                k_features=10,
                forward=True,
                cv=0,
                scoring="roc_auc",
            )
            myVars = mySFS.fit(X.values, y.values)
            return [X.columns[i] for i in myVars.k_feature_idx_]

        if self.method == "RFE":
            rfe = RFE(self.model, self.n_features)
            fit = rfe.fit(X, y)
            return [i[1] for i in zip(fit.support_, X.columns) if i[0]]

        raise ValueError("Unknown method for core feature selection")
Beispiel #31
0
def select_r2(df_in, ss_label, f_n, eps):
    dfx = df_in.copy()
    if len(dfx.columns) > f_n:
        select = SFS(RandomForestClassifier(n_estimators=eps, random_state=1),
                     k_features=f_n,
                     forward=True,
                     floating=False,
                     scoring='accuracy',
                     cv=4,
                     n_jobs=3)
        select.fit(dfx.values, ss_label.values)
        mask = select.k_feature_idx_
        x_sfs = select.transform(dfx.values)
        m_mir_list = dfx.columns[[x for x in mask]]
        return x_sfs, ','.join(m_mir_list), len(m_mir_list)
    else:
        f_list = dfx.columns.tolist()
        return dfx.values, ','.join(f_list), len(f_list)
Beispiel #32
0
def make_features_selection(X_train, y_train, is_forward):
    curr_C = float(sys.argv[1])

    rkf = RepeatedKFold(n_splits=Q, n_repeats=T)
    features_number = 90 if is_forward else len(X_train.columns) - 12

    curr_svm_classifier = LinearSVC(penalty='l2', dual=False, C=curr_C)
    sfs = SFS(estimator=curr_svm_classifier,
              k_features=features_number,
              forward=is_forward,
              floating=True,
              n_jobs=-1,
              verbose=2,
              scoring=SCORING,
              cv=rkf)
    sfs = sfs.fit(X_train.values, y_train)
    make_plot(sfs, curr_C, is_forward)
    make_debug_info(sfs, curr_C, is_forward)