コード例 #1
0
def test_knn_cv3_groups():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs1 = SFS(knn,
               k_features=3,
               forward=True,
               floating=False,
               cv=GroupKFold(n_splits=3),
               verbose=0)
    np.random.seed(1630672634)
    groups = np.random.randint(0, 6, size=len(y))
    sfs1 = sfs1.fit(X, y, groups=groups)
    # print(sfs1.subsets_)
    expect = {
        1: {'cv_scores': np.array([0.97916667, 0.93877551, 0.96226415]),
            'feature_idx': (3,),
            'avg_score': 0.9600687759380482},
        2: {'cv_scores': np.array([0.95833333, 0.93877551, 0.98113208]),
            'feature_idx': (1, 3),
            'avg_score': 0.9594136396697044},
        3: {'cv_scores': np.array([0.97916667, 0.95918367, 0.94339623]),
            'feature_idx': (1, 2, 3),
            'avg_score': 0.9605821888503829}}
    dict_compare_utility(d_actual=sfs1.subsets_, d_desired=expect, decimal=3)
コード例 #2
0
def test_knn_cv3():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs1 = SFS(knn,
               k_features=3,
               forward=True,
               floating=False,
               scoring='accuracy',
               cv=4,
               skip_if_stuck=True,
               print_progress=False)
    sfs1 = sfs1.fit(X, y)
    sfs1.subsets_
    expect = {1: {'avg_score': 0.95299145299145294,
                  'cv_scores': np.array([0.97435897,
                                         0.94871795,
                                         0.88888889,
                                         1.0]),
                  'feature_idx': (3,)},
              2: {'avg_score': 0.95993589743589736,
                  'cv_scores': np.array([0.97435897,
                                         0.94871795,
                                         0.91666667,
                                         1.0]),
                  'feature_idx': (2, 3)},
              3: {'avg_score': 0.97275641025641035,
                  'cv_scores': np.array([0.97435897,
                                         1.0,
                                         0.94444444,
                                         0.97222222]),
                  'feature_idx': (1, 2, 3)}}
    dict_compare_utility(d1=expect, d2=sfs1.subsets_)
コード例 #3
0
def test_knn_scoring_metric():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs5 = SFS(knn,
               k_features=3,
               forward=False,
               floating=True,
               scoring='accuracy',
               cv=4,
               skip_if_stuck=True,
               print_progress=False)
    sfs5 = sfs5.fit(X, y)
    assert round(sfs5.k_score_, 4) == 0.9728

    sfs6 = SFS(knn,
               k_features=3,
               forward=False,
               floating=True,
               scoring='precision',
               cv=4,
               skip_if_stuck=True,
               print_progress=False)
    sfs6 = sfs6.fit(X, y)
    assert round(sfs6.k_score_, 4) == 0.9737
コード例 #4
0
def test_knn_cv3():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs1 = SFS(knn,
               k_features=3,
               forward=True,
               floating=False,
               cv=4,
               verbose=0)
    sfs1 = sfs1.fit(X, y)
    sfs1.subsets_
    expect = {1: {'avg_score': 0.95299145299145294,
                  'cv_scores': np.array([0.97435897,
                                         0.94871795,
                                         0.88888889,
                                         1.0]),
                  'feature_idx': (3,)},
              2: {'avg_score': 0.95993589743589736,
                  'cv_scores': np.array([0.97435897,
                                         0.94871795,
                                         0.91666667,
                                         1.0]),
                  'feature_idx': (2, 3)},
              3: {'avg_score': 0.97275641025641035,
                  'cv_scores': np.array([0.97435897,
                                         1.0,
                                         0.94444444,
                                         0.97222222]),
                  'feature_idx': (1, 2, 3)}}
    dict_compare_utility(d_actual=sfs1.subsets_, d_desired=expect)
コード例 #5
0
ファイル: utils.py プロジェクト: armgilles/frateli
def get_best_logisitc(y):
    
  from mlxtend.feature_selection import SequentialFeatureSelector as SFS
  from sklearn.cross_validation import StratifiedKFold
  import pandas as pd
  from sklearn.linear_model import LogisticRegression
  from sklearn.cross_validation import cross_val_score
  
  my_data = pd.read_csv('data/my_data_test.csv', encoding='utf-8')
  
  y = my_data.target
  my_data = my_data.drop('target', axis=1)
  
    
  # To have better CV
  skf = StratifiedKFold(y, n_folds=5, random_state=17, shuffle=False)

  C_params = [0.01 , 1, 10, 50, 70, 100]
  solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag']

  my_result_list = []
  for C_param in C_params:
      for solver in solvers:
          print "Looking for C : %s and solver : %s" % (C_param, solver)
          model = LogisticRegression(class_weight='balanced', random_state=17, 
                                     solver=solver, C=C_param)
          sfs = SFS(model, 
                    k_features=len(my_data.columns), 
                    forward=True, 
                    floating=False, 
                    scoring='roc_auc',
                    print_progress=False,
                    cv=skf,
                    n_jobs=-1)
          
          sfs = sfs.fit(my_data.values, y.values)

          result_sfs = pd.DataFrame.from_dict(sfs.get_metric_dict()).T
          result_sfs.sort_values('avg_score', ascending=0, inplace=True)
          features_sfs = result_sfs.feature_idx.head(1).tolist()
          select_features_sfs = list(my_data.columns[features_sfs])

          scores = cross_val_score(model, my_data[select_features_sfs], y, cv=skf, scoring='roc_auc')
          my_result_list.append({'C' : C_param,
                               'solver' : solver,
                               'auc' : scores.mean(),
                               'std' : scores.std(),
                               'best_columns' : select_features_sfs,
                               'estimator' : model})

  my_result = pd.DataFrame(my_result_list)
  my_result.sort_values('auc', ascending=0, inplace=True)

  best_features = my_result.best_columns.head(1).values[0]
  best_model = my_result.estimator.head(1).values[0]

  return best_features, best_model
コード例 #6
0
def test_run_default():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier()
    sfs = SFS(estimator=knn,
              verbose=0)
    sfs.fit(X, y)
    assert sfs.k_feature_idx_ == (3,)
コード例 #7
0
def test_fit_params():
    iris = load_iris()
    X = iris.data
    y = iris.target
    sample_weight = np.ones(X.shape[0])
    forest = RandomForestClassifier(n_estimators=100, random_state=123)
    sfs = SFS(estimator=forest,
              verbose=0)
    sfs.fit(X, y, sample_weight=sample_weight)
    assert sfs.k_feature_idx_ == (3,)
コード例 #8
0
def test_max_feature_subset_best():
    boston = load_boston()
    X, y = boston.data, boston.target
    lr = LinearRegression()

    sfs = SFS(lr,
              k_features='best',
              forward=True,
              floating=False,
              cv=10)

    sfs = sfs.fit(X, y)
    assert sfs.k_feature_idx_ == (1, 3, 5, 7, 8, 9, 10, 11, 12)
コード例 #9
0
def test_regression_sbfs():
    boston = load_boston()
    X, y = boston.data, boston.target
    lr = LinearRegression()
    sfs_r = SFS(lr,
                k_features=3,
                forward=False,
                floating=True,
                scoring='neg_mean_squared_error',
                cv=10,
                verbose=0)
    sfs_r = sfs_r.fit(X, y)
    assert sfs_r.k_feature_idx_ == (7, 10, 12), sfs_r.k_feature_idx_
コード例 #10
0
def test_knn_option_sfbs():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs4 = SFS(knn,
               k_features=3,
               forward=False,
               floating=True,
               cv=4,
               verbose=0)
    sfs4 = sfs4.fit(X, y)
    assert sfs4.k_feature_idx_ == (1, 2, 3)
コード例 #11
0
def test_max_feature_subset_parsimonious():
    boston = load_boston()
    X, y = boston.data, boston.target
    lr = LinearRegression()

    sfs = SFS(lr,
              k_features='parsimonious',
              forward=True,
              floating=False,
              cv=10)

    sfs = sfs.fit(X, y)
    assert sfs.k_feature_idx_ == (5, 10, 11, 12)
コード例 #12
0
def test_regression_in_range():
    boston = load_boston()
    X, y = boston.data, boston.target
    lr = LinearRegression()
    sfs_r = SFS(lr,
                k_features=(1, 13),
                forward=True,
                floating=False,
                scoring='neg_mean_squared_error',
                cv=10,
                verbose=0)
    sfs_r = sfs_r.fit(X, y)
    assert len(sfs_r.k_feature_idx_) == 9
    assert round(sfs_r.k_score_, 4) == -31.1537
コード例 #13
0
def test_knn_option_sbs_tuplerange_1():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=3)
    sfs4 = SFS(knn,
               k_features=(1, 3),
               forward=False,
               floating=False,
               cv=4,
               verbose=0)
    sfs4 = sfs4.fit(X, y)
    assert round(sfs4.k_score_, 3) == 0.967, sfs4.k_score_
    assert sfs4.k_feature_idx_ == (0, 2, 3), sfs4.k_feature_idx_
コード例 #14
0
def test_regression():
    boston = load_boston()
    X, y = boston.data, boston.target
    lr = LinearRegression()
    sfs_r = SFS(lr,
                k_features=13,
                forward=True,
                floating=False,
                scoring='mean_squared_error',
                cv=10,
                skip_if_stuck=True,
                print_progress=False)
    sfs_r = sfs_r.fit(X, y)
    assert round(sfs_r.k_score_, 4) == -34.7631
コード例 #15
0
def test_knn_rbf_groupkfold():
    nan_roc_auc_scorer = make_scorer(nan_roc_auc_score)
    rng = np.random.RandomState(123)
    iris = load_iris()
    X = iris.data
    # knn = KNeighborsClassifier(n_neighbors=4)
    forest = RandomForestClassifier(n_estimators=100, random_state=123)
    bool_01 = [True if item == 0 else False for item in iris['target']]
    bool_02 = [True if (item == 1 or item == 2) else False for item in
               iris['target']]
    groups = []
    y_new = []
    for ind, _ in enumerate(bool_01):
        if bool_01[ind]:
            groups.append('attribute_A')
            y_new.append(0)
        if bool_02[ind]:
            throw = rng.rand()
            if throw < 0.5:
                groups.append('attribute_B')
            else:
                groups.append('attribute_C')
            throw2 = rng.rand()
            if throw2 < 0.5:
                y_new.append(0)
            else:
                y_new.append(1)
    y_new_bool = [True if item is 1 else False for item in y_new]
    cv_obj = GroupKFold(n_splits=3)
    cv_obj_list = list(cv_obj.split(X, np.array(y_new_bool), groups))
    sfs1 = SFS(forest,
               k_features=3,
               forward=True,
               floating=False,
               cv=cv_obj_list,
               scoring=nan_roc_auc_scorer,
               verbose=0
               )
    sfs1 = sfs1.fit(X, y_new)
    expect = {
        1: {'cv_scores': np.array([0.52, nan, 0.72]), 'avg_score': 0.62,
            'feature_idx': (1,)},
        2: {'cv_scores': np.array([0.42, nan, 0.65]), 'avg_score': 0.53,
            'feature_idx': (1, 2)},
        3: {'cv_scores': np.array([0.47, nan, 0.63]),
            'avg_score': 0.55,
            'feature_idx': (1, 2, 3)}}

    dict_compare_utility(d_actual=sfs1.subsets_, d_desired=expect, decimal=1)
コード例 #16
0
def test_predefinedholdoutsplit_in_sfs():
    h_iter = PredefinedHoldoutSplit(valid_indices=[0, 1, 99])
    knn = KNeighborsClassifier(n_neighbors=4)

    sfs1 = SFS(knn,
               k_features=3,
               forward=True,
               floating=False,
               verbose=2,
               scoring='accuracy',
               cv=h_iter)

    sfs1 = sfs1.fit(X, y)
    d = sfs1.get_metric_dict()
    assert d[1]['cv_scores'].shape[0] == 1
コード例 #17
0
def test_randomholdoutsplit_in_sfs():
    h_iter = RandomHoldoutSplit(valid_size=0.3, random_seed=123)
    knn = KNeighborsClassifier(n_neighbors=4)

    sfs1 = SFS(knn,
               k_features=3,
               forward=True,
               floating=False,
               verbose=2,
               scoring='accuracy',
               cv=h_iter)

    sfs1 = sfs1.fit(X, y)
    d = sfs1.get_metric_dict()
    assert d[1]['cv_scores'].shape[0] == 1
コード例 #18
0
def test_regression():
    boston = load_boston()
    X, y = boston.data, boston.target
    lr = LinearRegression()
    sfs_r = SFS(lr,
                k_features=13,
                forward=True,
                floating=False,
                scoring='mean_squared_error',
                cv=10,
                skip_if_stuck=True,
                print_progress=False)
    sfs_r = sfs_r.fit(X, y)
    assert len(sfs_r.k_feature_idx_) == 13
    assert round(sfs_r.k_score_, 4) == -34.7631
コード例 #19
0
def test_knn_option_sffs():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs2 = SFS(knn,
               k_features=3,
               forward=True,
               floating=True,
               scoring='accuracy',
               cv=4,
               skip_if_stuck=True,
               verbose=0)
    sfs2 = sfs2.fit(X, y)
    assert sfs2.k_feature_idx_ == (1, 2, 3)
コード例 #20
0
def test_max_feature_subset_size_in_tuple_range():
    boston = load_boston()
    X, y = boston.data, boston.target

    lr = LinearRegression()

    sfs = SFS(lr,
              k_features=(1, 5),
              forward=False,
              floating=True,
              scoring='neg_mean_squared_error',
              cv=10)

    sfs = sfs.fit(X, y)
    assert len(sfs.k_feature_idx_) == 5
コード例 #21
0
def test_knn_option_sffs():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs2 = SFS(knn,
               k_features=3,
               forward=True,
               floating=True,
               scoring='accuracy',
               cv=4,
               skip_if_stuck=True,
               verbose=0)
    sfs2 = sfs2.fit(X, y)
    assert sfs2.k_feature_idx_ == (1, 2, 3)
コード例 #22
0
def feature_selection(data, label, num_channel):
    print("test")
    channel_rm_list = []
    channel_all_list = set(list(range(32)))
    sfs = SFS(LinearRegression(),
        k_features=num_channel,
        forward=True,
        floating=False,
        scoring = 'r2',
        cv = 0) 
    sfs.fit(data, label)
    x = sfs.k_feature_names_     # to get the final set of features
    channel_list = set([int(a) for a in list(x)])
    channel_rm_list = list(channel_all_list.difference(channel_list))
    return channel_rm_list
コード例 #23
0
def test_regression_in_range():
    boston = load_boston()
    X, y = boston.data, boston.target
    lr = LinearRegression()
    sfs_r = SFS(lr,
                k_features=(1, 13),
                forward=True,
                floating=False,
                scoring='neg_mean_squared_error',
                cv=10,
                skip_if_stuck=True,
                verbose=0)
    sfs_r = sfs_r.fit(X, y)
    assert len(sfs_r.k_feature_idx_) == 9
    assert round(sfs_r.k_score_, 4) == -31.1537
コード例 #24
0
def test_knn_option_sfs():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs1 = SFS(knn,
               k_features=3,
               forward=True,
               floating=False,
               scoring='accuracy',
               cv=4,
               skip_if_stuck=True,
               print_progress=False)
    sfs1 = sfs1.fit(X, y)
    assert sfs1.k_feature_idx_ == (1, 2, 3)
コード例 #25
0
def test_knn_option_sfbs():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs4 = SFS(knn,
               k_features=3,
               forward=False,
               floating=True,
               scoring='accuracy',
               cv=4,
               skip_if_stuck=True,
               print_progress=False)
    sfs4 = sfs4.fit(X, y)
    assert sfs4.k_feature_idx_ == (1, 2, 3)
def test_max_feature_subset_size_in_tuple_range():
    boston = load_boston()
    X, y = boston.data, boston.target

    lr = LinearRegression()

    sfs = SFS(lr,
              k_features=(1, 5),
              forward=False,
              floating=True,
              scoring='neg_mean_squared_error',
              cv=10)

    sfs = sfs.fit(X, y)
    assert len(sfs.k_feature_idx_) == 5
コード例 #27
0
def test_regression():
    boston = load_boston()
    X, y = boston.data, boston.target
    lr = LinearRegression()
    sfs_r = SFS(lr,
                k_features=13,
                forward=True,
                floating=False,
                scoring=MEAN_SQUARED_ERROR,
                cv=10,
                skip_if_stuck=True,
                verbose=0)
    sfs_r = sfs_r.fit(X, y)
    assert len(sfs_r.k_feature_idx_) == 13
    assert round(sfs_r.k_score_, 4) == -34.7631
コード例 #28
0
def main():

    x_train, y_train, x_test, y_test = get_data()

    for n in [2, 3, 5, 10, 16]:
        sfs = SFS(KNeighborsClassifier(n_neighbors=7),
                  k_features=n,
                  forward=False,
                  floating=True,
                  scoring='accuracy',
                  cv=0)
        sfs = sfs.fit(x_train, y_train)

        print('\nSequential Floating Forward Selection: ', n)
        feat_cols = list(sfs.k_feature_idx_)
        print(feat_cols)

        knn = KNeighborsClassifier(n_neighbors=3)
        knn.fit(x_train[:, feat_cols], y_train)

        y_train_pred = knn.predict(x_train[:, feat_cols])
        print('Training accuracy on selected features: %.3f' %
              acc(y_train, y_train_pred))

        y_test_pred = knn.predict(x_test[:, feat_cols])
        print('Testing accuracy on selected features: %.3f' %
              acc(y_test, y_test_pred))

        print(confusion_matrix(y_test, y_test_pred))
        print(classification_report(y_test, y_test_pred))

        if n == 2:
            fig, axs = plt.subplots(2)
            fig.suptitle("SFS(KNN) Scatter Plot", fontsize='small')
            axs[0].scatter(x_train[:, feat_cols[0]],
                           x_train[:, feat_cols[1]],
                           marker='o',
                           c=y_train,
                           s=25,
                           edgecolor='k')
            axs[1].scatter(x_test[:, feat_cols[0]],
                           x_test[:, feat_cols[1]],
                           marker='o',
                           c=y_test,
                           s=25,
                           edgecolor='k')

            plt.show()
コード例 #29
0
def run_experiment(X, y, clf, protected_groups, unfairness_metric, unfairness_weight):
    metric = unfairness_metrics.UnfairnessMetric(protected_groups, unfairness_metric)
    unfairness_scorer = metrics.make_scorer(metric)
    unfairness_means = []
    auc_means = []
    selected_feature_props = np.zeros([ITERATIONS, X.shape[1]])
    for i in tqdm(range(ITERATIONS), desc=' Training ' + clf.__class__.__name__):
        xval = model_selection.KFold(4, shuffle=True, random_state=i)
        # Make a metric combining accuracy and subtracting unfairness w.r.t. the protected groups
        metric = unfairness_metrics.CombinedMetric(ACCURACY_METRIC, protected_groups,
                                                   unfairness_metric, unfairness_weight)
        combined_scorer = metrics.make_scorer(metric)
        sfs = SequentialFeatureSelector(clf, 'best', verbose=0, cv=xval, scoring=combined_scorer,
                                        n_jobs=2)
        pipe = pipeline.Pipeline([
            ('standardize', preprocessing.StandardScaler()),
            ('feature_selection', sfs),
            ('model', clf),
        ])
        result = model_selection.cross_validate(pipe, X, y, verbose=0, cv=xval, scoring={
            'unfairness': unfairness_scorer,
            'auc': metrics.make_scorer(ACCURACY_METRIC),
        }, return_estimator=True)
        unfairness_means.append(result['test_unfairness'].mean())
        auc_means.append(result['test_auc'].mean())
        for estimator in result['estimator']:
            for feature_i in estimator.named_steps['feature_selection'].k_feature_idx_:
                selected_feature_props[i][feature_i] += 1 / len(result['estimator'])
    return unfairness_means, auc_means, selected_feature_props
def test_pandas():

    X_df = pd.DataFrame(X_iris, columns=['sepal length', 'sepal width',
                                         'petal width', 'petal width'])
    knn = KNeighborsClassifier()
    sfs = SFS(estimator=knn,
              k_features=3,
              forward=True,
              floating=False,
              fixed_features=('sepal length', 'sepal width'),
              verbose=0)
    sfs.fit(X_df, y_iris)
    print(sfs.subsets_)
    for k in sfs.subsets_:
        assert 0 in sfs.subsets_[k]['feature_idx']
        assert 1 in sfs.subsets_[k]['feature_idx']
コード例 #31
0
def test_knn_option_sfbs_tuplerange_2():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=3)
    sfs4 = SFS(knn,
               k_features=(1, 4),
               forward=False,
               floating=True,
               scoring='accuracy',
               cv=4,
               skip_if_stuck=True,
               verbose=0)
    sfs4 = sfs4.fit(X, y)
    assert round(sfs4.k_score_, 3) == 0.966, sfs4.k_score_
    assert sfs4.k_feature_idx_ == (1, 2, 3), sfs4.k_feature_idx_
コード例 #32
0
def test_clone_params_pass():
    iris = load_iris()
    X = iris.data
    y = iris.target
    lr = SoftmaxRegression(random_seed=1)
    sfs1 = SFS(lr,
               k_features=2,
               forward=True,
               floating=False,
               scoring='accuracy',
               cv=0,
               clone_estimator=True,
               verbose=0,
               n_jobs=1)
    sfs1 = sfs1.fit(X, y)
    assert (sfs1.k_feature_idx_ == (1, 3))
コード例 #33
0
def test_string_scoring_clf():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs1 = SFS(knn, k_features=3, cv=0)
    sfs1 = sfs1.fit(X, y)

    sfs2 = SFS(knn, k_features=3, scoring='accuracy', cv=0)
    sfs2 = sfs2.fit(X, y)

    sfs3 = SFS(knn, k_features=3, scoring=make_scorer(accuracy_score), cv=0)
    sfs3 = sfs2.fit(X, y)

    assert sfs1.k_score_ == sfs2.k_score_
    assert sfs1.k_score_ == sfs3.k_score_
コード例 #34
0
def sequential_feature_selection(data_set, y_values, want_graph):
    lr = LinearRegression()
    sfs = SFS(lr,
              k_features=13,
              forward=True,
              floating=False,
              scoring='neg_mean_squared_error',
              cv=10)
    sfs = sfs.fit(data_set, y_values)
    if want_graph:
        fig = plot_sfs(sfs.get_metric_dict(), kind='std_err')
        plt.title('Sequential Forward Selection (w. StdErr)')
        plt.grid()
        plt.show()

    return sfs
コード例 #35
0
def test_knn_option_sbs_tuplerange_1():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=3)
    sfs4 = SFS(knn,
               k_features=(1, 3),
               forward=False,
               floating=False,
               scoring='accuracy',
               cv=4,
               skip_if_stuck=True,
               print_progress=False)
    sfs4 = sfs4.fit(X, y)
    assert round(sfs4.k_score_, 3) == 0.967, sfs4.k_score_
    assert sfs4.k_feature_idx_ == (0, 2, 3), sfs4.k_feature_idx_
コード例 #36
0
def test_clone_params_pass():
    iris = load_iris()
    X = iris.data
    y = iris.target
    lr = SoftmaxRegression(random_seed=1)
    sfs1 = SFS(lr,
               k_features=2,
               forward=True,
               floating=False,
               scoring='accuracy',
               cv=0,
               clone_estimator=True,
               verbose=0,
               n_jobs=1)
    sfs1 = sfs1.fit(X, y)
    assert (sfs1.k_feature_idx_ == (1, 3))
コード例 #37
0
def SFS_test(input, how_many_attrs, cv_scores):
    y = np.array(input[:, -1])
    x = np.array(input[:, :-1])
    sfs = SequentialFeatureSelector(KNeighborsClassifier(n_neighbors=5,
                                                         metric="euclidean"),
                                    k_features=how_many_attrs,
                                    forward=True,
                                    floating=False,
                                    verbose=0,
                                    scoring='accuracy',
                                    n_jobs=-1,
                                    cv=4)
    sfs = sfs.fit(x, y)
    # print(sfs.k_feature_idx_)
    target = np.array(input[:, -1]).reshape(475, 1)
    return np.hstack((input[:, sfs.k_feature_idx_], target))
コード例 #38
0
def filter_with_sfs(train_X, valid_X, test_X, train_Y, i):
    features = {item for item in train_X.head(0)}
    fs = SequentialFeatureSelector(RandomForestClassifier(n_estimators=30,
                                                          random_state=0),
                                   k_features=i,
                                   forward=True,
                                   verbose=0,
                                   scoring='accuracy',
                                   cv=4)
    fs.fit(train_X, train_Y)

    selected_features = set(fs.k_feature_names_)
    features_to_drop = list(features - selected_features)

    return train_X.drop(features_to_drop, axis=1), valid_X.drop(features_to_drop, axis=1), \
           test_X.drop(features_to_drop, axis=1)
コード例 #39
0
 def select_features(self,
                     model,
                     X_train,
                     y_train,
                     k_features=(1, 30),
                     scorer='r2',
                     cv=0):
     sfs = SFS(model,
               k_features=k_features,
               forward=True,
               floating=False,
               scoring=scorer,
               cv=cv,
               verbose=2)
     sfs.fit(np.array(X_train), np.array(y_train))
     print(sfs.k_feature_idx_)
     """
コード例 #40
0
def test_clone_params_pass():
    iris = load_iris()
    X = iris.data
    y = iris.target
    lr = SoftmaxRegression(random_seed=1)
    sfs1 = SFS(lr,
               k_features=3,
               forward=True,
               floating=False,
               scoring='accuracy',
               cv=0,
               skip_if_stuck=True,
               clone_estimator=False,
               print_progress=False,
               n_jobs=1)
    sfs1 = sfs1.fit(X, y)
    assert sfs1.k_feature_idx_ == (0, 1, 2)
コード例 #41
0
    def sub_window_creation(self, images, kernels):
        gb_all_sw = []
        label = []
        for i in range(0, 100, 11):
            for j in range(0, 50, 11):
                for k in range(len(images)):
                    image = images[k]
                    sw_image = image[i:i + 50, j:j + 50]
                    sw_image = cv2.resize(sw_image,
                                          dsize=(12, 12),
                                          interpolation=cv2.INTER_NEAREST)
                    # print('sw size', sw_image.shape)
                    gabored_image = Preprocessing.process(
                        self, sw_image, kernels)
                    # print('gab size', gabored_image.shape)
                    # model = SpectralEmbedding(n_components=100, n_neighbors=10)
                    # reduced_sw = model.fit_transform(gabored_image.reshape(-1, 1))
                    # print('gab size', gabored_image.reshape(1, -1).shape)
                    # gb_all_sw.append(gabored_image)
                    gb_all_sw.append(gabored_image)
                    label.append(int(k / 4))
                    # print('red size', reduced_sw.reshape(-1, 1).shape)
                    # plt.imshow(image[i:i+50, j:j+50], cmap='gray')
                    # plt.show()
                    # plt.imshow(gabored_image, cmap='gray')
                    # plt.show()
        print(len(gb_all_sw))
        print(len(gb_all_sw[0]))
        # LEM demension reduction
        model = SpectralEmbedding(n_components=100, n_neighbors=10)
        # reduced_sw = model.fit_transform(gb_all_sw)
        reduced_sw = model.fit_transform(gb_all_sw)

        knn = KNeighborsClassifier(n_neighbors=5)
        sffs = SFS(knn,
                   k_features=5,
                   forward=True,
                   floating=True,
                   scoring='accuracy',
                   cv=4,
                   n_jobs=-1)
        sffs = sffs.fit(reduced_sw, label)
        print('\nSequential Forward Floating Selection (k=', i, '):')
        print(sffs.k_feature_idx_)
        print('CV Score:')
        print(sffs.k_score_)
コード例 #42
0
def test_check_pandas_dataframe_fit_backward():
    for floating in [True, False]:
        iris = load_iris()
        X = iris.data
        y = iris.target
        lr = SoftmaxRegression(random_seed=1)
        sfs1 = SFS(lr,
                   k_features=2,
                   forward=False,
                   floating=floating,
                   scoring='accuracy',
                   cv=0,
                   verbose=0,
                   n_jobs=1)

        df = pd.DataFrame(
            X,
            columns=['sepal len', 'sepal width', 'petal len', 'petal width'])

        sfs1 = sfs1.fit(X, y)
        assert sfs1.k_feature_idx_ == (1, 2)
        assert sfs1.k_feature_names_ == ('1', '2')
        assert sfs1.subsets_[2]['feature_names'] == ('1', '2')

        sfs1 = sfs1.fit(df, y)
        assert sfs1.subsets_[3]['feature_names'] == ('sepal len',
                                                     'sepal width',
                                                     'petal len')
        assert sfs1.subsets_[2]['feature_names'] == ('sepal width',
                                                     'petal len')
        assert sfs1.subsets_[3]['feature_idx'] == (0, 1, 2)
        assert sfs1.subsets_[2]['feature_idx'] == (1, 2)
        assert sfs1.k_feature_idx_ == (1, 2)
        assert sfs1.k_feature_names_ == ('sepal width', 'petal len')

        sfs1._TESTING_INTERRUPT_MODE = True
        out = sfs1.fit(df, y)
        assert len(out.subsets_.keys()) > 0
        assert sfs1.interrupted_
        assert sfs1.subsets_[3]['feature_names'] == ('sepal len',
                                                     'sepal width',
                                                     'petal len')
        assert sfs1.k_feature_idx_ == (0, 1, 2)
        assert sfs1.k_feature_names_ == ('sepal len', 'sepal width',
                                         'petal len')
コード例 #43
0
ファイル: notebook_funcs.py プロジェクト: sjmercer65/tremor
def sff_selection(k_features, pipeline, x, y, fwd=True, flt=True):
    '''
    Selects a subset of available features

    Input:
      k_features - number of features to select
      pipeline - predictor pipeline
      x, y - features and labels
      fwd,flt - boolean parameters for SFFS algorithm, see mlxtend docs
    Output:
      tuple of accuracy score and list of k selected fatures

    The mlxtend SFS function implements four related feature selection algorithms;
    if the default parameters (fwd=True, flt=True) are not changed, this is
    Sequential Floating Forward Selection (SFFS)

    SFFS has been identified as a good way of performing dimension reduction
    through feature selection on triaxial accelerometer data:

      Gupta and Tim Dallas (2014) "Feature Selection and Activity Recognition
      System Using a Single Triaxial Accelerometer"
      IEEE Trans. Bomed. Eng., 61(6)

    '''
    # hyperparameters
    sffs_scoring = 'accuracy'
    sffs_cv_folds = 10

    # Feature selection
    sffs = SFS(pipeline,
               k_features=k_features,
               forward=fwd,
               floating=flt,
               scoring=sffs_scoring,
               cv=sffs_cv_folds,
               n_jobs=-1)

    sffs = sffs.fit(x.as_matrix(), y.as_matrix())

    # list of the k best features
    feat_names = list(x.columns.values)
    feat_list = [feat_names[i] for i in sffs.k_feature_idx_]

    # return the prediction score and feature name list
    return sffs.k_score_, feat_list
コード例 #44
0
    def run_decision_tree(self):
        clf = DecisionTreeRegressor(random_state=7, max_depth=self.max_depth)

        sfs = SFS(clf,
                  k_features=self.k_features,
                  forward=True,
                  floating=True,
                  scoring=self.scoring,
                  n_jobs=-1,
                  cv=4)
        test_features = self.train.columns
        test_features = list(test_features.drop(['date', 'ticker', 'return']))

        sfs = sfs.fit(self.train[test_features], self.train['return'])

        self.score = sfs.k_score_

        self.features = list(sfs.k_feature_names_)
コード例 #45
0
def FeatureSelection(xSet, ySet, nFeatures=10):

    sffs = SFS(svm.SVC(kernel='rbf', C=1),
               k_features=nFeatures,
               forward=True,
               floating=True,
               verbose=2,
               scoring='accuracy',
               cv=10,
               n_jobs=-1)  #-1

    sffs = sffs.fit(xSet, ySet)

    print('\nSequential Forward Floating Selection:')
    print(sffs.k_feature_idx_)
    print('CV Score:')
    print(sffs.k_score_)
    return sffs
コード例 #46
0
    def get_core_features(self, X, y) -> List[str]:
        if self.method == "SFS":
            mySFS = SFS(
                LogisticRegression(),
                k_features=10,
                forward=True,
                cv=0,
                scoring="roc_auc",
            )
            myVars = mySFS.fit(X.values, y.values)
            return [X.columns[i] for i in myVars.k_feature_idx_]

        if self.method == "RFE":
            rfe = RFE(self.model, self.n_features)
            fit = rfe.fit(X, y)
            return [i[1] for i in zip(fit.support_, X.columns) if i[0]]

        raise ValueError("Unknown method for core feature selection")
コード例 #47
0
ファイル: svm.py プロジェクト: codepictor/static-pe-analysis
def make_features_selection(X_train, y_train, is_forward):
    curr_C = float(sys.argv[1])

    rkf = RepeatedKFold(n_splits=Q, n_repeats=T)
    features_number = 90 if is_forward else len(X_train.columns) - 12

    curr_svm_classifier = LinearSVC(penalty='l2', dual=False, C=curr_C)
    sfs = SFS(estimator=curr_svm_classifier,
              k_features=features_number,
              forward=is_forward,
              floating=True,
              n_jobs=-1,
              verbose=2,
              scoring=SCORING,
              cv=rkf)
    sfs = sfs.fit(X_train.values, y_train)
    make_plot(sfs, curr_C, is_forward)
    make_debug_info(sfs, curr_C, is_forward)
コード例 #48
0
ファイル: rice_ml_PL3.py プロジェクト: JRLi/untitled
def select_r2(df_in, ss_label, f_n, eps):
    dfx = df_in.copy()
    if len(dfx.columns) > f_n:
        select = SFS(RandomForestClassifier(n_estimators=eps, random_state=1),
                     k_features=f_n,
                     forward=True,
                     floating=False,
                     scoring='accuracy',
                     cv=4,
                     n_jobs=3)
        select.fit(dfx.values, ss_label.values)
        mask = select.k_feature_idx_
        x_sfs = select.transform(dfx.values)
        m_mir_list = dfx.columns[[x for x in mask]]
        return x_sfs, ','.join(m_mir_list), len(m_mir_list)
    else:
        f_list = dfx.columns.tolist()
        return dfx.values, ','.join(f_list), len(f_list)
コード例 #49
0
def perform_sfs(curr_classifier, X_train, X_test, y_train, y_test):
    sfs1 = SFS(curr_classifier,
               k_features=100,
               verbose=0,
               forward=True,
               floating=False,
               scoring='accuracy',
               cv=5,
               n_jobs=8)

    sfs1 = sfs1.fit(X_train, y_train)
    df = pd.DataFrame.from_dict(sfs1.get_metric_dict(), orient='index')
    df[[
        'accuracy', 'precision', 'recall', 'f1_score', 'confusion_matrix'
    ]] = df['feature_idx'].apply(lambda x: get_test_score(
        X_train, X_test, y_train, y_test, x, curr_classifier)).apply(pd.Series)

    return df
コード例 #50
0
def select_features_wrapper(X, y, forward=True, k_features=20):
    # svc = SVC(gamma='auto')
    # linearSVC = LinearSVC(random_state=0, tol=1e-5, class_weight='balanced')
    random_forest_clssifier = RandomForestClassifier(max_depth=7,
                                                     random_state=0)

    sgd = SGDClassifier(max_iter=1000, tol=1e-3)
    #     knn = KNNeighborsClassifier(n_neighbors=3)
    sfs = SequentialFeatureSelector(sgd,
                                    k_features=k_features,
                                    forward=forward,
                                    floating=False,
                                    verbose=5,
                                    cv=0,
                                    n_jobs=-1)
    sfs.fit(X, y.values.ravel())
    print(sfs.k_feature_names_)
    return sfs
コード例 #51
0
def test_check_pandas_dataframe_fit_backward():
    for floating in [True, False]:
        iris = load_iris()
        X = iris.data
        y = iris.target
        lr = SoftmaxRegression(random_seed=1)
        sfs1 = SFS(lr,
                   k_features=2,
                   forward=False,
                   floating=floating,
                   scoring='accuracy',
                   cv=0,
                   verbose=0,
                   n_jobs=1)

        df = pd.DataFrame(X, columns=['sepal len', 'sepal width',
                                      'petal len', 'petal width'])

        sfs1 = sfs1.fit(X, y)
        assert sfs1.k_feature_idx_ == (1, 2)
        assert sfs1.k_feature_names_ == ('1', '2')
        assert sfs1.subsets_[2]['feature_names'] == ('1', '2')

        sfs1 = sfs1.fit(df, y)
        assert sfs1.subsets_[3]['feature_names'] == ('sepal len',
                                                     'sepal width',
                                                     'petal len')
        assert sfs1.subsets_[2]['feature_names'] == ('sepal width',
                                                     'petal len')
        assert sfs1.subsets_[3]['feature_idx'] == (0, 1, 2)
        assert sfs1.subsets_[2]['feature_idx'] == (1, 2)
        assert sfs1.k_feature_idx_ == (1, 2)
        assert sfs1.k_feature_names_ == ('sepal width', 'petal len')

        sfs1._TESTING_INTERRUPT_MODE = True
        out = sfs1.fit(df, y)
        assert len(out.subsets_.keys()) > 0
        assert sfs1.interrupted_
        assert sfs1.subsets_[3]['feature_names'] == ('sepal len',
                                                     'sepal width',
                                                     'petal len')
        assert sfs1.k_feature_idx_ == (0, 1, 2)
        assert sfs1.k_feature_names_ == ('sepal len', 'sepal width',
                                         'petal len')
コード例 #52
0
def test_check_pandas_dataframe_transform():
    iris = load_iris()
    X = iris.data
    y = iris.target
    lr = SoftmaxRegression(random_seed=1)
    sfs1 = SFS(lr,
               k_features=2,
               forward=True,
               floating=False,
               scoring='accuracy',
               cv=0,
               verbose=0,
               n_jobs=1)

    df = pd.DataFrame(X, columns=['sepal length', 'sepal width',
                                  'petal length', 'petal width'])
    sfs1 = sfs1.fit(df, y)
    assert sfs1.k_feature_idx_ == (1, 3)
    assert (150, 2) == sfs1.transform(df).shape
コード例 #53
0
def sfs_selection(X,y,n_features,forward):
	"""
	Performs the Sequential Forward/Backward Selection method and selects the top ranking features

	Keyword arguments:
	X -- The feature vectors
	y -- The target vector
	n_features -- n best ranked features
	"""

	if verbose:
		print '\nPerforming Feature Selection based on the Sequential Feature Selection method ...'

	clf=RandomForestClassifierWithCoef(n_estimators=5,n_jobs=-1)
	sfs = SFS(clf,k_features=n_features,forward=forward,scoring='accuracy',cv=0,n_jobs=-1, print_progress=True,)
	sfs = sfs.fit(X, y)

	feature_indexes=sfs.k_feature_idx_
	return X[:,feature_indexes[0:n_features]],feature_indexes[0:n_features]		#return selected features and original index features
コード例 #54
0
def test_regression():
    boston = load_boston()
    X, y = boston.data, boston.target
    lr = LinearRegression()
    sfs_r = SFS(lr,
                k_features=13,
                forward=True,
                floating=False,
                scoring='neg_mean_squared_error',
                cv=10,
                verbose=0)
    sfs_r = sfs_r.fit(X, y)
    assert len(sfs_r.k_feature_idx_) == 13

    if Version(sklearn_version) < '0.20':
        assert round(sfs_r.k_score_, 4) == -34.7631, \
            round(sfs_r.k_score_, 4)
    else:
        assert round(sfs_r.k_score_, 4) == -34.7053, \
            round(sfs_r.k_score_, 4)
コード例 #55
0
def test_custom_feature_names():

    iris = load_iris()
    X = iris.data
    y = iris.target
    lr = SoftmaxRegression(random_seed=1)
    sfs1 = SFS(lr,
               k_features=2,
               forward=True,
               floating=False,
               scoring='accuracy',
               cv=0,
               verbose=0,
               n_jobs=1)

    sfs1 = sfs1.fit(X, y, custom_feature_names=(
          'sepal length', 'sepal width', 'petal length', 'petal width'))
    assert sfs1.k_feature_idx_ == (1, 3)
    assert sfs1.k_feature_names_ == ('sepal width', 'petal width')
    assert sfs1.subsets_[2]['feature_names'] == ('sepal width',
                                                 'petal width')
コード例 #56
0
def test_knn_scoring_metric():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs5 = SFS(knn,
               k_features=3,
               forward=False,
               floating=True,
               scoring='accuracy',
               cv=4,
               skip_if_stuck=True,
               verbose=0)
    sfs5 = sfs5.fit(X, y)
    assert round(sfs5.k_score_, 4) == 0.9728

    sfs6 = SFS(knn,
               k_features=3,
               forward=False,
               floating=True,
               cv=4,
               skip_if_stuck=True,
               verbose=0)
    sfs6 = sfs6.fit(X, y)
    assert round(sfs6.k_score_, 4) == 0.9728

    sfs7 = SFS(knn,
               k_features=3,
               forward=False,
               floating=True,
               scoring='f1_macro',
               cv=4,
               skip_if_stuck=True)
    sfs7 = sfs7.fit(X, y)
    assert round(sfs7.k_score_, 4) == 0.9727, sfs7.k_score_
コード例 #57
0
def test_keyboard_interrupt():
    iris = load_iris()
    X = iris.data
    y = iris.target

    knn = KNeighborsClassifier(n_neighbors=4)
    sfs1 = SFS(
        knn,
        k_features=3,
        forward=True,
        floating=False,
        cv=3,
        clone_estimator=False,
        verbose=5,
        n_jobs=1
    )

    sfs1._TESTING_INTERRUPT_MODE = True
    out = sfs1.fit(X, y)

    assert len(out.subsets_.keys()) > 0
    assert sfs1.interrupted_
コード例 #58
0
def test_knn_wo_cv():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs1 = SFS(knn,
               k_features=3,
               forward=True,
               floating=False,
               cv=0,
               verbose=0)
    sfs1 = sfs1.fit(X, y)
    expect = {1: {'avg_score': 0.95999999999999996,
                  'cv_scores': np.array([0.96]),
                  'feature_idx': (3,)},
              2: {'avg_score': 0.97333333333333338,
                  'cv_scores': np.array([0.97333333]),
                  'feature_idx': (2, 3)},
              3: {'avg_score': 0.97333333333333338,
                  'cv_scores': np.array([0.97333333]),
                  'feature_idx': (1, 2, 3)}}
    dict_compare_utility(d_actual=sfs1.subsets_, d_desired=expect)
コード例 #59
0
def test_string_scoring_clf():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs1 = SFS(knn,
               k_features=3,
               cv=0)
    sfs1 = sfs1.fit(X, y)

    sfs2 = SFS(knn,
               k_features=3,
               scoring='accuracy',
               cv=0)
    sfs2 = sfs2.fit(X, y)

    sfs3 = SFS(knn,
               k_features=3,
               scoring=make_scorer(accuracy_score),
               cv=0)
    sfs3 = sfs2.fit(X, y)

    assert sfs1.k_score_ == sfs2.k_score_
    assert sfs1.k_score_ == sfs3.k_score_
コード例 #60
0
def test_knn_wo_cv():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs1 = SFS(knn,
               k_features=3,
               forward=True,
               floating=False,
               scoring='accuracy',
               cv=0,
               skip_if_stuck=True,
               print_progress=False)
    sfs1 = sfs1.fit(X, y)
    expect = {1: {'avg_score': 0.95999999999999996,
                  'cv_scores': np.array([0.96]),
                  'feature_idx': (3,)},
              2: {'avg_score': 0.97333333333333338,
                  'cv_scores': np.array([0.97333333]),
                  'feature_idx': (2, 3)},
              3: {'avg_score': 0.97333333333333338,
                  'cv_scores': np.array([0.97333333]),
                  'feature_idx': (1, 2, 3)}}
    dict_compare_utility(d1=expect, d2=sfs1.subsets_)