def test_knn_scoring_metric():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs5 = SFS(knn,
               k_features=3,
               forward=False,
               floating=True,
               scoring='accuracy',
               cv=4,
               skip_if_stuck=True,
               verbose=0)
    sfs5 = sfs5.fit(X, y)
    assert round(sfs5.k_score_, 4) == 0.9728

    sfs6 = SFS(knn,
               k_features=3,
               forward=False,
               floating=True,
               cv=4,
               skip_if_stuck=True,
               verbose=0)
    sfs6 = sfs6.fit(X, y)
    assert round(sfs6.k_score_, 4) == 0.9728

    sfs7 = SFS(knn,
               k_features=3,
               forward=False,
               floating=True,
               scoring='f1_macro',
               cv=4,
               skip_if_stuck=True)
    sfs7 = sfs7.fit(X, y)
    assert round(sfs7.k_score_, 4) == 0.9727, sfs7.k_score_
def test_knn_scoring_metric():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs5 = SFS(knn,
               k_features=3,
               forward=False,
               floating=True,
               scoring='accuracy',
               cv=4,
               skip_if_stuck=True,
               print_progress=False)
    sfs5 = sfs5.fit(X, y)
    assert round(sfs5.k_score_, 4) == 0.9728

    sfs6 = SFS(knn,
               k_features=3,
               forward=False,
               floating=True,
               scoring='precision',
               cv=4,
               skip_if_stuck=True,
               print_progress=False)
    sfs6 = sfs6.fit(X, y)
    assert round(sfs6.k_score_, 4) == 0.9737
def test_run_default():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier()
    sfs = SFS(estimator=knn,
              verbose=0)
    sfs.fit(X, y)
    assert sfs.k_feature_idx_ == (3,)
def test_fit_params():
    iris = load_iris()
    X = iris.data
    y = iris.target
    sample_weight = np.ones(X.shape[0])
    forest = RandomForestClassifier(n_estimators=100, random_state=123)
    sfs = SFS(estimator=forest,
              verbose=0)
    sfs.fit(X, y, sample_weight=sample_weight)
    assert sfs.k_feature_idx_ == (3,)
def test_knn_cv3_groups():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs1 = SFS(knn,
               k_features=3,
               forward=True,
               floating=False,
               cv=GroupKFold(n_splits=3),
               verbose=0)
    np.random.seed(1630672634)
    groups = np.random.randint(0, 6, size=len(y))
    sfs1 = sfs1.fit(X, y, groups=groups)
    # print(sfs1.subsets_)
    expect = {
        1: {'cv_scores': np.array([0.97916667, 0.93877551, 0.96226415]),
            'feature_idx': (3,),
            'avg_score': 0.9600687759380482},
        2: {'cv_scores': np.array([0.95833333, 0.93877551, 0.98113208]),
            'feature_idx': (1, 3),
            'avg_score': 0.9594136396697044},
        3: {'cv_scores': np.array([0.97916667, 0.95918367, 0.94339623]),
            'feature_idx': (1, 2, 3),
            'avg_score': 0.9605821888503829}}
    dict_compare_utility(d_actual=sfs1.subsets_, d_desired=expect, decimal=3)
def test_knn_cv3():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs1 = SFS(knn,
               k_features=3,
               forward=True,
               floating=False,
               cv=4,
               verbose=0)
    sfs1 = sfs1.fit(X, y)
    sfs1.subsets_
    expect = {1: {'avg_score': 0.95299145299145294,
                  'cv_scores': np.array([0.97435897,
                                         0.94871795,
                                         0.88888889,
                                         1.0]),
                  'feature_idx': (3,)},
              2: {'avg_score': 0.95993589743589736,
                  'cv_scores': np.array([0.97435897,
                                         0.94871795,
                                         0.91666667,
                                         1.0]),
                  'feature_idx': (2, 3)},
              3: {'avg_score': 0.97275641025641035,
                  'cv_scores': np.array([0.97435897,
                                         1.0,
                                         0.94444444,
                                         0.97222222]),
                  'feature_idx': (1, 2, 3)}}
    dict_compare_utility(d_actual=sfs1.subsets_, d_desired=expect)
Beispiel #7
0
def forward_feature_selection_linear_regression(X_train, y_train):
    """
    Selects features using Feedforward Feature Selection using a Linear Regression.
    -- RATIONALE  I had aimed to write my own function to let the number of features to select be variable, however
    due to time constraints I did not implement such a version. For now I selected the number of features (9), based on
    visual inspection of the Forward Feature Selection plots. --
    Parameters
    -----------
    Returns
    -----------
    """
    regr = LinearRegression()
    # Build step forward feature selection
    sfs = SequentialFeatureSelector(regr,
                                    k_features=9,
                                    forward=True,
                                    floating=False,
                                    verbose=2,
                                    scoring='r2',
                                    cv=5)

    # Perform Sequential Feedforward Selection
    sfs = sfs.fit(X_train, y_train)
    selected_feature_names = sfs.k_feature_names_
    return selected_feature_names
Beispiel #8
0
def select_SFS(X_tr, y_tr, num_feat=100, knn_parameter=1, forward_=False, floating_=True):
    """
    Secuential Feature Selection
    :param X_tr:
    :param y_tr:
    :param num_feat:
    :param knn_parameter:
    :param forward_:
    :param floating_:
    :return:
    """
    X = X_tr
    y = y_tr
    knn = KNeighborsClassifier(n_neighbors=knn_parameter)
    sfs1 = SFS(knn,
               k_features=(1, num_feat),
               forward=forward_,
               floating=floating_,
               verbose=1,
               scoring='accuracy',
               cv=3,
               n_jobs=4)
    sfs1 = sfs1.fit(X, y)
    out = sfs1.k_feature_idx_
    return np.asarray(out)
Beispiel #9
0
def forward_selection_regression(data, target, k_features=3):
    """

    :param data: pandas dataframe of input data
    :param target: pandas dataframe of input data's corresponding target
    :param k_features: number of desired features to fit the regression upon,
    features are chosen based on their importance
    :return: prints out the mean squared error and regression coefficients
    """

    reg = LinearRegression()
    sfs = SFS(reg,
              k_features,
              forward=True,
              floating=False,
              verbose=0,
              scoring='r2',
              cv=5)

    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=.3)
    sfs = sfs.fit(X_train, y_train)
    X_train_sfs = sfs.transform(X_train)
    X_test_sfs = sfs.transform(X_test)
    reg = reg.fit(X_train_sfs, y_train)

    print('estimated coefficients for the linear regression:', reg.coef_)
    print('interception coefficient b_0:', reg.intercept_)
    print('MSE_train:', metrics.mean_squared_error(y_train, reg.predict(X_train_sfs)))
    print('MSE_test:', metrics.mean_squared_error(y_test, reg.predict(X_test_sfs)))
Beispiel #10
0
def fsel(bcl, X, d, m, forward=True, floating=False, cv=0, show=0):
    if show > 0:
        print('Feature Selection - ' + bcl[0] +
              ':  - number of features reducing from ' + str(X.shape[1]) +
              ' to ' + str(m) + ' ...')
    if bcl[0] == 'Fisher':
        sel = sfsfisher(X, d, m)
    else:
        estimator = defineModel(bcl)
        sfs = SFS(estimator,
                  k_features=m,
                  forward=True,
                  floating=False,
                  verbose=show,
                  scoring='accuracy',
                  cv=cv)
        sfs = sfs.fit(X, d)
        sel = list(sfs.k_feature_idx_)
        if show > 0:
            print(' ')
        if show:
            plot_sfs(sfs.get_metric_dict(), kind='std_err')
            plt.title('Sequential Forward Selection')
            plt.grid()
            plt.show()
    return sel
Beispiel #11
0
def filter_with_sfs(train_X, valid_X, test_X, train_Y, i):
    features = {item for item in train_X.head(0)}
    fs = SequentialFeatureSelector(RandomForestClassifier(n_estimators=30,
                                                          random_state=0),
                                   k_features=i,
                                   forward=True,
                                   verbose=0,
                                   scoring='accuracy',
                                   cv=4)
    fs.fit(train_X, train_Y)

    selected_features = set(fs.k_feature_names_)
    features_to_drop = list(features - selected_features)

    return train_X.drop(features_to_drop, axis=1), valid_X.drop(features_to_drop, axis=1), \
           test_X.drop(features_to_drop, axis=1)
def test_knn_cv3_groups():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs1 = SFS(knn,
               k_features=3,
               forward=True,
               floating=False,
               cv=GroupKFold(n_splits=3),
               verbose=0)
    np.random.seed(1630672634)
    groups = np.random.randint(0, 6, size=len(y))
    sfs1 = sfs1.fit(X, y, groups=groups)
    # print(sfs1.subsets_)
    expect = {
        1: {
            'cv_scores': np.array([0.97916667, 0.93877551, 0.96226415]),
            'feature_idx': (3, ),
            'avg_score': 0.9600687759380482
        },
        2: {
            'cv_scores': np.array([0.95833333, 0.93877551, 0.98113208]),
            'feature_idx': (1, 3),
            'avg_score': 0.9594136396697044
        },
        3: {
            'cv_scores': np.array([0.97916667, 0.95918367, 0.94339623]),
            'feature_idx': (1, 2, 3),
            'avg_score': 0.9605821888503829
        }
    }
    dict_compare_utility(d_actual=sfs1.subsets_, d_desired=expect, decimal=3)
Beispiel #13
0
def forward_feature_selection_decision_tree(X_train, y_train_binned):
    """
    Selects features using Feedforward Feature Selection using a Decision Tree Classifier.
    -- RATIONALE  I had aimed to write my own function to let the number of features to select be variable, however
    due to time constraints I did not implement such a version. For now I selected the number of features (7), based on
    visual inspection of the Forward Feature Selection plots. --
    Parameters
    -----------
    X_train: training split of feature variables with continuous values
    y_train_binned: training split of feature variables with 3 class values
    Returns
    -----------
    """
    clf = tree.DecisionTreeClassifier()
    # Build step forward feature selection
    sfs = SequentialFeatureSelector(clf,
                                    k_features=7,
                                    forward=True,
                                    floating=False,
                                    verbose=2,
                                    scoring='r2',
                                    cv=5)

    # Perform Sequential Feature Selection
    sfs = sfs.fit(X_train, y_train_binned)
    selected_feature_names = sfs.k_feature_names_
    return selected_feature_names
def test_knn_cv3():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs1 = SFS(knn,
               k_features=3,
               forward=True,
               floating=False,
               scoring='accuracy',
               cv=4,
               skip_if_stuck=True,
               print_progress=False)
    sfs1 = sfs1.fit(X, y)
    sfs1.subsets_
    expect = {1: {'avg_score': 0.95299145299145294,
                  'cv_scores': np.array([0.97435897,
                                         0.94871795,
                                         0.88888889,
                                         1.0]),
                  'feature_idx': (3,)},
              2: {'avg_score': 0.95993589743589736,
                  'cv_scores': np.array([0.97435897,
                                         0.94871795,
                                         0.91666667,
                                         1.0]),
                  'feature_idx': (2, 3)},
              3: {'avg_score': 0.97275641025641035,
                  'cv_scores': np.array([0.97435897,
                                         1.0,
                                         0.94444444,
                                         0.97222222]),
                  'feature_idx': (1, 2, 3)}}
    dict_compare_utility(d1=expect, d2=sfs1.subsets_)
Beispiel #15
0
def test_knn_cv3():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs1 = SFS(knn,
               k_features=3,
               forward=True,
               floating=False,
               cv=4,
               verbose=0)
    sfs1 = sfs1.fit(X, y)
    sfs1.subsets_
    expect = {
        1: {
            'avg_score': 0.95299145299145294,
            'cv_scores': np.array([0.97435897, 0.94871795, 0.88888889, 1.0]),
            'feature_idx': (3, )
        },
        2: {
            'avg_score': 0.95993589743589736,
            'cv_scores': np.array([0.97435897, 0.94871795, 0.91666667, 1.0]),
            'feature_idx': (2, 3)
        },
        3: {
            'avg_score': 0.97275641025641035,
            'cv_scores': np.array([0.97435897, 1.0, 0.94444444, 0.97222222]),
            'feature_idx': (1, 2, 3)
        }
    }
    dict_compare_utility(d1=expect, d2=sfs1.subsets_)
Beispiel #16
0
def test_knn_wo_cv():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs1 = SFS(knn,
               k_features=3,
               forward=True,
               floating=False,
               cv=0,
               verbose=0)
    sfs1 = sfs1.fit(X, y)
    expect = {
        1: {
            'avg_score': 0.95999999999999996,
            'cv_scores': np.array([0.96]),
            'feature_idx': (3, )
        },
        2: {
            'avg_score': 0.97333333333333338,
            'cv_scores': np.array([0.97333333]),
            'feature_idx': (2, 3)
        },
        3: {
            'avg_score': 0.97333333333333338,
            'cv_scores': np.array([0.97333333]),
            'feature_idx': (1, 2, 3)
        }
    }
    dict_compare_utility(d1=expect, d2=sfs1.subsets_)
def test_string_scoring_clf():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs1 = SFS(knn, k_features=3, cv=0)
    sfs1 = sfs1.fit(X, y)

    sfs2 = SFS(knn, k_features=3, scoring='accuracy', cv=0)
    sfs2 = sfs2.fit(X, y)

    sfs3 = SFS(knn, k_features=3, scoring=make_scorer(accuracy_score), cv=0)
    sfs3 = sfs2.fit(X, y)

    assert sfs1.k_score_ == sfs2.k_score_
    assert sfs1.k_score_ == sfs3.k_score_
def feature_selection_sfs(X, y, parameters):
    # TODO check n_neighbor
    #     k_features = parameters['sfs_k_features']
    #     forward = parameters['sfs_forward']
    #     floating = parameters['floating']
    #     scoring = parameters['scoring']
    #     n_folds = parameters['n_folds']
    #     n_jobs = parameters['n_jobs']

    knn = KNeighborsClassifier(n_neighbors=parameters['sfs_k_neighbors'])
    sfs1 = SFS(knn,
               k_features=parameters['sfs_k_features'],
               forward=parameters['sfs_forward'],
               floating=parameters['sfs_floating'],
               verbose=2,
               scoring=parameters['sfs_scoring'],
               cv=parameters['sfs_cv'],
               n_jobs=parameters['sfs_n_jobs'])

    custom_feature_names = None
    if parameters['feature_names'].any():
        custom_feature_names = parameters['feature_names']

    sfs1 = sfs1.fit(X, y, custom_feature_names=custom_feature_names)
    return sfs1
Beispiel #19
0
def test_pandas():

    X_df = pd.DataFrame(
        X_iris,
        columns=['sepal length', 'sepal width', 'petal width', 'petal width'])
    knn = KNeighborsClassifier()
    sfs = SFS(estimator=knn,
              k_features=3,
              forward=True,
              floating=False,
              fixed_features=('sepal length', 'sepal width'),
              verbose=0)
    sfs.fit(X_df, y_iris)
    print(sfs.subsets_)
    for k in sfs.subsets_:
        assert 0 in sfs.subsets_[k]['feature_idx']
        assert 1 in sfs.subsets_[k]['feature_idx']
def stepwiseFeatureSelection(label, features):
    lr = linear_model.LinearRegression()
    sfs = SFS(lr, k_features=1)
    sfs = sfs.fit(features, label)
    prediction = sfs.predict(features)
    r2Score = r2_score(label, prediction)
    print(r2Score)
    return sfs
Beispiel #21
0
def test_knn_option_sffs():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs2 = SFS(knn, k_features=3, forward=True, floating=True, cv=4, verbose=0)
    sfs2 = sfs2.fit(X, y)
    assert sfs2.k_feature_idx_ == (1, 2, 3)
Beispiel #22
0
 def select_features(self,
                     model,
                     X_train,
                     y_train,
                     k_features=(1, 30),
                     scorer='r2',
                     cv=0):
     sfs = SFS(model,
               k_features=k_features,
               forward=True,
               floating=False,
               scoring=scorer,
               cv=cv,
               verbose=2)
     sfs.fit(np.array(X_train), np.array(y_train))
     print(sfs.k_feature_idx_)
     """
Beispiel #23
0
def test_knn_rbf_groupkfold():
    nan_roc_auc_scorer = make_scorer(nan_roc_auc_score)
    rng = np.random.RandomState(123)
    iris = load_iris()
    X = iris.data
    # knn = KNeighborsClassifier(n_neighbors=4)
    forest = RandomForestClassifier(n_estimators=100, random_state=123)
    bool_01 = [True if item == 0 else False for item in iris['target']]
    bool_02 = [
        True if (item == 1 or item == 2) else False for item in iris['target']
    ]
    groups = []
    y_new = []
    for ind, _ in enumerate(bool_01):
        if bool_01[ind]:
            groups.append('attribute_A')
            y_new.append(0)
        if bool_02[ind]:
            throw = rng.rand()
            if throw < 0.5:
                groups.append('attribute_B')
            else:
                groups.append('attribute_C')
            throw2 = rng.rand()
            if throw2 < 0.5:
                y_new.append(0)
            else:
                y_new.append(1)
    y_new_bool = [True if item is 1 else False for item in y_new]
    cv_obj = GroupKFold(n_splits=3)
    cv_obj_list = list(cv_obj.split(X, np.array(y_new_bool), groups))
    sfs1 = SFS(forest,
               k_features=3,
               forward=True,
               floating=False,
               cv=cv_obj_list,
               scoring=nan_roc_auc_scorer,
               verbose=0)
    sfs1 = sfs1.fit(X, y_new)
    expect = {
        1: {
            'cv_scores': np.array([0.52, nan, 0.72]),
            'avg_score': 0.62,
            'feature_idx': (1, )
        },
        2: {
            'cv_scores': np.array([0.42, nan, 0.65]),
            'avg_score': 0.53,
            'feature_idx': (1, 2)
        },
        3: {
            'cv_scores': np.array([0.47, nan, 0.63]),
            'avg_score': 0.55,
            'feature_idx': (1, 2, 3)
        }
    }

    dict_compare_utility(d1=expect, d2=sfs1.subsets_, decimal=1)
Beispiel #24
0
def vote(X_train, Y_train, X_test, Y_test, voting_type, feature_selection,
         k_features):
    """Invokation of a soft voting/majority rule classification.

    This is a wrapper around `sklearn.ensemble.VotingClassifier` which
    automatically uses all classifiers that are known to `gumpy` in
    `gumpy.classification.available_classifiers`.

    Args:
        X_train: training data (values)
        Y_train: training data (labels)
        X_test: evaluation data (values)
        Y_test: evaluation data (labels)
        voting_type (str): either of 'soft' or 'hard'. See the
            sklearn.ensemble.VotingClassifier documentation for more details

    Returns:
        2-element tuple containing

        - **ClassificationResult**: The result of the classification.
        - **Classifier**:  The instance of `sklearn.ensemble.VotingClassifier`
          that was used during the classification.

    """

    k_cross_val = 10
    N_JOBS = -1

    clfs = []
    for classifier in available_classifiers:
        # determine kwargs such that the classifiers get initialized with
        # proper default settings. This avoids cross-validation, for instance
        opts = available_classifiers[classifier].static_opts('vote',
                                                             X_train=X_train)

        # retrieve instance
        cobj = available_classifiers[classifier](**opts)
        clfs.append((classifier, cobj.clf))

    # instantiate the VotingClassifier
    soft_vote_clf = VotingClassifier(estimators=clfs, voting=voting_type)

    if feature_selection:
        sfs = SFS(soft_vote_clf,
                  k_features,
                  forward=True,
                  floating=True,
                  verbose=2,
                  scoring='accuracy',
                  cv=k_cross_val,
                  n_jobs=N_JOBS)
        sfs = sfs.fit(X_train, Y_train)
        X_train = sfs.transform(X_train)
        X_test = sfs.transform(X_test)

    soft_vote_clf.fit(X_train, Y_train)
    Y_pred = soft_vote_clf.predict(X_test)
    return ClassificationResult(Y_test, Y_pred), soft_vote_clf
Beispiel #25
0
def test_check_pandas_dataframe_fit_backward():
    for floating in [True, False]:
        iris = load_iris()
        X = iris.data
        y = iris.target
        lr = SoftmaxRegression(random_seed=1)
        sfs1 = SFS(lr,
                   k_features=2,
                   forward=False,
                   floating=floating,
                   scoring='accuracy',
                   cv=0,
                   verbose=0,
                   n_jobs=1)

        df = pd.DataFrame(
            X,
            columns=['sepal len', 'sepal width', 'petal len', 'petal width'])

        sfs1 = sfs1.fit(X, y)
        assert sfs1.k_feature_idx_ == (1, 2)
        assert sfs1.k_feature_names_ == ('1', '2')
        assert sfs1.subsets_[2]['feature_names'] == ('1', '2')

        sfs1 = sfs1.fit(df, y)
        assert sfs1.subsets_[3]['feature_names'] == ('sepal len',
                                                     'sepal width',
                                                     'petal len')
        assert sfs1.subsets_[2]['feature_names'] == ('sepal width',
                                                     'petal len')
        assert sfs1.subsets_[3]['feature_idx'] == (0, 1, 2)
        assert sfs1.subsets_[2]['feature_idx'] == (1, 2)
        assert sfs1.k_feature_idx_ == (1, 2)
        assert sfs1.k_feature_names_ == ('sepal width', 'petal len')

        sfs1._TESTING_INTERRUPT_MODE = True
        out = sfs1.fit(df, y)
        assert len(out.subsets_.keys()) > 0
        assert sfs1.interrupted_
        assert sfs1.subsets_[3]['feature_names'] == ('sepal len',
                                                     'sepal width',
                                                     'petal len')
        assert sfs1.k_feature_idx_ == (0, 1, 2)
        assert sfs1.k_feature_names_ == ('sepal len', 'sepal width',
                                         'petal len')
 def backward(X_train, Y_train):
     rf_sfs = RandomForestRegressor(n_estimators=100, max_depth=50, oob_score=False, n_jobs=-1)
     SFS_b = SequentialFeatureSelector(rf_sfs, forward=False, k_features=6, scoring='neg_mean_squared_error',
                                       n_jobs=-1)
     SFS_b = SFS_b.fit(X_train.values, Y_train.values)
     indxs = list(SFS_b.k_feature_names_)
     str_cols = X_train.columns
     features = set(zip(indxs, str_cols))
     print(features)
Beispiel #27
0
def test_max_feature_subset_best():
    boston = load_boston()
    X, y = boston.data, boston.target
    lr = LinearRegression()

    sfs = SFS(lr, k_features='best', forward=True, floating=False, cv=10)

    sfs = sfs.fit(X, y)
    assert sfs.k_feature_idx_ == (1, 3, 5, 7, 8, 9, 10, 11, 12)
def sequential_feature_selection(X, y, k):
    sfs = SFS(LinearRegression(),
              k_features=k,
              forward=True,
              floating=False,
              scoring='r2',
              cv=0)
    fit = sfs.fit(X, y)
    return fit
Beispiel #29
0
def select_r2(df_in, ss_label, f_n, eps):
    dfx = df_in.copy()
    if len(dfx.columns) > f_n:
        select = SFS(RandomForestClassifier(n_estimators=eps, random_state=1),
                     k_features=f_n,
                     forward=True,
                     floating=False,
                     scoring='accuracy',
                     cv=4,
                     n_jobs=3)
        select.fit(dfx.values, ss_label.values)
        mask = select.k_feature_idx_
        x_sfs = select.transform(dfx.values)
        m_mir_list = dfx.columns[[x for x in mask]]
        return x_sfs, ','.join(m_mir_list), len(m_mir_list)
    else:
        f_list = dfx.columns.tolist()
        return dfx.values, ','.join(f_list), len(f_list)
Beispiel #30
0
def sfs(X_train, y_train, estimator, metric):
    sfs1 = SFS(estimator,
               k_features=(1, X_train.shape[1]),
               forward=True,
               floating=False,
               scoring=metric,
               cv=0)
    sfs1 = sfs1.fit(X_train, y_train)
    return sfs1.k_feature_idx_
Beispiel #31
0
def select_features_wrapper(X, y, forward=True, k_features=20):
    # svc = SVC(gamma='auto')
    # linearSVC = LinearSVC(random_state=0, tol=1e-5, class_weight='balanced')
    random_forest_clssifier = RandomForestClassifier(max_depth=7,
                                                     random_state=0)

    sgd = SGDClassifier(max_iter=1000, tol=1e-3)
    #     knn = KNNeighborsClassifier(n_neighbors=3)
    sfs = SequentialFeatureSelector(sgd,
                                    k_features=k_features,
                                    forward=forward,
                                    floating=False,
                                    verbose=5,
                                    cv=0,
                                    n_jobs=-1)
    sfs.fit(X, y.values.ravel())
    print(sfs.k_feature_names_)
    return sfs
Beispiel #32
0
def get_best_logisitc(y):
    
  from mlxtend.feature_selection import SequentialFeatureSelector as SFS
  from sklearn.cross_validation import StratifiedKFold
  import pandas as pd
  from sklearn.linear_model import LogisticRegression
  from sklearn.cross_validation import cross_val_score
  
  my_data = pd.read_csv('data/my_data_test.csv', encoding='utf-8')
  
  y = my_data.target
  my_data = my_data.drop('target', axis=1)
  
    
  # To have better CV
  skf = StratifiedKFold(y, n_folds=5, random_state=17, shuffle=False)

  C_params = [0.01 , 1, 10, 50, 70, 100]
  solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag']

  my_result_list = []
  for C_param in C_params:
      for solver in solvers:
          print "Looking for C : %s and solver : %s" % (C_param, solver)
          model = LogisticRegression(class_weight='balanced', random_state=17, 
                                     solver=solver, C=C_param)
          sfs = SFS(model, 
                    k_features=len(my_data.columns), 
                    forward=True, 
                    floating=False, 
                    scoring='roc_auc',
                    print_progress=False,
                    cv=skf,
                    n_jobs=-1)
          
          sfs = sfs.fit(my_data.values, y.values)

          result_sfs = pd.DataFrame.from_dict(sfs.get_metric_dict()).T
          result_sfs.sort_values('avg_score', ascending=0, inplace=True)
          features_sfs = result_sfs.feature_idx.head(1).tolist()
          select_features_sfs = list(my_data.columns[features_sfs])

          scores = cross_val_score(model, my_data[select_features_sfs], y, cv=skf, scoring='roc_auc')
          my_result_list.append({'C' : C_param,
                               'solver' : solver,
                               'auc' : scores.mean(),
                               'std' : scores.std(),
                               'best_columns' : select_features_sfs,
                               'estimator' : model})

  my_result = pd.DataFrame(my_result_list)
  my_result.sort_values('auc', ascending=0, inplace=True)

  best_features = my_result.best_columns.head(1).values[0]
  best_model = my_result.estimator.head(1).values[0]

  return best_features, best_model
def test_check_pandas_dataframe_fit_backward():
    for floating in [True, False]:
        iris = load_iris()
        X = iris.data
        y = iris.target
        lr = SoftmaxRegression(random_seed=1)
        sfs1 = SFS(lr,
                   k_features=2,
                   forward=False,
                   floating=floating,
                   scoring='accuracy',
                   cv=0,
                   verbose=0,
                   n_jobs=1)

        df = pd.DataFrame(X, columns=['sepal len', 'sepal width',
                                      'petal len', 'petal width'])

        sfs1 = sfs1.fit(X, y)
        assert sfs1.k_feature_idx_ == (1, 2)
        assert sfs1.k_feature_names_ == ('1', '2')
        assert sfs1.subsets_[2]['feature_names'] == ('1', '2')

        sfs1 = sfs1.fit(df, y)
        assert sfs1.subsets_[3]['feature_names'] == ('sepal len',
                                                     'sepal width',
                                                     'petal len')
        assert sfs1.subsets_[2]['feature_names'] == ('sepal width',
                                                     'petal len')
        assert sfs1.subsets_[3]['feature_idx'] == (0, 1, 2)
        assert sfs1.subsets_[2]['feature_idx'] == (1, 2)
        assert sfs1.k_feature_idx_ == (1, 2)
        assert sfs1.k_feature_names_ == ('sepal width', 'petal len')

        sfs1._TESTING_INTERRUPT_MODE = True
        out = sfs1.fit(df, y)
        assert len(out.subsets_.keys()) > 0
        assert sfs1.interrupted_
        assert sfs1.subsets_[3]['feature_names'] == ('sepal len',
                                                     'sepal width',
                                                     'petal len')
        assert sfs1.k_feature_idx_ == (0, 1, 2)
        assert sfs1.k_feature_names_ == ('sepal len', 'sepal width',
                                         'petal len')
def feature_selection(regr, train):
    x, y = train.drop(columns=['Id', 'SalePrice']), train['SalePrice']

    regr.fit(x, y)

    sfs = SFS(regr, k_features=x.shape[1] - 10, forward=False, verbose=2,
              scoring='neg_mean_squared_error', cv=4)
    sfs.fit(x, y)
    selected_features = (pd.DataFrame(sfs.get_metric_dict())
                         .T
                         .loc[:, ['feature_names', 'avg_score', 'std_dev', 'std_err']]
                         .sort_values(['avg_score', 'std_dev'], ascending=False)
                         .reset_index(drop=True))

    best_features = selected_features.at[0, 'feature_names']
    best_features = list(best_features)
    bad_features = [f for f in x if f not in best_features]

    return bad_features
def feature_selection(X, y,  method=1, k_features=5, save_params=False, seed=127):
    logit = LogisticRegression(C=1, random_state=seed, solver='liblinear')

    if method == 1:
        rfe = RFE(logit, n_features_to_select=k_features, verbose=2)
        rfe.fit(X, y)
        if save_params:
            with open('rfe.pkl', 'wb') as file:
                pickle.dump(rfe, file, pickle.HIGHEST_PROTOCOL)
        return rfe
    elif method == 2:
        sfs = SequentialFeatureSelector(logit, cv=0, k_features=k_features,
                                        forward=False, scoring='roc_auc',
                                        verbose=2, n_jobs=-1)
        sfs.fit(X, y)
        if save_params:
            with open('sfs.pkl', 'wb') as file:
                pickle.dump(sfs, file, pickle.HIGHEST_PROTOCOL)
        return sfs
def callSBS():
    sbs = SFS(knn,
              k_features=8,
              forward=False,
              floating=False,
              scoring='accuracy',
              cv=0,
              verbose=2)

    sbs = sbs.fit(X, yfinal)
    print sbs.subsets_
def callSFS():
    sfs1 = SFS(knn,
               k_features=8,
               forward=True,
               floating=False,
               verbose=2,
               scoring='accuracy',
               cv=0)

    sfs1 = sfs1.fit(X, yfinal)
    print sfs1.subsets_
def callSFBS():
    sfbs = SFS(knn,
               k_features=8,
               forward=False,
               floating=True,
               scoring='accuracy',
               cv=0,
               n_jobs=-1)

    sfbs = sfbs.fit(X, yfinal)
    print sfbs.subsets_
def callSFFS():
    sffs = SFS(knn,
               k_features=8,
               forward=True,
               floating=True,
               scoring='accuracy',
               cv=0,
               verbose=2)

    sffs = sffs.fit(X, yfinal)
    print sffs.subsets_
def test_knn_option_sfbs():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs4 = SFS(knn,
               k_features=3,
               forward=False,
               floating=True,
               cv=4,
               verbose=0)
    sfs4 = sfs4.fit(X, y)
    assert sfs4.k_feature_idx_ == (1, 2, 3)
def test_regression_sbfs():
    boston = load_boston()
    X, y = boston.data, boston.target
    lr = LinearRegression()
    sfs_r = SFS(lr,
                k_features=3,
                forward=False,
                floating=True,
                scoring='neg_mean_squared_error',
                cv=10,
                verbose=0)
    sfs_r = sfs_r.fit(X, y)
    assert sfs_r.k_feature_idx_ == (7, 10, 12), sfs_r.k_feature_idx_
def test_max_feature_subset_best():
    boston = load_boston()
    X, y = boston.data, boston.target
    lr = LinearRegression()

    sfs = SFS(lr,
              k_features='best',
              forward=True,
              floating=False,
              cv=10)

    sfs = sfs.fit(X, y)
    assert sfs.k_feature_idx_ == (1, 3, 5, 7, 8, 9, 10, 11, 12)
def test_max_feature_subset_parsimonious():
    boston = load_boston()
    X, y = boston.data, boston.target
    lr = LinearRegression()

    sfs = SFS(lr,
              k_features='parsimonious',
              forward=True,
              floating=False,
              cv=10)

    sfs = sfs.fit(X, y)
    assert sfs.k_feature_idx_ == (5, 10, 11, 12)
def test_regression():
    boston = load_boston()
    X, y = boston.data, boston.target
    lr = LinearRegression()
    sfs_r = SFS(lr,
                k_features=13,
                forward=True,
                floating=False,
                scoring='mean_squared_error',
                cv=10,
                skip_if_stuck=True,
                print_progress=False)
    sfs_r = sfs_r.fit(X, y)
    assert round(sfs_r.k_score_, 4) == -34.7631
def test_knn_option_sbs_tuplerange_1():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=3)
    sfs4 = SFS(knn,
               k_features=(1, 3),
               forward=False,
               floating=False,
               cv=4,
               verbose=0)
    sfs4 = sfs4.fit(X, y)
    assert round(sfs4.k_score_, 3) == 0.967, sfs4.k_score_
    assert sfs4.k_feature_idx_ == (0, 2, 3), sfs4.k_feature_idx_
def test_regression_in_range():
    boston = load_boston()
    X, y = boston.data, boston.target
    lr = LinearRegression()
    sfs_r = SFS(lr,
                k_features=(1, 13),
                forward=True,
                floating=False,
                scoring='neg_mean_squared_error',
                cv=10,
                verbose=0)
    sfs_r = sfs_r.fit(X, y)
    assert len(sfs_r.k_feature_idx_) == 9
    assert round(sfs_r.k_score_, 4) == -31.1537
def test_knn_rbf_groupkfold():
    nan_roc_auc_scorer = make_scorer(nan_roc_auc_score)
    rng = np.random.RandomState(123)
    iris = load_iris()
    X = iris.data
    # knn = KNeighborsClassifier(n_neighbors=4)
    forest = RandomForestClassifier(n_estimators=100, random_state=123)
    bool_01 = [True if item == 0 else False for item in iris['target']]
    bool_02 = [True if (item == 1 or item == 2) else False for item in
               iris['target']]
    groups = []
    y_new = []
    for ind, _ in enumerate(bool_01):
        if bool_01[ind]:
            groups.append('attribute_A')
            y_new.append(0)
        if bool_02[ind]:
            throw = rng.rand()
            if throw < 0.5:
                groups.append('attribute_B')
            else:
                groups.append('attribute_C')
            throw2 = rng.rand()
            if throw2 < 0.5:
                y_new.append(0)
            else:
                y_new.append(1)
    y_new_bool = [True if item is 1 else False for item in y_new]
    cv_obj = GroupKFold(n_splits=3)
    cv_obj_list = list(cv_obj.split(X, np.array(y_new_bool), groups))
    sfs1 = SFS(forest,
               k_features=3,
               forward=True,
               floating=False,
               cv=cv_obj_list,
               scoring=nan_roc_auc_scorer,
               verbose=0
               )
    sfs1 = sfs1.fit(X, y_new)
    expect = {
        1: {'cv_scores': np.array([0.52, nan, 0.72]), 'avg_score': 0.62,
            'feature_idx': (1,)},
        2: {'cv_scores': np.array([0.42, nan, 0.65]), 'avg_score': 0.53,
            'feature_idx': (1, 2)},
        3: {'cv_scores': np.array([0.47, nan, 0.63]),
            'avg_score': 0.55,
            'feature_idx': (1, 2, 3)}}

    dict_compare_utility(d_actual=sfs1.subsets_, d_desired=expect, decimal=1)
def test_knn_option_sffs():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs2 = SFS(knn,
               k_features=3,
               forward=True,
               floating=True,
               scoring='accuracy',
               cv=4,
               skip_if_stuck=True,
               verbose=0)
    sfs2 = sfs2.fit(X, y)
    assert sfs2.k_feature_idx_ == (1, 2, 3)
def test_string_scoring_clf():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs1 = SFS(knn,
               k_features=3,
               cv=0)
    sfs1 = sfs1.fit(X, y)

    sfs2 = SFS(knn,
               k_features=3,
               scoring='accuracy',
               cv=0)
    sfs2 = sfs2.fit(X, y)

    sfs3 = SFS(knn,
               k_features=3,
               scoring=make_scorer(accuracy_score),
               cv=0)
    sfs3 = sfs2.fit(X, y)

    assert sfs1.k_score_ == sfs2.k_score_
    assert sfs1.k_score_ == sfs3.k_score_
def test_knn_option_sfs():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs1 = SFS(knn,
               k_features=3,
               forward=True,
               floating=False,
               scoring='accuracy',
               cv=4,
               skip_if_stuck=True,
               print_progress=False)
    sfs1 = sfs1.fit(X, y)
    assert sfs1.k_feature_idx_ == (1, 2, 3)
def test_max_feature_subset_size_in_tuple_range():
    boston = load_boston()
    X, y = boston.data, boston.target

    lr = LinearRegression()

    sfs = SFS(lr,
              k_features=(1, 5),
              forward=False,
              floating=True,
              scoring='neg_mean_squared_error',
              cv=10)

    sfs = sfs.fit(X, y)
    assert len(sfs.k_feature_idx_) == 5
def test_clone_params_pass():
    iris = load_iris()
    X = iris.data
    y = iris.target
    lr = SoftmaxRegression(random_seed=1)
    sfs1 = SFS(lr,
               k_features=2,
               forward=True,
               floating=False,
               scoring='accuracy',
               cv=0,
               clone_estimator=True,
               verbose=0,
               n_jobs=1)
    sfs1 = sfs1.fit(X, y)
    assert (sfs1.k_feature_idx_ == (1, 3))
def test_knn_option_sfbs_tuplerange_2():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=3)
    sfs4 = SFS(knn,
               k_features=(1, 4),
               forward=False,
               floating=True,
               scoring='accuracy',
               cv=4,
               skip_if_stuck=True,
               verbose=0)
    sfs4 = sfs4.fit(X, y)
    assert round(sfs4.k_score_, 3) == 0.966, sfs4.k_score_
    assert sfs4.k_feature_idx_ == (1, 2, 3), sfs4.k_feature_idx_
def test_check_pandas_dataframe_transform():
    iris = load_iris()
    X = iris.data
    y = iris.target
    lr = SoftmaxRegression(random_seed=1)
    sfs1 = SFS(lr,
               k_features=2,
               forward=True,
               floating=False,
               scoring='accuracy',
               cv=0,
               verbose=0,
               n_jobs=1)

    df = pd.DataFrame(X, columns=['sepal length', 'sepal width',
                                  'petal length', 'petal width'])
    sfs1 = sfs1.fit(df, y)
    assert sfs1.k_feature_idx_ == (1, 3)
    assert (150, 2) == sfs1.transform(df).shape
def sfs_selection(X,y,n_features,forward):
	"""
	Performs the Sequential Forward/Backward Selection method and selects the top ranking features

	Keyword arguments:
	X -- The feature vectors
	y -- The target vector
	n_features -- n best ranked features
	"""

	if verbose:
		print '\nPerforming Feature Selection based on the Sequential Feature Selection method ...'

	clf=RandomForestClassifierWithCoef(n_estimators=5,n_jobs=-1)
	sfs = SFS(clf,k_features=n_features,forward=forward,scoring='accuracy',cv=0,n_jobs=-1, print_progress=True,)
	sfs = sfs.fit(X, y)

	feature_indexes=sfs.k_feature_idx_
	return X[:,feature_indexes[0:n_features]],feature_indexes[0:n_features]		#return selected features and original index features
def test_regression():
    boston = load_boston()
    X, y = boston.data, boston.target
    lr = LinearRegression()
    sfs_r = SFS(lr,
                k_features=13,
                forward=True,
                floating=False,
                scoring='neg_mean_squared_error',
                cv=10,
                verbose=0)
    sfs_r = sfs_r.fit(X, y)
    assert len(sfs_r.k_feature_idx_) == 13

    if Version(sklearn_version) < '0.20':
        assert round(sfs_r.k_score_, 4) == -34.7631, \
            round(sfs_r.k_score_, 4)
    else:
        assert round(sfs_r.k_score_, 4) == -34.7053, \
            round(sfs_r.k_score_, 4)
def test_custom_feature_names():

    iris = load_iris()
    X = iris.data
    y = iris.target
    lr = SoftmaxRegression(random_seed=1)
    sfs1 = SFS(lr,
               k_features=2,
               forward=True,
               floating=False,
               scoring='accuracy',
               cv=0,
               verbose=0,
               n_jobs=1)

    sfs1 = sfs1.fit(X, y, custom_feature_names=(
          'sepal length', 'sepal width', 'petal length', 'petal width'))
    assert sfs1.k_feature_idx_ == (1, 3)
    assert sfs1.k_feature_names_ == ('sepal width', 'petal width')
    assert sfs1.subsets_[2]['feature_names'] == ('sepal width',
                                                 'petal width')
def test_knn_wo_cv():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs1 = SFS(knn,
               k_features=3,
               forward=True,
               floating=False,
               cv=0,
               verbose=0)
    sfs1 = sfs1.fit(X, y)
    expect = {1: {'avg_score': 0.95999999999999996,
                  'cv_scores': np.array([0.96]),
                  'feature_idx': (3,)},
              2: {'avg_score': 0.97333333333333338,
                  'cv_scores': np.array([0.97333333]),
                  'feature_idx': (2, 3)},
              3: {'avg_score': 0.97333333333333338,
                  'cv_scores': np.array([0.97333333]),
                  'feature_idx': (1, 2, 3)}}
    dict_compare_utility(d_actual=sfs1.subsets_, d_desired=expect)
def test_keyboard_interrupt():
    iris = load_iris()
    X = iris.data
    y = iris.target

    knn = KNeighborsClassifier(n_neighbors=4)
    sfs1 = SFS(
        knn,
        k_features=3,
        forward=True,
        floating=False,
        cv=3,
        clone_estimator=False,
        verbose=5,
        n_jobs=1
    )

    sfs1._TESTING_INTERRUPT_MODE = True
    out = sfs1.fit(X, y)

    assert len(out.subsets_.keys()) > 0
    assert sfs1.interrupted_
def test_knn_wo_cv():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    sfs1 = SFS(knn,
               k_features=3,
               forward=True,
               floating=False,
               scoring='accuracy',
               cv=0,
               skip_if_stuck=True,
               print_progress=False)
    sfs1 = sfs1.fit(X, y)
    expect = {1: {'avg_score': 0.95999999999999996,
                  'cv_scores': np.array([0.96]),
                  'feature_idx': (3,)},
              2: {'avg_score': 0.97333333333333338,
                  'cv_scores': np.array([0.97333333]),
                  'feature_idx': (2, 3)},
              3: {'avg_score': 0.97333333333333338,
                  'cv_scores': np.array([0.97333333]),
                  'feature_idx': (1, 2, 3)}}
    dict_compare_utility(d1=expect, d2=sfs1.subsets_)