class DFExhaustiveFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, **kwargs):
        self.columns = columns
        self.selector = ExhaustiveFeatureSelector(**kwargs)
        self.transform_cols = None
        self.stat_df = None

    def fit(self, X, y):
        self.columns = X.columns if self.columns is None else self.columns
        self.transform_cols = [x for x in X.columns if x in self.columns]
        self.selector.fit(X[self.transform_cols], y)

        self.stat_df = pd.DataFrame.from_dict(
            self.selector.get_metric_dict()).T
        self.stat_df.at[self.stat_df['avg_score'].astype(float).idxmax(),
                        'support'] = True
        self.stat_df['support'].fillna(False, inplace=True)

        return self

    def transform(self, X):
        if self.transform_cols is None:
            raise NotFittedError(
                f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
            )

        features = list(
            self.stat_df[self.stat_df['support']]['feature_names'].values[0])
        new_X = X[features].copy()

        return new_X

    def fit_transform(self, X, y):
        return self.fit(X, y).transform(X)
Ejemplo n.º 2
0
def select_EFS(tr, num_feat=100):
    X = tr
    y = [i for i in range(9) for j in range(60)]
    knn = KNeighborsClassifier(n_neighbors=1)
    efs = EFS(knn, min_features=num_feat, max_features=num_feat, cv=5, n_jobs=4)
    efs.fit(X, y)
    out = efs.best_idx_
    return out
Ejemplo n.º 3
0
def test_knn_cv3_groups():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    efs1 = EFS(knn,
               min_features=3,
               max_features=3,
               scoring='accuracy',
               cv=GroupKFold(n_splits=3),
               print_progress=False)
    np.random.seed(1630672634)
    groups = np.random.randint(0, 6, size=len(y))
    efs1 = efs1.fit(X, y, groups=groups)

    expect = {0: {'cv_scores': np.array([0.97916667, 0.93877551, 0.9245283]),
                  'feature_idx': (0, 1, 2),
                  'avg_score': 0.9474901595858469,
                  'feature_names': ('0', '1', '2')},
              1: {'cv_scores': np.array([1., 0.93877551, 0.9245283]),
                  'feature_idx': (0, 1, 3),
                  'avg_score': 0.9544346040302915,
                  'feature_names': ('0', '1', '3')},
              2: {'cv_scores': np.array([0.97916667, 0.95918367, 0.9245283]),
                  'feature_idx': (0, 2, 3),
                  'avg_score': 0.9542928806742822,
                  'feature_names': ('0', '2', '3')},
              3: {'cv_scores': np.array([0.97916667, 0.95918367, 0.94339623]),
                  'feature_idx': (1, 2, 3),
                  'avg_score': 0.9605821888503829,
                  'feature_names': ('1', '2', '3')}}
    dict_compare_utility(d1=expect, d2=efs1.subsets_)
def test_knn_cv3():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    efs1 = EFS(knn,
               min_features=3,
               max_features=3,
               scoring='accuracy',
               cv=4,
               print_progress=False)
    efs1 = efs1.fit(X, y)
    expect = {0: {'avg_score': 0.9391025641025641,
                  'feature_idx': (0, 1, 2),
                  'cv_scores': np.array([0.97435897, 0.94871795,
                                         0.88888889, 0.94444444])},
              1: {'avg_score': 0.94017094017094016,
                  'feature_idx': (0, 1, 3),
                  'cv_scores': np.array([0.92307692, 0.94871795,
                                         0.91666667, 0.97222222])},
              2: {'avg_score': 0.95299145299145294,
                  'feature_idx': (0, 2, 3),
                  'cv_scores': np.array([0.97435897, 0.94871795,
                                         0.91666667, 0.97222222])},
              3: {'avg_score': 0.97275641025641035,
                  'feature_idx': (1, 2, 3),
                  'cv_scores': np.array([0.97435897, 1.,
                                         0.94444444, 0.97222222])}}
    dict_compare_utility(d1=expect, d2=efs1.subsets_)
    assert efs1.best_idx_ == (1, 2, 3)
    assert round(efs1.best_score_, 4) == 0.9728
def test_knn_cv3():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    efs1 = EFS(knn,
               min_features=3,
               max_features=3,
               scoring='accuracy',
               cv=4,
               print_progress=False)
    efs1 = efs1.fit(X, y)
    expect = {0: {'avg_score': 0.9391025641025641,
                  'feature_idx': (0, 1, 2),
                  'cv_scores': np.array([0.97435897, 0.94871795,
                                         0.88888889, 0.94444444])},
              1: {'avg_score': 0.94017094017094016,
                  'feature_idx': (0, 1, 3),
                  'cv_scores': np.array([0.92307692, 0.94871795,
                                         0.91666667, 0.97222222])},
              2: {'avg_score': 0.95299145299145294,
                  'feature_idx': (0, 2, 3),
                  'cv_scores': np.array([0.97435897, 0.94871795,
                                         0.91666667, 0.97222222])},
              3: {'avg_score': 0.97275641025641035,
                  'feature_idx': (1, 2, 3),
                  'cv_scores': np.array([0.97435897, 1.,
                                         0.94444444, 0.97222222])}}
    dict_compare_utility(d1=expect, d2=efs1.subsets_)
    assert efs1.best_idx_ == (1, 2, 3)
    assert round(efs1.best_score_, 4) == 0.9728
def test_fit_params():
    iris = load_iris()
    X = iris.data
    y = iris.target
    sample_weight = np.ones(X.shape[0])
    forest = RandomForestClassifier(n_estimators=100, random_state=123)
    efs1 = EFS(forest,
               min_features=3,
               max_features=3,
               scoring='accuracy',
               cv=4,
               print_progress=False)
    efs1 = efs1.fit(X, y, sample_weight=sample_weight)
    expect = {0: {'feature_idx': (0, 1, 2),
                  'cv_scores': np.array([0.94871795, 0.92307692,
                                         0.91666667, 0.97222222]),
                  'avg_score': 0.9401709401709402},
              1: {'feature_idx': (0, 1, 3),
                  'cv_scores': np.array([0.92307692, 0.92307692,
                                         0.88888889, 1.]),
                  'avg_score': 0.9337606837606838},
              2: {'feature_idx': (0, 2, 3),
                  'cv_scores': np.array([0.97435897, 0.94871795,
                                         0.94444444, 0.97222222]),
                  'avg_score': 0.9599358974358974},
              3: {'feature_idx': (1, 2, 3),
                  'cv_scores': np.array([0.97435897, 0.94871795,
                                         0.91666667, 1.]),
                  'avg_score': 0.9599358974358974}}
    dict_compare_utility(d1=expect, d2=efs1.subsets_)
    assert efs1.best_idx_ == (0, 2, 3)
    assert round(efs1.best_score_, 4) == 0.9599
Ejemplo n.º 7
0
def test_check_pandas_dataframe_fit():

    knn = KNeighborsClassifier(n_neighbors=4)
    iris = load_iris()
    X = iris.data
    y = iris.target
    efs1 = EFS(knn,
               min_features=2,
               max_features=2,
               scoring='accuracy',
               cv=0,
               clone_estimator=False,
               print_progress=False,
               n_jobs=1)

    df = pd.DataFrame(
        X,
        columns=['sepal length', 'sepal width', 'petal length', 'petal width'])

    sfs1 = efs1.fit(X, y)
    assert efs1.best_idx_ == (2, 3), efs1.best_idx_
    assert efs1.best_feature_names_ == ('2', '3')
    assert efs1.interrupted_ is False

    sfs1._TESTING_INTERRUPT_MODE = True
    sfs1 = sfs1.fit(df, y)
    assert efs1.best_idx_ == (0, 1), efs1.best_idx_
    assert efs1.best_feature_names_ == ('sepal length', 'sepal width')
    assert efs1.interrupted_ is True
def test_check_pandas_dataframe_fit():

    knn = KNeighborsClassifier(n_neighbors=4)
    iris = load_iris()
    X = iris.data
    y = iris.target
    efs1 = EFS(knn,
               min_features=2,
               max_features=2,
               scoring='accuracy',
               cv=0,
               clone_estimator=False,
               print_progress=False,
               n_jobs=1)

    df = pd.DataFrame(X, columns=['sepal length', 'sepal width',
                                  'petal length', 'petal width'])

    sfs1 = efs1.fit(X, y)
    assert efs1.best_idx_ == (2, 3), efs1.best_idx_
    assert efs1.best_feature_names_ == ('2', '3')
    assert efs1.interrupted_ is False

    sfs1._TESTING_INTERRUPT_MODE = True
    sfs1 = sfs1.fit(df, y)
    assert efs1.best_idx_ == (0, 1), efs1.best_idx_
    assert efs1.best_feature_names_ == ('sepal length', 'sepal width')
    assert efs1.interrupted_ is True
def test_knn_cv3_groups():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    efs1 = EFS(knn,
               min_features=3,
               max_features=3,
               scoring='accuracy',
               cv=GroupKFold(n_splits=3),
               print_progress=False)
    np.random.seed(1630672634)
    groups = np.random.randint(0, 6, size=len(y))
    efs1 = efs1.fit(X, y, groups=groups)
    # print(efs1.subsets_)
    expect = {0: {'cv_scores': np.array([0.97916667, 0.93877551, 0.9245283]),
                  'feature_idx': (0, 1, 2),
                  'avg_score': 0.9474901595858469,
                  'feature_names': ('0', '1', '2')},
              1: {'cv_scores': np.array([1., 0.93877551, 0.9245283]),
                  'feature_idx': (0, 1, 3),
                  'avg_score': 0.9544346040302915,
                  'feature_names': ('0', '1', '3')},
              2: {'cv_scores': np.array([0.97916667, 0.95918367, 0.9245283]),
                  'feature_idx': (0, 2, 3),
                  'avg_score': 0.9542928806742822,
                  'feature_names': ('0', '2', '3')},
              3: {'cv_scores': np.array([0.97916667, 0.95918367, 0.94339623]),
                  'feature_idx': (1, 2, 3),
                  'avg_score': 0.9605821888503829,
                  'feature_names': ('1', '2', '3')}}
    dict_compare_utility(d1=expect, d2=efs1.subsets_)
    def exhaustive_feature_selection(X: dt.Frame = None):
        if X is None:
                return []
        # X[:, 'default payment next month leak'] = X[:, 'default payment next month']
        datadf = X.to_pandas()
        data_y = datadf['default payment next month']
        data_X = datadf.iloc[:,:datadf.shape[1] - 1] # radius_mean onwards
        XX = data_X
        y = np.ravel(data_y)
        #
        knn = KNeighborsClassifier(n_neighbors=3)

        efs1 = EFS(knn, 
                min_features=5,
                max_features=10,
                scoring='accuracy',
                print_progress=True,
                cv=5)

        efs1 = efs1.fit(XX, y)
        support = sfs1.k_feature_names_
        feat_list = list(support)
        # get the new features
        col_names_to_pick = feat_list + ['default payment next month']
        new_df = datadf[col_names_to_pick]
        new_dt = dt.Frame(new_df)
        return new_dt
Ejemplo n.º 11
0
def test_fit_params():
    iris = load_iris()
    X = iris.data
    y = iris.target
    sample_weight = np.ones(X.shape[0])
    forest = RandomForestClassifier(n_estimators=100, random_state=123)
    efs1 = EFS(forest,
               min_features=3,
               max_features=3,
               scoring='accuracy',
               cv=4,
               print_progress=False)
    efs1 = efs1.fit(X, y, sample_weight=sample_weight)
    expect = {
        0: {
            'feature_idx': (0, 1, 2),
            'feature_names': ('0', '1', '2'),
            'cv_scores': np.array([0.947, 0.868, 0.919, 0.973]),
            'avg_score': 0.9269203413940257
        },
        1: {
            'feature_idx': (0, 1, 3),
            'feature_names': ('0', '1', '3'),
            'cv_scores': np.array([0.921, 0.921, 0.892, 1.]),
            'avg_score': 0.9337606837606838
        },
        2: {
            'feature_idx': (0, 2, 3),
            'feature_names': ('0', '2', '3'),
            'cv_scores': np.array([0.974, 0.947, 0.919, 0.973]),
            'avg_score': 0.9532361308677098
        },
        3: {
            'feature_idx': (1, 2, 3),
            'feature_names': ('1', '2', '3'),
            'cv_scores': np.array([0.974, 0.947, 0.892, 1.]),
            'avg_score': 0.9532361308677098
        }
    }

    if Version(sklearn_version) < Version("0.22"):
        expect[0]['avg_score'] = 0.9401709401709402
        expect[0]['cv_scores'] = np.array(
            [0.94871795, 0.92307692, 0.91666667, 0.97222222])
        expect[1]['cv_scores'] = np.array(
            [0.94871795, 0.92307692, 0.91666667, 0.97222222])
        expect[2]['cv_scores'] = np.array(
            [0.94871795, 0.92307692, 0.91666667, 0.97222222])
        expect[2]['avg_score'] = 0.9599358974358974
        expect[3]['avg_score'] = 0.9599358974358974
        expect[3]['cv_scores'] = np.array(
            [0.97435897, 0.94871795, 0.91666667, 1.])
        assert round(efs1.best_score_, 4) == 0.9599

    else:
        assert round(efs1.best_score_, 4) == 0.9532

    dict_compare_utility(d1=expect, d2=efs1.subsets_)
    assert efs1.best_idx_ == (0, 2, 3)
Ejemplo n.º 12
0
def test_knn_cv3():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    efs1 = EFS(knn,
               min_features=3,
               max_features=3,
               scoring='accuracy',
               cv=4,
               print_progress=False)
    efs1 = efs1.fit(X, y)
    expect = {
        0: {
            'avg_score': 0.9391025641025641,
            'feature_idx': (0, 1, 2),
            'feature_names': ('0', '1', '2'),
            'cv_scores': np.array([0.974, 0.947, 0.892, 0.946])
        },
        1: {
            'avg_score': 0.9400782361308677,
            'feature_idx': (0, 1, 3),
            'feature_names': ('0', '1', '3'),
            'cv_scores': np.array([0.921, 0.947, 0.919, 0.973])
        },
        2: {
            'avg_score': 0.95299145299145294,
            'feature_idx': (0, 2, 3),
            'feature_names': ('0', '2', '3'),
            'cv_scores': np.array([0.974, 0.947, 0.919, 0.973])
        },
        3: {
            'avg_score': 0.97275641025641035,
            'feature_idx': (1, 2, 3),
            'feature_names': ('1', '2', '3'),
            'cv_scores': np.array([0.974, 1., 0.946, 0.973])
        }
    }

    if Version(sklearn_version) < Version("0.22"):
        expect[0]['cv_scores'] = np.array(
            [0.97435897, 0.94871795, 0.88888889, 0.94444444])
        expect[1]['cv_scores'] = np.array(
            [0.92307692, 0.94871795, 0.91666667, 0.97222222])
        expect[2]['cv_scores'] = np.array(
            [0.97435897, 0.94871795, 0.91666667, 0.97222222])
        expect[3]['cv_scores'] = np.array(
            [0.97435897, 0.94871795, 0.91666667, 0.97222222])
        expect[1]['avg_score'] = 0.94017094017094016
        assert round(efs1.best_score_, 4) == 0.9728
    else:
        assert round(efs1.best_score_, 4) == 0.9732

    dict_compare_utility(d1=expect, d2=efs1.subsets_)
    assert efs1.best_idx_ == (1, 2, 3)
    assert efs1.best_feature_names_ == ('1', '2', '3')
Ejemplo n.º 13
0
def exhaustive_feature_selection(x_data, y_data, min_feat, max_feat):
    print(f"Applying exhaustive feature selection to numeric data")
    print(
        f"cat variables before backward feature selection {x_data.select_dtypes(include='object').columns.shape}"
    )
    print(
        f"numeric variables before backward feature selection {x_data.select_dtypes(include='number').columns.shape}"
    )

    numeric_cols = x_data.select_dtypes(include='number').columns

    temp = x_data[numeric_cols]

    efs = EFS(RandomForestRegressor(n_jobs=4),
              max_features=max_feat,
              min_features=min_feat,
              scoring='r2',
              print_progress=True,
              cv=2)

    efs.fit(temp, y_data)

    idx = efs.best_idx_

    print(idx)

    idx = list(idx)

    cols_to_keep = x_data.columns[idx]
    cols_to_drop = [x for x in numeric_cols if x not in cols_to_keep]

    print(cols_to_drop.__len__())

    x_data.drop(labels=cols_to_drop, axis=1, inplace=True)
    print(
        f"cat variables after exhaustive feature selection {x_data.select_dtypes(include='object').columns}"
    )
    print(
        f"numeric variables after exhaustive  feature selection {x_data.select_dtypes(include='number').columns}"
    )
    return x_data
Ejemplo n.º 14
0
def test_knn_wo_cv():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    efs1 = EFS(knn,
               min_features=2,
               max_features=3,
               scoring='accuracy',
               cv=0,
               print_progress=False)
    efs1 = efs1.fit(X, y)
    expect = {0: {'feature_idx': (0, 1),
                  'feature_names': ('0', '1'),
                  'avg_score': 0.82666666666666666,
                  'cv_scores': np.array([0.82666667])},
              1: {'feature_idx': (0, 2),
                  'feature_names': ('0', '2'),
                  'avg_score': 0.95999999999999996,
                  'cv_scores': np.array([0.96])},
              2: {'feature_idx': (0, 3),
                  'feature_names': ('0', '3'),
                  'avg_score': 0.96666666666666667,
                  'cv_scores': np.array([0.96666667])},
              3: {'feature_idx': (1, 2),
                  'feature_names': ('1', '2'),
                  'avg_score': 0.95999999999999996,
                  'cv_scores': np.array([0.96])},
              4: {'feature_idx': (1, 3),
                  'feature_names': ('1', '3'),
                  'avg_score': 0.95999999999999996,
                  'cv_scores': np.array([0.96])},
              5: {'feature_idx': (2, 3),
                  'feature_names': ('2', '3'),
                  'avg_score': 0.97333333333333338,
                  'cv_scores': np.array([0.97333333])},
              6: {'feature_idx': (0, 1, 2),
                  'feature_names': ('0', '1', '2'),
                  'avg_score': 0.95999999999999996,
                  'cv_scores': np.array([0.96])},
              7: {'feature_idx': (0, 1, 3),
                  'feature_names': ('0', '1', '3'),
                  'avg_score': 0.96666666666666667,
                  'cv_scores': np.array([0.96666667])},
              8: {'feature_idx': (0, 2, 3),
                  'feature_names': ('0', '2', '3'),
                  'avg_score': 0.96666666666666667,
                  'cv_scores': np.array([0.96666667])},
              9: {'feature_idx': (1, 2, 3),
                  'feature_names': ('1', '2', '3'),
                  'avg_score': 0.97333333333333338,
                  'cv_scores': np.array([0.97333333])}}
    dict_compare_utility(d1=expect, d2=efs1.subsets_)
def test_knn_wo_cv():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    efs1 = EFS(knn,
               min_features=2,
               max_features=3,
               scoring='accuracy',
               cv=0,
               print_progress=False)
    efs1 = efs1.fit(X, y)
    expect = {0: {'feature_idx': (0, 1),
                  'feature_names': ('0', '1'),
                  'avg_score': 0.82666666666666666,
                  'cv_scores': np.array([0.82666667])},
              1: {'feature_idx': (0, 2),
                  'feature_names': ('0', '2'),
                  'avg_score': 0.95999999999999996,
                  'cv_scores': np.array([0.96])},
              2: {'feature_idx': (0, 3),
                  'feature_names': ('0', '3'),
                  'avg_score': 0.96666666666666667,
                  'cv_scores': np.array([0.96666667])},
              3: {'feature_idx': (1, 2),
                  'feature_names': ('1', '2'),
                  'avg_score': 0.95999999999999996,
                  'cv_scores': np.array([0.96])},
              4: {'feature_idx': (1, 3),
                  'feature_names': ('1', '3'),
                  'avg_score': 0.95999999999999996,
                  'cv_scores': np.array([0.96])},
              5: {'feature_idx': (2, 3),
                  'feature_names': ('2', '3'),
                  'avg_score': 0.97333333333333338,
                  'cv_scores': np.array([0.97333333])},
              6: {'feature_idx': (0, 1, 2),
                  'feature_names': ('0', '1', '2'),
                  'avg_score': 0.95999999999999996,
                  'cv_scores': np.array([0.96])},
              7: {'feature_idx': (0, 1, 3),
                  'feature_names': ('0', '1', '3'),
                  'avg_score': 0.96666666666666667,
                  'cv_scores': np.array([0.96666667])},
              8: {'feature_idx': (0, 2, 3),
                  'feature_names': ('0', '2', '3'),
                  'avg_score': 0.96666666666666667,
                  'cv_scores': np.array([0.96666667])},
              9: {'feature_idx': (1, 2, 3),
                  'feature_names': ('1', '2', '3'),
                  'avg_score': 0.97333333333333338,
                  'cv_scores': np.array([0.97333333])}}
    dict_compare_utility(d1=expect, d2=efs1.subsets_)
Ejemplo n.º 16
0
    def brute_force(self, X, y, y_type):
        if y_type == "binary":
            est = LinearRegression()
            efs = EFS(
                estimator=est,
                min_features=1,
                max_features=2,
                scoring="neg_mean_squared_error",
                cv=5,
            )

            efs = efs.fit(X, y)
            efs_df = pd.DataFrame.from_dict(efs.get_metric_dict()).T
            efs_df.sort_values("avg_score", inplace=True, ascending=False)

        else:
            est = LogisticRegression()
            efs = EFS(
                estimator=est,
                min_features=1,
                max_features=2,
                scoring="neg_mean_squared_error",
                cv=5,
            )

            efs = efs.fit(X, y)
            efs_df = pd.DataFrame.from_dict(efs.get_metric_dict()).T
            efs_df.sort_values("avg_score", inplace=True, ascending=False)

        # horizontal bar chart
        fig, ax = plt.subplots(figsize=(12, 9))
        y_pos = np.arange(len(efs_df))
        ax.barh(y_pos, efs_df["avg_score"], xerr=efs_df["std_dev"])
        ax.set_yticks(y_pos)
        ax.set_xlabel("Avg Score")
        ax.set_ylabel("Feature Names")
        ax.tick_params(labelleft=False)
        plt.show()

        return efs_df
def test_regression():
    boston = load_boston()
    X, y = boston.data[:, [1, 2, 6, 8, 12]], boston.target
    lr = LinearRegression()
    efs_r = EFS(lr,
                min_features=3,
                max_features=4,
                scoring='neg_mean_squared_error',
                cv=10,
                print_progress=False)
    efs_r = efs_r.fit(X, y)
    assert efs_r.best_idx_ == (0, 2, 4)
    assert round(efs_r.best_score_, 4) == -40.8777
def test_regression():
    boston = load_boston()
    X, y = boston.data[:, [1, 2, 6, 8, 12]], boston.target
    lr = LinearRegression()
    efs_r = EFS(lr,
                min_features=3,
                max_features=4,
                scoring='neg_mean_squared_error',
                cv=10,
                print_progress=False)
    efs_r = efs_r.fit(X, y)
    assert efs_r.best_idx_ == (0, 2, 4)
    assert round(efs_r.best_score_, 4) == -40.8777
    def create_data(X: dt.Frame = None) -> pd.DataFrame:
        if X is None:
            return []

        from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

        X = X.to_pandas()
        y = X[TARGET_COLUMN].values
        X.drop(TARGET_COLUMN, axis=1, inplace=True)

        efs = EFS(ESTIMATOR,
                  min_features=MIN_FEATURES,
                  max_features=MAX_FEATURES,
                  scoring=SCORING,
                  cv=CV,
                  n_jobs=-1)

        efs.fit(X, y)

        X_fs = X.iloc[:, list(efs.best_idx_)]

        return X_fs
def test_clone_params_pass():
    iris = load_iris()
    X = iris.data
    y = iris.target
    lr = SoftmaxRegression(random_seed=1)
    efs1 = EFS(lr,
               min_features=2,
               max_features=2,
               scoring='accuracy',
               cv=0,
               clone_estimator=False,
               print_progress=False,
               n_jobs=1)
    efs1 = efs1.fit(X, y)
    assert(efs1.best_idx_ == (1, 3))
def test_clone_params_pass():
    iris = load_iris()
    X = iris.data
    y = iris.target
    lr = SoftmaxRegression(random_seed=1)
    efs1 = EFS(lr,
               min_features=2,
               max_features=2,
               scoring='accuracy',
               cv=0,
               clone_estimator=False,
               print_progress=False,
               n_jobs=1)
    efs1 = efs1.fit(X, y)
    assert(efs1.best_idx_ == (1, 3))
Ejemplo n.º 22
0
def ExhaustiveFeatureSelector(X, y, min_features=1, max_features=4):
    from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
    from sklearn.linear_model import LinearRegression
    lr = LinearRegression()

    efs1 = EFS(lr,
               min_features=min_features,
               max_features=max_features,
               scoring='r2',
               print_progress=True,
               cv=5)

    efs1 = efs1.fit(X, y)

    #print('Best subset:', efs1.best_idx_)
    print('Best subset (corresponding names):', efs1.best_feature_names_)
    print('Best R² score: %.2f' % efs1.best_score_)
    return efs1.best_feature_names_, efs1.best_score_
def perform_efs(curr_model, X, y, min_cols, max_cols):

    efs1 = EFS(curr_model,
               min_features=min_cols,
               max_features=max_cols,
               print_progress=True,
               scoring='accuracy',
               cv=5,
               n_jobs=-1)

    efs1 = efs1.fit(X, y)

    df = pd.DataFrame.from_dict(efs1.get_metric_dict()).T
    #    df['test_acc'] = df['feature_idx'].apply(
    #        lambda x: make_predictions_on_test(efs1, curr_model, X_train, X_test, y_train, y_test, x)
    #    )

    return df
def test_custom_feature_names():
    knn = KNeighborsClassifier(n_neighbors=4)
    iris = load_iris()
    X = iris.data
    y = iris.target
    efs1 = EFS(knn,
               min_features=2,
               max_features=2,
               scoring='accuracy',
               cv=0,
               clone_estimator=False,
               print_progress=False,
               n_jobs=1)

    efs1 = efs1.fit(X, y, custom_feature_names=(
          'sepal length', 'sepal width', 'petal length', 'petal width'))
    assert efs1.best_idx_ == (2, 3), efs1.best_idx_
    assert efs1.best_feature_names_ == ('petal length', 'petal width')
Ejemplo n.º 25
0
def test_custom_feature_names():
    knn = KNeighborsClassifier(n_neighbors=4)
    iris = load_iris()
    X = iris.data
    y = iris.target
    efs1 = EFS(knn,
               min_features=2,
               max_features=2,
               scoring='accuracy',
               cv=0,
               clone_estimator=False,
               print_progress=False,
               n_jobs=1)

    efs1 = efs1.fit(X, y, custom_feature_names=(
          'sepal length', 'sepal width', 'petal length', 'petal width'))
    assert efs1.best_idx_ == (2, 3), efs1.best_idx_
    assert efs1.best_feature_names_ == ('petal length', 'petal width')
def test_check_pandas_dataframe_transform():
    knn = KNeighborsClassifier(n_neighbors=4)
    iris = load_iris()
    X = iris.data
    y = iris.target
    efs1 = EFS(knn,
               min_features=2,
               max_features=2,
               scoring='accuracy',
               cv=0,
               clone_estimator=False,
               print_progress=False,
               n_jobs=1)

    df = pd.DataFrame(X, columns=['sepal length', 'sepal width',
                                  'petal length', 'petal width'])
    efs1 = efs1.fit(df, y)
    assert efs1.best_idx_ == (2, 3)
    assert (150, 2) == efs1.transform(df).shape
Ejemplo n.º 27
0
def test_check_pandas_dataframe_transform():
    knn = KNeighborsClassifier(n_neighbors=4)
    iris = load_iris()
    X = iris.data
    y = iris.target
    efs1 = EFS(knn,
               min_features=2,
               max_features=2,
               scoring='accuracy',
               cv=0,
               clone_estimator=False,
               print_progress=False,
               n_jobs=1)

    df = pd.DataFrame(X, columns=['sepal length', 'sepal width',
                                  'petal length', 'petal width'])
    efs1 = efs1.fit(df, y)
    assert efs1.best_idx_ == (2, 3)
    assert (150, 2) == efs1.transform(df).shape
Ejemplo n.º 28
0
def wrapper_selection():
    print('--------------------------------------------------------')
    print('Utilizando tecnica de Envoltura...')
    models = [
        svm.SVC(),
        RandomForestClassifier(),
        GaussianNB(),
        LogisticRegression(),
        KNeighborsClassifier()
    ]
    for model in models:
        efs = ExhaustiveFeatureSelector(model,
                                        min_features=1,
                                        max_features=5,
                                        scoring='accuracy',
                                        cv=5)
        efs = efs.fit(data, labels)
        selected_features = columns[list(efs.best_idx_)]
        print(
            f'Variables seleccionadas utilizando {model}: {selected_features}')
Ejemplo n.º 29
0
def wrapper(x_train_df):
    x_train = x_train_df.drop(["id", "failed test"], axis=1)
    y_train = x_train_df["failed test"]
    feature_selector = EFS(RandomForestClassifier(max_depth=17,
                                                  n_estimators=136,
                                                  max_features=0.307,
                                                  min_samples_split=30,
                                                  random_state=42),
                           min_features=6,
                           max_features=7,
                           scoring='log_loss',
                           print_progress=True,
                           n_jobs=1,
                           cv=5)
    features = feature_selector.fit(x_train, y_train)
    print('Best recall score: %.2f' % feature_selector.best_score_)
    print('Best subset (indices):', feature_selector.best_idx_)
    print('Best subset (corresponding names):',
          feature_selector.best_feature_names_)
    print('Subsets_: ', feature_selector.subsets_)
Ejemplo n.º 30
0
def exhaustive_feature_selection(x_train, y_train, model=None, num_features=[2, 5], classification_tasks=True,
                                 scoring=None):
    print("============== Exhaustive feature selection ===================")
    if not model:
        if classification_tasks:
            model = LogisticRegression(multi_class='multinomial',
                                       solver='lbfgs',
                                       random_state=123)
        else:
            model = Ridge()

    if not scoring:
        if classification_tasks:
            scoring = "accuracy"
        else:
            scoring = "neg_mean_absolute_error"

    efs = EFS(estimator=model,
               min_features=num_features[0],
               max_features=num_features[1],
               scoring=scoring,
               print_progress=False,
               clone_estimator=False,
               cv=10,
               n_jobs=2)

    X = efs.fit(x_train.values, y_train.values)
    print('Best accuracy score: %.2f' % efs.best_score_)
    col_list = []
    col_list.extend(efs.best_idx_)
    col_names = x_train.columns
    print('Best subset:', col_names[col_list].values)
    x_train = x_train.iloc[:,col_list]

    print("=================================")
    return x_train
def test_fit_params():
    iris = load_iris()
    X = iris.data
    y = iris.target
    sample_weight = np.ones(X.shape[0])
    forest = RandomForestClassifier(n_estimators=100, random_state=123)
    efs1 = EFS(forest,
               min_features=3,
               max_features=3,
               scoring='accuracy',
               cv=4,
               print_progress=False)
    efs1 = efs1.fit(X, y, sample_weight=sample_weight)
    expect = {0: {'feature_idx': (0, 1, 2),
                  'feature_names': ('0', '1', '2'),
                  'cv_scores': np.array([0.94871795, 0.92307692,
                                         0.91666667, 0.97222222]),
                  'avg_score': 0.9401709401709402},
              1: {'feature_idx': (0, 1, 3),
                  'feature_names': ('0', '1', '3'),
                  'cv_scores': np.array([0.92307692, 0.92307692,
                                         0.88888889, 1.]),
                  'avg_score': 0.9337606837606838},
              2: {'feature_idx': (0, 2, 3),
                  'feature_names': ('0', '2', '3'),
                  'cv_scores': np.array([0.97435897, 0.94871795,
                                         0.94444444, 0.97222222]),
                  'avg_score': 0.9599358974358974},
              3: {'feature_idx': (1, 2, 3),
                  'feature_names': ('1', '2', '3'),
                  'cv_scores': np.array([0.97435897, 0.94871795,
                                         0.91666667, 1.]),
                  'avg_score': 0.9599358974358974}}
    dict_compare_utility(d1=expect, d2=efs1.subsets_)
    assert efs1.best_idx_ == (0, 2, 3)
    assert round(efs1.best_score_, 4) == 0.9599
Ejemplo n.º 32
0
        #print(df)
        test_cols = these_choices['feature'].values
        print(test_cols)
        efs = EFS(
            estimator=rfc,
            min_features=3,
            max_features=max_len,
            print_progress=False,
            scoring='accuracy',
            n_jobs=15,
            cv=4,
        )

        start_time = time.time()
        try:
            efs = efs.fit(X_train[test_cols], y_train)
        except:
            continue
        end_time = time.time()
        #print()
        #print(feature_choices, end_time - start_time)
        best_features = list(efs.best_feature_names_)
        best_score = efs.best_score_

        print(feature_choices, max_len, 'time', end_time - start_time, 'score',
              best_score, 'features', len(best_features), best_features)

        #print()

        rfc.fit(X_train[best_features], y_train)
        #y_pred = test_pipe.predict(X_test[best_features])
Ejemplo n.º 33
0
        'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Gender_Male',
        'Married_Yes', 'Dependents_1', 'Dependents_2', 'Dependents_3+',
        'Education_Not Graduate', 'Self_Employed_Yes'
    ])
# Target class
Y = pd.DataFrame(newdata['Loan_Status'])
#Visualization
sns.pairplot(df1, hue="Loan_Status")
#Splitting data into train and test samples
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
#Running ExhaustiveFeatureSelector() for feature_selection on 3 different classifiers
efs = ExhaustiveFeatureSelector(RandomForestClassifier(),
                                max_features=6,
                                scoring='roc_auc',
                                cv=5)
efs_fit = efs.fit(X_train, Y_train)
selected_features = X_train.columns[list(efs_fit.best_idx_)]
print(selected_features)
print(efs_fit.best_score_)
rClassifier = RandomForestClassifier(random_state=0)
rClassifier.fit(X_train[selected_features], Y_train)
Y_RCF = rClassifier.predict(X_test[selected_features])
print(classification_report(Y_test, Y_RCF))
efs_naive = ExhaustiveFeatureSelector(GaussianNB(),
                                      max_features=6,
                                      scoring='roc_auc',
                                      cv=4)
efs_naive_fit = efs_naive.fit(X_train, Y_train)
selected_features_naive = X_train.columns[list(efs_naive_fit.best_idx_)]
print(selected_features_naive)
print(efs_naive_fit.best_score_)
Ejemplo n.º 34
0
                 
corr_features=correlation(X_train,0.8)
print('correlated features:',len(set(corr_features)))

X_train.drop(labels=corr_features,axis=1,inplace=True)
X_test.drop(labels=corr_features,axis=1,inplace=True)

efs1=EFS(RandomForestClassifier(n_jobs=4,random_state=0),
         min_features=1 ,
         max_features=4,
         scoring='roc_auc',
         print_progress = True,
         cv=2
         )

efs1=efs1.fit(np.array(X_train[X_train.columns[0:4]].fillna(0)),y_train)
select_feat= X_train.columns[list(efs1.best_idx_)]
select_feat

def run_randomForests(X_train,X_test,y_train,y_test):
    rf=RandomForestClassifier(n_estimators=200,random_state=39,max_depth=4)
    rf.fit(X_train,y_train)
    print('Train set')
    pred=rf.predict_proba(X_train)
    print('Random Forests roc_auc :{}'.format(roc_auc_score(y_train,pred[:,1])))
    print('Test set')
    pred=rf.predict_proba(X_test)
    print('Random Forests roc_auc :{}'.format(roc_auc_score(y_test,pred[:,1])))
    
run_randomForests(X_train[select_feat].fillna(0),X_test[select_feat].fillna(0),y_train,y_test)    
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[
                i, j]) > 0.8:  #0.8 is a correlation threshold value
            column_name = correlation_matrix.columns[i]
            correlated_features.add(column_name)

train_features.drop(labels=correlated_features, axis=1, inplace=True)
test_features.drop(labels=correlated_features, axis=1, inplace=True)

feature_selector = ExhaustiveFeatureSelector(RandomForestClassifier(n_jobs=-1),
                                             min_features=2,
                                             max_features=4,
                                             scoring='roc_auc',
                                             print_progress=True,
                                             cv=2)
features = feature_selector.fit(np.array(train_features.fillna(0)),
                                train_labels)

filtered_features = train_features.columns[list(features.k_feature_idx_)]

clf = RandomForestClassifier(n_estimators=100, random_state=41, max_depth=3)
clf.fit(train_features[filtered_features].fillna(0), train_labels)

train_pred = clf.predict_proba(train_features[filtered_features].fillna(0))
print('Accuracy on training set: {}'.format(
    roc_auc_score(train_labels, train_pred[:, 1])))

test_pred = clf.predict_proba(test_features[filtered_features].fillna(0))
print('Accuracy on test set: {}'.format(
    roc_auc_score(test_labels, test_pred[:, 1])))
This method searches across all possible feature combinations.
Its aim is to find the best performing feature subset.
"""
# import the algorithm you want to evaluate on your features.
from mlxtend.feature_selection import ExhaustiveFeatureSelector
from sklearn.ensemble import RandomForestClassifier

# create the ExhaustiveFeatureSelector object.
efs = ExhaustiveFeatureSelector(RandomForestClassifier(),
                                min_features=45,
                                max_features=70,
                                scoring='accuracy',
                                cv=2)

# fit the object to the training data.
efs = efs.fit(x, y)

# print the selected features.
selected_features1 = x.columns[list(efs.k_feature_idx_)]
print('selected features from exhaustive selection:', selected_features1)

# print the final prediction score.
print('accuracy:', efs.k_score_)

# transform to the newly selected features.
#X_train = efs.transform(X_train)
#X_test = efs.transform(X_test)
"""
FORWARD FEATURE SELECTION.
an iterative method in which we start by evaluating all features individually,
and then select the one that results in the best performance.
Ejemplo n.º 37
0
    print("The selected feature list:")
    print(feat_cols)
elif (choice == 2):
    sfs1 = sfs(clf,
               k_features=4,
               forward=False,
               floating=False,
               verbose=2,
               scoring='accuracy',
               cv=5)
    # Perform SFFS
    sfs1 = sfs1.fit(X_train, y_train)

    feat_cols = list(sfs1.k_feature_idx_)
    print("******")
    print("The selected feature list:")
    print(feat_cols)
elif (choice == 3):
    efs1 = EFS(knn,
               min_features=4,
               max_features=5,
               scoring='accuracy',
               print_progress=True,
               cv=5)
    efs1 = efs1.fit(X_train, y_train)
    feat_cols = list(efs1.best_idx_)
    print("******")
    print("The selected feature list:")
    print(feat_cols)
else:
    print("Wrong Input")
Ejemplo n.º 38
0
iris = load_iris()
x = pd.DataFrame(iris.data, \
    columns=iris.feature_names)

#%% create a logistic regression object
lr = LogisticRegression()

#%% create an EFS object
efs = EFS(estimator=lr,        
          min_features=1,      
          max_features=3,      
          scoring='accuracy',  
          cv=5)

#%% fit the model
efs = efs.fit(x, iris.target)

#%% show the selected features
efs.best_feature_names_
# console output:
# ('sepal length (cm)', 'petal length (cm)', 
# 'petal width (cm)')

#%% show a full report on the feature selection
efs_results = pd.DataFrame(efs.get_metric_dict()).\
    T. \
    sort_values(by='avg_score', ascending=False)

#%% show feature importance visually
# create figure and axes
fig, ax = plt.subplots()
Ejemplo n.º 39
0
                         usecols=list(range(969)),
                         chunksize=100000,
                         dtype=np.float32)
X = pd.concat([
    pd.concat([dchunk, nchunk], axis=1).sample(frac=0.05)
    for dchunk, nchunk in zip(date_chunks, num_chunks)
])
y = pd.read_csv("train_numeric.csv",
                index_col=0,
                usecols=[0, 969],
                dtype=np.float32).loc[X.index].values.ravel()
X = X.values
model = XGBClassifier()
efs1 = EFS(model, min_features=100, max_features=900, scoring='accuracy', cv=5)

efs1 = efs1.fit(X, y)

print('Best accuracy score: %.2f' % efs1.best_score_)
important_indices = efs1.best_idx_

# Got important_indices from above code
#important_indices = []
print("Found important features %s" % important_indices)
# load entire dataset for these features.
# note where the feature indices are split so we can load the correct ones straight from read_csv
n_date_features = 1156
X = np.concatenate([
    pd.read_csv("train_date.csv",
                index_col=0,
                dtype=np.float32,
                usecols=np.concatenate([[