コード例 #1
0
def test_fit_params():
    iris = load_iris()
    X = iris.data
    y = iris.target
    sample_weight = np.ones(X.shape[0])
    forest = RandomForestClassifier(n_estimators=100, random_state=123)
    efs1 = EFS(forest,
               min_features=3,
               max_features=3,
               scoring='accuracy',
               cv=4,
               print_progress=False)
    efs1 = efs1.fit(X, y, sample_weight=sample_weight)
    expect = {0: {'feature_idx': (0, 1, 2),
                  'cv_scores': np.array([0.94871795, 0.92307692,
                                         0.91666667, 0.97222222]),
                  'avg_score': 0.9401709401709402},
              1: {'feature_idx': (0, 1, 3),
                  'cv_scores': np.array([0.92307692, 0.92307692,
                                         0.88888889, 1.]),
                  'avg_score': 0.9337606837606838},
              2: {'feature_idx': (0, 2, 3),
                  'cv_scores': np.array([0.97435897, 0.94871795,
                                         0.94444444, 0.97222222]),
                  'avg_score': 0.9599358974358974},
              3: {'feature_idx': (1, 2, 3),
                  'cv_scores': np.array([0.97435897, 0.94871795,
                                         0.91666667, 1.]),
                  'avg_score': 0.9599358974358974}}
    dict_compare_utility(d1=expect, d2=efs1.subsets_)
    assert efs1.best_idx_ == (0, 2, 3)
    assert round(efs1.best_score_, 4) == 0.9599
コード例 #2
0
def test_knn_cv3_groups():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    efs1 = EFS(knn,
               min_features=3,
               max_features=3,
               scoring='accuracy',
               cv=GroupKFold(n_splits=3),
               print_progress=False)
    np.random.seed(1630672634)
    groups = np.random.randint(0, 6, size=len(y))
    efs1 = efs1.fit(X, y, groups=groups)

    expect = {0: {'cv_scores': np.array([0.97916667, 0.93877551, 0.9245283]),
                  'feature_idx': (0, 1, 2),
                  'avg_score': 0.9474901595858469,
                  'feature_names': ('0', '1', '2')},
              1: {'cv_scores': np.array([1., 0.93877551, 0.9245283]),
                  'feature_idx': (0, 1, 3),
                  'avg_score': 0.9544346040302915,
                  'feature_names': ('0', '1', '3')},
              2: {'cv_scores': np.array([0.97916667, 0.95918367, 0.9245283]),
                  'feature_idx': (0, 2, 3),
                  'avg_score': 0.9542928806742822,
                  'feature_names': ('0', '2', '3')},
              3: {'cv_scores': np.array([0.97916667, 0.95918367, 0.94339623]),
                  'feature_idx': (1, 2, 3),
                  'avg_score': 0.9605821888503829,
                  'feature_names': ('1', '2', '3')}}
    dict_compare_utility(d1=expect, d2=efs1.subsets_)
コード例 #3
0
    def exhaustive_feature_selection(X: dt.Frame = None):
        if X is None:
                return []
        # X[:, 'default payment next month leak'] = X[:, 'default payment next month']
        datadf = X.to_pandas()
        data_y = datadf['default payment next month']
        data_X = datadf.iloc[:,:datadf.shape[1] - 1] # radius_mean onwards
        XX = data_X
        y = np.ravel(data_y)
        #
        knn = KNeighborsClassifier(n_neighbors=3)

        efs1 = EFS(knn, 
                min_features=5,
                max_features=10,
                scoring='accuracy',
                print_progress=True,
                cv=5)

        efs1 = efs1.fit(XX, y)
        support = sfs1.k_feature_names_
        feat_list = list(support)
        # get the new features
        col_names_to_pick = feat_list + ['default payment next month']
        new_df = datadf[col_names_to_pick]
        new_dt = dt.Frame(new_df)
        return new_dt
コード例 #4
0
def test_check_pandas_dataframe_fit():

    knn = KNeighborsClassifier(n_neighbors=4)
    iris = load_iris()
    X = iris.data
    y = iris.target
    efs1 = EFS(knn,
               min_features=2,
               max_features=2,
               scoring='accuracy',
               cv=0,
               clone_estimator=False,
               print_progress=False,
               n_jobs=1)

    df = pd.DataFrame(
        X,
        columns=['sepal length', 'sepal width', 'petal length', 'petal width'])

    sfs1 = efs1.fit(X, y)
    assert efs1.best_idx_ == (2, 3), efs1.best_idx_
    assert efs1.best_feature_names_ == ('2', '3')
    assert efs1.interrupted_ is False

    sfs1._TESTING_INTERRUPT_MODE = True
    sfs1 = sfs1.fit(df, y)
    assert efs1.best_idx_ == (0, 1), efs1.best_idx_
    assert efs1.best_feature_names_ == ('sepal length', 'sepal width')
    assert efs1.interrupted_ is True
コード例 #5
0
def test_knn_cv3():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    efs1 = EFS(knn,
               min_features=3,
               max_features=3,
               scoring='accuracy',
               cv=4,
               print_progress=False)
    efs1 = efs1.fit(X, y)
    expect = {0: {'avg_score': 0.9391025641025641,
                  'feature_idx': (0, 1, 2),
                  'cv_scores': np.array([0.97435897, 0.94871795,
                                         0.88888889, 0.94444444])},
              1: {'avg_score': 0.94017094017094016,
                  'feature_idx': (0, 1, 3),
                  'cv_scores': np.array([0.92307692, 0.94871795,
                                         0.91666667, 0.97222222])},
              2: {'avg_score': 0.95299145299145294,
                  'feature_idx': (0, 2, 3),
                  'cv_scores': np.array([0.97435897, 0.94871795,
                                         0.91666667, 0.97222222])},
              3: {'avg_score': 0.97275641025641035,
                  'feature_idx': (1, 2, 3),
                  'cv_scores': np.array([0.97435897, 1.,
                                         0.94444444, 0.97222222])}}
    dict_compare_utility(d1=expect, d2=efs1.subsets_)
    assert efs1.best_idx_ == (1, 2, 3)
    assert round(efs1.best_score_, 4) == 0.9728
コード例 #6
0
    def __init__(self,
                 estimator=LGBMClassifier(n_jobs=30),
                 scoring='roc_auc',
                 selector_name='rfe',
                 cv_worker=1,
                 step=1):
        """

        :param estimator:
        :param scoring:
        :param selector: {'sfs', 'efs', 'rfe'} # rfe 可以粗排一下用gpu加速
        :param cv_worker: gpu要设为1
        """
        self.selector_name = selector_name
        if selector_name == 'sfs':
            """
            efs的优化版:根据 scoring 筛选特征
            顺序特征选择算法的贪婪搜索算法家族,用于减少初始d维特征空间到ķ维特征空间,其中ķ<d 。
            特征选择算法背后的动机是自动选择与问题最相关的特征子集。
            特征选择的目标是双重的:我们希望通过去除不相关的特征或噪声来提高计算效率并减少模型的泛化误差。
            如果嵌入式特征选择(例如,像LASSO这样的正则化惩罚)不适用,则诸如顺序特征选择之类的包装器方法尤其有用。
            """
            self.selector = SFS(
                estimator,
                scoring=scoring,
                cv=5,
                n_jobs=cv_worker,
                verbose=2,
                k_features='best',
                forward=True,
                floating=False,
            )
        elif selector_name == 'efs':  # 枚举:2^n
            self.selector = EFS(estimator,
                                scoring=scoring,
                                cv=5,
                                n_jobs=cv_worker,
                                print_progress=True,
                                max_features=1000)
        elif selector_name == 'rfe':  # 根据树模型特征重要性等权重信息筛选特征
            """https://www.jianshu.com/p/025395835591
            # 打印的是相应位置上属性的排名
            print(rfe.ranking_)
            # 属性选择的一种模糊表示,选择的是true,未选择的是false
            print(rfe.support_)
            # 第1个属相的排名
            print(rfe.ranking_[1])
            # 外部估计函数的相关信息
            print(rfe.estimator_)
            """
            """https://www.kaggle.com/roydatascience/recursive-feature-selection-new-transactions-elo"""
            self.selector = RFECV(
                estimator,
                scoring=scoring,
                cv=5,
                n_jobs=cv_worker,
                verbose=2,
                step=step,  # 每次迭代要删除的特征数/占比
            )
コード例 #7
0
def test_fit_params():
    iris = load_iris()
    X = iris.data
    y = iris.target
    sample_weight = np.ones(X.shape[0])
    forest = RandomForestClassifier(n_estimators=100, random_state=123)
    efs1 = EFS(forest,
               min_features=3,
               max_features=3,
               scoring='accuracy',
               cv=4,
               print_progress=False)
    efs1 = efs1.fit(X, y, sample_weight=sample_weight)
    expect = {
        0: {
            'feature_idx': (0, 1, 2),
            'feature_names': ('0', '1', '2'),
            'cv_scores': np.array([0.947, 0.868, 0.919, 0.973]),
            'avg_score': 0.9269203413940257
        },
        1: {
            'feature_idx': (0, 1, 3),
            'feature_names': ('0', '1', '3'),
            'cv_scores': np.array([0.921, 0.921, 0.892, 1.]),
            'avg_score': 0.9337606837606838
        },
        2: {
            'feature_idx': (0, 2, 3),
            'feature_names': ('0', '2', '3'),
            'cv_scores': np.array([0.974, 0.947, 0.919, 0.973]),
            'avg_score': 0.9532361308677098
        },
        3: {
            'feature_idx': (1, 2, 3),
            'feature_names': ('1', '2', '3'),
            'cv_scores': np.array([0.974, 0.947, 0.892, 1.]),
            'avg_score': 0.9532361308677098
        }
    }

    if Version(sklearn_version) < Version("0.22"):
        expect[0]['avg_score'] = 0.9401709401709402
        expect[0]['cv_scores'] = np.array(
            [0.94871795, 0.92307692, 0.91666667, 0.97222222])
        expect[1]['cv_scores'] = np.array(
            [0.94871795, 0.92307692, 0.91666667, 0.97222222])
        expect[2]['cv_scores'] = np.array(
            [0.94871795, 0.92307692, 0.91666667, 0.97222222])
        expect[2]['avg_score'] = 0.9599358974358974
        expect[3]['avg_score'] = 0.9599358974358974
        expect[3]['cv_scores'] = np.array(
            [0.97435897, 0.94871795, 0.91666667, 1.])
        assert round(efs1.best_score_, 4) == 0.9599

    else:
        assert round(efs1.best_score_, 4) == 0.9532

    dict_compare_utility(d1=expect, d2=efs1.subsets_)
    assert efs1.best_idx_ == (0, 2, 3)
コード例 #8
0
def select_EFS(tr, num_feat=100):
    X = tr
    y = [i for i in range(9) for j in range(60)]
    knn = KNeighborsClassifier(n_neighbors=1)
    efs = EFS(knn, min_features=num_feat, max_features=num_feat, cv=5, n_jobs=4)
    efs.fit(X, y)
    out = efs.best_idx_
    return out
コード例 #9
0
def test_maxfeatures_1():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier()

    efs = EFS(estimator=knn, min_features=1, max_features=0)
    expect = ('max_features must be smaller than 5 and larger than 0')
    assert_raises(AttributeError, expect, efs.fit, X, y)
コード例 #10
0
def test_minmaxfeatures_1():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier()

    efs = EFS(estimator=knn, min_features=3, max_features=2)
    expect = ('min_features must be <= max_features')
    assert_raises(AttributeError, expect, efs.fit, X, y)
コード例 #11
0
def test_knn_cv3():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    efs1 = EFS(knn,
               min_features=3,
               max_features=3,
               scoring='accuracy',
               cv=4,
               print_progress=False)
    efs1 = efs1.fit(X, y)
    expect = {
        0: {
            'avg_score': 0.9391025641025641,
            'feature_idx': (0, 1, 2),
            'feature_names': ('0', '1', '2'),
            'cv_scores': np.array([0.974, 0.947, 0.892, 0.946])
        },
        1: {
            'avg_score': 0.9400782361308677,
            'feature_idx': (0, 1, 3),
            'feature_names': ('0', '1', '3'),
            'cv_scores': np.array([0.921, 0.947, 0.919, 0.973])
        },
        2: {
            'avg_score': 0.95299145299145294,
            'feature_idx': (0, 2, 3),
            'feature_names': ('0', '2', '3'),
            'cv_scores': np.array([0.974, 0.947, 0.919, 0.973])
        },
        3: {
            'avg_score': 0.97275641025641035,
            'feature_idx': (1, 2, 3),
            'feature_names': ('1', '2', '3'),
            'cv_scores': np.array([0.974, 1., 0.946, 0.973])
        }
    }

    if Version(sklearn_version) < Version("0.22"):
        expect[0]['cv_scores'] = np.array(
            [0.97435897, 0.94871795, 0.88888889, 0.94444444])
        expect[1]['cv_scores'] = np.array(
            [0.92307692, 0.94871795, 0.91666667, 0.97222222])
        expect[2]['cv_scores'] = np.array(
            [0.97435897, 0.94871795, 0.91666667, 0.97222222])
        expect[3]['cv_scores'] = np.array(
            [0.97435897, 0.94871795, 0.91666667, 0.97222222])
        expect[1]['avg_score'] = 0.94017094017094016
        assert round(efs1.best_score_, 4) == 0.9728
    else:
        assert round(efs1.best_score_, 4) == 0.9732

    dict_compare_utility(d1=expect, d2=efs1.subsets_)
    assert efs1.best_idx_ == (1, 2, 3)
    assert efs1.best_feature_names_ == ('1', '2', '3')
コード例 #12
0
def test_knn_wo_cv():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    efs1 = EFS(knn,
               min_features=2,
               max_features=3,
               scoring='accuracy',
               cv=0,
               print_progress=False)
    efs1 = efs1.fit(X, y)
    expect = {0: {'feature_idx': (0, 1),
                  'feature_names': ('0', '1'),
                  'avg_score': 0.82666666666666666,
                  'cv_scores': np.array([0.82666667])},
              1: {'feature_idx': (0, 2),
                  'feature_names': ('0', '2'),
                  'avg_score': 0.95999999999999996,
                  'cv_scores': np.array([0.96])},
              2: {'feature_idx': (0, 3),
                  'feature_names': ('0', '3'),
                  'avg_score': 0.96666666666666667,
                  'cv_scores': np.array([0.96666667])},
              3: {'feature_idx': (1, 2),
                  'feature_names': ('1', '2'),
                  'avg_score': 0.95999999999999996,
                  'cv_scores': np.array([0.96])},
              4: {'feature_idx': (1, 3),
                  'feature_names': ('1', '3'),
                  'avg_score': 0.95999999999999996,
                  'cv_scores': np.array([0.96])},
              5: {'feature_idx': (2, 3),
                  'feature_names': ('2', '3'),
                  'avg_score': 0.97333333333333338,
                  'cv_scores': np.array([0.97333333])},
              6: {'feature_idx': (0, 1, 2),
                  'feature_names': ('0', '1', '2'),
                  'avg_score': 0.95999999999999996,
                  'cv_scores': np.array([0.96])},
              7: {'feature_idx': (0, 1, 3),
                  'feature_names': ('0', '1', '3'),
                  'avg_score': 0.96666666666666667,
                  'cv_scores': np.array([0.96666667])},
              8: {'feature_idx': (0, 2, 3),
                  'feature_names': ('0', '2', '3'),
                  'avg_score': 0.96666666666666667,
                  'cv_scores': np.array([0.96666667])},
              9: {'feature_idx': (1, 2, 3),
                  'feature_names': ('1', '2', '3'),
                  'avg_score': 0.97333333333333338,
                  'cv_scores': np.array([0.97333333])}}
    dict_compare_utility(d1=expect, d2=efs1.subsets_)
コード例 #13
0
    def brute_force(self, X, y, y_type):
        if y_type == "binary":
            est = LinearRegression()
            efs = EFS(
                estimator=est,
                min_features=1,
                max_features=2,
                scoring="neg_mean_squared_error",
                cv=5,
            )

            efs = efs.fit(X, y)
            efs_df = pd.DataFrame.from_dict(efs.get_metric_dict()).T
            efs_df.sort_values("avg_score", inplace=True, ascending=False)

        else:
            est = LogisticRegression()
            efs = EFS(
                estimator=est,
                min_features=1,
                max_features=2,
                scoring="neg_mean_squared_error",
                cv=5,
            )

            efs = efs.fit(X, y)
            efs_df = pd.DataFrame.from_dict(efs.get_metric_dict()).T
            efs_df.sort_values("avg_score", inplace=True, ascending=False)

        # horizontal bar chart
        fig, ax = plt.subplots(figsize=(12, 9))
        y_pos = np.arange(len(efs_df))
        ax.barh(y_pos, efs_df["avg_score"], xerr=efs_df["std_dev"])
        ax.set_yticks(y_pos)
        ax.set_xlabel("Avg Score")
        ax.set_ylabel("Feature Names")
        ax.tick_params(labelleft=False)
        plt.show()

        return efs_df
コード例 #14
0
def test_regression():
    boston = load_boston()
    X, y = boston.data[:, [1, 2, 6, 8, 12]], boston.target
    lr = LinearRegression()
    efs_r = EFS(lr,
                min_features=3,
                max_features=4,
                scoring='neg_mean_squared_error',
                cv=10,
                print_progress=False)
    efs_r = efs_r.fit(X, y)
    assert efs_r.best_idx_ == (0, 2, 4)
    assert round(efs_r.best_score_, 4) == -40.8777
コード例 #15
0
def test_get_metric_dict_not_fitted():
    knn = KNeighborsClassifier(n_neighbors=4)

    efs1 = EFS(knn,
               min_features=2,
               max_features=2,
               scoring='accuracy',
               cv=0,
               clone_estimator=False,
               print_progress=False,
               n_jobs=1)

    expect = 'ExhaustiveFeatureSelector has not been fitted, yet.'

    assert_raises(AttributeError, expect, efs1.get_metric_dict)
コード例 #16
0
def test_clone_params_pass():
    iris = load_iris()
    X = iris.data
    y = iris.target
    lr = SoftmaxRegression(random_seed=1)
    efs1 = EFS(lr,
               min_features=2,
               max_features=2,
               scoring='accuracy',
               cv=0,
               clone_estimator=False,
               print_progress=False,
               n_jobs=1)
    efs1 = efs1.fit(X, y)
    assert(efs1.best_idx_ == (1, 3))
コード例 #17
0
def test_transform_not_fitted():
    iris = load_iris()
    X = iris.data
    knn = KNeighborsClassifier(n_neighbors=4)

    efs1 = EFS(knn,
               min_features=2,
               max_features=2,
               scoring='accuracy',
               cv=0,
               clone_estimator=False,
               print_progress=False,
               n_jobs=1)

    expect = 'ExhaustiveFeatureSelector has not been fitted, yet.'

    assert_raises(AttributeError, expect, efs1.transform, X)
コード例 #18
0
def test_fit_transform():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)

    efs1 = EFS(knn,
               min_features=2,
               max_features=2,
               scoring='accuracy',
               cv=0,
               clone_estimator=False,
               print_progress=False,
               n_jobs=1)

    X_t = efs1.fit_transform(X, y)
    assert X_t.shape == (150, 2)
コード例 #19
0
def ExhaustiveFeatureSelector(X, y, min_features=1, max_features=4):
    from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
    from sklearn.linear_model import LinearRegression
    lr = LinearRegression()

    efs1 = EFS(lr,
               min_features=min_features,
               max_features=max_features,
               scoring='r2',
               print_progress=True,
               cv=5)

    efs1 = efs1.fit(X, y)

    #print('Best subset:', efs1.best_idx_)
    print('Best subset (corresponding names):', efs1.best_feature_names_)
    print('Best R² score: %.2f' % efs1.best_score_)
    return efs1.best_feature_names_, efs1.best_score_
コード例 #20
0
def test_custom_feature_names():
    knn = KNeighborsClassifier(n_neighbors=4)
    iris = load_iris()
    X = iris.data
    y = iris.target
    efs1 = EFS(knn,
               min_features=2,
               max_features=2,
               scoring='accuracy',
               cv=0,
               clone_estimator=False,
               print_progress=False,
               n_jobs=1)

    efs1 = efs1.fit(X, y, custom_feature_names=(
          'sepal length', 'sepal width', 'petal length', 'petal width'))
    assert efs1.best_idx_ == (2, 3), efs1.best_idx_
    assert efs1.best_feature_names_ == ('petal length', 'petal width')
コード例 #21
0
def perform_efs(curr_model, X, y, min_cols, max_cols):

    efs1 = EFS(curr_model,
               min_features=min_cols,
               max_features=max_cols,
               print_progress=True,
               scoring='accuracy',
               cv=5,
               n_jobs=-1)

    efs1 = efs1.fit(X, y)

    df = pd.DataFrame.from_dict(efs1.get_metric_dict()).T
    #    df['test_acc'] = df['feature_idx'].apply(
    #        lambda x: make_predictions_on_test(efs1, curr_model, X_train, X_test, y_train, y_test, x)
    #    )

    return df
コード例 #22
0
def test_check_pandas_dataframe_transform():
    knn = KNeighborsClassifier(n_neighbors=4)
    iris = load_iris()
    X = iris.data
    y = iris.target
    efs1 = EFS(knn,
               min_features=2,
               max_features=2,
               scoring='accuracy',
               cv=0,
               clone_estimator=False,
               print_progress=False,
               n_jobs=1)

    df = pd.DataFrame(X, columns=['sepal length', 'sepal width',
                                  'petal length', 'petal width'])
    efs1 = efs1.fit(df, y)
    assert efs1.best_idx_ == (2, 3)
    assert (150, 2) == efs1.transform(df).shape
コード例 #23
0
def wrapper(x_train_df):
    x_train = x_train_df.drop(["id", "failed test"], axis=1)
    y_train = x_train_df["failed test"]
    feature_selector = EFS(RandomForestClassifier(max_depth=17,
                                                  n_estimators=136,
                                                  max_features=0.307,
                                                  min_samples_split=30,
                                                  random_state=42),
                           min_features=6,
                           max_features=7,
                           scoring='log_loss',
                           print_progress=True,
                           n_jobs=1,
                           cv=5)
    features = feature_selector.fit(x_train, y_train)
    print('Best recall score: %.2f' % feature_selector.best_score_)
    print('Best subset (indices):', feature_selector.best_idx_)
    print('Best subset (corresponding names):',
          feature_selector.best_feature_names_)
    print('Subsets_: ', feature_selector.subsets_)
コード例 #24
0
def exhaustive_feature_selection(x_data, y_data, min_feat, max_feat):
    print(f"Applying exhaustive feature selection to numeric data")
    print(
        f"cat variables before backward feature selection {x_data.select_dtypes(include='object').columns.shape}"
    )
    print(
        f"numeric variables before backward feature selection {x_data.select_dtypes(include='number').columns.shape}"
    )

    numeric_cols = x_data.select_dtypes(include='number').columns

    temp = x_data[numeric_cols]

    efs = EFS(RandomForestRegressor(n_jobs=4),
              max_features=max_feat,
              min_features=min_feat,
              scoring='r2',
              print_progress=True,
              cv=2)

    efs.fit(temp, y_data)

    idx = efs.best_idx_

    print(idx)

    idx = list(idx)

    cols_to_keep = x_data.columns[idx]
    cols_to_drop = [x for x in numeric_cols if x not in cols_to_keep]

    print(cols_to_drop.__len__())

    x_data.drop(labels=cols_to_drop, axis=1, inplace=True)
    print(
        f"cat variables after exhaustive feature selection {x_data.select_dtypes(include='object').columns}"
    )
    print(
        f"numeric variables after exhaustive  feature selection {x_data.select_dtypes(include='number').columns}"
    )
    return x_data
コード例 #25
0
    def create_data(X: dt.Frame = None) -> pd.DataFrame:
        if X is None:
            return []

        from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

        X = X.to_pandas()
        y = X[TARGET_COLUMN].values
        X.drop(TARGET_COLUMN, axis=1, inplace=True)

        efs = EFS(ESTIMATOR,
                  min_features=MIN_FEATURES,
                  max_features=MAX_FEATURES,
                  scoring=SCORING,
                  cv=CV,
                  n_jobs=-1)

        efs.fit(X, y)

        X_fs = X.iloc[:, list(efs.best_idx_)]

        return X_fs
コード例 #26
0
def exhaustive_feature_selection(x_train, y_train, model=None, num_features=[2, 5], classification_tasks=True,
                                 scoring=None):
    print("============== Exhaustive feature selection ===================")
    if not model:
        if classification_tasks:
            model = LogisticRegression(multi_class='multinomial',
                                       solver='lbfgs',
                                       random_state=123)
        else:
            model = Ridge()

    if not scoring:
        if classification_tasks:
            scoring = "accuracy"
        else:
            scoring = "neg_mean_absolute_error"

    efs = EFS(estimator=model,
               min_features=num_features[0],
               max_features=num_features[1],
               scoring=scoring,
               print_progress=False,
               clone_estimator=False,
               cv=10,
               n_jobs=2)

    X = efs.fit(x_train.values, y_train.values)
    print('Best accuracy score: %.2f' % efs.best_score_)
    col_list = []
    col_list.extend(efs.best_idx_)
    col_names = x_train.columns
    print('Best subset:', col_names[col_list].values)
    x_train = x_train.iloc[:,col_list]

    print("=================================")
    return x_train
コード例 #27
0
df = df.sort_values(by=['importances'])
print('\n\n')

for feature_choices in [10, 20, 30, 40, 50]:
    for max_len in [5, 10]:

        these_choices = df.tail(feature_choices)
        #print(these_choices)
        #print(df)
        test_cols = these_choices['feature'].values
        print(test_cols)
        efs = EFS(
            estimator=rfc,
            min_features=3,
            max_features=max_len,
            print_progress=False,
            scoring='accuracy',
            n_jobs=15,
            cv=4,
        )

        start_time = time.time()
        try:
            efs = efs.fit(X_train[test_cols], y_train)
        except:
            continue
        end_time = time.time()
        #print()
        #print(feature_choices, end_time - start_time)
        best_features = list(efs.best_feature_names_)
        best_score = efs.best_score_
コード例 #28
0
    import ExhaustiveFeatureSelector as EFS
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

#%% load sample data
iris = load_iris()
x = pd.DataFrame(iris.data, \
    columns=iris.feature_names)

#%% create a logistic regression object
lr = LogisticRegression()

#%% create an EFS object
efs = EFS(estimator=lr,        
          min_features=1,      
          max_features=3,      
          scoring='accuracy',  
          cv=5)

#%% fit the model
efs = efs.fit(x, iris.target)

#%% show the selected features
efs.best_feature_names_
# console output:
# ('sepal length (cm)', 'petal length (cm)', 
# 'petal width (cm)')

#%% show a full report on the feature selection
efs_results = pd.DataFrame(efs.get_metric_dict()).\
    T. \
コード例 #29
0
# plt.show()
# print(p_values[p_values<0.05])
#######################################
#####################F-score
f_score = chi2(X_train, y_train)
p_values = pd.Series(f_score[1])
p_values.sort_values(ascending=True, inplace=True)
# print("P-values",p_values)
p_values.plot.bar()
# plt.show()
###########################
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
sfs = EFS(RandomForestClassifier(n_jobs=2, random_state=0),
          min_features=1,
          max_features=6,
          scoring='accuracy',
          cv=None,
          n_jobs=-1).fit(X_train, y_train)
# sfs=SFS(RandomForestClassifier(n_jobs=2,random_state=1),k_features=5,forward=True,floating=False,verbose=2,scoring='accuracy',cv=None,n_jobs=-1).fit(X_train,y_train)
# print(sfs.k_feature_names_)
# print(sfs.k_score_)
# x=df[['TEMP','DO','PH','CONDUCTIVITY','BOD','NITRATE']].values
x = df[['TEMP', 'CONDUCTIVITY', 'BOD']].values
y = df["CLASS"]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1)
print(sfs.best_score_)
print(sfs.best_feature_names_)
コード例 #30
0
ファイル: HyperEFS.py プロジェクト: Aniruddha25/sklearn1
num_chunks = pd.read_csv("train_numeric.csv",
                         index_col=0,
                         usecols=list(range(969)),
                         chunksize=100000,
                         dtype=np.float32)
X = pd.concat([
    pd.concat([dchunk, nchunk], axis=1).sample(frac=0.05)
    for dchunk, nchunk in zip(date_chunks, num_chunks)
])
y = pd.read_csv("train_numeric.csv",
                index_col=0,
                usecols=[0, 969],
                dtype=np.float32).loc[X.index].values.ravel()
X = X.values
model = XGBClassifier()
efs1 = EFS(model, min_features=100, max_features=900, scoring='accuracy', cv=5)

efs1 = efs1.fit(X, y)

print('Best accuracy score: %.2f' % efs1.best_score_)
important_indices = efs1.best_idx_

# Got important_indices from above code
#important_indices = []
print("Found important features %s" % important_indices)
# load entire dataset for these features.
# note where the feature indices are split so we can load the correct ones straight from read_csv
n_date_features = 1156
X = np.concatenate([
    pd.read_csv("train_date.csv",
                index_col=0,