def test_fit_params(): iris = load_iris() X = iris.data y = iris.target sample_weight = np.ones(X.shape[0]) forest = RandomForestClassifier(n_estimators=100, random_state=123) efs1 = EFS(forest, min_features=3, max_features=3, scoring='accuracy', cv=4, print_progress=False) efs1 = efs1.fit(X, y, sample_weight=sample_weight) expect = {0: {'feature_idx': (0, 1, 2), 'cv_scores': np.array([0.94871795, 0.92307692, 0.91666667, 0.97222222]), 'avg_score': 0.9401709401709402}, 1: {'feature_idx': (0, 1, 3), 'cv_scores': np.array([0.92307692, 0.92307692, 0.88888889, 1.]), 'avg_score': 0.9337606837606838}, 2: {'feature_idx': (0, 2, 3), 'cv_scores': np.array([0.97435897, 0.94871795, 0.94444444, 0.97222222]), 'avg_score': 0.9599358974358974}, 3: {'feature_idx': (1, 2, 3), 'cv_scores': np.array([0.97435897, 0.94871795, 0.91666667, 1.]), 'avg_score': 0.9599358974358974}} dict_compare_utility(d1=expect, d2=efs1.subsets_) assert efs1.best_idx_ == (0, 2, 3) assert round(efs1.best_score_, 4) == 0.9599
def test_knn_cv3_groups(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) efs1 = EFS(knn, min_features=3, max_features=3, scoring='accuracy', cv=GroupKFold(n_splits=3), print_progress=False) np.random.seed(1630672634) groups = np.random.randint(0, 6, size=len(y)) efs1 = efs1.fit(X, y, groups=groups) expect = {0: {'cv_scores': np.array([0.97916667, 0.93877551, 0.9245283]), 'feature_idx': (0, 1, 2), 'avg_score': 0.9474901595858469, 'feature_names': ('0', '1', '2')}, 1: {'cv_scores': np.array([1., 0.93877551, 0.9245283]), 'feature_idx': (0, 1, 3), 'avg_score': 0.9544346040302915, 'feature_names': ('0', '1', '3')}, 2: {'cv_scores': np.array([0.97916667, 0.95918367, 0.9245283]), 'feature_idx': (0, 2, 3), 'avg_score': 0.9542928806742822, 'feature_names': ('0', '2', '3')}, 3: {'cv_scores': np.array([0.97916667, 0.95918367, 0.94339623]), 'feature_idx': (1, 2, 3), 'avg_score': 0.9605821888503829, 'feature_names': ('1', '2', '3')}} dict_compare_utility(d1=expect, d2=efs1.subsets_)
def exhaustive_feature_selection(X: dt.Frame = None): if X is None: return [] # X[:, 'default payment next month leak'] = X[:, 'default payment next month'] datadf = X.to_pandas() data_y = datadf['default payment next month'] data_X = datadf.iloc[:,:datadf.shape[1] - 1] # radius_mean onwards XX = data_X y = np.ravel(data_y) # knn = KNeighborsClassifier(n_neighbors=3) efs1 = EFS(knn, min_features=5, max_features=10, scoring='accuracy', print_progress=True, cv=5) efs1 = efs1.fit(XX, y) support = sfs1.k_feature_names_ feat_list = list(support) # get the new features col_names_to_pick = feat_list + ['default payment next month'] new_df = datadf[col_names_to_pick] new_dt = dt.Frame(new_df) return new_dt
def test_check_pandas_dataframe_fit(): knn = KNeighborsClassifier(n_neighbors=4) iris = load_iris() X = iris.data y = iris.target efs1 = EFS(knn, min_features=2, max_features=2, scoring='accuracy', cv=0, clone_estimator=False, print_progress=False, n_jobs=1) df = pd.DataFrame( X, columns=['sepal length', 'sepal width', 'petal length', 'petal width']) sfs1 = efs1.fit(X, y) assert efs1.best_idx_ == (2, 3), efs1.best_idx_ assert efs1.best_feature_names_ == ('2', '3') assert efs1.interrupted_ is False sfs1._TESTING_INTERRUPT_MODE = True sfs1 = sfs1.fit(df, y) assert efs1.best_idx_ == (0, 1), efs1.best_idx_ assert efs1.best_feature_names_ == ('sepal length', 'sepal width') assert efs1.interrupted_ is True
def test_knn_cv3(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) efs1 = EFS(knn, min_features=3, max_features=3, scoring='accuracy', cv=4, print_progress=False) efs1 = efs1.fit(X, y) expect = {0: {'avg_score': 0.9391025641025641, 'feature_idx': (0, 1, 2), 'cv_scores': np.array([0.97435897, 0.94871795, 0.88888889, 0.94444444])}, 1: {'avg_score': 0.94017094017094016, 'feature_idx': (0, 1, 3), 'cv_scores': np.array([0.92307692, 0.94871795, 0.91666667, 0.97222222])}, 2: {'avg_score': 0.95299145299145294, 'feature_idx': (0, 2, 3), 'cv_scores': np.array([0.97435897, 0.94871795, 0.91666667, 0.97222222])}, 3: {'avg_score': 0.97275641025641035, 'feature_idx': (1, 2, 3), 'cv_scores': np.array([0.97435897, 1., 0.94444444, 0.97222222])}} dict_compare_utility(d1=expect, d2=efs1.subsets_) assert efs1.best_idx_ == (1, 2, 3) assert round(efs1.best_score_, 4) == 0.9728
def __init__(self, estimator=LGBMClassifier(n_jobs=30), scoring='roc_auc', selector_name='rfe', cv_worker=1, step=1): """ :param estimator: :param scoring: :param selector: {'sfs', 'efs', 'rfe'} # rfe 可以粗排一下用gpu加速 :param cv_worker: gpu要设为1 """ self.selector_name = selector_name if selector_name == 'sfs': """ efs的优化版:根据 scoring 筛选特征 顺序特征选择算法的贪婪搜索算法家族,用于减少初始d维特征空间到ķ维特征空间,其中ķ<d 。 特征选择算法背后的动机是自动选择与问题最相关的特征子集。 特征选择的目标是双重的:我们希望通过去除不相关的特征或噪声来提高计算效率并减少模型的泛化误差。 如果嵌入式特征选择(例如,像LASSO这样的正则化惩罚)不适用,则诸如顺序特征选择之类的包装器方法尤其有用。 """ self.selector = SFS( estimator, scoring=scoring, cv=5, n_jobs=cv_worker, verbose=2, k_features='best', forward=True, floating=False, ) elif selector_name == 'efs': # 枚举:2^n self.selector = EFS(estimator, scoring=scoring, cv=5, n_jobs=cv_worker, print_progress=True, max_features=1000) elif selector_name == 'rfe': # 根据树模型特征重要性等权重信息筛选特征 """https://www.jianshu.com/p/025395835591 # 打印的是相应位置上属性的排名 print(rfe.ranking_) # 属性选择的一种模糊表示,选择的是true,未选择的是false print(rfe.support_) # 第1个属相的排名 print(rfe.ranking_[1]) # 外部估计函数的相关信息 print(rfe.estimator_) """ """https://www.kaggle.com/roydatascience/recursive-feature-selection-new-transactions-elo""" self.selector = RFECV( estimator, scoring=scoring, cv=5, n_jobs=cv_worker, verbose=2, step=step, # 每次迭代要删除的特征数/占比 )
def test_fit_params(): iris = load_iris() X = iris.data y = iris.target sample_weight = np.ones(X.shape[0]) forest = RandomForestClassifier(n_estimators=100, random_state=123) efs1 = EFS(forest, min_features=3, max_features=3, scoring='accuracy', cv=4, print_progress=False) efs1 = efs1.fit(X, y, sample_weight=sample_weight) expect = { 0: { 'feature_idx': (0, 1, 2), 'feature_names': ('0', '1', '2'), 'cv_scores': np.array([0.947, 0.868, 0.919, 0.973]), 'avg_score': 0.9269203413940257 }, 1: { 'feature_idx': (0, 1, 3), 'feature_names': ('0', '1', '3'), 'cv_scores': np.array([0.921, 0.921, 0.892, 1.]), 'avg_score': 0.9337606837606838 }, 2: { 'feature_idx': (0, 2, 3), 'feature_names': ('0', '2', '3'), 'cv_scores': np.array([0.974, 0.947, 0.919, 0.973]), 'avg_score': 0.9532361308677098 }, 3: { 'feature_idx': (1, 2, 3), 'feature_names': ('1', '2', '3'), 'cv_scores': np.array([0.974, 0.947, 0.892, 1.]), 'avg_score': 0.9532361308677098 } } if Version(sklearn_version) < Version("0.22"): expect[0]['avg_score'] = 0.9401709401709402 expect[0]['cv_scores'] = np.array( [0.94871795, 0.92307692, 0.91666667, 0.97222222]) expect[1]['cv_scores'] = np.array( [0.94871795, 0.92307692, 0.91666667, 0.97222222]) expect[2]['cv_scores'] = np.array( [0.94871795, 0.92307692, 0.91666667, 0.97222222]) expect[2]['avg_score'] = 0.9599358974358974 expect[3]['avg_score'] = 0.9599358974358974 expect[3]['cv_scores'] = np.array( [0.97435897, 0.94871795, 0.91666667, 1.]) assert round(efs1.best_score_, 4) == 0.9599 else: assert round(efs1.best_score_, 4) == 0.9532 dict_compare_utility(d1=expect, d2=efs1.subsets_) assert efs1.best_idx_ == (0, 2, 3)
def select_EFS(tr, num_feat=100): X = tr y = [i for i in range(9) for j in range(60)] knn = KNeighborsClassifier(n_neighbors=1) efs = EFS(knn, min_features=num_feat, max_features=num_feat, cv=5, n_jobs=4) efs.fit(X, y) out = efs.best_idx_ return out
def test_maxfeatures_1(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier() efs = EFS(estimator=knn, min_features=1, max_features=0) expect = ('max_features must be smaller than 5 and larger than 0') assert_raises(AttributeError, expect, efs.fit, X, y)
def test_minmaxfeatures_1(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier() efs = EFS(estimator=knn, min_features=3, max_features=2) expect = ('min_features must be <= max_features') assert_raises(AttributeError, expect, efs.fit, X, y)
def test_knn_cv3(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) efs1 = EFS(knn, min_features=3, max_features=3, scoring='accuracy', cv=4, print_progress=False) efs1 = efs1.fit(X, y) expect = { 0: { 'avg_score': 0.9391025641025641, 'feature_idx': (0, 1, 2), 'feature_names': ('0', '1', '2'), 'cv_scores': np.array([0.974, 0.947, 0.892, 0.946]) }, 1: { 'avg_score': 0.9400782361308677, 'feature_idx': (0, 1, 3), 'feature_names': ('0', '1', '3'), 'cv_scores': np.array([0.921, 0.947, 0.919, 0.973]) }, 2: { 'avg_score': 0.95299145299145294, 'feature_idx': (0, 2, 3), 'feature_names': ('0', '2', '3'), 'cv_scores': np.array([0.974, 0.947, 0.919, 0.973]) }, 3: { 'avg_score': 0.97275641025641035, 'feature_idx': (1, 2, 3), 'feature_names': ('1', '2', '3'), 'cv_scores': np.array([0.974, 1., 0.946, 0.973]) } } if Version(sklearn_version) < Version("0.22"): expect[0]['cv_scores'] = np.array( [0.97435897, 0.94871795, 0.88888889, 0.94444444]) expect[1]['cv_scores'] = np.array( [0.92307692, 0.94871795, 0.91666667, 0.97222222]) expect[2]['cv_scores'] = np.array( [0.97435897, 0.94871795, 0.91666667, 0.97222222]) expect[3]['cv_scores'] = np.array( [0.97435897, 0.94871795, 0.91666667, 0.97222222]) expect[1]['avg_score'] = 0.94017094017094016 assert round(efs1.best_score_, 4) == 0.9728 else: assert round(efs1.best_score_, 4) == 0.9732 dict_compare_utility(d1=expect, d2=efs1.subsets_) assert efs1.best_idx_ == (1, 2, 3) assert efs1.best_feature_names_ == ('1', '2', '3')
def test_knn_wo_cv(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) efs1 = EFS(knn, min_features=2, max_features=3, scoring='accuracy', cv=0, print_progress=False) efs1 = efs1.fit(X, y) expect = {0: {'feature_idx': (0, 1), 'feature_names': ('0', '1'), 'avg_score': 0.82666666666666666, 'cv_scores': np.array([0.82666667])}, 1: {'feature_idx': (0, 2), 'feature_names': ('0', '2'), 'avg_score': 0.95999999999999996, 'cv_scores': np.array([0.96])}, 2: {'feature_idx': (0, 3), 'feature_names': ('0', '3'), 'avg_score': 0.96666666666666667, 'cv_scores': np.array([0.96666667])}, 3: {'feature_idx': (1, 2), 'feature_names': ('1', '2'), 'avg_score': 0.95999999999999996, 'cv_scores': np.array([0.96])}, 4: {'feature_idx': (1, 3), 'feature_names': ('1', '3'), 'avg_score': 0.95999999999999996, 'cv_scores': np.array([0.96])}, 5: {'feature_idx': (2, 3), 'feature_names': ('2', '3'), 'avg_score': 0.97333333333333338, 'cv_scores': np.array([0.97333333])}, 6: {'feature_idx': (0, 1, 2), 'feature_names': ('0', '1', '2'), 'avg_score': 0.95999999999999996, 'cv_scores': np.array([0.96])}, 7: {'feature_idx': (0, 1, 3), 'feature_names': ('0', '1', '3'), 'avg_score': 0.96666666666666667, 'cv_scores': np.array([0.96666667])}, 8: {'feature_idx': (0, 2, 3), 'feature_names': ('0', '2', '3'), 'avg_score': 0.96666666666666667, 'cv_scores': np.array([0.96666667])}, 9: {'feature_idx': (1, 2, 3), 'feature_names': ('1', '2', '3'), 'avg_score': 0.97333333333333338, 'cv_scores': np.array([0.97333333])}} dict_compare_utility(d1=expect, d2=efs1.subsets_)
def brute_force(self, X, y, y_type): if y_type == "binary": est = LinearRegression() efs = EFS( estimator=est, min_features=1, max_features=2, scoring="neg_mean_squared_error", cv=5, ) efs = efs.fit(X, y) efs_df = pd.DataFrame.from_dict(efs.get_metric_dict()).T efs_df.sort_values("avg_score", inplace=True, ascending=False) else: est = LogisticRegression() efs = EFS( estimator=est, min_features=1, max_features=2, scoring="neg_mean_squared_error", cv=5, ) efs = efs.fit(X, y) efs_df = pd.DataFrame.from_dict(efs.get_metric_dict()).T efs_df.sort_values("avg_score", inplace=True, ascending=False) # horizontal bar chart fig, ax = plt.subplots(figsize=(12, 9)) y_pos = np.arange(len(efs_df)) ax.barh(y_pos, efs_df["avg_score"], xerr=efs_df["std_dev"]) ax.set_yticks(y_pos) ax.set_xlabel("Avg Score") ax.set_ylabel("Feature Names") ax.tick_params(labelleft=False) plt.show() return efs_df
def test_regression(): boston = load_boston() X, y = boston.data[:, [1, 2, 6, 8, 12]], boston.target lr = LinearRegression() efs_r = EFS(lr, min_features=3, max_features=4, scoring='neg_mean_squared_error', cv=10, print_progress=False) efs_r = efs_r.fit(X, y) assert efs_r.best_idx_ == (0, 2, 4) assert round(efs_r.best_score_, 4) == -40.8777
def test_get_metric_dict_not_fitted(): knn = KNeighborsClassifier(n_neighbors=4) efs1 = EFS(knn, min_features=2, max_features=2, scoring='accuracy', cv=0, clone_estimator=False, print_progress=False, n_jobs=1) expect = 'ExhaustiveFeatureSelector has not been fitted, yet.' assert_raises(AttributeError, expect, efs1.get_metric_dict)
def test_clone_params_pass(): iris = load_iris() X = iris.data y = iris.target lr = SoftmaxRegression(random_seed=1) efs1 = EFS(lr, min_features=2, max_features=2, scoring='accuracy', cv=0, clone_estimator=False, print_progress=False, n_jobs=1) efs1 = efs1.fit(X, y) assert(efs1.best_idx_ == (1, 3))
def test_transform_not_fitted(): iris = load_iris() X = iris.data knn = KNeighborsClassifier(n_neighbors=4) efs1 = EFS(knn, min_features=2, max_features=2, scoring='accuracy', cv=0, clone_estimator=False, print_progress=False, n_jobs=1) expect = 'ExhaustiveFeatureSelector has not been fitted, yet.' assert_raises(AttributeError, expect, efs1.transform, X)
def test_fit_transform(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) efs1 = EFS(knn, min_features=2, max_features=2, scoring='accuracy', cv=0, clone_estimator=False, print_progress=False, n_jobs=1) X_t = efs1.fit_transform(X, y) assert X_t.shape == (150, 2)
def ExhaustiveFeatureSelector(X, y, min_features=1, max_features=4): from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS from sklearn.linear_model import LinearRegression lr = LinearRegression() efs1 = EFS(lr, min_features=min_features, max_features=max_features, scoring='r2', print_progress=True, cv=5) efs1 = efs1.fit(X, y) #print('Best subset:', efs1.best_idx_) print('Best subset (corresponding names):', efs1.best_feature_names_) print('Best R² score: %.2f' % efs1.best_score_) return efs1.best_feature_names_, efs1.best_score_
def test_custom_feature_names(): knn = KNeighborsClassifier(n_neighbors=4) iris = load_iris() X = iris.data y = iris.target efs1 = EFS(knn, min_features=2, max_features=2, scoring='accuracy', cv=0, clone_estimator=False, print_progress=False, n_jobs=1) efs1 = efs1.fit(X, y, custom_feature_names=( 'sepal length', 'sepal width', 'petal length', 'petal width')) assert efs1.best_idx_ == (2, 3), efs1.best_idx_ assert efs1.best_feature_names_ == ('petal length', 'petal width')
def perform_efs(curr_model, X, y, min_cols, max_cols): efs1 = EFS(curr_model, min_features=min_cols, max_features=max_cols, print_progress=True, scoring='accuracy', cv=5, n_jobs=-1) efs1 = efs1.fit(X, y) df = pd.DataFrame.from_dict(efs1.get_metric_dict()).T # df['test_acc'] = df['feature_idx'].apply( # lambda x: make_predictions_on_test(efs1, curr_model, X_train, X_test, y_train, y_test, x) # ) return df
def test_check_pandas_dataframe_transform(): knn = KNeighborsClassifier(n_neighbors=4) iris = load_iris() X = iris.data y = iris.target efs1 = EFS(knn, min_features=2, max_features=2, scoring='accuracy', cv=0, clone_estimator=False, print_progress=False, n_jobs=1) df = pd.DataFrame(X, columns=['sepal length', 'sepal width', 'petal length', 'petal width']) efs1 = efs1.fit(df, y) assert efs1.best_idx_ == (2, 3) assert (150, 2) == efs1.transform(df).shape
def wrapper(x_train_df): x_train = x_train_df.drop(["id", "failed test"], axis=1) y_train = x_train_df["failed test"] feature_selector = EFS(RandomForestClassifier(max_depth=17, n_estimators=136, max_features=0.307, min_samples_split=30, random_state=42), min_features=6, max_features=7, scoring='log_loss', print_progress=True, n_jobs=1, cv=5) features = feature_selector.fit(x_train, y_train) print('Best recall score: %.2f' % feature_selector.best_score_) print('Best subset (indices):', feature_selector.best_idx_) print('Best subset (corresponding names):', feature_selector.best_feature_names_) print('Subsets_: ', feature_selector.subsets_)
def exhaustive_feature_selection(x_data, y_data, min_feat, max_feat): print(f"Applying exhaustive feature selection to numeric data") print( f"cat variables before backward feature selection {x_data.select_dtypes(include='object').columns.shape}" ) print( f"numeric variables before backward feature selection {x_data.select_dtypes(include='number').columns.shape}" ) numeric_cols = x_data.select_dtypes(include='number').columns temp = x_data[numeric_cols] efs = EFS(RandomForestRegressor(n_jobs=4), max_features=max_feat, min_features=min_feat, scoring='r2', print_progress=True, cv=2) efs.fit(temp, y_data) idx = efs.best_idx_ print(idx) idx = list(idx) cols_to_keep = x_data.columns[idx] cols_to_drop = [x for x in numeric_cols if x not in cols_to_keep] print(cols_to_drop.__len__()) x_data.drop(labels=cols_to_drop, axis=1, inplace=True) print( f"cat variables after exhaustive feature selection {x_data.select_dtypes(include='object').columns}" ) print( f"numeric variables after exhaustive feature selection {x_data.select_dtypes(include='number').columns}" ) return x_data
def create_data(X: dt.Frame = None) -> pd.DataFrame: if X is None: return [] from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS X = X.to_pandas() y = X[TARGET_COLUMN].values X.drop(TARGET_COLUMN, axis=1, inplace=True) efs = EFS(ESTIMATOR, min_features=MIN_FEATURES, max_features=MAX_FEATURES, scoring=SCORING, cv=CV, n_jobs=-1) efs.fit(X, y) X_fs = X.iloc[:, list(efs.best_idx_)] return X_fs
def exhaustive_feature_selection(x_train, y_train, model=None, num_features=[2, 5], classification_tasks=True, scoring=None): print("============== Exhaustive feature selection ===================") if not model: if classification_tasks: model = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=123) else: model = Ridge() if not scoring: if classification_tasks: scoring = "accuracy" else: scoring = "neg_mean_absolute_error" efs = EFS(estimator=model, min_features=num_features[0], max_features=num_features[1], scoring=scoring, print_progress=False, clone_estimator=False, cv=10, n_jobs=2) X = efs.fit(x_train.values, y_train.values) print('Best accuracy score: %.2f' % efs.best_score_) col_list = [] col_list.extend(efs.best_idx_) col_names = x_train.columns print('Best subset:', col_names[col_list].values) x_train = x_train.iloc[:,col_list] print("=================================") return x_train
df = df.sort_values(by=['importances']) print('\n\n') for feature_choices in [10, 20, 30, 40, 50]: for max_len in [5, 10]: these_choices = df.tail(feature_choices) #print(these_choices) #print(df) test_cols = these_choices['feature'].values print(test_cols) efs = EFS( estimator=rfc, min_features=3, max_features=max_len, print_progress=False, scoring='accuracy', n_jobs=15, cv=4, ) start_time = time.time() try: efs = efs.fit(X_train[test_cols], y_train) except: continue end_time = time.time() #print() #print(feature_choices, end_time - start_time) best_features = list(efs.best_feature_names_) best_score = efs.best_score_
import ExhaustiveFeatureSelector as EFS from sklearn.linear_model import LogisticRegression import matplotlib.pyplot as plt #%% load sample data iris = load_iris() x = pd.DataFrame(iris.data, \ columns=iris.feature_names) #%% create a logistic regression object lr = LogisticRegression() #%% create an EFS object efs = EFS(estimator=lr, min_features=1, max_features=3, scoring='accuracy', cv=5) #%% fit the model efs = efs.fit(x, iris.target) #%% show the selected features efs.best_feature_names_ # console output: # ('sepal length (cm)', 'petal length (cm)', # 'petal width (cm)') #%% show a full report on the feature selection efs_results = pd.DataFrame(efs.get_metric_dict()).\ T. \
# plt.show() # print(p_values[p_values<0.05]) ####################################### #####################F-score f_score = chi2(X_train, y_train) p_values = pd.Series(f_score[1]) p_values.sort_values(ascending=True, inplace=True) # print("P-values",p_values) p_values.plot.bar() # plt.show() ########################### from sklearn.metrics import classification_report from sklearn.preprocessing import StandardScaler sfs = EFS(RandomForestClassifier(n_jobs=2, random_state=0), min_features=1, max_features=6, scoring='accuracy', cv=None, n_jobs=-1).fit(X_train, y_train) # sfs=SFS(RandomForestClassifier(n_jobs=2,random_state=1),k_features=5,forward=True,floating=False,verbose=2,scoring='accuracy',cv=None,n_jobs=-1).fit(X_train,y_train) # print(sfs.k_feature_names_) # print(sfs.k_score_) # x=df[['TEMP','DO','PH','CONDUCTIVITY','BOD','NITRATE']].values x = df[['TEMP', 'CONDUCTIVITY', 'BOD']].values y = df["CLASS"] from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1) print(sfs.best_score_) print(sfs.best_feature_names_)
num_chunks = pd.read_csv("train_numeric.csv", index_col=0, usecols=list(range(969)), chunksize=100000, dtype=np.float32) X = pd.concat([ pd.concat([dchunk, nchunk], axis=1).sample(frac=0.05) for dchunk, nchunk in zip(date_chunks, num_chunks) ]) y = pd.read_csv("train_numeric.csv", index_col=0, usecols=[0, 969], dtype=np.float32).loc[X.index].values.ravel() X = X.values model = XGBClassifier() efs1 = EFS(model, min_features=100, max_features=900, scoring='accuracy', cv=5) efs1 = efs1.fit(X, y) print('Best accuracy score: %.2f' % efs1.best_score_) important_indices = efs1.best_idx_ # Got important_indices from above code #important_indices = [] print("Found important features %s" % important_indices) # load entire dataset for these features. # note where the feature indices are split so we can load the correct ones straight from read_csv n_date_features = 1156 X = np.concatenate([ pd.read_csv("train_date.csv", index_col=0,