def test_check_pandas_dataframe_fit(): knn = KNeighborsClassifier(n_neighbors=4) iris = load_iris() X = iris.data y = iris.target efs1 = EFS(knn, min_features=2, max_features=2, scoring='accuracy', cv=0, clone_estimator=False, print_progress=False, n_jobs=1) df = pd.DataFrame(X, columns=['sepal length', 'sepal width', 'petal length', 'petal width']) sfs1 = efs1.fit(X, y) assert efs1.best_idx_ == (2, 3), efs1.best_idx_ assert efs1.best_feature_names_ == ('2', '3') assert efs1.interrupted_ is False sfs1._TESTING_INTERRUPT_MODE = True sfs1 = sfs1.fit(df, y) assert efs1.best_idx_ == (0, 1), efs1.best_idx_ assert efs1.best_feature_names_ == ('sepal length', 'sepal width') assert efs1.interrupted_ is True
def test_knn_cv3_groups(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) efs1 = EFS(knn, min_features=3, max_features=3, scoring='accuracy', cv=GroupKFold(n_splits=3), print_progress=False) np.random.seed(1630672634) groups = np.random.randint(0, 6, size=len(y)) efs1 = efs1.fit(X, y, groups=groups) # print(efs1.subsets_) expect = {0: {'cv_scores': np.array([0.97916667, 0.93877551, 0.9245283]), 'feature_idx': (0, 1, 2), 'avg_score': 0.9474901595858469, 'feature_names': ('0', '1', '2')}, 1: {'cv_scores': np.array([1., 0.93877551, 0.9245283]), 'feature_idx': (0, 1, 3), 'avg_score': 0.9544346040302915, 'feature_names': ('0', '1', '3')}, 2: {'cv_scores': np.array([0.97916667, 0.95918367, 0.9245283]), 'feature_idx': (0, 2, 3), 'avg_score': 0.9542928806742822, 'feature_names': ('0', '2', '3')}, 3: {'cv_scores': np.array([0.97916667, 0.95918367, 0.94339623]), 'feature_idx': (1, 2, 3), 'avg_score': 0.9605821888503829, 'feature_names': ('1', '2', '3')}} dict_compare_utility(d1=expect, d2=efs1.subsets_)
def test_knn_cv3(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) efs1 = EFS(knn, min_features=3, max_features=3, scoring='accuracy', cv=4, print_progress=False) efs1 = efs1.fit(X, y) expect = {0: {'avg_score': 0.9391025641025641, 'feature_idx': (0, 1, 2), 'cv_scores': np.array([0.97435897, 0.94871795, 0.88888889, 0.94444444])}, 1: {'avg_score': 0.94017094017094016, 'feature_idx': (0, 1, 3), 'cv_scores': np.array([0.92307692, 0.94871795, 0.91666667, 0.97222222])}, 2: {'avg_score': 0.95299145299145294, 'feature_idx': (0, 2, 3), 'cv_scores': np.array([0.97435897, 0.94871795, 0.91666667, 0.97222222])}, 3: {'avg_score': 0.97275641025641035, 'feature_idx': (1, 2, 3), 'cv_scores': np.array([0.97435897, 1., 0.94444444, 0.97222222])}} dict_compare_utility(d1=expect, d2=efs1.subsets_) assert efs1.best_idx_ == (1, 2, 3) assert round(efs1.best_score_, 4) == 0.9728
def test_knn_wo_cv(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) efs1 = EFS(knn, min_features=2, max_features=3, scoring='accuracy', cv=0, print_progress=False) efs1 = efs1.fit(X, y) expect = {0: {'feature_idx': (0, 1), 'feature_names': ('0', '1'), 'avg_score': 0.82666666666666666, 'cv_scores': np.array([0.82666667])}, 1: {'feature_idx': (0, 2), 'feature_names': ('0', '2'), 'avg_score': 0.95999999999999996, 'cv_scores': np.array([0.96])}, 2: {'feature_idx': (0, 3), 'feature_names': ('0', '3'), 'avg_score': 0.96666666666666667, 'cv_scores': np.array([0.96666667])}, 3: {'feature_idx': (1, 2), 'feature_names': ('1', '2'), 'avg_score': 0.95999999999999996, 'cv_scores': np.array([0.96])}, 4: {'feature_idx': (1, 3), 'feature_names': ('1', '3'), 'avg_score': 0.95999999999999996, 'cv_scores': np.array([0.96])}, 5: {'feature_idx': (2, 3), 'feature_names': ('2', '3'), 'avg_score': 0.97333333333333338, 'cv_scores': np.array([0.97333333])}, 6: {'feature_idx': (0, 1, 2), 'feature_names': ('0', '1', '2'), 'avg_score': 0.95999999999999996, 'cv_scores': np.array([0.96])}, 7: {'feature_idx': (0, 1, 3), 'feature_names': ('0', '1', '3'), 'avg_score': 0.96666666666666667, 'cv_scores': np.array([0.96666667])}, 8: {'feature_idx': (0, 2, 3), 'feature_names': ('0', '2', '3'), 'avg_score': 0.96666666666666667, 'cv_scores': np.array([0.96666667])}, 9: {'feature_idx': (1, 2, 3), 'feature_names': ('1', '2', '3'), 'avg_score': 0.97333333333333338, 'cv_scores': np.array([0.97333333])}} dict_compare_utility(d1=expect, d2=efs1.subsets_)
def test_regression(): boston = load_boston() X, y = boston.data[:, [1, 2, 6, 8, 12]], boston.target lr = LinearRegression() efs_r = EFS(lr, min_features=3, max_features=4, scoring='neg_mean_squared_error', cv=10, print_progress=False) efs_r = efs_r.fit(X, y) assert efs_r.best_idx_ == (0, 2, 4) assert round(efs_r.best_score_, 4) == -40.8777
def test_fit_params(): iris = load_iris() X = iris.data y = iris.target sample_weight = np.ones(X.shape[0]) forest = RandomForestClassifier(n_estimators=100, random_state=123) efs1 = EFS(forest, min_features=3, max_features=3, scoring='accuracy', cv=4, print_progress=False) efs1 = efs1.fit(X, y, sample_weight=sample_weight) expect = {0: {'feature_idx': (0, 1, 2), 'feature_names': ('0', '1', '2'), 'cv_scores': np.array([0.947, 0.868, 0.919, 0.973]), 'avg_score': 0.9269203413940257}, 1: {'feature_idx': (0, 1, 3), 'feature_names': ('0', '1', '3'), 'cv_scores': np.array([0.921, 0.921, 0.892, 1.]), 'avg_score': 0.9337606837606838}, 2: {'feature_idx': (0, 2, 3), 'feature_names': ('0', '2', '3'), 'cv_scores': np.array([0.974, 0.947, 0.919, 0.973]), 'avg_score': 0.9532361308677098}, 3: {'feature_idx': (1, 2, 3), 'feature_names': ('1', '2', '3'), 'cv_scores': np.array([0.974, 0.947, 0.892, 1.]), 'avg_score': 0.9532361308677098}} if Version(sklearn_version) < Version("0.22"): expect[0]['avg_score'] = 0.9401709401709402 expect[0]['cv_scores'] = np.array([0.94871795, 0.92307692, 0.91666667, 0.97222222]) expect[1]['cv_scores'] = np.array([0.94871795, 0.92307692, 0.91666667, 0.97222222]) expect[2]['cv_scores'] = np.array([0.94871795, 0.92307692, 0.91666667, 0.97222222]) expect[2]['avg_score'] = 0.9599358974358974 expect[3]['avg_score'] = 0.9599358974358974 expect[3]['cv_scores'] = np.array([0.97435897, 0.94871795, 0.91666667, 1.]) assert round(efs1.best_score_, 4) == 0.9599 else: assert round(efs1.best_score_, 4) == 0.9532 dict_compare_utility(d1=expect, d2=efs1.subsets_) assert efs1.best_idx_ == (0, 2, 3)
def test_clone_params_pass(): iris = load_iris() X = iris.data y = iris.target lr = SoftmaxRegression(random_seed=1) efs1 = EFS(lr, min_features=2, max_features=2, scoring='accuracy', cv=0, clone_estimator=False, print_progress=False, n_jobs=1) efs1 = efs1.fit(X, y) assert(efs1.best_idx_ == (1, 3))
def test_knn_cv3(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) efs1 = EFS(knn, min_features=3, max_features=3, scoring='accuracy', cv=4, print_progress=False) efs1 = efs1.fit(X, y) expect = {0: {'avg_score': 0.9391025641025641, 'feature_idx': (0, 1, 2), 'feature_names': ('0', '1', '2'), 'cv_scores': np.array([0.974, 0.947, 0.892, 0.946])}, 1: {'avg_score': 0.9400782361308677, 'feature_idx': (0, 1, 3), 'feature_names': ('0', '1', '3'), 'cv_scores': np.array([0.921, 0.947, 0.919, 0.973])}, 2: {'avg_score': 0.95299145299145294, 'feature_idx': (0, 2, 3), 'feature_names': ('0', '2', '3'), 'cv_scores': np.array([0.974, 0.947, 0.919, 0.973])}, 3: {'avg_score': 0.97275641025641035, 'feature_idx': (1, 2, 3), 'feature_names': ('1', '2', '3'), 'cv_scores': np.array([0.974, 1. , 0.946, 0.973])}} if Version(sklearn_version) < Version("0.22"): expect[0]['cv_scores'] = np.array([0.97435897, 0.94871795, 0.88888889, 0.94444444]) expect[1]['cv_scores'] = np.array([0.92307692, 0.94871795, 0.91666667, 0.97222222]) expect[2]['cv_scores'] = np.array([0.97435897, 0.94871795, 0.91666667, 0.97222222]) expect[3]['cv_scores'] = np.array([0.97435897, 0.94871795, 0.91666667, 0.97222222]) expect[1]['avg_score'] = 0.94017094017094016 assert round(efs1.best_score_, 4) == 0.9728 else: assert round(efs1.best_score_, 4) == 0.9732 dict_compare_utility(d1=expect, d2=efs1.subsets_) assert efs1.best_idx_ == (1, 2, 3) assert efs1.best_feature_names_ == ('1', '2', '3')
def test_fit_transform(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) efs1 = EFS(knn, min_features=2, max_features=2, scoring='accuracy', cv=0, clone_estimator=False, print_progress=False, n_jobs=1) X_t = efs1.fit_transform(X, y) assert X_t.shape == (150, 2)
def test_knn_cv3(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) efs1 = EFS(knn, min_features=3, max_features=3, scoring='accuracy', cv=4, print_progress=False) efs1 = efs1.fit(X, y) expect = { 0: { 'avg_score': 0.9391025641025641, 'feature_idx': (0, 1, 2), 'feature_names': ('0', '1', '2'), 'cv_scores': np.array([0.97435897, 0.94871795, 0.88888889, 0.94444444]) }, 1: { 'avg_score': 0.94017094017094016, 'feature_idx': (0, 1, 3), 'feature_names': ('0', '1', '3'), 'cv_scores': np.array([0.92307692, 0.94871795, 0.91666667, 0.97222222]) }, 2: { 'avg_score': 0.95299145299145294, 'feature_idx': (0, 2, 3), 'feature_names': ('0', '2', '3'), 'cv_scores': np.array([0.97435897, 0.94871795, 0.91666667, 0.97222222]) }, 3: { 'avg_score': 0.97275641025641035, 'feature_idx': (1, 2, 3), 'feature_names': ('1', '2', '3'), 'cv_scores': np.array([0.97435897, 1., 0.94444444, 0.97222222]) } } dict_compare_utility(d1=expect, d2=efs1.subsets_) assert efs1.best_idx_ == (1, 2, 3) assert efs1.best_feature_names_ == ('1', '2', '3') assert round(efs1.best_score_, 4) == 0.9728
def perform_efs(curr_model, X, y, min_cols, max_cols): efs1 = EFS(curr_model, min_features=min_cols, max_features=max_cols, print_progress=True, scoring='accuracy', cv=5, n_jobs=-1) efs1 = efs1.fit(X, y) df = pd.DataFrame.from_dict(efs1.get_metric_dict()).T # df['test_acc'] = df['feature_idx'].apply( # lambda x: make_predictions_on_test(efs1, curr_model, X_train, X_test, y_train, y_test, x) # ) return df
def test_custom_feature_names(): knn = KNeighborsClassifier(n_neighbors=4) iris = load_iris() X = iris.data y = iris.target efs1 = EFS(knn, min_features=2, max_features=2, scoring='accuracy', cv=0, clone_estimator=False, print_progress=False, n_jobs=1) efs1 = efs1.fit(X, y, custom_feature_names=( 'sepal length', 'sepal width', 'petal length', 'petal width')) assert efs1.best_idx_ == (2, 3), efs1.best_idx_ assert efs1.best_feature_names_ == ('petal length', 'petal width')
def ExhaustiveFeatureSelector(X, y, min_features=1, max_features=4): from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS from sklearn.linear_model import LinearRegression lr = LinearRegression() efs1 = EFS(lr, min_features=min_features, max_features=max_features, scoring='r2', print_progress=True, cv=5) efs1 = efs1.fit(X, y) #print('Best subset:', efs1.best_idx_) print('Best subset (corresponding names):', efs1.best_feature_names_) print('Best R² score: %.2f' % efs1.best_score_) return efs1.best_feature_names_, efs1.best_score_
def test_fit_params(): iris = load_iris() X = iris.data y = iris.target sample_weight = np.ones(X.shape[0]) forest = RandomForestClassifier(n_estimators=100, random_state=123) efs1 = EFS(forest, min_features=3, max_features=3, scoring='accuracy', cv=4, print_progress=False) efs1 = efs1.fit(X, y, sample_weight=sample_weight) expect = { 0: { 'feature_idx': (0, 1, 2), 'feature_names': ('0', '1', '2'), 'cv_scores': np.array([0.94871795, 0.92307692, 0.91666667, 0.97222222]), 'avg_score': 0.9401709401709402 }, 1: { 'feature_idx': (0, 1, 3), 'feature_names': ('0', '1', '3'), 'cv_scores': np.array([0.92307692, 0.92307692, 0.88888889, 1.]), 'avg_score': 0.9337606837606838 }, 2: { 'feature_idx': (0, 2, 3), 'feature_names': ('0', '2', '3'), 'cv_scores': np.array([0.97435897, 0.94871795, 0.94444444, 0.97222222]), 'avg_score': 0.9599358974358974 }, 3: { 'feature_idx': (1, 2, 3), 'feature_names': ('1', '2', '3'), 'cv_scores': np.array([0.97435897, 0.94871795, 0.91666667, 1.]), 'avg_score': 0.9599358974358974 } } dict_compare_utility(d1=expect, d2=efs1.subsets_) assert efs1.best_idx_ == (0, 2, 3) assert round(efs1.best_score_, 4) == 0.9599
def test_check_pandas_dataframe_transform(): knn = KNeighborsClassifier(n_neighbors=4) iris = load_iris() X = iris.data y = iris.target efs1 = EFS(knn, min_features=2, max_features=2, scoring='accuracy', cv=0, clone_estimator=False, print_progress=False, n_jobs=1) df = pd.DataFrame(X, columns=['sepal length', 'sepal width', 'petal length', 'petal width']) efs1 = efs1.fit(df, y) assert efs1.best_idx_ == (2, 3) assert (150, 2) == efs1.transform(df).shape
def test_knn_cv3_groups(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) efs1 = EFS(knn, min_features=3, max_features=3, scoring='accuracy', cv=GroupKFold(n_splits=3), print_progress=False) np.random.seed(1630672634) groups = np.random.randint(0, 6, size=len(y)) efs1 = efs1.fit(X, y, groups=groups) expect = { 0: { 'cv_scores': np.array([0.97916667, 0.93877551, 0.9245283]), 'feature_idx': (0, 1, 2), 'avg_score': 0.9474901595858469, 'feature_names': ('0', '1', '2') }, 1: { 'cv_scores': np.array([1., 0.93877551, 0.9245283]), 'feature_idx': (0, 1, 3), 'avg_score': 0.9544346040302915, 'feature_names': ('0', '1', '3') }, 2: { 'cv_scores': np.array([0.97916667, 0.95918367, 0.9245283]), 'feature_idx': (0, 2, 3), 'avg_score': 0.9542928806742822, 'feature_names': ('0', '2', '3') }, 3: { 'cv_scores': np.array([0.97916667, 0.95918367, 0.94339623]), 'feature_idx': (1, 2, 3), 'avg_score': 0.9605821888503829, 'feature_names': ('1', '2', '3') } } dict_compare_utility(d1=expect, d2=efs1.subsets_)
def wrapper_selection(): print('--------------------------------------------------------') print('Utilizando tecnica de Envoltura...') models = [ svm.SVC(), RandomForestClassifier(), GaussianNB(), LogisticRegression(), KNeighborsClassifier() ] for model in models: efs = ExhaustiveFeatureSelector(model, min_features=1, max_features=5, scoring='accuracy', cv=5) efs = efs.fit(data, labels) selected_features = columns[list(efs.best_idx_)] print( f'Variables seleccionadas utilizando {model}: {selected_features}')
def wrapper(x_train_df): x_train = x_train_df.drop(["id", "failed test"], axis=1) y_train = x_train_df["failed test"] feature_selector = EFS(RandomForestClassifier(max_depth=17, n_estimators=136, max_features=0.307, min_samples_split=30, random_state=42), min_features=6, max_features=7, scoring='log_loss', print_progress=True, n_jobs=1, cv=5) features = feature_selector.fit(x_train, y_train) print('Best recall score: %.2f' % feature_selector.best_score_) print('Best subset (indices):', feature_selector.best_idx_) print('Best subset (corresponding names):', feature_selector.best_feature_names_) print('Subsets_: ', feature_selector.subsets_)
def exhaustive_feature_selection(x_data, y_data, min_feat, max_feat): print(f"Applying exhaustive feature selection to numeric data") print( f"cat variables before backward feature selection {x_data.select_dtypes(include='object').columns.shape}" ) print( f"numeric variables before backward feature selection {x_data.select_dtypes(include='number').columns.shape}" ) numeric_cols = x_data.select_dtypes(include='number').columns temp = x_data[numeric_cols] efs = EFS(RandomForestRegressor(n_jobs=4), max_features=max_feat, min_features=min_feat, scoring='r2', print_progress=True, cv=2) efs.fit(temp, y_data) idx = efs.best_idx_ print(idx) idx = list(idx) cols_to_keep = x_data.columns[idx] cols_to_drop = [x for x in numeric_cols if x not in cols_to_keep] print(cols_to_drop.__len__()) x_data.drop(labels=cols_to_drop, axis=1, inplace=True) print( f"cat variables after exhaustive feature selection {x_data.select_dtypes(include='object').columns}" ) print( f"numeric variables after exhaustive feature selection {x_data.select_dtypes(include='number').columns}" ) return x_data
def create_data(X: dt.Frame = None) -> pd.DataFrame: if X is None: return [] from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS X = X.to_pandas() y = X[TARGET_COLUMN].values X.drop(TARGET_COLUMN, axis=1, inplace=True) efs = EFS(ESTIMATOR, min_features=MIN_FEATURES, max_features=MAX_FEATURES, scoring=SCORING, cv=CV, n_jobs=-1) efs.fit(X, y) X_fs = X.iloc[:, list(efs.best_idx_)] return X_fs
def exhaustive_feature_selection(x_train, y_train, model=None, num_features=[2, 5], classification_tasks=True, scoring=None): print("============== Exhaustive feature selection ===================") if not model: if classification_tasks: model = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=123) else: model = Ridge() if not scoring: if classification_tasks: scoring = "accuracy" else: scoring = "neg_mean_absolute_error" efs = EFS(estimator=model, min_features=num_features[0], max_features=num_features[1], scoring=scoring, print_progress=False, clone_estimator=False, cv=10, n_jobs=2) X = efs.fit(x_train.values, y_train.values) print('Best accuracy score: %.2f' % efs.best_score_) col_list = [] col_list.extend(efs.best_idx_) col_names = x_train.columns print('Best subset:', col_names[col_list].values) x_train = x_train.iloc[:,col_list] print("=================================") return x_train
def test_fit_params(): iris = load_iris() X = iris.data y = iris.target sample_weight = np.ones(X.shape[0]) forest = RandomForestClassifier(n_estimators=100, random_state=123) efs1 = EFS(forest, min_features=3, max_features=3, scoring='accuracy', cv=4, print_progress=False) efs1 = efs1.fit(X, y, sample_weight=sample_weight) expect = {0: {'feature_idx': (0, 1, 2), 'feature_names': ('0', '1', '2'), 'cv_scores': np.array([0.94871795, 0.92307692, 0.91666667, 0.97222222]), 'avg_score': 0.9401709401709402}, 1: {'feature_idx': (0, 1, 3), 'feature_names': ('0', '1', '3'), 'cv_scores': np.array([0.92307692, 0.92307692, 0.88888889, 1.]), 'avg_score': 0.9337606837606838}, 2: {'feature_idx': (0, 2, 3), 'feature_names': ('0', '2', '3'), 'cv_scores': np.array([0.97435897, 0.94871795, 0.94444444, 0.97222222]), 'avg_score': 0.9599358974358974}, 3: {'feature_idx': (1, 2, 3), 'feature_names': ('1', '2', '3'), 'cv_scores': np.array([0.97435897, 0.94871795, 0.91666667, 1.]), 'avg_score': 0.9599358974358974}} dict_compare_utility(d1=expect, d2=efs1.subsets_) assert efs1.best_idx_ == (0, 2, 3) assert round(efs1.best_score_, 4) == 0.9599
def brute_force(self, X, y, y_type): if y_type == "binary": est = LinearRegression() efs = EFS( estimator=est, min_features=1, max_features=2, scoring="neg_mean_squared_error", cv=5, ) efs = efs.fit(X, y) efs_df = pd.DataFrame.from_dict(efs.get_metric_dict()).T efs_df.sort_values("avg_score", inplace=True, ascending=False) else: est = LogisticRegression() efs = EFS( estimator=est, min_features=1, max_features=2, scoring="neg_mean_squared_error", cv=5, ) efs = efs.fit(X, y) efs_df = pd.DataFrame.from_dict(efs.get_metric_dict()).T efs_df.sort_values("avg_score", inplace=True, ascending=False) # horizontal bar chart fig, ax = plt.subplots(figsize=(12, 9)) y_pos = np.arange(len(efs_df)) ax.barh(y_pos, efs_df["avg_score"], xerr=efs_df["std_dev"]) ax.set_yticks(y_pos) ax.set_xlabel("Avg Score") ax.set_ylabel("Feature Names") ax.tick_params(labelleft=False) plt.show() return efs_df
sc.fit_transform(newdata.drop(['Loan_ID', 'Loan_Status'], axis=1)), columns=[ 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Gender_Male', 'Married_Yes', 'Dependents_1', 'Dependents_2', 'Dependents_3+', 'Education_Not Graduate', 'Self_Employed_Yes' ]) # Target class Y = pd.DataFrame(newdata['Loan_Status']) #Visualization sns.pairplot(df1, hue="Loan_Status") #Splitting data into train and test samples X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) #Running ExhaustiveFeatureSelector() for feature_selection on 3 different classifiers efs = ExhaustiveFeatureSelector(RandomForestClassifier(), max_features=6, scoring='roc_auc', cv=5) efs_fit = efs.fit(X_train, Y_train) selected_features = X_train.columns[list(efs_fit.best_idx_)] print(selected_features) print(efs_fit.best_score_) rClassifier = RandomForestClassifier(random_state=0) rClassifier.fit(X_train[selected_features], Y_train) Y_RCF = rClassifier.predict(X_test[selected_features]) print(classification_report(Y_test, Y_RCF)) efs_naive = ExhaustiveFeatureSelector(GaussianNB(), max_features=6, scoring='roc_auc', cv=4) efs_naive_fit = efs_naive.fit(X_train, Y_train) selected_features_naive = X_train.columns[list(efs_naive_fit.best_idx_)]
print('Accuracy on training set: {}'.format( roc_auc_score(train_labels, train_pred[:, 1]))) test_pred = clf.predict_proba(test_features[filtered_features].fillna(0)) print('Accuracy on test set: {}'.format( roc_auc_score(test_labels, test_pred[:, 1]))) ##################################### XGBoost - Exhaustive feature Selector ############################################ from mlxtend.feature_selection import ExhaustiveFeatureSelector from xgboost import XGBClassifier from sklearn.metrics import roc_auc_score feature_selector = ExhaustiveFeatureSelector(XGBClassifier(), min_features=2, max_features=10, scoring='roc_auc', print_progress=True, cv=2) features = feature_selector.fit(np.array(train_features.fillna(0)), train_labels) print(type(features)) filtered_features = train_features.columns[list(features.best_idx_)] print(filtered_features) # see result of XGBoost: clf = XGBClassifier(min_child_weight=5, gamma=0.5, subsample=0.6,
def __init__(self, columns=None, **kwargs): self.columns = columns self.selector = ExhaustiveFeatureSelector(**kwargs) self.transform_cols = None self.stat_df = None
correlated_features = set() correlation_matrix = paribas_data.corr() for i in range(len(correlation_matrix.columns)): for j in range(i): if abs(correlation_matrix.iloc[ i, j]) > 0.8: #0.8 is a correlation threshold value column_name = correlation_matrix.columns[i] correlated_features.add(column_name) train_features.drop(labels=correlated_features, axis=1, inplace=True) test_features.drop(labels=correlated_features, axis=1, inplace=True) feature_selector = ExhaustiveFeatureSelector(RandomForestClassifier(n_jobs=-1), min_features=2, max_features=4, scoring='roc_auc', print_progress=True, cv=2) features = feature_selector.fit(np.array(train_features.fillna(0)), train_labels) filtered_features = train_features.columns[list(features.k_feature_idx_)] clf = RandomForestClassifier(n_estimators=100, random_state=41, max_depth=3) clf.fit(train_features[filtered_features].fillna(0), train_labels) train_pred = clf.predict_proba(train_features[filtered_features].fillna(0)) print('Accuracy on training set: {}'.format( roc_auc_score(train_labels, train_pred[:, 1]))) test_pred = clf.predict_proba(test_features[filtered_features].fillna(0))
that employs a search strategy to look through the space of possible feature subsets, evaluating each subset based on the quality of the performance of a given algorithm. """ """ EXHAUSTIVE FEATURE SELECTION. This method searches across all possible feature combinations. Its aim is to find the best performing feature subset. """ # import the algorithm you want to evaluate on your features. from mlxtend.feature_selection import ExhaustiveFeatureSelector from sklearn.ensemble import RandomForestClassifier # create the ExhaustiveFeatureSelector object. efs = ExhaustiveFeatureSelector(RandomForestClassifier(), min_features=45, max_features=70, scoring='accuracy', cv=2) # fit the object to the training data. efs = efs.fit(x, y) # print the selected features. selected_features1 = x.columns[list(efs.k_feature_idx_)] print('selected features from exhaustive selection:', selected_features1) # print the final prediction score. print('accuracy:', efs.k_score_) # transform to the newly selected features. #X_train = efs.transform(X_train)
""" mlxtend ExhaustiveFeatureSelector, selects best subset of features from all possible combinations of the features link :http://rasbt.github.io/mlxtend/user_guide/feature_selection/ExhaustiveFeatureSelector/ """ from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS efs_1 = EFS(knn, min_features=1, max_features=4, scoring='accuracy', print_progress=True, cv=5) ##where knn is the model efs_1.get_metric_dict() print('Selected features:', efs1.best_idx_) """ Sequential feature selection is better computationally compared to EFS, however they should not be applied along with embedded feature selection methods like LASSO Compared to RFE its more computation intensive as it relies on a metric for feature selection, whereas RFE relies on weight coefficients(linear) or feature importances(tree based algos) There are 4 different flavors of SFAs available via the SequentialFeatureSelector: Sequential Forward Selection (SFS) Sequential Backward Selection (SBS) Sequential Forward Floating Selection (SFFS) Sequential Backward Floating Selection (SBFS) link: http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/ """ from mlxtend.feature_selection import SequentialFeatureSelector as SFS sfs1 = SFS(knn, k_features=3,
print("The selected feature list:") print(feat_cols) elif (choice == 2): sfs1 = sfs(clf, k_features=4, forward=False, floating=False, verbose=2, scoring='accuracy', cv=5) # Perform SFFS sfs1 = sfs1.fit(X_train, y_train) feat_cols = list(sfs1.k_feature_idx_) print("******") print("The selected feature list:") print(feat_cols) elif (choice == 3): efs1 = EFS(knn, min_features=4, max_features=5, scoring='accuracy', print_progress=True, cv=5) efs1 = efs1.fit(X_train, y_train) feat_cols = list(efs1.best_idx_) print("******") print("The selected feature list:") print(feat_cols) else: print("Wrong Input")
import ExhaustiveFeatureSelector as EFS from sklearn.linear_model import LogisticRegression import matplotlib.pyplot as plt #%% load sample data iris = load_iris() x = pd.DataFrame(iris.data, \ columns=iris.feature_names) #%% create a logistic regression object lr = LogisticRegression() #%% create an EFS object efs = EFS(estimator=lr, min_features=1, max_features=3, scoring='accuracy', cv=5) #%% fit the model efs = efs.fit(x, iris.target) #%% show the selected features efs.best_feature_names_ # console output: # ('sepal length (cm)', 'petal length (cm)', # 'petal width (cm)') #%% show a full report on the feature selection efs_results = pd.DataFrame(efs.get_metric_dict()).\ T. \
for j in range(i): if abs(corr_matrix.iloc[i,j])>threshold: colname=corr_matrix.columns[i] col_corr.add(colname) return col_corr corr_features=correlation(X_train,0.8) print('correlated features:',len(set(corr_features))) X_train.drop(labels=corr_features,axis=1,inplace=True) X_test.drop(labels=corr_features,axis=1,inplace=True) efs1=EFS(RandomForestClassifier(n_jobs=4,random_state=0), min_features=1 , max_features=4, scoring='roc_auc', print_progress = True, cv=2 ) efs1=efs1.fit(np.array(X_train[X_train.columns[0:4]].fillna(0)),y_train) select_feat= X_train.columns[list(efs1.best_idx_)] select_feat def run_randomForests(X_train,X_test,y_train,y_test): rf=RandomForestClassifier(n_estimators=200,random_state=39,max_depth=4) rf.fit(X_train,y_train) print('Train set') pred=rf.predict_proba(X_train) print('Random Forests roc_auc :{}'.format(roc_auc_score(y_train,pred[:,1]))) print('Test set')
df = df.sort_values(by=['importances']) print('\n\n') for feature_choices in [10, 20, 30, 40, 50]: for max_len in [5, 10]: these_choices = df.tail(feature_choices) #print(these_choices) #print(df) test_cols = these_choices['feature'].values print(test_cols) efs = EFS( estimator=rfc, min_features=3, max_features=max_len, print_progress=False, scoring='accuracy', n_jobs=15, cv=4, ) start_time = time.time() try: efs = efs.fit(X_train[test_cols], y_train) except: continue end_time = time.time() #print() #print(feature_choices, end_time - start_time) best_features = list(efs.best_feature_names_) best_score = efs.best_score_
num_chunks = pd.read_csv("train_numeric.csv", index_col=0, usecols=list(range(969)), chunksize=100000, dtype=np.float32) X = pd.concat([ pd.concat([dchunk, nchunk], axis=1).sample(frac=0.05) for dchunk, nchunk in zip(date_chunks, num_chunks) ]) y = pd.read_csv("train_numeric.csv", index_col=0, usecols=[0, 969], dtype=np.float32).loc[X.index].values.ravel() X = X.values model = XGBClassifier() efs1 = EFS(model, min_features=100, max_features=900, scoring='accuracy', cv=5) efs1 = efs1.fit(X, y) print('Best accuracy score: %.2f' % efs1.best_score_) important_indices = efs1.best_idx_ # Got important_indices from above code #important_indices = [] print("Found important features %s" % important_indices) # load entire dataset for these features. # note where the feature indices are split so we can load the correct ones straight from read_csv n_date_features = 1156 X = np.concatenate([ pd.read_csv("train_date.csv", index_col=0,