class DFExhaustiveFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, **kwargs):
        self.columns = columns
        self.selector = ExhaustiveFeatureSelector(**kwargs)
        self.transform_cols = None
        self.stat_df = None

    def fit(self, X, y):
        self.columns = X.columns if self.columns is None else self.columns
        self.transform_cols = [x for x in X.columns if x in self.columns]
        self.selector.fit(X[self.transform_cols], y)

        self.stat_df = pd.DataFrame.from_dict(
            self.selector.get_metric_dict()).T
        self.stat_df.at[self.stat_df['avg_score'].astype(float).idxmax(),
                        'support'] = True
        self.stat_df['support'].fillna(False, inplace=True)

        return self

    def transform(self, X):
        if self.transform_cols is None:
            raise NotFittedError(
                f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
            )

        features = list(
            self.stat_df[self.stat_df['support']]['feature_names'].values[0])
        new_X = X[features].copy()

        return new_X

    def fit_transform(self, X, y):
        return self.fit(X, y).transform(X)
    def brute_force(self, X, y, y_type):
        if y_type == "binary":
            est = LinearRegression()
            efs = EFS(
                estimator=est,
                min_features=1,
                max_features=2,
                scoring="neg_mean_squared_error",
                cv=5,
            )

            efs = efs.fit(X, y)
            efs_df = pd.DataFrame.from_dict(efs.get_metric_dict()).T
            efs_df.sort_values("avg_score", inplace=True, ascending=False)

        else:
            est = LogisticRegression()
            efs = EFS(
                estimator=est,
                min_features=1,
                max_features=2,
                scoring="neg_mean_squared_error",
                cv=5,
            )

            efs = efs.fit(X, y)
            efs_df = pd.DataFrame.from_dict(efs.get_metric_dict()).T
            efs_df.sort_values("avg_score", inplace=True, ascending=False)

        # horizontal bar chart
        fig, ax = plt.subplots(figsize=(12, 9))
        y_pos = np.arange(len(efs_df))
        ax.barh(y_pos, efs_df["avg_score"], xerr=efs_df["std_dev"])
        ax.set_yticks(y_pos)
        ax.set_xlabel("Avg Score")
        ax.set_ylabel("Feature Names")
        ax.tick_params(labelleft=False)
        plt.show()

        return efs_df
def perform_efs(curr_model, X, y, min_cols, max_cols):

    efs1 = EFS(curr_model,
               min_features=min_cols,
               max_features=max_cols,
               print_progress=True,
               scoring='accuracy',
               cv=5,
               n_jobs=-1)

    efs1 = efs1.fit(X, y)

    df = pd.DataFrame.from_dict(efs1.get_metric_dict()).T
    #    df['test_acc'] = df['feature_idx'].apply(
    #        lambda x: make_predictions_on_test(efs1, curr_model, X_train, X_test, y_train, y_test, x)
    #    )

    return df
Beispiel #4
0
          min_features=1,      
          max_features=3,      
          scoring='accuracy',  
          cv=5)

#%% fit the model
efs = efs.fit(x, iris.target)

#%% show the selected features
efs.best_feature_names_
# console output:
# ('sepal length (cm)', 'petal length (cm)', 
# 'petal width (cm)')

#%% show a full report on the feature selection
efs_results = pd.DataFrame(efs.get_metric_dict()).\
    T. \
    sort_values(by='avg_score', ascending=False)

#%% show feature importance visually
# create figure and axes
fig, ax = plt.subplots()

# plot bars
y_pos = np.arange(len(efs_results))
ax.barh(y_pos, efs_results['avg_score'], \
    xerr=efs_results['std_err'])

# set axis ticks and labels
ax.set_yticks(y_pos)
ax.set_yticklabels(efs_results['feature_names'])
"""
mlxtend ExhaustiveFeatureSelector, selects best subset of features from all possible combinations of the features
link :http://rasbt.github.io/mlxtend/user_guide/feature_selection/ExhaustiveFeatureSelector/
"""
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
efs_1 = EFS(knn,
            min_features=1,
            max_features=4,
            scoring='accuracy',
            print_progress=True,
            cv=5)  ##where knn is the model

efs_1.get_metric_dict()
print('Selected features:', efs1.best_idx_)
"""
Sequential feature selection is better computationally compared to EFS, however they should not be applied along with embedded feature selection methods like LASSO
Compared to RFE its more computation intensive as it relies on a metric for feature selection,
whereas RFE relies on weight coefficients(linear) or feature importances(tree based algos)

There are 4 different flavors of SFAs available via the SequentialFeatureSelector:

Sequential Forward Selection (SFS)
Sequential Backward Selection (SBS)
Sequential Forward Floating Selection (SFFS)
Sequential Backward Floating Selection (SBFS)
link: http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/
"""
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

sfs1 = SFS(knn,
           k_features=3,
Beispiel #6
0
gbc = GradientBoostingClassifier(n_estimators=ml_cfg["gbc"]["n_estimators"],
                                 learning_rate=ml_cfg["gbc"]["learning_rate"],
                                 criterion=ml_cfg["gbc"]["criterion"],
                                 max_depth=ml_cfg["gbc"]["max_depth"],
                                 loss=ml_cfg["gbc"]["loss"])

classifiers = [knn, svc, nusvc, rf, gbc]
classifiers_test = [knn, svc]

# For each specified model, run exhaustive sequential feature selection
for model in classifiers:
    efs = ExhaustiveFeatureSelector(
        model,
        min_features=3,
        max_features=len(features),
        scoring='accuracy',
        print_progress=True,
        # Specify to use all available CPUs -> "-1"
        n_jobs=-1,
        # Split for cross-validation
        cv=5)
    efs.fit(X_train, y_train, custom_feature_names=feature_names)
    df = pd.DataFrame.from_dict(efs.get_metric_dict()).T
    df.sort_values('avg_score', inplace=True, ascending=False)
    name = f"{model}".split("(")[0]
    df.to_csv(f"Feature_Selection/{name}.csv")
    print(f"\nModel: {name}")
    print(f"Best accuracy score: {efs.best_score_}")
    print(f"Best subset (indices): {efs.best_idx_}")
    print(f"Best subset (corresponding names): {efs.best_feature_names_}")