Example #1
0
 def feature_selection(self, direction=None):
     sfs = SequentialFeatureSelector(self.model,
                                     n_features_to_select=3,
                                     direction=direction)
     self.sfs = sfs.fit(self.X_train, self.Y_train)
     self.X_train_columns = self.X_train.columns[sfs.get_support()]
     self.X_test_columns = self.X_test.columns[sfs.get_support()]
     self.X_train = self.X_train[self.X_train_columns]
     self.X_test = self.X_test[self.X_test_columns]
     self.model = LinearRegression().fit(self.X_train, self.Y_train)
    def run_sklearn(self):
        """ Train and evaluate models from sklearn """

        classifier_options = {
            #'lr': linear_model.LogisticRegressionCV(cv=10, n_jobs=10, max_iter=10000, verbose=0),
            'lr':
            linear_model.LogisticRegression(n_jobs=10,
                                            max_iter=100000,
                                            verbose=0),
            'svm':
            model_selection.GridSearchCV(svm.LinearSVC(dual=False,
                                                       max_iter=10000,
                                                       verbose=0),
                                         {
                                             'C': [.01, .1, 1, 10, 100],
                                             'penalty': ['l2']
                                         },
                                         n_jobs=10,
                                         cv=10,
                                         verbose=2),
            'mlp':
            neural_network.MLPClassifier(hidden_layer_sizes=(32, 50),
                                         activation='relu',
                                         early_stopping=True,
                                         verbose=2)
        }
        self.clf = classifier_options[self.clf_type]
        if self.sfs_k > 0:
            # Forward feature selection
            print("Doing forward feature selection...")
            sfs = SequentialFeatureSelector(self.clf,
                                            n_features_to_select=self.sfs_k,
                                            n_jobs=-1)
            sfs.fit(self.data.X_train, self.data.y_train)
            # Save out selected features
            outpath = os.path.join(
                '/projects/tumblr_community_identity/tmp/',
                f'sfs{self.sfs_k}_{self.extractor.select_k}.txt')
            np.savetxt(outpath, sfs.get_support())
            print(f"Saved forward feature selection mask to {outpath}")
            sfs_mask = sfs.get_support()
            #sfs_mask = np.loadtxt(
            #    '/projects/tumblr_community_identity/tmp/sfs20_500.txt').astype(bool)
            X_train = self.data.X_train[:, sfs_mask]
            X_dev = self.data.X_dev[:, sfs_mask]
            X_test = self.data.X_test[:, sfs_mask]
            self.data.X_train, self.data.X_dev, self.data.X_test = X_train, X_dev, X_test

        self.model = self.clf.fit(self.data.X_train, self.data.y_train)
        self.test_score = self.model.score(self.data.X_test, self.data.y_test)
        self.train_pred = self.model.predict(self.data.X_train)
        if self.data.X_dev is not None:
            self.dev_score = self.model.score(self.data.X_dev, self.data.y_dev)
            self.dev_pred = self.model.predict(self.data.X_dev)
        self.test_pred = self.model.predict(self.data.X_test)
def seq_feature_selection(data, target, n_features=None):
    predictors = data.drop(columns=target).select_dtypes(np.number)
    selector = SequentialFeatureSelector(estimator=LinearRegression(),
                                         n_features_to_select=n_features)
    selector = selector.fit(predictors, data[target])
    selected = selector.get_support(indices=True)
    return predictors.iloc[:, selected].columns.to_list()
Example #4
0
def test_n_features_to_select(direction, n_features_to_select):
    # Make sure n_features_to_select is respected

    X, y = make_regression(n_features=10)
    sfs = SequentialFeatureSelector(LinearRegression(),
                                    n_features_to_select=n_features_to_select,
                                    direction=direction, cv=2)
    sfs.fit(X, y)
    if n_features_to_select is None:
        n_features_to_select = 5  # n_features // 2
    assert sfs.get_support(indices=True).shape[0] == n_features_to_select
    assert sfs.n_features_to_select_ == n_features_to_select
    assert sfs.transform(X).shape[1] == n_features_to_select
Example #5
0
def test_n_features_to_select_auto(direction):
    """Check the behaviour of `n_features_to_select="auto"` with different
    values for the parameter `tol`.
    """

    n_features = 10
    tol = 1e-3
    X, y = make_regression(n_features=n_features, random_state=0)
    sfs = SequentialFeatureSelector(
        LinearRegression(),
        n_features_to_select="auto",
        tol=tol,
        direction=direction,
        cv=2,
    )
    sfs.fit(X, y)

    max_features_to_select = n_features - 1

    assert sfs.get_support(indices=True).shape[0] <= max_features_to_select
    assert sfs.n_features_to_select_ <= max_features_to_select
    assert sfs.transform(X).shape[1] <= max_features_to_select
    assert sfs.get_support(indices=True).shape[0] == sfs.n_features_to_select_
Example #6
0
    def select_greedy(data):
        X, X_test, y = data

        svr = svm.SVR(kernel="rbf", C=100, tol=1).fit(X, y)
        tic = time()
        select = SequentialFeatureSelector(svr,
                                           direction=direction,
                                           n_features_to_select=n_features,
                                           n_jobs=-1).fit(X, y)
        toc = time()

        joblib.dump(select.get_support(), "joblib/greedy_support")

        print(f"features selected: {select.get_support()}")
        print(f"done in: {toc - tic:.2f}s")

        return select.transform(X), select.transform(X_test), y
Example #7
0
def test_sanity(seed, direction, n_features_to_select,
                expected_selected_features):
    # Basic sanity check: 3 features, only f0 and f2 are correlated with the
    # target, f2 having a stronger correlation than f0. We expect f1 to be
    # dropped, and f2 to always be selected.

    rng = np.random.RandomState(seed)
    n_samples = 100
    X = rng.randn(n_samples, 3)
    y = 3 * X[:, 0] - 10 * X[:, 2]

    sfs = SequentialFeatureSelector(LinearRegression(),
                                    n_features_to_select=n_features_to_select,
                                    direction=direction, cv=2)
    sfs.fit(X, y)
    assert_array_equal(sfs.get_support(indices=True),
                       expected_selected_features)
Example #8
0
from sklearn.feature_selection import SequentialFeatureSelector
model = LinearSVC(C=0.01, penalty='l2', dual=False,
                  random_state=0).fit(x, y)  #C越小惩罚力度越大
names = np.array(columns)
sfs_forward = SequentialFeatureSelector(model,
                                        n_features_to_select=7,
                                        scoring='accuracy',
                                        cv=5).fit(x, y)
sfs_backward = SequentialFeatureSelector(model,
                                         n_features_to_select=7,
                                         scoring='accuracy',
                                         direction='backward',
                                         cv=5).fit(x, y)
print('Features selected by forward SFS selection l2:',
      names[sfs_forward.get_support()])
print('Features selected by backward SFS selection l2:',
      names[sfs_backward.get_support()])
'''
L1惩罚项降维的原理在于保留多个对目标值具有同等相关性的特征中的一个,所以没选到的特征不代表不重要。
故,可结合L2惩罚项来优化。具体操作为:若一个特征在L1中的权值为1,选择在L2中权值差别不大且在L1中权值为0的特征构成同类集合,
将这一集合中的特征平分L1中的权值,故需要构建一个新的模型:
'''


class LS(LinearSVC):
    def __init__(self,
                 threshold=0.01,
                 dual=False,
                 tol=1e-4,
                 C=0.01,
Example #9
0
        valid_df[valid_df.columns[i]])
    train_df[train_df.columns[i]] = clean_Dirt_Data(
        train_df[train_df.columns[i]])

x_train, y_train = prepareData(train_df)
x_valid, y_valid = prepareData(valid_df)

print('Done Read Train and Validation data!')

#Avaliacao do NB
classificador_nb = GaussianNB(priors=None, var_smoothing=1e-9)
# Sequential Forward Selection(sfs)
sfs = SequentialFeatureSelector(classificador_nb,
                                n_features_to_select=x_train.shape[1] * 0.15)
sfs.fit(x_train, y_train)
result_nb = sfs.get_support()

result_nb_out = pd.DataFrame(result_nb)
result_nb_out.to_csv('nb_interpretavel.csv')

#Avaliacao do SVC
classificador_svm = LinearSVC()
# Sequential Forward Selection(sfs)
sfs = SequentialFeatureSelector(classificador_svm,
                                n_features_to_select=x_train.shape[1] * 0.15)
sfs.fit(x_train, y_train)
result_svc = sfs.get_support()

result_svc_out = pd.DataFrame(result_svc)
result_svc_out.to_csv('svc_interpretavel.csv')
Example #10
0
def test_n_features_to_select_stopping_criterion(direction):
    """Check the behaviour stopping criterion for feature selection
    depending on the values of `n_features_to_select` and `tol`.

    When `direction` is `'forward'`, select a new features at random
    among those not currently selected in selector.support_,
    build a new version of the data that includes all the features
    in selector.support_ + this newly selected feature.
    And check that the cross-validation score of the model trained on
    this new dataset variant is lower than the model with
    the selected forward selected features or at least does not improve
    by more than the tol margin.

    When `direction` is `'backward'`, instead of adding a new feature
    to selector.support_, try to remove one of those selected features at random
    And check that the cross-validation score is either decreasing or
    not improving by more than the tol margin.
    """

    X, y = make_regression(n_features=50, n_informative=10, random_state=0)

    tol = 1e-3

    sfs = SequentialFeatureSelector(
        LinearRegression(),
        n_features_to_select="auto",
        tol=tol,
        direction=direction,
        cv=2,
    )
    sfs.fit(X, y)
    selected_X = sfs.transform(X)

    rng = np.random.RandomState(0)

    added_candidates = list(
        set(range(X.shape[1])) - set(sfs.get_support(indices=True)))
    added_X = np.hstack([
        selected_X,
        (X[:, rng.choice(added_candidates)])[:, np.newaxis],
    ])

    removed_candidate = rng.choice(list(range(sfs.n_features_to_select_)))
    removed_X = np.delete(selected_X, removed_candidate, axis=1)

    plain_cv_score = cross_val_score(LinearRegression(), X, y, cv=2).mean()
    sfs_cv_score = cross_val_score(LinearRegression(), selected_X, y,
                                   cv=2).mean()
    added_cv_score = cross_val_score(LinearRegression(), added_X, y,
                                     cv=2).mean()
    removed_cv_score = cross_val_score(LinearRegression(), removed_X, y,
                                       cv=2).mean()

    assert sfs_cv_score >= plain_cv_score

    if direction == "forward":
        assert (sfs_cv_score - added_cv_score) <= tol
        assert (sfs_cv_score - removed_cv_score) >= tol
    else:
        assert (added_cv_score - sfs_cv_score) <= tol
        assert (removed_cv_score - sfs_cv_score) <= tol