Beispiel #1
0
    def SelectBestFeatures(self):
        """
            Approximates best features for the model using L1 regularization
        """

        logreg = LogisticRegression(C=1,
                                    penalty='l1',
                                    solver='liblinear',
                                    random_state=42).fit(
                                        self.features, self.target)
        model = SelectFromModel(logreg, prefit=True)

        self.features_new = model.transform(self.features)

        selected_features = pd.DataFrame(model.inverse_transform(
            self.features_new),
                                         index=self.features.index,
                                         columns=self.features.columns)
        self.top_selected_columns = selected_features.columns[
            selected_features.var() != 0]
        self.top_selected_columns = self.train[self.top_selected_columns]

        print('Best selected columns :\n{}'.format(
            self.top_selected_columns.columns))
        return self.top_selected_columns
Beispiel #2
0
def feature_selection_l1(Xtrain, ytrain, c=0.07):
    """ Return selected features using logistic regression with an L1 penalty """
    logistic = LogisticRegression(C=c, penalty="l1",
                                  random_state=7).fit(Xtrain, ytrain)
    model = SelectFromModel(logistic, prefit=True)
    Xtrain_new = model.transform(Xtrain)
    selected_features = pd.DataFrame(model.inverse_transform(Xtrain_new),
                                     index=Xtrain.index,
                                     columns=Xtrain.columns)

    selected_columns = selected_features.columns[selected_features.var() != 0]
    dropped_columns = selected_features.columns[selected_features.var() == 0]

    return selected_columns, dropped_columns
Beispiel #3
0
def select_from_model():
    iris = load_iris()
    x = iris.data
    y = iris.target

    estimator = LinearSVC(penalty="l1", dual=False)
    selector = SelectFromModel(estimator=estimator, threshold="mean")
    selector.fit(x, y)
    selector.transform(x)

    print(selector.threshold_)
    print(selector.get_support(indices=True))
    print(selector.get_support(indices=False))
    print(selector.inverse_transform(selector.transform(x)))

    pass
Beispiel #4
0
def select_features_l1(X, y):
    """ Return selected features using logistic regression with an L1 penalty """
    logistic = LogisticRegression(C=0.1, penalty="l1",
                                  random_state=7).fit(X, y)
    model = SelectFromModel(logistic, prefit=True)

    X_new = model.transform(X)

    # Get back the kept features as a DataFrame with dropped columns as all 0s
    selected_features = pd.DataFrame(model.inverse_transform(X_new),
                                     index=X.index,
                                     columns=X.columns)

    # Dropped columns have values of all 0s, keep other columns
    selected = selected_features.columns[selected_features.var() != 0]

    return selected
Beispiel #5
0
        def select_features_l1(X, y):
            logistic_model = LogisticRegression(C=0.1,
                                                penalty="l1",
                                                random_state=7,
                                               solver='liblinear').fit(X, y)
            model = SelectFromModel(logistic_model, prefit=True)

            X_new = model.transform(X)

            # Get back the kept features as a DataFrame with dropped columns as all 0s
            selected_features = pd.DataFrame(model.inverse_transform(X_new),
                                            index=X.index,
                                            columns=X.columns)

            # Dropped columns have values of all 0s, keep other columns
            cols_to_keep = selected_features.columns[selected_features.var() != 0]

            return cols_to_keep
def feature_selectionfrommodel(data, y, num_feature):
    xx = data.sort_values('pid').values
    xx_label = y.sort_values('pid')[sep].values
    # select = SelectKBest(f_classif, k=num_feature).fit(xx,xx_label)
    # select = SelectFromModel(LinearSVC(C=0.01, penalty="l1", dual=False, max_iter=10000), threshold= "median", max_features=num_feature).fit(xx,xx_label)
    select = SelectFromModel(RandomForestClassifier(n_estimators=20000,
                                                    random_state=0,
                                                    n_jobs=-1),
                             threshold="median",
                             max_features=num_feature).fit(xx, xx_label)
    reduced_xx = select.transform(xx)
    new_data = select.inverse_transform(reduced_xx)
    new_data = pd.DataFrame(new_data,
                            index=data.sort_values('pid').index,
                            columns=data.sort_values('pid').columns)
    # idx = select.get_support()
    # print(idx)
    # new_data = np.delete(new_data,idx,1)
    return new_data
def l1_regularization_selection(X_train, y_train, features, reg_parameter,
                                rand_state):

    # Esta funcion utiliza la regularizacion L1 para seleccionar las mejores features
    # Nota: esta funcion no trae las mejore k features, sino las que quedan seleccionadas como mas relevantes
    # luego de aplicarles la regularizacion

    from sklearn.linear_model import LogisticRegression
    from sklearn.feature_selection import SelectFromModel

    logistic = LogisticRegression(C=reg_parameter,
                                  penalty='l1',
                                  random_state=rand_state).fit(
                                      X_train, y_train)
    model = SelectFromModel(logistic, prefit=True)
    X_new = model.transform(X_train)

    selected_features = pd.DataFrame(model.inverse_transform(X_new),
                                     index=X_train.index,
                                     columns=X_train.columns)

    cols_to_keep = selected_features.columns[selected_features.var() != 0]

    return cols_to_keep
Beispiel #8
0
def lesson_4():
    print_("Lesson 4: Feature Selection", 0, 1)
    ks = pd.read_csv(ks_projects_file_path,
                     parse_dates=['deadline', 'launched'])

    # Drop live projects
    ks = ks.query('state != "live"')

    # Add outcome column, "successful" == 1, others are 0
    ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int))

    # Timestamp features
    ks = ks.assign(hour=ks.launched.dt.hour,
                   day=ks.launched.dt.day,
                   month=ks.launched.dt.month,
                   year=ks.launched.dt.year)

    # Label encoding
    cat_features = ['category', 'currency', 'country']
    encoder = LabelEncoder()
    encoded = ks[cat_features].apply(encoder.fit_transform)

    data_cols = ['goal', 'hour', 'day', 'month', 'year', 'outcome']
    baseline_data = ks[data_cols].join(encoded)

    cat_features = ['category', 'currency', 'country']
    interactions = pd.DataFrame(index=ks.index)
    for col1, col2 in itertools.combinations(cat_features, 2):
        new_col_name = '_'.join([col1, col2])
        # Convert to strings and combine
        new_values = ks[col1].map(str) + "_" + ks[col2].map(str)
        label_enc = LabelEncoder()
        interactions[new_col_name] = label_enc.fit_transform(new_values)
    baseline_data = baseline_data.join(interactions)

    launched = pd.Series(ks.index, index=ks.launched,
                         name="count_7_days").sort_index()
    count_7_days = launched.rolling('7d').count() - 1
    count_7_days.index = launched.values
    count_7_days = count_7_days.reindex(ks.index)

    baseline_data = baseline_data.join(count_7_days)

    def time_since_last_project(series):
        # Return the time in hours
        return series.diff().dt.total_seconds() / 3600.

    df = ks[['category', 'launched']].sort_values('launched')
    timedeltas = df.groupby('category').transform(time_since_last_project)
    timedeltas = timedeltas.fillna(timedeltas.max())

    baseline_data = baseline_data.join(
        timedeltas.rename({'launched': 'time_since_last_project'}, axis=1))

    def get_data_splits(dataframe, valid_fraction=0.1):
        valid_fraction = 0.1
        valid_size = int(len(dataframe) * valid_fraction)

        train = dataframe[:-valid_size * 2]
        # valid size == test size, last two sections of the data
        valid = dataframe[-valid_size * 2:-valid_size]
        test = dataframe[-valid_size:]

        return train, valid, test

    def train_model(train, valid):
        feature_cols = train.columns.drop('outcome')

        dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])
        dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])

        param = {
            'num_leaves': 64,
            'objective': 'binary',
            'metric': 'auc',
            'seed': 7
        }
        print("Training model!")
        bst = lgb.train(param,
                        dtrain,
                        num_boost_round=1000,
                        valid_sets=[dvalid],
                        early_stopping_rounds=10,
                        verbose_eval=False)

        valid_pred = bst.predict(valid[feature_cols])
        valid_score = metrics.roc_auc_score(valid['outcome'], valid_pred)
        print(f"Validation AUC score: {valid_score:.4f}")
        return bst

    # ----------------------------
    # Univariate Feature Selection
    # ----------------------------
    feature_cols = baseline_data.columns.drop('outcome')

    # Keep 5 features
    selector = SelectKBest(f_classif, k=5)

    # NOTE: we should select features using only a training set, not the whole
    # dataset we are doing here (which will be fixed next)
    X_new = selector.fit_transform(baseline_data[feature_cols],
                                   baseline_data['outcome'])
    print_("X_new (after selecting 5 best features)", 0)
    print_(X_new)

    # Fix: select features using only a training set
    feature_cols = baseline_data.columns.drop('outcome')
    train, valid, _ = get_data_splits(baseline_data)

    # Keep 5 features
    selector = SelectKBest(f_classif, k=5)

    X_new = selector.fit_transform(train[feature_cols], train['outcome'])
    print_("X_new FIXED [Using Train Only]", 0)
    print_(X_new)

    # Get back the features we've kept, zero out all other features
    selected_features = pd.DataFrame(selector.inverse_transform(X_new),
                                     index=train.index,
                                     columns=feature_cols)
    print_(
        "First 5 rows from the train set including the 5 best features only (others set at 0)",
        0)
    print_(selected_features.head())

    # Dropped columns have values of all 0s, so var is 0, drop them
    selected_columns = selected_features.columns[selected_features.var() != 0]

    # Get the valid dataset with the selected features.
    print_("Valid dataset with the selected features only", 0)
    print_(valid[selected_columns].head())

    # -----------------
    # L1 regularization
    # -----------------
    train, valid, _ = get_data_splits(baseline_data)

    X, y = train[train.columns.drop("outcome")], train['outcome']

    # Set the regularization parameter C=1
    logistic = LogisticRegression(C=1,
                                  penalty="l1",
                                  solver='liblinear',
                                  random_state=7).fit(X, y)
    model = SelectFromModel(logistic, prefit=True)

    X_new = model.transform(X)
    print_("X_new with L1 regularization", 0)
    print_(X_new)

    # Get back the kept features as a DataFrame with dropped columns as all 0s
    selected_features = pd.DataFrame(model.inverse_transform(X_new),
                                     index=X.index,
                                     columns=X.columns)

    # Dropped columns have values of all 0s, keep other columns
    selected_columns = selected_features.columns[selected_features.var() != 0]
    print_("Rejected columns: {}".format(
        selected_features.columns.difference(selected_columns).to_list()))

    # Get the valid dataset with the selected features.
    print_("Valid dataset with the selected features using L1 regularization",
           0)
    print_(valid[selected_columns].head())
Beispiel #9
0
# (4) apply the feature selector to the training dataset
# (5) get a dataframe with the same index and columns as the training data but the unselected columns are filled with zeros
# (6) find selected columns by choosing features with nonzero variance
feature_cols = data.columns.drop('outcome')
train, valid, test = get_data_splits(data)
selector = SelectKBest(f_classif, k=6)
X_new = selector.fit_transform(train[feature_cols], train['outcome'])
selected_features = pd.DataFrame(selector.inverse_transform(X_new), 
                                 index=train.index, 
                                 columns=feature_cols)
selected_columns = selected_features.columns[selected_features.var()!=0]

# L1 regularization
# feature selection using L1 regularization should use training data only
# (1) split the data into training, validation and testing
# (2) drop the target column
# (3) fit a logistic regressio model to the training dataset (the smaller the parameter C the more penalty)
# (4) select the nonzero coefficients using .SelectFromModel method
# (5) select features based on the nonzero coefficients
# (6) get a dataframe with the same index and columns as the training data but the unselected columns are filled with zeros
# (7) find selected columns by choosing features with nonzero variance
train, valid, test = get_data_splits(data)
X, y = train[train.columns.drop("outcome")], train['outcome']
logistic = LogisticRegression(C=0.00001, penalty="l1", random_state=7).fit(X, y)
model = SelectFromModel(logistic, prefit=True)
X_new = model.transform(X)
selected_features = pd.DataFrame(model.inverse_transform(X_new),
                                 index=X.index,
                                 columns=X.columns)
selected_columns = selected_features.columns[selected_features.var()!=0]
                      index_col=0)
y_train = pd.read_csv('data/y_train.csv', index_col=0)
test_df = pd.read_csv('data/test_df_with_division_non_unique_words.csv',
                      index_col=0)

# Set the regularization parameter C=1
logistic = LogisticRegression(C=1, penalty="l1",
                              random_state=7).fit(X_train, y_train)
model = SelectFromModel(logistic, prefit=True)
print("Model trained")

X_new = model.transform(X_train)
test_new = model.transform(test_df)

# Get back the kept features as a DataFrame with dropped columns as all 0s
selected_features_train = pd.DataFrame(model.inverse_transform(X_new),
                                       index=X_train.index,
                                       columns=X_train.columns)
selected_features_test = pd.DataFrame(model.inverse_transform(test_new),
                                      index=test_df.index,
                                      columns=X_train.columns)
print("Features selected")

# Dropped columns have values of all 0s, keep other columns
selected_columns = selected_features_train.columns[
    selected_features_train.var() != 0]

selected_features_train = selected_features_train[selected_columns]
selected_features_test = selected_features_test[selected_columns]

selected_features_train.to_csv('data/selected_features_train.csv')
Beispiel #11
0
#train = train.loc[:,(train != -1).any(axis=0)]

label = train.WnvPresent
train = train.drop('WnvPresent', axis=1)
sfm = SelectFromModel(LinearSVC(penalty='l1', loss='squared_hinge',
                                dual=False))
data = sfm.fit_transform(train, label)
data = preprocessing.scale(data)
#data = preprocessing.scale(train)
transformer = FunctionTransformer(np.log1p, validate=True)
transformer.transform(data)
data = preprocessing.normalize(data, norm='l2')

feature_cols = train.columns
databackup = data
data = pd.DataFrame(sfm.inverse_transform(data),
                    index=train.index,
                    columns=feature_cols)
selCols = data.columns[data.var() != 0]
data = data[selCols]

TrainX, TestX, TrainY, TestY = train_test_split(data,
                                                label,
                                                test_size=0.2,
                                                random_state=1)
########################################################################################################################


def plotCurves(model):
    results = model.evals_result()
    epochs = len(results['validation_0']['auc'])
Beispiel #12
0
    def run_grid_pipeline(self, features, labels, standardization_colms,
                          parameters, estimator,
                          feature_selection_threshold_type):

        # Preprocessing for numerical data
        numerical_transformer = StandardScaler()

        # Preprocessing for categorical data
        categorical_transformer = OneHotEncoder(handle_unknown='ignore')

        # Bundle preprocessing for numerical and categorical data
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, standardization_colms),
                # ('cat', categorical_transformer, self.onehot_colms)
                # ], n_jobs = self.n_jobs)
            ],
            n_jobs=self.n_jobs,
            remainder='passthrough')

        feature_selection_clf = RandomForestClassifier(
            random_state=self.random_state, n_jobs=self.n_jobs)
        feature_selection_model = SelectFromModel(
            feature_selection_clf, threshold=feature_selection_threshold_type)

        grid = GridSearchCV(estimator=estimator,
                            param_grid=parameters,
                            cv=5,
                            scoring='accuracy',
                            refit=True,
                            n_jobs=-1)

        pipeline = Pipeline(steps=[(
            'preprocessor',
            preprocessor), ('feature_selection',
                            feature_selection_model), ('grid_search', grid)])

        pipeline.fit(features, labels)

        def print_results(results):
            print('BEST PARAMS: {}\n'.format(results.best_params_))

            means = results.cv_results_['mean_test_score']
            stds = results.cv_results_['std_test_score']
            for mean, std, params in zip(means, stds,
                                         results.cv_results_['params']):
                print('{} (+/-{}) for {}'.format(round(mean, 3),
                                                 round(std * 2, 3), params))

        print_results(pipeline['grid_search'])

        # print(features.columns)
        feature_selection_model = pipeline['feature_selection']
        selected_features = feature_selection_model.transform(features)
        selected_features = pd.DataFrame(
            feature_selection_model.inverse_transform(selected_features),
            index=features.index,
            columns=features.columns)
        self.selected_columns = selected_features.columns[
            selected_features.var() != 0]
        print(
            '\nColumns selected for {0} threshold'.format(
                feature_selection_threshold_type), self.selected_columns)

        # print('\nBest estimator:\n')
        # print(pipeline['grid_search'].best_estimator_)
        # print(pipeline['grid_search'].best_score_)
        # print(pipeline['grid_search'].best_params_)
        # print(pipeline['grid_search'].scorer_)

        return pipeline
Beispiel #13
0
    def feature_selection(self):

        onehot_features = self.original_features
        onehot_labels = self.original_labels

        onehot_encoder = OneHotEncoder(handle_unknown='error', sparse=False)
        onehot_encoder.fit(onehot_features[self.onehot_colms])
        onehot_transformed_colms = onehot_encoder.get_feature_names(
            self.onehot_colms)
        onehot_transformed_features = onehot_encoder.transform(
            onehot_features[self.onehot_colms])
        onehot_features = onehot_features.join(pd.DataFrame(
            onehot_transformed_features,
            index=onehot_features.index,
            columns=onehot_transformed_colms),
                                               how='inner')
        # print(onehot_features.info())
        # print(onehot_transformed_colms)
        onehot_features = onehot_features.drop(columns=self.onehot_colms)
        # print(onehot_features.info())
        # print(self.original_features.loc[0:5,'Region'])
        # print(onehot_features.loc[0:5, ['Region_1', 'Region_2', 'Region_3', 'Region_4', 'Region_5', 'Region_6', 'Region_7', 'Region_8', 'Region_9'] ] )

        sss = StratifiedShuffleSplit(n_splits=1,
                                     train_size=self.train_ratio,
                                     random_state=self.random_state)
        for train_indx, test_indx in sss.split(onehot_features, onehot_labels):
            # print(len(train_indx)/len(features), len(test_indx)/len(features))
            # print('% Survived:', labels[test_indx].mean())

            # Using RandomForestClassifier gives non-linear decision boundary
            clf = RandomForestClassifier(random_state=self.random_state,
                                         n_jobs=self.n_jobs)

            # Using LogisticRegression (default L1) gives linear decision boundary
            # clf = LogisticRegression()

            clf.fit(onehot_features.iloc[train_indx],
                    onehot_labels.iloc[train_indx])

            # Using mean threshold in SelectFromModel
            feature_selection_model = SelectFromModel(clf,
                                                      prefit=True,
                                                      threshold='mean')
            selected_features = feature_selection_model.transform(
                onehot_features.iloc[train_indx])
            selected_features = pd.DataFrame(
                feature_selection_model.inverse_transform(selected_features),
                index=onehot_features.iloc[train_indx].index,
                columns=onehot_features.iloc[train_indx].columns)
            self.selected_columns_mean = selected_features.columns[
                selected_features.var() != 0]
            print('Mean threshold:', self.selected_columns_mean)

            # Using Median threshold for SelectFromModel
            feature_selection_model = SelectFromModel(clf,
                                                      prefit=True,
                                                      threshold='median')
            selected_features = feature_selection_model.transform(
                onehot_features.iloc[train_indx])
            selected_features = pd.DataFrame(
                feature_selection_model.inverse_transform(selected_features),
                index=onehot_features.iloc[train_indx].index,
                columns=onehot_features.iloc[train_indx].columns)
            self.selected_columns_median = selected_features.columns[
                selected_features.var() != 0]
            print('Median threshold', self.selected_columns_median)
                                                      test_size=0.2,
                                                      random_state=0)
N, d = np.shape(x_train)

#METHOD 0: no feature selection; all features are used in the ML model
print('No feature selection:')
train_eval_tree(x_train, y_train, x_devel, y_devel)

#METHOD 1: LASSO/L1 regularization
lsvc = LinearSVC(C=1.0, penalty='l1', dual=False,
                 max_iter=1000).fit(x_train, y_train)
svc_mod = SelectFromModel(lsvc, prefit=True)
x_train_new = svc_mod.transform(x_train)

#get the selected/most important features and extract from validation set
selected_feats = pd.DataFrame(svc_mod.inverse_transform(x_train_new),
                              index=x_train.index,
                              columns=x_train.columns)
selected_cols = selected_feats.columns[selected_feats.var() != 0]
x_devel_new = x_devel[selected_cols]

#now train and test a decision tree using these selected features
print('L1 regularization:')
train_eval_tree(x_train_new, y_train, x_devel_new, y_devel)

#METHOD 2: SelectKBest using the f_classif score
select_feats = SelectKBest(f_classif, k=10)
x_train_new = select_feats.fit_transform(x_train, y_train)
selected_feats = pd.DataFrame(select_feats.inverse_transform(x_train_new),
                              index=x_train.index,
                              columns=x_train.columns)
Beispiel #15
0
# Get back the features we've kept, zero out all other features
selected_features = pd.DataFrame(selector.inverse_transform(X_new), index=train.index, columns=feature_cols)
# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns = selected_features.columns[selected_features.var() != 0]

=================================================================================
L1 regularization
Univariate methods consider only one feature at a time when making a selection decision.
Instead, we can make our selection using all of the features by including them in a linear model with L1 regularization. 
This type of regularization (sometimes called Lasso) penalizes the absolute magnitude of the coefficients, 
as compared to L2 (Ridge) regression which penalizes the square of the coefficients.
As the strength of regularization is increased, features which are less important for predicting the target are set to 0. 
This allows us to perform feature selection by adjusting the regularization parameter. We choose the parameter 
by finding the best performance on a hold-out set, or decide ahead of time how many features to keep.


from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

train, valid, _ = get_data_splits(baseline_data)
X, y = train[train.columns.drop("outcome")], train['outcome']
# Set the regularization parameter C=1
logistic = LogisticRegression(C=1, penalty="l1", random_state=7).fit(X, y)
model = SelectFromModel(logistic, prefit=True)
X_new = model.transform(X)
# Get back the kept features as a DataFrame with dropped columns as all 0s
selected_features = pd.DataFrame(model.inverse_transform(X_new), index=X.index,columns=X.columns)
# Dropped columns have values of all 0s, keep other columns 
selected_columns = selected_features.columns[selected_features.var() != 0]