def test_with_stratified_bootstrap():
    n, p, k = 1000, 1000, 5

    X, y, important_betas = _generate_dummy_classification_data(n=n, k=k)
    selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25),
                                  verbose=1,
                                  bootstrap_func='stratified')
    selector.fit(X, y)

    chosen_betas = selector.get_support(indices=True)
    assert_almost_equal(important_betas, chosen_betas)
def test_stability_selection_classification():
    n, p, k = 1000, 1000, 5

    X, y, important_betas = _generate_dummy_classification_data(n=n, k=k)
    selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25),
                                  verbose=1)
    selector.fit(X, y)

    chosen_betas = selector.get_support(indices=True)
    X_r = selector.transform(X)

    assert_almost_equal(important_betas, chosen_betas)
    assert (X_r.shape == (n, k))
    assert (selector.stability_scores_.shape == (
        p, selector.lambda_grid.shape[0]))
 def stability_selection(self, X, y):
     """
     Wrapper around stability-selection package which is based on the stability selection feature selection
     algorithm [1]. Bootstrap can be performed using complementary pairs subsampling [2].
     https://github.com/scikit-learn-contrib/stability-selection
     [1]: Meinshausen, N. and Buhlmann, P., 2010. Stability selection. Journal of the Royal Statistical Society:
     Series B (Statistical Methodology), 72(4), pp.417-473.
     [2] Shah, R.D. and Samworth, R.J., 2013. Variable selection with error control: another look at stability
     selection. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 75(1), pp.55-80.
     """
     init_params_dic = {
         'base_estimator':
         self.fit_params.get(
             'base_estimator',
             LogisticRegression(penalty='l1', solver='liblinear')),
         'lambda_name':
         self.fit_params.get('lambda_name', 'C'),
         'lambda_grid':
         self.fit_params.get('lambda_grid', np.logspace(-5, -2, 25)),
         'n_bootstrap_iterations':
         self.fit_params.get('n_bootstrap_iterations', self.n_bsamples),
         'sample_fraction':
         self.fit_params.get('sample_fraction', 0.5),
         'threshold':
         self.fit_params.get('threshold', 0.6),
         'bootstrap_func':
         self.fit_params.get('bootstrap_func', 'subsample'),
         'bootstrap_threshold':
         self.fit_params.get('bootstrap_threshold', None),
         'verbose':
         self.fit_params.get('verbose', 0),
         'n_jobs':
         self.fit_params.get('n_jobs', 1),
         'pre_dispatch':
         self.fit_params.get('pre_dispatch', '2*n_jobs'),
         'random_state':
         self.fit_params.get('random_state', self.random_state)
     }
     fit_params = self.fit_params.copy()
     for init_params in init_params_dic:
         if init_params in fit_params:
             fit_params.pop(init_params)
     feature_selector = StabilitySelection(**init_params_dic)
     feature_selector.fit(X, y)
     self.accepted_features_index = feature_selector.get_support(
         indices=True)
def test_stability_selection_regression():
    n, p, k = 500, 1000, 5

    X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)

    base_estimator = Pipeline([('scaler', StandardScaler()),
                               ('model', Lasso())])

    lambdas_grid = np.logspace(-1, 1, num=10)

    selector = StabilitySelection(base_estimator=base_estimator,
                                  lambda_name='model__alpha',
                                  lambda_grid=lambdas_grid)
    selector.fit(X, y)

    chosen_betas = selector.get_support(indices=True)

    assert_almost_equal(important_betas, chosen_betas)
def test_randomized_lasso():
    n, p = 200, 200
    rho = 0.6
    weakness = 0.2

    X, y = generate_experiment_data(n, p, rho)
    lambda_grid = np.linspace(0.01, 0.5, num=100)

    estimator = RandomizedLasso(weakness=weakness)
    selector = StabilitySelection(base_estimator=estimator,
                                  lambda_name='alpha',
                                  lambda_grid=lambda_grid,
                                  threshold=0.9,
                                  verbose=1)
    selector.fit(X, y)

    chosen_betas = selector.get_support(indices=True)

    assert_almost_equal(np.array([0, 1]), chosen_betas)
Beispiel #6
0
	self.model = clf

	base_estimator = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])

    selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__C',
                                  lambda_grid=np.logspace(-5, -1, 50))
    selector.fit(X, y)

    fig, ax = plot_stability_path(selector)
    fig.show()

    selected_variables = selector.get_support(indices=True)
    selected_scores = selector.stability_scores_.max(axis=1)

    print('Selected variables are:')
    print('-----------------------')

    for idx, (variable, score) in enumerate(zip(selected_variables, selected_scores[selected_variables])):
        print('Variable %d: [%d], score %.3f' % (idx + 1, variable, score))



def stepwise_feature_selection(model,df,original_columns,target):

    train_columns = list(train.columns[13:])
    usefull_columns = []
    not_usefull_columns = []
# Extract names of each column (using pandas)
headers = np.array(list(data.columns.values))
names = headers[2:]
#print ("Feature names shape is {}".format(names.shape))

# Extract features (using pandas and numpy)
np_array = data.as_matrix()
X = np_array[:,2:]
#print ("Features shape is {}".format(X.shape))

# Extract labels (using pandas)
Y = data['class_label'].as_matrix()
lambda_grid = np.linspace(0.001, 0.5, num=100)
base_estimator = Pipeline([('scaler', StandardScaler()),('model', LogisticRegression(penalty='l1'))])
selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__C',lambda_grid=np.logspace(-5, -1, 50))
selector.fit(X, Y)                                      
selected_variables = selector.get_support(indices=True,threshold=0.00001)
selected_scores = selector.stability_scores_.max(axis=1)

print('Ranking is:')
print('-----------------------')

#for idx, (variable, score) in enumerate(zip(selected_variables, selected_scores[selected_variables])):
# print('Variable %d: [%d], score %.3f' % (idx + 1, variable, score))
rank = sorted(zip(selected_scores,names),reverse=True)
for el in rank: print(el)




Beispiel #8
0
    binary_data = pd.concat([binary_data, dummy], axis = 1)

train_x = np.array(binary_data.drop(columns=['outcome']))
train_y = np.array(binary_data['outcome']).astype('int')

#Feature Selection
from stability_selection import StabilitySelection
from sklearn.svm import LinearSVC

lsvc = LinearSVC()
selector = StabilitySelection(base_estimator = lsvc).fit(train_x, train_y)
train_x_name = binary_data.drop(columns=['outcome']).columns
score_table = pd.DataFrame(np.mean(selector.stability_scores_,axis = 1),train_x_name,columns = ['Scores'])
score_table = score_table.sort_values(['Scores'],ascending=False)[0:30]

train_x_varselect = binary_data[train_x_name[selector.get_support()]]

#Modeling
from sklearn.linear_model import LogisticRegression
Logistic_model = LogisticRegression()
Logistic_model.fit(train_x_varselect, train_y)

#CV
from sklearn.model_selection import cross_val_score
Logistic_scores = np.mean(cross_val_score(Logistic_model, train_x_varselect, train_y))

###############################################################################################

#Tree Model
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier