def test_with_stratified_bootstrap(): n, p, k = 1000, 1000, 5 X, y, important_betas = _generate_dummy_classification_data(n=n, k=k) selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25), verbose=1, bootstrap_func='stratified') selector.fit(X, y) chosen_betas = selector.get_support(indices=True) assert_almost_equal(important_betas, chosen_betas)
def test_issparse(): n, p = 200, 200 rho = 0.6 weakness = 0.2 X, y = generate_experiment_data(n, p, rho) lambda_grid = np.linspace(0.01, 0.5, num=100) estimator = RandomizedLasso(weakness=weakness) selector = StabilitySelection(base_estimator=estimator, lambda_name='alpha', lambda_grid=lambda_grid, threshold=0.9, verbose=1) selector.fit(csr_matrix(X), y)
def test_stability_plot(): n, p, k = 500, 200, 5 X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) base_estimator = Pipeline([('scaler', StandardScaler()), ('model', Lasso())]) lambdas_grid = np.logspace(-1, 1, num=10) selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__alpha', lambda_grid=lambdas_grid) selector.fit(X, y) plot_stability_path(selector, threshold_highlight=0.5)
def test_no_features(): n, p, k = 100, 200, 0 X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) base_estimator = Pipeline([('scaler', StandardScaler()), ('model', Lasso())]) lambdas_grid = np.logspace(-1, 1, num=10) selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__alpha', lambda_grid=lambdas_grid) selector.fit(X, y) assert_almost_equal(selector.transform(X), np.empty(0).reshape((X.shape[0], 0)))
def stability_selection(self, X, y): """ Wrapper around stability-selection package which is based on the stability selection feature selection algorithm [1]. Bootstrap can be performed using complementary pairs subsampling [2]. https://github.com/scikit-learn-contrib/stability-selection [1]: Meinshausen, N. and Buhlmann, P., 2010. Stability selection. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 72(4), pp.417-473. [2] Shah, R.D. and Samworth, R.J., 2013. Variable selection with error control: another look at stability selection. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 75(1), pp.55-80. """ init_params_dic = { 'base_estimator': self.fit_params.get( 'base_estimator', LogisticRegression(penalty='l1', solver='liblinear')), 'lambda_name': self.fit_params.get('lambda_name', 'C'), 'lambda_grid': self.fit_params.get('lambda_grid', np.logspace(-5, -2, 25)), 'n_bootstrap_iterations': self.fit_params.get('n_bootstrap_iterations', self.n_bsamples), 'sample_fraction': self.fit_params.get('sample_fraction', 0.5), 'threshold': self.fit_params.get('threshold', 0.6), 'bootstrap_func': self.fit_params.get('bootstrap_func', 'subsample'), 'bootstrap_threshold': self.fit_params.get('bootstrap_threshold', None), 'verbose': self.fit_params.get('verbose', 0), 'n_jobs': self.fit_params.get('n_jobs', 1), 'pre_dispatch': self.fit_params.get('pre_dispatch', '2*n_jobs'), 'random_state': self.fit_params.get('random_state', self.random_state) } fit_params = self.fit_params.copy() for init_params in init_params_dic: if init_params in fit_params: fit_params.pop(init_params) feature_selector = StabilitySelection(**init_params_dic) feature_selector.fit(X, y) self.accepted_features_index = feature_selector.get_support( indices=True)
def test_stability_selection_regression(): n, p, k = 500, 1000, 5 X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) base_estimator = Pipeline([('scaler', StandardScaler()), ('model', Lasso())]) lambdas_grid = np.logspace(-1, 1, num=10) selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__alpha', lambda_grid=lambdas_grid) selector.fit(X, y) chosen_betas = selector.get_support(indices=True) assert_almost_equal(important_betas, chosen_betas)
def test_randomized_lasso(): n, p = 200, 200 rho = 0.6 weakness = 0.2 X, y = generate_experiment_data(n, p, rho) lambda_grid = np.linspace(0.01, 0.5, num=100) estimator = RandomizedLasso(weakness=weakness) selector = StabilitySelection(base_estimator=estimator, lambda_name='alpha', lambda_grid=lambda_grid, threshold=0.9, verbose=1) selector.fit(X, y) chosen_betas = selector.get_support(indices=True) assert_almost_equal(np.array([0, 1]), chosen_betas)
def test_different_shape(): n, p, k = 100, 200, 5 X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) base_estimator = Pipeline([('scaler', StandardScaler()), ('model', Lasso())]) lambdas_grid = np.logspace(-1, 1, num=10) selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__alpha', lambda_grid=lambdas_grid) selector.fit(X, y) selector.transform(X[:, :-2])
def test_stability_selection_classification(): n, p, k = 1000, 1000, 5 X, y, important_betas = _generate_dummy_classification_data(n=n, k=k) selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25), verbose=1) selector.fit(X, y) chosen_betas = selector.get_support(indices=True) X_r = selector.transform(X) assert_almost_equal(important_betas, chosen_betas) assert (X_r.shape == (n, k)) assert (selector.stability_scores_.shape == ( p, selector.lambda_grid.shape[0]))
#Data transforming for col in all_cate_var: dummy = pd.get_dummies(binary_data[col],prefix = col) dummy.drop(dummy.columns[0],axis = 1,inplace = True) binary_data.drop(col,axis = 1,inplace = True) binary_data = pd.concat([binary_data, dummy], axis = 1) train_x = np.array(binary_data.drop(columns=['outcome'])) train_y = np.array(binary_data['outcome']).astype('int') #Feature Selection from stability_selection import StabilitySelection from sklearn.svm import LinearSVC lsvc = LinearSVC() selector = StabilitySelection(base_estimator = lsvc).fit(train_x, train_y) train_x_name = binary_data.drop(columns=['outcome']).columns score_table = pd.DataFrame(np.mean(selector.stability_scores_,axis = 1),train_x_name,columns = ['Scores']) score_table = score_table.sort_values(['Scores'],ascending=False)[0:30] train_x_varselect = binary_data[train_x_name[selector.get_support()]] #Modeling from sklearn.linear_model import LogisticRegression Logistic_model = LogisticRegression() Logistic_model.fit(train_x_varselect, train_y) #CV from sklearn.model_selection import cross_val_score Logistic_scores = np.mean(cross_val_score(Logistic_model, train_x_varselect, train_y))
def test_check_string_threshold(): StabilitySelection(threshold='wrong_value')._validate_input()
def test_check_threshold_too_large(): StabilitySelection(threshold=1.5)._validate_input()
from stability_selection import StabilitySelection if model == 'LogisticRegression': clf = LogisticRegression(penalty='l1', class_weight='balanced', solver='auto',random_state= ) if model == 'LogisticRegression': clf = RidgeClassifier(alpha=1.0, class_weight='balanced', solver='auto', random_state=None) self.model = clf base_estimator = Pipeline([ ('scaler', StandardScaler()), ('model', model) ]) selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__C', lambda_grid=np.logspace(-5, -1, 50)) selector.fit(X, y) fig, ax = plot_stability_path(selector) fig.show() selected_variables = selector.get_support(indices=True) selected_scores = selector.stability_scores_.max(axis=1) print('Selected variables are:') print('-----------------------') for idx, (variable, score) in enumerate(zip(selected_variables, selected_scores[selected_variables])): print('Variable %d: [%d], score %.3f' % (idx + 1, variable, score))
sigma[2, 1] = rho X = rng.multivariate_normal(mean=np.zeros(p), cov=sigma, size=(n, )) beta = np.zeros(p) beta[:2] = 1.0 epsilon = rng.normal(0.0, 0.25, size=(n, )) y = np.matmul(X, beta) + epsilon return X, y if __name__ == '__main__': n, p = 200, 200 rho = 0.6 X, y = generate_experiment_data() lambda_grid = np.linspace(0.001, 0.5, num=100) for weakness in [0.2, 0.5, 1.0]: estimator = RandomizedLasso(weakness=weakness) selector = StabilitySelection(base_estimator=estimator, lambda_name='alpha', lambda_grid=lambda_grid, threshold=0.9, verbose=1) selector.fit(X, y) fig, ax = plot_stability_path(selector) fig.show()
def test_check_arguments(): StabilitySelection(threshold='wrong_value')._validate_input()
def test_lambda_name(): StabilitySelection(lambda_name='n_estimators')._validate_input()
def test_sample_fraction(): StabilitySelection(sample_fraction=0.0)._validate_input()
def test_callable_bootstrap_func(): StabilitySelection(bootstrap_func=0.5)._validate_input()
def test_check_threshold_too_small(): StabilitySelection(threshold=0.0)._validate_input()
def test_automatic_lambda_grid(): selector = StabilitySelection() selector._validate_input() assert_array_equal(np.logspace(-5, -2, 25), selector.lambda_grid)
def test_check_wrong_lambda_name(): StabilitySelection(n_bootstrap_iterations=-1)._validate_input()
def test_check_wrong_lambda_name(): StabilitySelection(lambda_name='alpha')._validate_input()
def test_transformer(): # With defaults this can fail because in the low sample size case # some of the bootstrap samples can have zero cases of the positive class return check_estimator( StabilitySelection(n_bootstrap_iterations=10, sample_fraction=1.0))
def test_bootstrap_func(): StabilitySelection(bootstrap_func='nonexistent')._validate_input()
def test_check_threshold_too_small(): StabilitySelection().get_support(threshold='wrong_value')