def test_sparse_support(): # Make sure sparse data is supported X, y = make_regression(n_features=10) X = scipy.sparse.csr_matrix(X) sfs = SequentialFeatureSelector(LinearRegression(), cv=2) sfs.fit(X, y) sfs.transform(X)
def test_nan_support(): # Make sure nans are OK if the underlying estimator supports nans rng = np.random.RandomState(0) n_samples, n_features = 100, 10 X, y = make_regression(n_samples, n_features, random_state=0) nan_mask = rng.randint(0, 2, size=(n_samples, n_features), dtype=bool) X[nan_mask] = np.nan sfs = SequentialFeatureSelector(HistGradientBoostingRegressor(), cv=2) sfs.fit(X, y) sfs.transform(X) with pytest.raises(ValueError, match='Input contains NaN'): # LinearRegression does not support nans SequentialFeatureSelector(LinearRegression(), cv=2).fit(X, y)
def run_sfs(x, y, output=None, caption=''): sfs = SequentialFeatureSelector(estimator=KNeighborsClassifier()) sfs.fit(x, y) x_reduced = pd.DataFrame(sfs.transform(x), columns=x.columns[sfs.support_]) print(f'reduced columns: {x_reduced.columns}') x_reduced.to_csv(f'{output}/{caption}-sfs.csv', index=False) return x_reduced
def select_greedy(data): X, X_test, y = data svr = svm.SVR(kernel="rbf", C=100, tol=1).fit(X, y) tic = time() select = SequentialFeatureSelector(svr, direction=direction, n_features_to_select=n_features, n_jobs=-1).fit(X, y) toc = time() joblib.dump(select.get_support(), "joblib/greedy_support") print(f"features selected: {select.get_support()}") print(f"done in: {toc - tic:.2f}s") return select.transform(X), select.transform(X_test), y
def test_pipeline_support(): # Make sure that pipelines can be passed into SFS and that SFS can be # passed into a pipeline n_samples, n_features = 50, 3 X, y = make_regression(n_samples, n_features, random_state=0) # pipeline in SFS pipe = make_pipeline(StandardScaler(), LinearRegression()) sfs = SequentialFeatureSelector(pipe, cv=2) sfs.fit(X, y) sfs.transform(X) # SFS in pipeline sfs = SequentialFeatureSelector(LinearRegression(), cv=2) pipe = make_pipeline(StandardScaler(), sfs) pipe.fit(X, y) pipe.transform(X)
def test_unsupervised_model_fit(n_features_to_select): # Make sure that models without classification labels are not being # validated X, y = make_blobs(n_features=6) sfs = SequentialFeatureSelector( KMeans(), n_features_to_select=n_features_to_select, ) sfs.fit(X) assert sfs.transform(X).shape[1] == n_features_to_select
def test_n_features_to_select(direction, n_features_to_select): # Make sure n_features_to_select is respected X, y = make_regression(n_features=10) sfs = SequentialFeatureSelector(LinearRegression(), n_features_to_select=n_features_to_select, direction=direction, cv=2) sfs.fit(X, y) if n_features_to_select is None: n_features_to_select = 5 # n_features // 2 assert sfs.get_support(indices=True).shape[0] == n_features_to_select assert sfs.n_features_to_select_ == n_features_to_select assert sfs.transform(X).shape[1] == n_features_to_select
def test_n_features_to_select_auto(direction): """Check the behaviour of `n_features_to_select="auto"` with different values for the parameter `tol`. """ n_features = 10 tol = 1e-3 X, y = make_regression(n_features=n_features, random_state=0) sfs = SequentialFeatureSelector( LinearRegression(), n_features_to_select="auto", tol=tol, direction=direction, cv=2, ) sfs.fit(X, y) max_features_to_select = n_features - 1 assert sfs.get_support(indices=True).shape[0] <= max_features_to_select assert sfs.n_features_to_select_ <= max_features_to_select assert sfs.transform(X).shape[1] <= max_features_to_select assert sfs.get_support(indices=True).shape[0] == sfs.n_features_to_select_
'AY_Dias_passive', 'AZ_Dias_passive', 'AT_Dias_passive' ] sfs1 = SequentialFeatureSelector(reg_model, n_features_to_select=5, direction='forward', scoring='neg_root_mean_squared_error') sfs1.fit(X_train_test, y_train_test) select_features = sfs1.get_feature_names_out(input_features=feature_set) #print selected features print('Selected features: ') print(select_features) print('\n') #transform the data to only use the selected features for the model development X_select_train_test = sfs1.transform(X_train_test) X_select_val = sfs1.transform(X_val) X_train_test = X_select_train_test X_val = X_select_val ####get feature importance with SVR########## fit = reg_model.fit(X_train_test, y_train_test) weights = fit.coef_ print('Feature Weights: ') print(weights) print('\n') ##############perform grid search on training-testing set############ estimator = SVR() #regression model for this analysis param_dist = {
def main(): # train_file_name = sys.argv[1] # output_file_name = sys.argv[2] train_data_file_name = "NEWS_Training_data.csv" train_label_file_name = "NEWS_Training_label.csv" test_data_file_name = "NEWS_Test_data.csv" test_label_file_name = "NEWS_Test_label.csv" tr_data_frame_X = read_csv(train_data_file_name) tr_data_frame_y = read_csv(train_label_file_name) te_data_frame_X = read_csv(test_data_file_name) te_data_frame_y = read_csv(test_label_file_name) alias = test_data_file_name.split('/')[-1].split('_')[0] model_filename = "model_{}.pkl".format(alias) try: linear_reg_model = pickle.load(open(model_filename, 'rb')) mode = "rfe" # prefix to open the models lasso_reg_model = pickle.load(open(mode + "lasso_reg_model.pkl", 'rb')) logistic_reg_model = pickle.load( open(mode + "logistic_reg_model.pkl", 'rb')) # logistic_reg_model = pickle.load(open("model_NEWS.pkl", 'rb')) extra_forest_reg_model = pickle.load( open(mode + "extra_forest_reg_model.pkl", 'rb')) svm_reg_model = pickle.load(open(mode + "svm_reg_model.pkl", 'rb')) knn_reg_model = pickle.load(open(mode + "knn_reg_model.pkl", 'rb')) except: print( "ERROR: You must provide model.pkl file in the current directory.") raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), model_filename) X_train, y_train = toNumpy(tr_data_frame_X, tr_data_frame_y) X_test, y_test = toNumpy(te_data_frame_X, te_data_frame_y) print(X_test.shape[0]) trivial_y_pred = np.repeat(np.mean(y_train), y_test.shape[0]) baseline_y_pred = model_predict(linear_reg_model, X_test) measure_performance(y_pred=trivial_y_pred, y_test=y_test, mode="trivial testing") measure_performance(y_pred=baseline_y_pred, y_test=y_test, mode="baseline testing") measure_model(lasso_reg_model, logistic_reg_model, extra_forest_reg_model, svm_reg_model, knn_reg_model, X_test=X_test, y_test=y_test) one_hot_cols = [11, 12, 13, 14, 15, 16, 29, 30, 31, 32, 33, 34, 35, 36] z_score_cols = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 18, 37, 42, 44, 45, 46, 47, 48, 49, 50, 54, 56, 57 ] normal_cols = [17, 19, 38, 39, 40, 41, 43, 51, 52, 53, 55] min_max_cols = [20, 21, 22, 23, 24, 25, 26, 27, 28] prepro_tr_X_data = np.zeros((X_train.shape[0], X_train.shape[1]), dtype='float') prepro_te_X_data = np.zeros((X_test.shape[0], X_test.shape[1]), dtype='float') prepro_tr_X_data[one_hot_cols] = X_train[one_hot_cols] prepro_te_X_data[one_hot_cols] = X_test[one_hot_cols] # Standardize the test data z_score_scaler = StandardScaler() z_score_scaler.fit(X_train[z_score_cols]) prepro_tr_X_data[z_score_cols] = z_score_scaler.transform( X_train[z_score_cols]) prepro_te_X_data[z_score_cols] = z_score_scaler.transform( X_test[z_score_cols]) # Normalize the test data prepro_tr_X_data[normal_cols] = normalize(X_train[normal_cols], norm='l2', axis=0) prepro_te_X_data[normal_cols] = normalize(X_test[normal_cols], norm='l2', axis=0) # MinMax scale the test data minmax_scaler = MinMaxScaler() minmax_scaler.fit(X_train[min_max_cols]) prepro_tr_X_data[min_max_cols] = minmax_scaler.transform( X_train[min_max_cols]) prepro_te_X_data[min_max_cols] = minmax_scaler.transform( X_test[min_max_cols]) measure_model(lasso_reg_model, logistic_reg_model, extra_forest_reg_model, svm_reg_model, knn_reg_model, X_test=prepro_te_X_data, y_test=y_test) # Do feature selection (UFS) # Univariate Feature Selection usf_f = SelectKBest(f_regression, k=50) usf_f.fit(prepro_tr_X_data, y_train) ufs_f_te_X_data = usf_f.transform(prepro_te_X_data) measure_model(lasso_reg_model, logistic_reg_model, extra_forest_reg_model, svm_reg_model, knn_reg_model, X_test=ufs_f_te_X_data, y_test=y_test) usf_mu = SelectKBest(mutual_info_regression, k=50) usf_mu.fit(prepro_tr_X_data, y_train) ufs_mu_te_X_data = usf_mu.transform(prepro_te_X_data) measure_model(lasso_reg_model, logistic_reg_model, extra_forest_reg_model, svm_reg_model, knn_reg_model, X_test=ufs_mu_te_X_data, y_test=y_test) estimator = linear_model.LinearRegression() # Recursive feature elimination rfe_selector = RFE(estimator, n_features_to_select=50, step=1) rfe_selector.fit(prepro_tr_X_data, y_train) rfe_X_data = rfe_selector.transform(prepro_te_X_data) measure_model(lasso_reg_model, logistic_reg_model, extra_forest_reg_model, svm_reg_model, knn_reg_model, X_test=rfe_X_data, y_test=y_test) # Sequential Feature Selection (SFS) sfs = SequentialFeatureSelector(estimator, n_features_to_select=50, direction='backward') sfs.fit(prepro_tr_X_data, y_train) sfs_X_data = sfs.transform(prepro_te_X_data) measure_model(lasso_reg_model, logistic_reg_model, extra_forest_reg_model, svm_reg_model, knn_reg_model, X_test=sfs_X_data, y_test=y_test)
def test_n_features_to_select_stopping_criterion(direction): """Check the behaviour stopping criterion for feature selection depending on the values of `n_features_to_select` and `tol`. When `direction` is `'forward'`, select a new features at random among those not currently selected in selector.support_, build a new version of the data that includes all the features in selector.support_ + this newly selected feature. And check that the cross-validation score of the model trained on this new dataset variant is lower than the model with the selected forward selected features or at least does not improve by more than the tol margin. When `direction` is `'backward'`, instead of adding a new feature to selector.support_, try to remove one of those selected features at random And check that the cross-validation score is either decreasing or not improving by more than the tol margin. """ X, y = make_regression(n_features=50, n_informative=10, random_state=0) tol = 1e-3 sfs = SequentialFeatureSelector( LinearRegression(), n_features_to_select="auto", tol=tol, direction=direction, cv=2, ) sfs.fit(X, y) selected_X = sfs.transform(X) rng = np.random.RandomState(0) added_candidates = list( set(range(X.shape[1])) - set(sfs.get_support(indices=True))) added_X = np.hstack([ selected_X, (X[:, rng.choice(added_candidates)])[:, np.newaxis], ]) removed_candidate = rng.choice(list(range(sfs.n_features_to_select_))) removed_X = np.delete(selected_X, removed_candidate, axis=1) plain_cv_score = cross_val_score(LinearRegression(), X, y, cv=2).mean() sfs_cv_score = cross_val_score(LinearRegression(), selected_X, y, cv=2).mean() added_cv_score = cross_val_score(LinearRegression(), added_X, y, cv=2).mean() removed_cv_score = cross_val_score(LinearRegression(), removed_X, y, cv=2).mean() assert sfs_cv_score >= plain_cv_score if direction == "forward": assert (sfs_cv_score - added_cv_score) <= tol assert (sfs_cv_score - removed_cv_score) >= tol else: assert (added_cv_score - sfs_cv_score) <= tol assert (removed_cv_score - sfs_cv_score) <= tol