def seq_feature_selection(data, target, n_features=None): predictors = data.drop(columns=target).select_dtypes(np.number) selector = SequentialFeatureSelector(estimator=LinearRegression(), n_features_to_select=n_features) selector = selector.fit(predictors, data[target]) selected = selector.get_support(indices=True) return predictors.iloc[:, selected].columns.to_list()
def test_bad_direction(): X, y = make_regression(n_features=5) sfs = SequentialFeatureSelector(LinearRegression(), n_features_to_select="auto", direction="bad") with pytest.raises(ValueError, match="must be either 'forward' or"): sfs.fit(X, y)
def run_sfs(x, y, output=None, caption=''): sfs = SequentialFeatureSelector(estimator=KNeighborsClassifier()) sfs.fit(x, y) x_reduced = pd.DataFrame(sfs.transform(x), columns=x.columns[sfs.support_]) print(f'reduced columns: {x_reduced.columns}') x_reduced.to_csv(f'{output}/{caption}-sfs.csv', index=False) return x_reduced
def test_n_features_to_select_float(direction, n_features_to_select, expected): # Test passing a float as n_features_to_select X, y = make_regression(n_features=10) sfs = SequentialFeatureSelector(LinearRegression(), n_features_to_select=n_features_to_select, direction=direction, cv=2) sfs.fit(X, y) assert sfs.n_features_to_select_ == expected
def feature_selection(self, direction=None): sfs = SequentialFeatureSelector(self.model, n_features_to_select=3, direction=direction) self.sfs = sfs.fit(self.X_train, self.Y_train) self.X_train_columns = self.X_train.columns[sfs.get_support()] self.X_test_columns = self.X_test.columns[sfs.get_support()] self.X_train = self.X_train[self.X_train_columns] self.X_test = self.X_test[self.X_test_columns] self.model = LinearRegression().fit(self.X_train, self.Y_train)
def test_unsupervised_model_fit(n_features_to_select): # Make sure that models without classification labels are not being # validated X, y = make_blobs(n_features=6) sfs = SequentialFeatureSelector( KMeans(), n_features_to_select=n_features_to_select, ) sfs.fit(X) assert sfs.transform(X).shape[1] == n_features_to_select
def test_no_y_validation_model_fit(y): # Make sure that other non-conventional y labels are not accepted X, clusters = make_blobs(n_features=6) sfs = SequentialFeatureSelector( KMeans(), n_features_to_select=3, ) with pytest.raises((TypeError, ValueError)): sfs.fit(X, y)
def test_sanity(seed, direction, n_features_to_select, expected_selected_features): # Basic sanity check: 3 features, only f0 and f2 are correlated with the # target, f2 having a stronger correlation than f0. We expect f1 to be # dropped, and f2 to always be selected. rng = np.random.RandomState(seed) n_samples = 100 X = rng.randn(n_samples, 3) y = 3 * X[:, 0] - 10 * X[:, 2] sfs = SequentialFeatureSelector(LinearRegression(), n_features_to_select=n_features_to_select, direction=direction, cv=2) sfs.fit(X, y) assert_array_equal(sfs.get_support(indices=True), expected_selected_features)
def select_greedy(data): X, X_test, y = data svr = svm.SVR(kernel="rbf", C=100, tol=1).fit(X, y) tic = time() select = SequentialFeatureSelector(svr, direction=direction, n_features_to_select=n_features, n_jobs=-1).fit(X, y) toc = time() joblib.dump(select.get_support(), "joblib/greedy_support") print(f"features selected: {select.get_support()}") print(f"done in: {toc - tic:.2f}s") return select.transform(X), select.transform(X_test), y
def test_raise_deprecation_warning(): """Check that we raise a FutureWarning with `n_features_to_select`.""" n_samples, n_features = 50, 3 X, y = make_regression(n_samples, n_features, random_state=0) warn_msg = "Leaving `n_features_to_select` to None is deprecated" with pytest.warns(FutureWarning, match=warn_msg): SequentialFeatureSelector(LinearRegression()).fit(X, y)
def evaluate_model_sfs(model, predictors, labels, direction='backward', n_features=50, n_jobs=-1): sfs = SequentialFeatureSelector(estimator=model, n_features_to_select=n_features, cv=5, direction=direction, n_jobs=n_jobs) sfs.fit(predictors, labels) predictors_reduced = predictors[predictors.columns[sfs.support_]] scores = cross_val_score(model, predictors_reduced, labels, scoring="neg_mean_absolute_error", cv=5, n_jobs=n_jobs) return np.sqrt(-scores)
def select_features(model_name): data = read_data() X, y = split_data(data) model = joblib.load(pathToModels + model_name + '.pkl') all_features = X.columns.values print('Исходный набор:' + str(all_features)) print('--------------------------------') print('First') sfs1 = SFS(model, n_features_to_select=16, direction='backward', scoring='f1', cv=10) result1 = sfs1.fit(X, y) part_of_features1 = all_features[result1.get_support(indices=True)] print(result1.get_support()) print(result1.get_support(indices=True)) print(str(part_of_features1)) print('--------------------------------') print('Second') sfs2 = SFS(model, n_features_to_select=16, direction='forward', scoring='f1', cv=10) result2 = sfs2.fit(X, y) part_of_features2 = all_features[result2.get_support(indices=True)] print(result2.get_support()) print(result2.get_support(indices=True)) print(str(part_of_features2)) print('--------------------------------') print('Third') sfs3 = SFS(model, direction='forward', scoring='f1', cv=10) result3 = sfs3.fit(X, y) part_of_features3 = all_features[result3.get_support(indices=True)] print(result3.get_support()) print(result3.get_support(indices=True)) print(str(part_of_features3)) print('--------------------------------') print('Fourth') sfs4 = SFS(model, direction='backward', scoring='f1', cv=10) result4 = sfs4.fit(X, y) part_of_features4 = all_features[result4.get_support(indices=True)] print(result4.get_support()) print(result4.get_support(indices=True)) print(str(part_of_features4))
def test_sparse_support(): # Make sure sparse data is supported X, y = make_regression(n_features=10) X = scipy.sparse.csr_matrix(X) sfs = SequentialFeatureSelector(LinearRegression(), cv=2) sfs.fit(X, y) sfs.transform(X)
def run_sklearn(self): """ Train and evaluate models from sklearn """ classifier_options = { #'lr': linear_model.LogisticRegressionCV(cv=10, n_jobs=10, max_iter=10000, verbose=0), 'lr': linear_model.LogisticRegression(n_jobs=10, max_iter=100000, verbose=0), 'svm': model_selection.GridSearchCV(svm.LinearSVC(dual=False, max_iter=10000, verbose=0), { 'C': [.01, .1, 1, 10, 100], 'penalty': ['l2'] }, n_jobs=10, cv=10, verbose=2), 'mlp': neural_network.MLPClassifier(hidden_layer_sizes=(32, 50), activation='relu', early_stopping=True, verbose=2) } self.clf = classifier_options[self.clf_type] if self.sfs_k > 0: # Forward feature selection print("Doing forward feature selection...") sfs = SequentialFeatureSelector(self.clf, n_features_to_select=self.sfs_k, n_jobs=-1) sfs.fit(self.data.X_train, self.data.y_train) # Save out selected features outpath = os.path.join( '/projects/tumblr_community_identity/tmp/', f'sfs{self.sfs_k}_{self.extractor.select_k}.txt') np.savetxt(outpath, sfs.get_support()) print(f"Saved forward feature selection mask to {outpath}") sfs_mask = sfs.get_support() #sfs_mask = np.loadtxt( # '/projects/tumblr_community_identity/tmp/sfs20_500.txt').astype(bool) X_train = self.data.X_train[:, sfs_mask] X_dev = self.data.X_dev[:, sfs_mask] X_test = self.data.X_test[:, sfs_mask] self.data.X_train, self.data.X_dev, self.data.X_test = X_train, X_dev, X_test self.model = self.clf.fit(self.data.X_train, self.data.y_train) self.test_score = self.model.score(self.data.X_test, self.data.y_test) self.train_pred = self.model.predict(self.data.X_train) if self.data.X_dev is not None: self.dev_score = self.model.score(self.data.X_dev, self.data.y_dev) self.dev_pred = self.model.predict(self.data.X_dev) self.test_pred = self.model.predict(self.data.X_test)
def test_pipeline_support(): # Make sure that pipelines can be passed into SFS and that SFS can be # passed into a pipeline n_samples, n_features = 50, 3 X, y = make_regression(n_samples, n_features, random_state=0) # pipeline in SFS pipe = make_pipeline(StandardScaler(), LinearRegression()) sfs = SequentialFeatureSelector(pipe, cv=2) sfs.fit(X, y) sfs.transform(X) # SFS in pipeline sfs = SequentialFeatureSelector(LinearRegression(), cv=2) pipe = make_pipeline(StandardScaler(), sfs) pipe.fit(X, y) pipe.transform(X)
def test_nan_support(): # Make sure nans are OK if the underlying estimator supports nans rng = np.random.RandomState(0) n_samples, n_features = 100, 10 X, y = make_regression(n_samples, n_features, random_state=0) nan_mask = rng.randint(0, 2, size=(n_samples, n_features), dtype=bool) X[nan_mask] = np.nan sfs = SequentialFeatureSelector(HistGradientBoostingRegressor(), cv=2) sfs.fit(X, y) sfs.transform(X) with pytest.raises(ValueError, match='Input contains NaN'): # LinearRegression does not support nans SequentialFeatureSelector(LinearRegression(), cv=2).fit(X, y)
def test_n_features_to_select(direction, n_features_to_select): # Make sure n_features_to_select is respected X, y = make_regression(n_features=10) sfs = SequentialFeatureSelector(LinearRegression(), n_features_to_select=n_features_to_select, direction=direction, cv=2) sfs.fit(X, y) if n_features_to_select is None: n_features_to_select = 5 # n_features // 2 assert sfs.get_support(indices=True).shape[0] == n_features_to_select assert sfs.n_features_to_select_ == n_features_to_select assert sfs.transform(X).shape[1] == n_features_to_select
def feature_selection_k(k, X, y, model_name=None, dataset_name=None): if model_name == "RFE": model = RFE(n_features_to_select=k, estimator=DecisionTreeClassifier()) elif model_name == "FSFS": model = SequentialFeatureSelector(n_features_to_select=k, estimator=DecisionTreeClassifier(), direction="forward") start = time.process_time() X_trans = model.fit_transform(X, y) end = time.process_time() runtime = end - start X_re = model.inverse_transform(X_trans) error = ((X_re - X)**2).mean().mean() return { "dataset": dataset_name, "model": model_name, "k": k, "runtime": runtime, "reconstruction_error": error }, model
def test_n_features_to_select_auto(direction): """Check the behaviour of `n_features_to_select="auto"` with different values for the parameter `tol`. """ n_features = 10 tol = 1e-3 X, y = make_regression(n_features=n_features, random_state=0) sfs = SequentialFeatureSelector( LinearRegression(), n_features_to_select="auto", tol=tol, direction=direction, cv=2, ) sfs.fit(X, y) max_features_to_select = n_features - 1 assert sfs.get_support(indices=True).shape[0] <= max_features_to_select assert sfs.n_features_to_select_ <= max_features_to_select assert sfs.transform(X).shape[1] <= max_features_to_select assert sfs.get_support(indices=True).shape[0] == sfs.n_features_to_select_
from sklearn.feature_selection import SelectFromModel model1 = LinearSVC(C=0.01, penalty='l2', dual=False, random_state=0).fit(x, y) #越小惩罚力度越大 sfm = SelectFromModel(model1).fit(x, y) print('Features selected by SelectFromModel selection l2:', names[sfm.get_support()]) #使用LinearSCV penalty='l2'做base model 用SFS做feature select from sklearn.feature_selection import SequentialFeatureSelector model = LinearSVC(C=0.01, penalty='l2', dual=False, random_state=0).fit(x, y) #C越小惩罚力度越大 names = np.array(columns) sfs_forward = SequentialFeatureSelector(model, n_features_to_select=7, scoring='accuracy', cv=5).fit(x, y) sfs_backward = SequentialFeatureSelector(model, n_features_to_select=7, scoring='accuracy', direction='backward', cv=5).fit(x, y) print('Features selected by forward SFS selection l2:', names[sfs_forward.get_support()]) print('Features selected by backward SFS selection l2:', names[sfs_backward.get_support()]) ''' L1惩罚项降维的原理在于保留多个对目标值具有同等相关性的特征中的一个,所以没选到的特征不代表不重要。 故,可结合L2惩罚项来优化。具体操作为:若一个特征在L1中的权值为1,选择在L2中权值差别不大且在L1中权值为0的特征构成同类集合, 将这一集合中的特征平分L1中的权值,故需要构建一个新的模型: '''
def test_bad_n_features_to_select(n_features_to_select): X, y = make_regression(n_features=5) sfs = SequentialFeatureSelector(LinearRegression(), n_features_to_select=n_features_to_select) with pytest.raises(ValueError, match="must be either None"): sfs.fit(X, y)
# :class:`~sklearn.feature_selection.SequentialFeatureSelector` # (SFS). SFS is a greedy procedure where, at each iteration, we choose the best # new feature to add to our selected features based a cross-validation score. # That is, we start with 0 features and choose the best single feature with the # highest score. The procedure is repeated until we reach the desired number of # selected features. # # We can also go in the reverse direction (backward SFS), *i.e.* start with all # the features and greedily choose features to remove one by one. We illustrate # both approaches here. from sklearn.feature_selection import SequentialFeatureSelector tic_fwd = time() sfs_forward = SequentialFeatureSelector(ridge, n_features_to_select=2, direction="forward").fit(X, y) toc_fwd = time() tic_bwd = time() sfs_backward = SequentialFeatureSelector(ridge, n_features_to_select=2, direction="backward").fit(X, y) toc_bwd = time() print("Features selected by forward sequential selection: " f"{feature_names[sfs_forward.get_support()]}") print(f"Done in {toc_fwd - tic_fwd:.3f}s") print("Features selected by backward sequential selection: " f"{feature_names[sfs_backward.get_support()]}") print(f"Done in {toc_bwd - tic_bwd:.3f}s")
X = data[:, 0:56] # x_s, time_s, Temp, ni_n, na_n, rho, v, p, E, H y = data[:, 56:57] # rhs[0:50] print(data.shape) print("x=", X.shape) print("y=", y.shape) feature_names = [f"feature {i}" for i in range(X.shape[1])] est = ExtraTreesRegressor(n_estimators=50) est = est.fit(X, y.ravel()) tic_fwd = time() sfs_forward = SequentialFeatureSelector(est, n_features_to_select=2, direction="forward").fit(X, y.ravel()) toc_fwd = time() tic_bwd = time() sfs_backward = SequentialFeatureSelector(est, n_features_to_select=2, direction="backward").fit( X, y.ravel()) toc_bwd = time() print("Features selected by forward sequential selection: " f"{feature_names[sfs_forward.get_support()]}") print(f"Done in {toc_fwd - tic_fwd:.3f}s") print("Features selected by backward sequential selection: " f"{feature_names[sfs_backward.get_support()]}")
X_train, X_test, y_train, y_test =\ T_T_S(X,y,test_size = .3, random_state = 0, stratify = y) # stratify ensures same class proportions of training and test data sets print('Training Data Size = ', len(X_train)) print('Test Data Size = ', len(X_test)) print() pause() feat_labels = df_wine.columns[1:] forest = RFC(n_estimators=500, random_state=1) forest.fit(X_train, y_train) tic_fwd = time() sfs_forward = SequentialFeatureSelector(forest, n_features_to_select=5, direction='forward').fit( X_train, y_train) toc_fwd = time() tic_bwd = time() sfs_backward = SequentialFeatureSelector(forest, n_features_to_select=5, direction='backward').fit( X_train, y_train) toc_bwd = time() print("Features selected by forward sequential selection: " f"{feat_labels[sfs_forward.get_support()]}") print(f"Done in {toc_fwd - tic_fwd:.3f}s") print("Features selected by backward sequential selection: " f"{feat_labels[sfs_backward.get_support()]}")
# Plot scores_train = df_results_train.iloc[:, 0:5].values.tolist() scores_test = df_results_test.iloc[:, 0:5].values.tolist() mean_train = np.mean(scores_train, axis=1) sd_train = np.std(scores_train, axis=1) mean_test = np.mean(scores_test, axis=1) sd_test = np.std(scores_test, axis=1) pyplot.plot(df_results_train.n_features, mean_train, 'o-', color='r', label="Training") pyplot.plot(df_results_test.n_features, mean_test, 'o-', color='g', label="Cross-validation") pyplot.fill_between(df_results_train.n_features, mean_train - sd_train, mean_train + sd_train, alpha=0.1) pyplot.fill_between(df_results_train.n_features, mean_test - sd_test, mean_test + sd_test, alpha=0.1) pyplot.ylabel('MAE', fontsize=16) pyplot.xlabel('N features', fontsize=16) pyplot.legend(loc="best") # Select n_features: sfs = SequentialFeatureSelector(estimator=model, n_features_to_select=26, cv=myCViterator, direction='forward', n_jobs=6) sfs.fit(predictors_train, labels_train) data_reduced_train = train_prepared[ np.append(np.array(predictors_train.columns[sfs.support_]),['log_visit_rate']) ] data_reduced_test = test_prepared[ np.append(np.array(predictors_test.columns[sfs.support_]),['log_visit_rate']) ] data_reduced_train.to_csv('C:/Users/angel/git/Observ_models/data/ML/Regression/train/data_reduced_15.csv', index=False) data_reduced_test.to_csv('C:/Users/angel/git/Observ_models/data/ML/Regression/test/data_reduced_15.csv', index=False) # ####################################### # # Permutation importance # ####################################### # model = SVR(C=2.62, epsilon=0.05, gamma=0.21) #{'C': 2.6180503547870377, 'coef0': -0.5901821308327051, 'epsilon': 0.045644987037295054, 'gamma': 0.2112333725279757, 'kernel': 'rbf'} # model.fit(predictors_train, labels_train) # perm_importance = permutation_importance(model, predictors_train, labels_train, random_state=135, n_jobs=6) # feature_names = predictors_train.columns # feature_importance = pd.DataFrame(sorted(zip(perm_importance.importances_mean, feature_names), reverse=True)) # pyplot.barh(feature_importance.loc[:,1], feature_importance.loc[:,0])
from sklearn.linear_model import LinearRegression from sklearn.model_selection import GridSearchCV, LeaveOneOut from sklearn.pipeline import Pipeline my_url = ('https://raw.githubusercontent.com/taroyabuki' '/fromzero/master/data/wine.csv') my_data = pd.read_csv(my_url) n = len(my_data) my_data2 = my_data.assign(v1=[i % 2 for i in range(n)], v2=[i % 3 for i in range(n)]) X, y = my_data2.drop(columns=['LPRICE2']), my_data2['LPRICE2'] my_sfs = SequentialFeatureSelector( estimator=LinearRegression(), direction='forward', # 変数増加法 cv=LeaveOneOut(), scoring='neg_mean_squared_error') my_pipeline = Pipeline([ # 変数選択の後で再訓練を行うようにする. ('sfs', my_sfs), # 変数選択 ('lr', LinearRegression()) ]) # 回帰分析 my_params = {'sfs__n_features_to_select': range(1, 6)} # 選択する変数の上限 my_search = GridSearchCV(estimator=my_pipeline, param_grid=my_params, cv=LeaveOneOut(), scoring='neg_mean_squared_error', n_jobs=-1).fit(X, y) my_model = my_search.best_estimator_ # 最良のパラメータで再訓練したモデル
def evaluate_model_sfs(model, predictors, labels, direction='backward', n_features=50, n_jobs=-1, cv=5): sfs = SequentialFeatureSelector(estimator=model, n_features_to_select=n_features, cv=cv, direction=direction, n_jobs=n_jobs) sfs.fit(predictors, labels) predictors_reduced = predictors[ predictors.columns[sfs.support_] ] return cross_validate(model, predictors_reduced, labels, scoring="neg_mean_absolute_error", cv=cv, n_jobs=n_jobs, return_train_score=True)
from sklearn.feature_selection import SequentialFeatureSelector # In[25]: lsvc = LinearSVC(C=0.01, penalty="l1", dual=False) tree_clf = ExtraTreesClassifier(n_estimators=70) # In[26]: sfs = SequentialFeatureSelector(lsvc, n_features_to_select=9) sfs_back = SequentialFeatureSelector(lsvc, n_features_to_select=9, direction='backward') Sequential_SVC_dataset = sfs.fit_transform(X, y) Sequential_SVC_dataset = pd.DataFrame(Sequential_SVC_dataset) Sequential_SVC_dataset.name = 'Sequential_SVC_dataset' datasets.append(Sequential_SVC_dataset) Sequential_back_SVC_dataset = sfs_back.fit_transform(X, y) Sequential_back_SVC_dataset = pd.DataFrame(Sequential_back_SVC_dataset) Sequential_back_SVC_dataset.name = 'Sequential_back_SVC_dataset' datasets.append(Sequential_back_SVC_dataset)
# :class:`~sklearn.feature_selection.SequentialFeatureSelector` # (SFS). SFS is a greedy procedure where, at each iteration, we choose the best # new feature to add to our selected features based a cross-validation score. # That is, we start with 0 features and choose the best single feature with the # highest score. The procedure is repeated until we reach the desired number of # selected features. # # We can also go in the reverse direction (backward SFS), *i.e.* start with all # the features and greedily choose features to remove one by one. We illustrate # both approaches here. from sklearn.feature_selection import SequentialFeatureSelector tic_fwd = time() sfs_forward = SequentialFeatureSelector(lasso, n_features_to_select=2, direction='forward').fit(X, y) toc_fwd = time() tic_bwd = time() sfs_backward = SequentialFeatureSelector(lasso, n_features_to_select=2, direction='backward').fit(X, y) toc_bwd = time() print("Features selected by forward sequential selection: " f"{feature_names[sfs_forward.get_support()]}") print(f"Done in {toc_fwd - tic_fwd:.3f}s") print("Features selected by backward sequential selection: " f"{feature_names[sfs_backward.get_support()]}") print(f"Done in {toc_bwd - tic_bwd:.3f}s")
# ----------------------------------------- # A new iterative transformer to select features is available: # :class:`~sklearn.feature_selection.SequentialFeatureSelector`. # Sequential Feature Selection can add features one at a time (forward # selection) or remove features from the list of the available features # (backward selection), based on a cross-validated score maximization. # See the :ref:`User Guide <sequential_feature_selection>`. from sklearn.feature_selection import SequentialFeatureSelector from sklearn.neighbors import KNeighborsClassifier from sklearn.datasets import load_iris X, y = load_iris(return_X_y=True, as_frame=True) feature_names = X.columns knn = KNeighborsClassifier(n_neighbors=3) sfs = SequentialFeatureSelector(knn, n_features_to_select=2) sfs.fit(X, y) print("Features selected by forward sequential selection: " f"{feature_names[sfs.get_support().tolist()]}") ############################################################################## # New PolynomialCountSketch kernel approximation function # ------------------------------------------------------- # The new :class:`~sklearn.kernel_approximation.PolynomialCountSketch` # approximates a polynomial expansion of a feature space when used with linear # models, but uses much less memory than # :class:`~sklearn.preprocessing.PolynomialFeatures`. from sklearn.datasets import fetch_covtype from sklearn.pipeline import make_pipeline from sklearn.model_selection import train_test_split