def test_knn_scoring_metric(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs5 = SFS(knn, k_features=3, forward=False, floating=True, scoring='accuracy', cv=4, skip_if_stuck=True, verbose=0) sfs5 = sfs5.fit(X, y) assert round(sfs5.k_score_, 4) == 0.9728 sfs6 = SFS(knn, k_features=3, forward=False, floating=True, cv=4, skip_if_stuck=True, verbose=0) sfs6 = sfs6.fit(X, y) assert round(sfs6.k_score_, 4) == 0.9728 sfs7 = SFS(knn, k_features=3, forward=False, floating=True, scoring='f1_macro', cv=4, skip_if_stuck=True) sfs7 = sfs7.fit(X, y) assert round(sfs7.k_score_, 4) == 0.9727, sfs7.k_score_
def test_knn_scoring_metric(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs5 = SFS(knn, k_features=3, forward=False, floating=True, scoring='accuracy', cv=4, skip_if_stuck=True, print_progress=False) sfs5 = sfs5.fit(X, y) assert round(sfs5.k_score_, 4) == 0.9728 sfs6 = SFS(knn, k_features=3, forward=False, floating=True, scoring='precision', cv=4, skip_if_stuck=True, print_progress=False) sfs6 = sfs6.fit(X, y) assert round(sfs6.k_score_, 4) == 0.9737
def test_run_default(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier() sfs = SFS(estimator=knn, verbose=0) sfs.fit(X, y) assert sfs.k_feature_idx_ == (3,)
def test_fit_params(): iris = load_iris() X = iris.data y = iris.target sample_weight = np.ones(X.shape[0]) forest = RandomForestClassifier(n_estimators=100, random_state=123) sfs = SFS(estimator=forest, verbose=0) sfs.fit(X, y, sample_weight=sample_weight) assert sfs.k_feature_idx_ == (3,)
def test_knn_cv3_groups(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, k_features=3, forward=True, floating=False, cv=GroupKFold(n_splits=3), verbose=0) np.random.seed(1630672634) groups = np.random.randint(0, 6, size=len(y)) sfs1 = sfs1.fit(X, y, groups=groups) # print(sfs1.subsets_) expect = { 1: {'cv_scores': np.array([0.97916667, 0.93877551, 0.96226415]), 'feature_idx': (3,), 'avg_score': 0.9600687759380482}, 2: {'cv_scores': np.array([0.95833333, 0.93877551, 0.98113208]), 'feature_idx': (1, 3), 'avg_score': 0.9594136396697044}, 3: {'cv_scores': np.array([0.97916667, 0.95918367, 0.94339623]), 'feature_idx': (1, 2, 3), 'avg_score': 0.9605821888503829}} dict_compare_utility(d_actual=sfs1.subsets_, d_desired=expect, decimal=3)
def test_knn_cv3(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, k_features=3, forward=True, floating=False, cv=4, verbose=0) sfs1 = sfs1.fit(X, y) sfs1.subsets_ expect = {1: {'avg_score': 0.95299145299145294, 'cv_scores': np.array([0.97435897, 0.94871795, 0.88888889, 1.0]), 'feature_idx': (3,)}, 2: {'avg_score': 0.95993589743589736, 'cv_scores': np.array([0.97435897, 0.94871795, 0.91666667, 1.0]), 'feature_idx': (2, 3)}, 3: {'avg_score': 0.97275641025641035, 'cv_scores': np.array([0.97435897, 1.0, 0.94444444, 0.97222222]), 'feature_idx': (1, 2, 3)}} dict_compare_utility(d_actual=sfs1.subsets_, d_desired=expect)
def forward_feature_selection_linear_regression(X_train, y_train): """ Selects features using Feedforward Feature Selection using a Linear Regression. -- RATIONALE I had aimed to write my own function to let the number of features to select be variable, however due to time constraints I did not implement such a version. For now I selected the number of features (9), based on visual inspection of the Forward Feature Selection plots. -- Parameters ----------- Returns ----------- """ regr = LinearRegression() # Build step forward feature selection sfs = SequentialFeatureSelector(regr, k_features=9, forward=True, floating=False, verbose=2, scoring='r2', cv=5) # Perform Sequential Feedforward Selection sfs = sfs.fit(X_train, y_train) selected_feature_names = sfs.k_feature_names_ return selected_feature_names
def select_SFS(X_tr, y_tr, num_feat=100, knn_parameter=1, forward_=False, floating_=True): """ Secuential Feature Selection :param X_tr: :param y_tr: :param num_feat: :param knn_parameter: :param forward_: :param floating_: :return: """ X = X_tr y = y_tr knn = KNeighborsClassifier(n_neighbors=knn_parameter) sfs1 = SFS(knn, k_features=(1, num_feat), forward=forward_, floating=floating_, verbose=1, scoring='accuracy', cv=3, n_jobs=4) sfs1 = sfs1.fit(X, y) out = sfs1.k_feature_idx_ return np.asarray(out)
def forward_selection_regression(data, target, k_features=3): """ :param data: pandas dataframe of input data :param target: pandas dataframe of input data's corresponding target :param k_features: number of desired features to fit the regression upon, features are chosen based on their importance :return: prints out the mean squared error and regression coefficients """ reg = LinearRegression() sfs = SFS(reg, k_features, forward=True, floating=False, verbose=0, scoring='r2', cv=5) X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=.3) sfs = sfs.fit(X_train, y_train) X_train_sfs = sfs.transform(X_train) X_test_sfs = sfs.transform(X_test) reg = reg.fit(X_train_sfs, y_train) print('estimated coefficients for the linear regression:', reg.coef_) print('interception coefficient b_0:', reg.intercept_) print('MSE_train:', metrics.mean_squared_error(y_train, reg.predict(X_train_sfs))) print('MSE_test:', metrics.mean_squared_error(y_test, reg.predict(X_test_sfs)))
def fsel(bcl, X, d, m, forward=True, floating=False, cv=0, show=0): if show > 0: print('Feature Selection - ' + bcl[0] + ': - number of features reducing from ' + str(X.shape[1]) + ' to ' + str(m) + ' ...') if bcl[0] == 'Fisher': sel = sfsfisher(X, d, m) else: estimator = defineModel(bcl) sfs = SFS(estimator, k_features=m, forward=True, floating=False, verbose=show, scoring='accuracy', cv=cv) sfs = sfs.fit(X, d) sel = list(sfs.k_feature_idx_) if show > 0: print(' ') if show: plot_sfs(sfs.get_metric_dict(), kind='std_err') plt.title('Sequential Forward Selection') plt.grid() plt.show() return sel
def filter_with_sfs(train_X, valid_X, test_X, train_Y, i): features = {item for item in train_X.head(0)} fs = SequentialFeatureSelector(RandomForestClassifier(n_estimators=30, random_state=0), k_features=i, forward=True, verbose=0, scoring='accuracy', cv=4) fs.fit(train_X, train_Y) selected_features = set(fs.k_feature_names_) features_to_drop = list(features - selected_features) return train_X.drop(features_to_drop, axis=1), valid_X.drop(features_to_drop, axis=1), \ test_X.drop(features_to_drop, axis=1)
def test_knn_cv3_groups(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, k_features=3, forward=True, floating=False, cv=GroupKFold(n_splits=3), verbose=0) np.random.seed(1630672634) groups = np.random.randint(0, 6, size=len(y)) sfs1 = sfs1.fit(X, y, groups=groups) # print(sfs1.subsets_) expect = { 1: { 'cv_scores': np.array([0.97916667, 0.93877551, 0.96226415]), 'feature_idx': (3, ), 'avg_score': 0.9600687759380482 }, 2: { 'cv_scores': np.array([0.95833333, 0.93877551, 0.98113208]), 'feature_idx': (1, 3), 'avg_score': 0.9594136396697044 }, 3: { 'cv_scores': np.array([0.97916667, 0.95918367, 0.94339623]), 'feature_idx': (1, 2, 3), 'avg_score': 0.9605821888503829 } } dict_compare_utility(d_actual=sfs1.subsets_, d_desired=expect, decimal=3)
def forward_feature_selection_decision_tree(X_train, y_train_binned): """ Selects features using Feedforward Feature Selection using a Decision Tree Classifier. -- RATIONALE I had aimed to write my own function to let the number of features to select be variable, however due to time constraints I did not implement such a version. For now I selected the number of features (7), based on visual inspection of the Forward Feature Selection plots. -- Parameters ----------- X_train: training split of feature variables with continuous values y_train_binned: training split of feature variables with 3 class values Returns ----------- """ clf = tree.DecisionTreeClassifier() # Build step forward feature selection sfs = SequentialFeatureSelector(clf, k_features=7, forward=True, floating=False, verbose=2, scoring='r2', cv=5) # Perform Sequential Feature Selection sfs = sfs.fit(X_train, y_train_binned) selected_feature_names = sfs.k_feature_names_ return selected_feature_names
def test_knn_cv3(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, k_features=3, forward=True, floating=False, scoring='accuracy', cv=4, skip_if_stuck=True, print_progress=False) sfs1 = sfs1.fit(X, y) sfs1.subsets_ expect = {1: {'avg_score': 0.95299145299145294, 'cv_scores': np.array([0.97435897, 0.94871795, 0.88888889, 1.0]), 'feature_idx': (3,)}, 2: {'avg_score': 0.95993589743589736, 'cv_scores': np.array([0.97435897, 0.94871795, 0.91666667, 1.0]), 'feature_idx': (2, 3)}, 3: {'avg_score': 0.97275641025641035, 'cv_scores': np.array([0.97435897, 1.0, 0.94444444, 0.97222222]), 'feature_idx': (1, 2, 3)}} dict_compare_utility(d1=expect, d2=sfs1.subsets_)
def test_knn_cv3(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, k_features=3, forward=True, floating=False, cv=4, verbose=0) sfs1 = sfs1.fit(X, y) sfs1.subsets_ expect = { 1: { 'avg_score': 0.95299145299145294, 'cv_scores': np.array([0.97435897, 0.94871795, 0.88888889, 1.0]), 'feature_idx': (3, ) }, 2: { 'avg_score': 0.95993589743589736, 'cv_scores': np.array([0.97435897, 0.94871795, 0.91666667, 1.0]), 'feature_idx': (2, 3) }, 3: { 'avg_score': 0.97275641025641035, 'cv_scores': np.array([0.97435897, 1.0, 0.94444444, 0.97222222]), 'feature_idx': (1, 2, 3) } } dict_compare_utility(d1=expect, d2=sfs1.subsets_)
def test_knn_wo_cv(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, k_features=3, forward=True, floating=False, cv=0, verbose=0) sfs1 = sfs1.fit(X, y) expect = { 1: { 'avg_score': 0.95999999999999996, 'cv_scores': np.array([0.96]), 'feature_idx': (3, ) }, 2: { 'avg_score': 0.97333333333333338, 'cv_scores': np.array([0.97333333]), 'feature_idx': (2, 3) }, 3: { 'avg_score': 0.97333333333333338, 'cv_scores': np.array([0.97333333]), 'feature_idx': (1, 2, 3) } } dict_compare_utility(d1=expect, d2=sfs1.subsets_)
def test_string_scoring_clf(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, k_features=3, cv=0) sfs1 = sfs1.fit(X, y) sfs2 = SFS(knn, k_features=3, scoring='accuracy', cv=0) sfs2 = sfs2.fit(X, y) sfs3 = SFS(knn, k_features=3, scoring=make_scorer(accuracy_score), cv=0) sfs3 = sfs2.fit(X, y) assert sfs1.k_score_ == sfs2.k_score_ assert sfs1.k_score_ == sfs3.k_score_
def feature_selection_sfs(X, y, parameters): # TODO check n_neighbor # k_features = parameters['sfs_k_features'] # forward = parameters['sfs_forward'] # floating = parameters['floating'] # scoring = parameters['scoring'] # n_folds = parameters['n_folds'] # n_jobs = parameters['n_jobs'] knn = KNeighborsClassifier(n_neighbors=parameters['sfs_k_neighbors']) sfs1 = SFS(knn, k_features=parameters['sfs_k_features'], forward=parameters['sfs_forward'], floating=parameters['sfs_floating'], verbose=2, scoring=parameters['sfs_scoring'], cv=parameters['sfs_cv'], n_jobs=parameters['sfs_n_jobs']) custom_feature_names = None if parameters['feature_names'].any(): custom_feature_names = parameters['feature_names'] sfs1 = sfs1.fit(X, y, custom_feature_names=custom_feature_names) return sfs1
def test_pandas(): X_df = pd.DataFrame( X_iris, columns=['sepal length', 'sepal width', 'petal width', 'petal width']) knn = KNeighborsClassifier() sfs = SFS(estimator=knn, k_features=3, forward=True, floating=False, fixed_features=('sepal length', 'sepal width'), verbose=0) sfs.fit(X_df, y_iris) print(sfs.subsets_) for k in sfs.subsets_: assert 0 in sfs.subsets_[k]['feature_idx'] assert 1 in sfs.subsets_[k]['feature_idx']
def stepwiseFeatureSelection(label, features): lr = linear_model.LinearRegression() sfs = SFS(lr, k_features=1) sfs = sfs.fit(features, label) prediction = sfs.predict(features) r2Score = r2_score(label, prediction) print(r2Score) return sfs
def test_knn_option_sffs(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs2 = SFS(knn, k_features=3, forward=True, floating=True, cv=4, verbose=0) sfs2 = sfs2.fit(X, y) assert sfs2.k_feature_idx_ == (1, 2, 3)
def select_features(self, model, X_train, y_train, k_features=(1, 30), scorer='r2', cv=0): sfs = SFS(model, k_features=k_features, forward=True, floating=False, scoring=scorer, cv=cv, verbose=2) sfs.fit(np.array(X_train), np.array(y_train)) print(sfs.k_feature_idx_) """
def test_knn_rbf_groupkfold(): nan_roc_auc_scorer = make_scorer(nan_roc_auc_score) rng = np.random.RandomState(123) iris = load_iris() X = iris.data # knn = KNeighborsClassifier(n_neighbors=4) forest = RandomForestClassifier(n_estimators=100, random_state=123) bool_01 = [True if item == 0 else False for item in iris['target']] bool_02 = [ True if (item == 1 or item == 2) else False for item in iris['target'] ] groups = [] y_new = [] for ind, _ in enumerate(bool_01): if bool_01[ind]: groups.append('attribute_A') y_new.append(0) if bool_02[ind]: throw = rng.rand() if throw < 0.5: groups.append('attribute_B') else: groups.append('attribute_C') throw2 = rng.rand() if throw2 < 0.5: y_new.append(0) else: y_new.append(1) y_new_bool = [True if item is 1 else False for item in y_new] cv_obj = GroupKFold(n_splits=3) cv_obj_list = list(cv_obj.split(X, np.array(y_new_bool), groups)) sfs1 = SFS(forest, k_features=3, forward=True, floating=False, cv=cv_obj_list, scoring=nan_roc_auc_scorer, verbose=0) sfs1 = sfs1.fit(X, y_new) expect = { 1: { 'cv_scores': np.array([0.52, nan, 0.72]), 'avg_score': 0.62, 'feature_idx': (1, ) }, 2: { 'cv_scores': np.array([0.42, nan, 0.65]), 'avg_score': 0.53, 'feature_idx': (1, 2) }, 3: { 'cv_scores': np.array([0.47, nan, 0.63]), 'avg_score': 0.55, 'feature_idx': (1, 2, 3) } } dict_compare_utility(d1=expect, d2=sfs1.subsets_, decimal=1)
def vote(X_train, Y_train, X_test, Y_test, voting_type, feature_selection, k_features): """Invokation of a soft voting/majority rule classification. This is a wrapper around `sklearn.ensemble.VotingClassifier` which automatically uses all classifiers that are known to `gumpy` in `gumpy.classification.available_classifiers`. Args: X_train: training data (values) Y_train: training data (labels) X_test: evaluation data (values) Y_test: evaluation data (labels) voting_type (str): either of 'soft' or 'hard'. See the sklearn.ensemble.VotingClassifier documentation for more details Returns: 2-element tuple containing - **ClassificationResult**: The result of the classification. - **Classifier**: The instance of `sklearn.ensemble.VotingClassifier` that was used during the classification. """ k_cross_val = 10 N_JOBS = -1 clfs = [] for classifier in available_classifiers: # determine kwargs such that the classifiers get initialized with # proper default settings. This avoids cross-validation, for instance opts = available_classifiers[classifier].static_opts('vote', X_train=X_train) # retrieve instance cobj = available_classifiers[classifier](**opts) clfs.append((classifier, cobj.clf)) # instantiate the VotingClassifier soft_vote_clf = VotingClassifier(estimators=clfs, voting=voting_type) if feature_selection: sfs = SFS(soft_vote_clf, k_features, forward=True, floating=True, verbose=2, scoring='accuracy', cv=k_cross_val, n_jobs=N_JOBS) sfs = sfs.fit(X_train, Y_train) X_train = sfs.transform(X_train) X_test = sfs.transform(X_test) soft_vote_clf.fit(X_train, Y_train) Y_pred = soft_vote_clf.predict(X_test) return ClassificationResult(Y_test, Y_pred), soft_vote_clf
def test_check_pandas_dataframe_fit_backward(): for floating in [True, False]: iris = load_iris() X = iris.data y = iris.target lr = SoftmaxRegression(random_seed=1) sfs1 = SFS(lr, k_features=2, forward=False, floating=floating, scoring='accuracy', cv=0, verbose=0, n_jobs=1) df = pd.DataFrame( X, columns=['sepal len', 'sepal width', 'petal len', 'petal width']) sfs1 = sfs1.fit(X, y) assert sfs1.k_feature_idx_ == (1, 2) assert sfs1.k_feature_names_ == ('1', '2') assert sfs1.subsets_[2]['feature_names'] == ('1', '2') sfs1 = sfs1.fit(df, y) assert sfs1.subsets_[3]['feature_names'] == ('sepal len', 'sepal width', 'petal len') assert sfs1.subsets_[2]['feature_names'] == ('sepal width', 'petal len') assert sfs1.subsets_[3]['feature_idx'] == (0, 1, 2) assert sfs1.subsets_[2]['feature_idx'] == (1, 2) assert sfs1.k_feature_idx_ == (1, 2) assert sfs1.k_feature_names_ == ('sepal width', 'petal len') sfs1._TESTING_INTERRUPT_MODE = True out = sfs1.fit(df, y) assert len(out.subsets_.keys()) > 0 assert sfs1.interrupted_ assert sfs1.subsets_[3]['feature_names'] == ('sepal len', 'sepal width', 'petal len') assert sfs1.k_feature_idx_ == (0, 1, 2) assert sfs1.k_feature_names_ == ('sepal len', 'sepal width', 'petal len')
def backward(X_train, Y_train): rf_sfs = RandomForestRegressor(n_estimators=100, max_depth=50, oob_score=False, n_jobs=-1) SFS_b = SequentialFeatureSelector(rf_sfs, forward=False, k_features=6, scoring='neg_mean_squared_error', n_jobs=-1) SFS_b = SFS_b.fit(X_train.values, Y_train.values) indxs = list(SFS_b.k_feature_names_) str_cols = X_train.columns features = set(zip(indxs, str_cols)) print(features)
def test_max_feature_subset_best(): boston = load_boston() X, y = boston.data, boston.target lr = LinearRegression() sfs = SFS(lr, k_features='best', forward=True, floating=False, cv=10) sfs = sfs.fit(X, y) assert sfs.k_feature_idx_ == (1, 3, 5, 7, 8, 9, 10, 11, 12)
def sequential_feature_selection(X, y, k): sfs = SFS(LinearRegression(), k_features=k, forward=True, floating=False, scoring='r2', cv=0) fit = sfs.fit(X, y) return fit
def select_r2(df_in, ss_label, f_n, eps): dfx = df_in.copy() if len(dfx.columns) > f_n: select = SFS(RandomForestClassifier(n_estimators=eps, random_state=1), k_features=f_n, forward=True, floating=False, scoring='accuracy', cv=4, n_jobs=3) select.fit(dfx.values, ss_label.values) mask = select.k_feature_idx_ x_sfs = select.transform(dfx.values) m_mir_list = dfx.columns[[x for x in mask]] return x_sfs, ','.join(m_mir_list), len(m_mir_list) else: f_list = dfx.columns.tolist() return dfx.values, ','.join(f_list), len(f_list)
def sfs(X_train, y_train, estimator, metric): sfs1 = SFS(estimator, k_features=(1, X_train.shape[1]), forward=True, floating=False, scoring=metric, cv=0) sfs1 = sfs1.fit(X_train, y_train) return sfs1.k_feature_idx_
def select_features_wrapper(X, y, forward=True, k_features=20): # svc = SVC(gamma='auto') # linearSVC = LinearSVC(random_state=0, tol=1e-5, class_weight='balanced') random_forest_clssifier = RandomForestClassifier(max_depth=7, random_state=0) sgd = SGDClassifier(max_iter=1000, tol=1e-3) # knn = KNNeighborsClassifier(n_neighbors=3) sfs = SequentialFeatureSelector(sgd, k_features=k_features, forward=forward, floating=False, verbose=5, cv=0, n_jobs=-1) sfs.fit(X, y.values.ravel()) print(sfs.k_feature_names_) return sfs
def get_best_logisitc(y): from mlxtend.feature_selection import SequentialFeatureSelector as SFS from sklearn.cross_validation import StratifiedKFold import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import cross_val_score my_data = pd.read_csv('data/my_data_test.csv', encoding='utf-8') y = my_data.target my_data = my_data.drop('target', axis=1) # To have better CV skf = StratifiedKFold(y, n_folds=5, random_state=17, shuffle=False) C_params = [0.01 , 1, 10, 50, 70, 100] solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag'] my_result_list = [] for C_param in C_params: for solver in solvers: print "Looking for C : %s and solver : %s" % (C_param, solver) model = LogisticRegression(class_weight='balanced', random_state=17, solver=solver, C=C_param) sfs = SFS(model, k_features=len(my_data.columns), forward=True, floating=False, scoring='roc_auc', print_progress=False, cv=skf, n_jobs=-1) sfs = sfs.fit(my_data.values, y.values) result_sfs = pd.DataFrame.from_dict(sfs.get_metric_dict()).T result_sfs.sort_values('avg_score', ascending=0, inplace=True) features_sfs = result_sfs.feature_idx.head(1).tolist() select_features_sfs = list(my_data.columns[features_sfs]) scores = cross_val_score(model, my_data[select_features_sfs], y, cv=skf, scoring='roc_auc') my_result_list.append({'C' : C_param, 'solver' : solver, 'auc' : scores.mean(), 'std' : scores.std(), 'best_columns' : select_features_sfs, 'estimator' : model}) my_result = pd.DataFrame(my_result_list) my_result.sort_values('auc', ascending=0, inplace=True) best_features = my_result.best_columns.head(1).values[0] best_model = my_result.estimator.head(1).values[0] return best_features, best_model
def test_check_pandas_dataframe_fit_backward(): for floating in [True, False]: iris = load_iris() X = iris.data y = iris.target lr = SoftmaxRegression(random_seed=1) sfs1 = SFS(lr, k_features=2, forward=False, floating=floating, scoring='accuracy', cv=0, verbose=0, n_jobs=1) df = pd.DataFrame(X, columns=['sepal len', 'sepal width', 'petal len', 'petal width']) sfs1 = sfs1.fit(X, y) assert sfs1.k_feature_idx_ == (1, 2) assert sfs1.k_feature_names_ == ('1', '2') assert sfs1.subsets_[2]['feature_names'] == ('1', '2') sfs1 = sfs1.fit(df, y) assert sfs1.subsets_[3]['feature_names'] == ('sepal len', 'sepal width', 'petal len') assert sfs1.subsets_[2]['feature_names'] == ('sepal width', 'petal len') assert sfs1.subsets_[3]['feature_idx'] == (0, 1, 2) assert sfs1.subsets_[2]['feature_idx'] == (1, 2) assert sfs1.k_feature_idx_ == (1, 2) assert sfs1.k_feature_names_ == ('sepal width', 'petal len') sfs1._TESTING_INTERRUPT_MODE = True out = sfs1.fit(df, y) assert len(out.subsets_.keys()) > 0 assert sfs1.interrupted_ assert sfs1.subsets_[3]['feature_names'] == ('sepal len', 'sepal width', 'petal len') assert sfs1.k_feature_idx_ == (0, 1, 2) assert sfs1.k_feature_names_ == ('sepal len', 'sepal width', 'petal len')
def feature_selection(regr, train): x, y = train.drop(columns=['Id', 'SalePrice']), train['SalePrice'] regr.fit(x, y) sfs = SFS(regr, k_features=x.shape[1] - 10, forward=False, verbose=2, scoring='neg_mean_squared_error', cv=4) sfs.fit(x, y) selected_features = (pd.DataFrame(sfs.get_metric_dict()) .T .loc[:, ['feature_names', 'avg_score', 'std_dev', 'std_err']] .sort_values(['avg_score', 'std_dev'], ascending=False) .reset_index(drop=True)) best_features = selected_features.at[0, 'feature_names'] best_features = list(best_features) bad_features = [f for f in x if f not in best_features] return bad_features
def feature_selection(X, y, method=1, k_features=5, save_params=False, seed=127): logit = LogisticRegression(C=1, random_state=seed, solver='liblinear') if method == 1: rfe = RFE(logit, n_features_to_select=k_features, verbose=2) rfe.fit(X, y) if save_params: with open('rfe.pkl', 'wb') as file: pickle.dump(rfe, file, pickle.HIGHEST_PROTOCOL) return rfe elif method == 2: sfs = SequentialFeatureSelector(logit, cv=0, k_features=k_features, forward=False, scoring='roc_auc', verbose=2, n_jobs=-1) sfs.fit(X, y) if save_params: with open('sfs.pkl', 'wb') as file: pickle.dump(sfs, file, pickle.HIGHEST_PROTOCOL) return sfs
def callSBS(): sbs = SFS(knn, k_features=8, forward=False, floating=False, scoring='accuracy', cv=0, verbose=2) sbs = sbs.fit(X, yfinal) print sbs.subsets_
def callSFS(): sfs1 = SFS(knn, k_features=8, forward=True, floating=False, verbose=2, scoring='accuracy', cv=0) sfs1 = sfs1.fit(X, yfinal) print sfs1.subsets_
def callSFBS(): sfbs = SFS(knn, k_features=8, forward=False, floating=True, scoring='accuracy', cv=0, n_jobs=-1) sfbs = sfbs.fit(X, yfinal) print sfbs.subsets_
def callSFFS(): sffs = SFS(knn, k_features=8, forward=True, floating=True, scoring='accuracy', cv=0, verbose=2) sffs = sffs.fit(X, yfinal) print sffs.subsets_
def test_knn_option_sfbs(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs4 = SFS(knn, k_features=3, forward=False, floating=True, cv=4, verbose=0) sfs4 = sfs4.fit(X, y) assert sfs4.k_feature_idx_ == (1, 2, 3)
def test_regression_sbfs(): boston = load_boston() X, y = boston.data, boston.target lr = LinearRegression() sfs_r = SFS(lr, k_features=3, forward=False, floating=True, scoring='neg_mean_squared_error', cv=10, verbose=0) sfs_r = sfs_r.fit(X, y) assert sfs_r.k_feature_idx_ == (7, 10, 12), sfs_r.k_feature_idx_
def test_max_feature_subset_best(): boston = load_boston() X, y = boston.data, boston.target lr = LinearRegression() sfs = SFS(lr, k_features='best', forward=True, floating=False, cv=10) sfs = sfs.fit(X, y) assert sfs.k_feature_idx_ == (1, 3, 5, 7, 8, 9, 10, 11, 12)
def test_max_feature_subset_parsimonious(): boston = load_boston() X, y = boston.data, boston.target lr = LinearRegression() sfs = SFS(lr, k_features='parsimonious', forward=True, floating=False, cv=10) sfs = sfs.fit(X, y) assert sfs.k_feature_idx_ == (5, 10, 11, 12)
def test_regression(): boston = load_boston() X, y = boston.data, boston.target lr = LinearRegression() sfs_r = SFS(lr, k_features=13, forward=True, floating=False, scoring='mean_squared_error', cv=10, skip_if_stuck=True, print_progress=False) sfs_r = sfs_r.fit(X, y) assert round(sfs_r.k_score_, 4) == -34.7631
def test_knn_option_sbs_tuplerange_1(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=3) sfs4 = SFS(knn, k_features=(1, 3), forward=False, floating=False, cv=4, verbose=0) sfs4 = sfs4.fit(X, y) assert round(sfs4.k_score_, 3) == 0.967, sfs4.k_score_ assert sfs4.k_feature_idx_ == (0, 2, 3), sfs4.k_feature_idx_
def test_regression_in_range(): boston = load_boston() X, y = boston.data, boston.target lr = LinearRegression() sfs_r = SFS(lr, k_features=(1, 13), forward=True, floating=False, scoring='neg_mean_squared_error', cv=10, verbose=0) sfs_r = sfs_r.fit(X, y) assert len(sfs_r.k_feature_idx_) == 9 assert round(sfs_r.k_score_, 4) == -31.1537
def test_knn_rbf_groupkfold(): nan_roc_auc_scorer = make_scorer(nan_roc_auc_score) rng = np.random.RandomState(123) iris = load_iris() X = iris.data # knn = KNeighborsClassifier(n_neighbors=4) forest = RandomForestClassifier(n_estimators=100, random_state=123) bool_01 = [True if item == 0 else False for item in iris['target']] bool_02 = [True if (item == 1 or item == 2) else False for item in iris['target']] groups = [] y_new = [] for ind, _ in enumerate(bool_01): if bool_01[ind]: groups.append('attribute_A') y_new.append(0) if bool_02[ind]: throw = rng.rand() if throw < 0.5: groups.append('attribute_B') else: groups.append('attribute_C') throw2 = rng.rand() if throw2 < 0.5: y_new.append(0) else: y_new.append(1) y_new_bool = [True if item is 1 else False for item in y_new] cv_obj = GroupKFold(n_splits=3) cv_obj_list = list(cv_obj.split(X, np.array(y_new_bool), groups)) sfs1 = SFS(forest, k_features=3, forward=True, floating=False, cv=cv_obj_list, scoring=nan_roc_auc_scorer, verbose=0 ) sfs1 = sfs1.fit(X, y_new) expect = { 1: {'cv_scores': np.array([0.52, nan, 0.72]), 'avg_score': 0.62, 'feature_idx': (1,)}, 2: {'cv_scores': np.array([0.42, nan, 0.65]), 'avg_score': 0.53, 'feature_idx': (1, 2)}, 3: {'cv_scores': np.array([0.47, nan, 0.63]), 'avg_score': 0.55, 'feature_idx': (1, 2, 3)}} dict_compare_utility(d_actual=sfs1.subsets_, d_desired=expect, decimal=1)
def test_knn_option_sffs(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs2 = SFS(knn, k_features=3, forward=True, floating=True, scoring='accuracy', cv=4, skip_if_stuck=True, verbose=0) sfs2 = sfs2.fit(X, y) assert sfs2.k_feature_idx_ == (1, 2, 3)
def test_string_scoring_clf(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, k_features=3, cv=0) sfs1 = sfs1.fit(X, y) sfs2 = SFS(knn, k_features=3, scoring='accuracy', cv=0) sfs2 = sfs2.fit(X, y) sfs3 = SFS(knn, k_features=3, scoring=make_scorer(accuracy_score), cv=0) sfs3 = sfs2.fit(X, y) assert sfs1.k_score_ == sfs2.k_score_ assert sfs1.k_score_ == sfs3.k_score_
def test_knn_option_sfs(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, k_features=3, forward=True, floating=False, scoring='accuracy', cv=4, skip_if_stuck=True, print_progress=False) sfs1 = sfs1.fit(X, y) assert sfs1.k_feature_idx_ == (1, 2, 3)
def test_max_feature_subset_size_in_tuple_range(): boston = load_boston() X, y = boston.data, boston.target lr = LinearRegression() sfs = SFS(lr, k_features=(1, 5), forward=False, floating=True, scoring='neg_mean_squared_error', cv=10) sfs = sfs.fit(X, y) assert len(sfs.k_feature_idx_) == 5
def test_clone_params_pass(): iris = load_iris() X = iris.data y = iris.target lr = SoftmaxRegression(random_seed=1) sfs1 = SFS(lr, k_features=2, forward=True, floating=False, scoring='accuracy', cv=0, clone_estimator=True, verbose=0, n_jobs=1) sfs1 = sfs1.fit(X, y) assert (sfs1.k_feature_idx_ == (1, 3))
def test_knn_option_sfbs_tuplerange_2(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=3) sfs4 = SFS(knn, k_features=(1, 4), forward=False, floating=True, scoring='accuracy', cv=4, skip_if_stuck=True, verbose=0) sfs4 = sfs4.fit(X, y) assert round(sfs4.k_score_, 3) == 0.966, sfs4.k_score_ assert sfs4.k_feature_idx_ == (1, 2, 3), sfs4.k_feature_idx_
def test_check_pandas_dataframe_transform(): iris = load_iris() X = iris.data y = iris.target lr = SoftmaxRegression(random_seed=1) sfs1 = SFS(lr, k_features=2, forward=True, floating=False, scoring='accuracy', cv=0, verbose=0, n_jobs=1) df = pd.DataFrame(X, columns=['sepal length', 'sepal width', 'petal length', 'petal width']) sfs1 = sfs1.fit(df, y) assert sfs1.k_feature_idx_ == (1, 3) assert (150, 2) == sfs1.transform(df).shape
def sfs_selection(X,y,n_features,forward): """ Performs the Sequential Forward/Backward Selection method and selects the top ranking features Keyword arguments: X -- The feature vectors y -- The target vector n_features -- n best ranked features """ if verbose: print '\nPerforming Feature Selection based on the Sequential Feature Selection method ...' clf=RandomForestClassifierWithCoef(n_estimators=5,n_jobs=-1) sfs = SFS(clf,k_features=n_features,forward=forward,scoring='accuracy',cv=0,n_jobs=-1, print_progress=True,) sfs = sfs.fit(X, y) feature_indexes=sfs.k_feature_idx_ return X[:,feature_indexes[0:n_features]],feature_indexes[0:n_features] #return selected features and original index features
def test_regression(): boston = load_boston() X, y = boston.data, boston.target lr = LinearRegression() sfs_r = SFS(lr, k_features=13, forward=True, floating=False, scoring='neg_mean_squared_error', cv=10, verbose=0) sfs_r = sfs_r.fit(X, y) assert len(sfs_r.k_feature_idx_) == 13 if Version(sklearn_version) < '0.20': assert round(sfs_r.k_score_, 4) == -34.7631, \ round(sfs_r.k_score_, 4) else: assert round(sfs_r.k_score_, 4) == -34.7053, \ round(sfs_r.k_score_, 4)
def test_custom_feature_names(): iris = load_iris() X = iris.data y = iris.target lr = SoftmaxRegression(random_seed=1) sfs1 = SFS(lr, k_features=2, forward=True, floating=False, scoring='accuracy', cv=0, verbose=0, n_jobs=1) sfs1 = sfs1.fit(X, y, custom_feature_names=( 'sepal length', 'sepal width', 'petal length', 'petal width')) assert sfs1.k_feature_idx_ == (1, 3) assert sfs1.k_feature_names_ == ('sepal width', 'petal width') assert sfs1.subsets_[2]['feature_names'] == ('sepal width', 'petal width')
def test_knn_wo_cv(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, k_features=3, forward=True, floating=False, cv=0, verbose=0) sfs1 = sfs1.fit(X, y) expect = {1: {'avg_score': 0.95999999999999996, 'cv_scores': np.array([0.96]), 'feature_idx': (3,)}, 2: {'avg_score': 0.97333333333333338, 'cv_scores': np.array([0.97333333]), 'feature_idx': (2, 3)}, 3: {'avg_score': 0.97333333333333338, 'cv_scores': np.array([0.97333333]), 'feature_idx': (1, 2, 3)}} dict_compare_utility(d_actual=sfs1.subsets_, d_desired=expect)
def test_keyboard_interrupt(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS( knn, k_features=3, forward=True, floating=False, cv=3, clone_estimator=False, verbose=5, n_jobs=1 ) sfs1._TESTING_INTERRUPT_MODE = True out = sfs1.fit(X, y) assert len(out.subsets_.keys()) > 0 assert sfs1.interrupted_
def test_knn_wo_cv(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=4) sfs1 = SFS(knn, k_features=3, forward=True, floating=False, scoring='accuracy', cv=0, skip_if_stuck=True, print_progress=False) sfs1 = sfs1.fit(X, y) expect = {1: {'avg_score': 0.95999999999999996, 'cv_scores': np.array([0.96]), 'feature_idx': (3,)}, 2: {'avg_score': 0.97333333333333338, 'cv_scores': np.array([0.97333333]), 'feature_idx': (2, 3)}, 3: {'avg_score': 0.97333333333333338, 'cv_scores': np.array([0.97333333]), 'feature_idx': (1, 2, 3)}} dict_compare_utility(d1=expect, d2=sfs1.subsets_)