def _fs_rfe(self, data, labels, plot_filename, n_features=10): svc = SVC(kernel="linear", C=1) transformer = RFE(estimator=svc, n_features_to_select=n_features, step=1) data = transformer.fit_transform(data, labels) attributes = OrderedDict() #produce a plot if requested and supported (for RFE) if plot_filename: try: grid_scores = transformer.grid_scores_ except: return transformer, data, attributes plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(grid_scores) + 1), transformer.grid_scores) plt.savefig(plot_filename, bbox_inches='tight') #put ranks in an array, so that we can get them in the log file for i, rank_strings in enumerate(transformer.ranking_): attributes["RFE_rank_f{}".format(i)] = rank_strings for i, rank_strings in enumerate(transformer.support_): attributes["RFE_mask_f{}".format(i)] = rank_strings return transformer, data, attributes
def test_rfe(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] X_sparse = sparse.csr_matrix(X) y = iris.target # dense model clf = SVC(kernel="linear") rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1) rfe.fit(X, y) X_r = rfe.transform(X) clf.fit(X_r, y) assert_equal(len(rfe.ranking_), X.shape[1]) # sparse model clf_sparse = SVC(kernel="linear") rfe_sparse = RFE(estimator=clf_sparse, n_features_to_select=4, step=0.1) rfe_sparse.fit(X_sparse, y) X_r_sparse = rfe_sparse.transform(X_sparse) assert_equal(X_r.shape, iris.data.shape) assert_array_almost_equal(X_r[:10], iris.data[:10]) assert_array_almost_equal(rfe.predict(X), clf.predict(iris.data)) assert_equal(rfe.score(X, y), clf.score(iris.data, iris.target)) assert_array_almost_equal(X_r, X_r_sparse.toarray())
def test_number_of_subsets_of_features(): # In RFE, 'number_of_subsets_of_features' # = the number of iterations in '_fit' # = max(ranking_) # = 1 + (n_features + step - n_features_to_select - 1) // step # After optimization #4534, this number # = 1 + np.ceil((n_features - n_features_to_select) / float(step)) # This test case is to test their equivalence, refer to #4534 and #3824 def formula1(n_features, n_features_to_select, step): return 1 + ((n_features + step - n_features_to_select - 1) // step) def formula2(n_features, n_features_to_select, step): return 1 + np.ceil((n_features - n_features_to_select) / float(step)) # RFE # Case 1, n_features - n_features_to_select is divisible by step # Case 2, n_features - n_features_to_select is not divisible by step n_features_list = [11, 11] n_features_to_select_list = [3, 3] step_list = [2, 3] for n_features, n_features_to_select, step in zip( n_features_list, n_features_to_select_list, step_list): generator = check_random_state(43) X = generator.normal(size=(100, n_features)) y = generator.rand(100).round() rfe = RFE(estimator=SVC(kernel="linear"), n_features_to_select=n_features_to_select, step=step) rfe.fit(X, y) # this number also equals to the maximum of ranking_ assert_equal(np.max(rfe.ranking_), formula1(n_features, n_features_to_select, step)) assert_equal(np.max(rfe.ranking_), formula2(n_features, n_features_to_select, step)) # In RFECV, 'fit' calls 'RFE._fit' # 'number_of_subsets_of_features' of RFE # = the size of 'grid_scores' of RFECV # = the number of iterations of the for loop before optimization #4534 # RFECV, n_features_to_select = 1 # Case 1, n_features - 1 is divisible by step # Case 2, n_features - 1 is not divisible by step n_features_to_select = 1 n_features_list = [11, 10] step_list = [2, 2] for n_features, step in zip(n_features_list, step_list): generator = check_random_state(43) X = generator.normal(size=(100, n_features)) y = generator.rand(100).round() rfecv = RFECV(estimator=SVC(kernel="linear"), step=step, cv=5) rfecv.fit(X, y) assert_equal(rfecv.grid_scores_.shape[0], formula1(n_features, n_features_to_select, step)) assert_equal(rfecv.grid_scores_.shape[0], formula2(n_features, n_features_to_select, step))
def test_rfe_set_params(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = iris.target clf = SVC(kernel="linear") rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1) y_pred = rfe.fit(X, y).predict(X) clf = SVC() rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1, estimator_params={"kernel": "linear"}) y_pred2 = rfe.fit(X, y).predict(X) assert_array_equal(y_pred, y_pred2)
def test_rfe_mockclassifier(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = iris.target # dense model clf = MockClassifier() rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1) rfe.fit(X, y) X_r = rfe.transform(X) clf.fit(X_r, y) assert_equal(len(rfe.ranking_), X.shape[1]) assert_equal(X_r.shape, iris.data.shape)
def test_rfe_deprecation_estimator_params(): deprecation_message = ("The parameter 'estimator_params' is deprecated as " "of version 0.16 and will be removed in 0.18. The " "parameter is no longer necessary because the " "value is set via the estimator initialisation or " "set_params method.") generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = iris.target assert_warns_message(DeprecationWarning, deprecation_message, RFE(estimator=SVC(), n_features_to_select=4, step=0.1, estimator_params={ 'kernel': 'linear' }).fit, X=X, y=y) assert_warns_message(DeprecationWarning, deprecation_message, RFECV(estimator=SVC(), step=1, cv=5, estimator_params={ 'kernel': 'linear' }).fit, X=X, y=y)
def test_rfe_estimator_tags(): rfe = RFE(SVC(kernel='linear')) assert_equal(rfe._estimator_type, "classifier") # make sure that cross-validation is stratified iris = load_iris() score = cross_val_score(rfe, iris.data, iris.target) assert_greater(score.min(), .7)
def test_rfe_set_params(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = iris.target clf = SVC(kernel="linear") rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1) y_pred = rfe.fit(X, y).predict(X) clf = SVC() with warnings.catch_warnings(record=True): # estimator_params is deprecated rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1, estimator_params={'kernel': 'linear'}) y_pred2 = rfe.fit(X, y).predict(X) assert_array_equal(y_pred, y_pred2)
def test_rfe_features_importance(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = iris.target clf = RandomForestClassifier(n_estimators=10, n_jobs=1) rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1) rfe.fit(X, y) assert_equal(len(rfe.ranking_), X.shape[1]) clf_svc = SVC(kernel="linear") rfe_svc = RFE(estimator=clf_svc, n_features_to_select=4, step=0.1) rfe_svc.fit(X, y) # Check if the supports are equal diff_support = rfe.get_support() == rfe_svc.get_support() assert_true(sum(diff_support) == len(diff_support))
def test_rfe_features_importance(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = iris.target clf = RandomForestClassifier(n_estimators=20, random_state=generator, max_depth=2) rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1) rfe.fit(X, y) assert len(rfe.ranking_) == X.shape[1] clf_svc = SVC(kernel="linear") rfe_svc = RFE(estimator=clf_svc, n_features_to_select=4, step=0.1) rfe_svc.fit(X, y) # Check if the supports are equal assert_array_equal(rfe.get_support(), rfe_svc.get_support())
def test_rfe_min_step(): n_features = 10 X, y = make_friedman1(n_samples=50, n_features=n_features, random_state=0) n_samples, n_features = X.shape estimator = SVR(kernel="linear") # Test when floor(step * n_features) <= 0 selector = RFE(estimator, step=0.01) sel = selector.fit(X, y) assert_equal(sel.support_.sum(), n_features // 2) # Test when step is between (0,1) and floor(step * n_features) > 0 selector = RFE(estimator, step=0.20) sel = selector.fit(X, y) assert_equal(sel.support_.sum(), n_features // 2) # Test when step is an integer selector = RFE(estimator, step=5) sel = selector.fit(X, y) assert_equal(sel.support_.sum(), n_features // 2)
def test_rfe_features_importance(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = iris.target clf = RandomForestClassifier(n_estimators=20, random_state=generator, max_depth=2) rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1) rfe.fit(X, y) assert_equal(len(rfe.ranking_), X.shape[1]) clf_svc = SVC(kernel="linear") rfe_svc = RFE(estimator=clf_svc, n_features_to_select=4, step=0.1) rfe_svc.fit(X, y) # Check if the supports are equal assert_array_equal(rfe.get_support(), rfe_svc.get_support())
def test_rfe(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = iris.target clf = SVC(kernel="linear") rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1) rfe.fit(X, y) X_r = rfe.transform(X) assert_true(X_r.shape == iris.data.shape) assert_array_almost_equal(X_r[:10], iris.data[:10]) assert_array_almost_equal(rfe.predict(X), clf.predict(iris.data)) assert_true(rfe.score(X, y) == clf.score(iris.data, iris.target))
def test_rfe_set_params(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = iris.target clf = SVC(kernel="linear") rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1) y_pred = rfe.fit(X, y).predict(X) clf = SVC() rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1, estimator_params={'kernel': 'linear'}) y_pred2 = rfe.fit(X, y).predict(X) assert_array_equal(y_pred, y_pred2)
model = linear_model.Ridge (alpha = 0.5) RMSE = 0 for i in range(10): x = [slices[(i+1) % 9], slices[(i+2) % 9], slices[(i+3) % 9], slices[(i+4) % 9], slices[(i+5) % 9], slices[(i+6) % 9], slices[(i+7) % 9], slices[(i+8) % 9]] x = pd.concat(x) y = [slicesOfc7[(i+1) % 9], slicesOfc7[(i+2) % 9], slicesOfc7[(i+3) % 9], slicesOfc7[(i+4) % 9], slicesOfc7[(i+5) % 9], slicesOfc7[(i+6) % 9], slicesOfc7[(i+7) % 9], slicesOfc7[(i+8) % 9]] y = pd.concat(y) model.fit(x, y) pr = model.predict(slices[i]) RMSE += math.pow(((pr - slicesOfc7[i]) ** 2).sum() / len(slicesOfc7[i]), 0.5) print "RMSE for validation is:" print RMSE/k # run SVM for extra features svc = SVC(kernel="linear", C=1) sel = RFE(estimator=svc, n_features_to_select=20, step=0.5, verbose=5) sel.fit(training, c7) training = training[:][sel.get_support(True)] # after SVM print "afteeeeeeeeeeeeeeeeeeeeeeer SVM:" modelLR = linear_model.LinearRegression() modelL = linear_model.Lasso(normalize=True) modelGBR = ensemble.GradientBoostingRegressor(n_estimators=500, max_depth=4, min_samples_split=2,learning_rate=0.01, loss='ls') modelR = linear_model.Ridge(alpha = .5) RMSELR = 0 RMSEL = 0 RMSEGBR = 0 RMSER = 0 for i in range(10): x = [slices[(i+1) % 9], slices[(i+2) % 9], slices[(i+3) % 9], slices[(i+4) % 9], slices[(i+5) % 9], slices[(i+6) % 9], slices[(i+7) % 9], slices[(i+8) % 9]] x = pd.concat(x)
from sklearn import datasets from sklearn.feature_selection.rfe import RFE from sklearn.linear_model.logistic import LogisticRegression iris = datasets.load_iris() rfe = RFE(estimator=LogisticRegression(), n_features_to_select=2) fit = rfe.fit(iris.data, iris.target) print(fit.n_features_) print(fit.support_) print(fit.ranking_)