def test_rfecv(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = iris.target # Test using the score function rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3) rfecv.fit(X, y) X_r = rfecv.transform(X) assert_true(X_r.shape == iris.data.shape) assert_array_almost_equal(X_r[:10], iris.data[:10]) # Test using a customized loss function rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3, loss_func=zero_one) rfecv.fit(X, y) X_r = rfecv.transform(X) assert_true(X_r.shape == iris.data.shape) assert_array_almost_equal(X_r[:10], iris.data[:10])
def test_number_of_subsets_of_features(): # In RFE, 'number_of_subsets_of_features' # = the number of iterations in '_fit' # = max(ranking_) # = 1 + (n_features + step - n_features_to_select - 1) // step # After optimization #4534, this number # = 1 + np.ceil((n_features - n_features_to_select) / float(step)) # This test case is to test their equivalence, refer to #4534 and #3824 def formula1(n_features, n_features_to_select, step): return 1 + ((n_features + step - n_features_to_select - 1) // step) def formula2(n_features, n_features_to_select, step): return 1 + np.ceil((n_features - n_features_to_select) / float(step)) # RFE # Case 1, n_features - n_features_to_select is divisible by step # Case 2, n_features - n_features_to_select is not divisible by step n_features_list = [11, 11] n_features_to_select_list = [3, 3] step_list = [2, 3] for n_features, n_features_to_select, step in zip( n_features_list, n_features_to_select_list, step_list): generator = check_random_state(43) X = generator.normal(size=(100, n_features)) y = generator.rand(100).round() rfe = RFE(estimator=SVC(kernel="linear"), n_features_to_select=n_features_to_select, step=step) rfe.fit(X, y) # this number also equals to the maximum of ranking_ assert_equal(np.max(rfe.ranking_), formula1(n_features, n_features_to_select, step)) assert_equal(np.max(rfe.ranking_), formula2(n_features, n_features_to_select, step)) # In RFECV, 'fit' calls 'RFE._fit' # 'number_of_subsets_of_features' of RFE # = the size of 'grid_scores' of RFECV # = the number of iterations of the for loop before optimization #4534 # RFECV, n_features_to_select = 1 # Case 1, n_features - 1 is divisible by step # Case 2, n_features - 1 is not divisible by step n_features_to_select = 1 n_features_list = [11, 10] step_list = [2, 2] for n_features, step in zip(n_features_list, step_list): generator = check_random_state(43) X = generator.normal(size=(100, n_features)) y = generator.rand(100).round() rfecv = RFECV(estimator=SVC(kernel="linear"), step=step, cv=5) rfecv.fit(X, y) assert_equal(rfecv.grid_scores_.shape[0], formula1(n_features, n_features_to_select, step)) assert_equal(rfecv.grid_scores_.shape[0], formula2(n_features, n_features_to_select, step))
def _fs_rfecv(self, data, labels, plot_filename, sample = 0.05): """ Helper function to perform feature selection with Recursive Feature Elimination based on a cross-validation over the entire or part of the data set. @param data: the values of the features @type data: numpy.array @param labels: the values of the training labels @type labels: numpy.array @param plot_filename: a string where the graph will be stored @type plot_filename: string @return: a trained transformer, the modified data and the co-efficients assigned to each feature @rtype: tuple of (scikit.transformer, numpy.array, OrderedDict) """ attributes = OrderedDict() svc = SVC(kernel="linear") if sample: skf = StratifiedKFold(labels, n_folds=int(round(1.00/sample))) last_fold = list(skf)[-1] sampledata = np.array([data[index] for index in last_fold[1]]) samplelabels = np.array([labels[index] for index in last_fold[1]]) log.info("RFECV will be performed on data: {}".format(sampledata.shape)) else: sampledata = data samplelabels = labels transformer = RFECV(estimator=svc, step=1, cv=StratifiedKFold(samplelabels, 5), scoring='accuracy') log.info("scikit: Running feature selection RFECV_SVC") log.info("scikit: data dimensions before fit_transform(): {}".format(data.shape)) log.info("scikit: labels dimensions before fit_transform(): {}".format(labels.shape)) transformer.fit(sampledata, samplelabels) log.info("scikit: Data fit. Proceeding with transforming...") data = transformer.transform(data) log.info("scikit: Dimensions after fit_transform(): %s,%s" % data.shape) #produce a plot if requested and supported (for RFE) if plot_filename: try: grid_scores = transformer.grid_scores_ except: return transformer, data, attributes plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(grid_scores) + 1), grid_scores) plt.savefig(plot_filename, bbox_inches='tight') #put ranks in an array, so that we can get them in the log file for i, rank_strings in enumerate(transformer.ranking_): attributes["RFE_rank_f{}".format(i)] = rank_strings for i, rank_strings in enumerate(transformer.support_): attributes["RFE_mask_f{}".format(i)] = rank_strings return transformer, data, attributes
def test_rfecv_mockclassifier(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = list(iris.target) # regression test: list should be supported # Test using the score function rfecv = RFECV(estimator=MockClassifier(), step=1, cv=5) rfecv.fit(X, y) # non-regression test for missing worst feature: assert_equal(len(rfecv.grid_scores_), X.shape[1]) assert_equal(len(rfecv.ranking_), X.shape[1])
def run_rfe1(lang_pair, results_fname, subset=None, step=100, folds=2): ambig_fname = config["sample"][lang_pair]["ambig_fname"] ambig_map = AmbiguityMap(ambig_fname, subset=subset) samples_fname = config["sample"][lang_pair]["samples_filt_fname"] sample_hdfile = h5py.File(samples_fname, "r") data_gen = DataSetGenerator(ambig_map, sample_hdfile) estimator = SGDClassifier() descriptor = [ ("lemma", "U32"), ("pos", "U32"), ("#cand", "i"), ("#feats", "i"), ("prec", "f"), ("rec", "f"), ("f-score", "f")] results = np.zeros(len(ambig_map), dtype=descriptor) i = 0 for data in data_gen: print i+1, data.source_lempos, if not data.target_lempos: print "*** no samples ***" continue lemma, pos = data.source_lempos.rsplit("/", 1) n_cand = len(data.target_lempos) samples = data.samples.tocsr() # Fix scoring func rfecv = RFECV(estimator=estimator, step=step, cv=StratifiedKFold(data.targets, folds), loss_func=loss_func, verbose=True ) rfecv.fit(samples, data.targets) samples = rfecv.transform(samples) scores = cross_val_score(estimator, samples, data.targets, score_func=score_func) scores = scores.mean(axis=1) * 100 results[i] = (lemma, pos, n_cand, rfecv.n_features_) + tuple(scores) print results[i] print rfecv.cv_scores_ i += 1 np.save(results_fname, results[:i])
def test_rfe_cv_n_jobs(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = iris.target rfecv = RFECV(estimator=SVC(kernel='linear')) rfecv.fit(X, y) rfecv_ranking = rfecv.ranking_ rfecv_grid_scores = rfecv.grid_scores_ rfecv.set_params(n_jobs=2) rfecv.fit(X, y) assert_array_almost_equal(rfecv.ranking_, rfecv_ranking) assert_array_almost_equal(rfecv.grid_scores_, rfecv_grid_scores)
def test_rfe_cv_groups(): generator = check_random_state(0) iris = load_iris() number_groups = 4 groups = np.floor(np.linspace(0, number_groups, len(iris.target))) X = iris.data y = (iris.target > 0).astype(int) est_groups = RFECV( estimator=RandomForestClassifier(random_state=generator), step=1, scoring='accuracy', cv=GroupKFold(n_splits=2)) est_groups.fit(X, y, groups=groups) assert est_groups.n_features_ > 0
def test_rfe_cv_groups(): generator = check_random_state(0) iris = load_iris() number_groups = 4 groups = np.floor(np.linspace(0, number_groups, len(iris.target))) X = iris.data y = (iris.target > 0).astype(int) est_groups = RFECV( estimator=RandomForestClassifier(random_state=generator), step=1, scoring='accuracy', cv=GroupKFold(n_splits=2) ) est_groups.fit(X, y, groups=groups) assert est_groups.n_features_ > 0
def test_rfecv_verbose_output(): # Check verbose=1 is producing an output. from io import StringIO import sys sys.stdout = StringIO() generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = list(iris.target) rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, verbose=1) rfecv.fit(X, y) verbose_output = sys.stdout verbose_output.seek(0) assert_greater(len(verbose_output.readline()), 0)
def test_rfecv_verbose_output(): # Check verbose=1 is producing an output. from sklearn.externals.six.moves import cStringIO as StringIO import sys sys.stdout = StringIO() generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = list(iris.target) rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, verbose=1) rfecv.fit(X, y) verbose_output = sys.stdout verbose_output.seek(0) assert_greater(len(verbose_output.readline()), 0)
def test_rfecv_grid_scores_size(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = list(iris.target) # regression test: list should be supported # Non-regression test for varying combinations of step and # min_features_to_select. for step, min_features_to_select in [[2, 1], [2, 2], [3, 3]]: rfecv = RFECV(estimator=MockClassifier(), step=step, min_features_to_select=min_features_to_select, cv=5) rfecv.fit(X, y) score_len = np.ceil( (X.shape[1] - min_features_to_select) / step) + 1 assert len(rfecv.grid_scores_) == score_len assert len(rfecv.ranking_) == X.shape[1] assert rfecv.n_features_ >= min_features_to_select
def test_rfecv_grid_scores_size(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = list(iris.target) # regression test: list should be supported # Non-regression test for varying combinations of step and # min_features_to_select. for step, min_features_to_select in [[2, 1], [2, 2], [3, 3]]: rfecv = RFECV(estimator=MockClassifier(), step=step, min_features_to_select=min_features_to_select) rfecv.fit(X, y) score_len = np.ceil( (X.shape[1] - min_features_to_select) / step) + 1 assert len(rfecv.grid_scores_) == score_len assert len(rfecv.ranking_) == X.shape[1] assert rfecv.n_features_ >= min_features_to_select
def test_rfecv(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = list(iris.target) # regression test: list should be supported # Test using the score function rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5) rfecv.fit(X, y) # non-regression test for missing worst feature: assert_equal(len(rfecv.grid_scores_), X.shape[1]) assert_equal(len(rfecv.ranking_), X.shape[1]) X_r = rfecv.transform(X) # All the noisy variable were filtered out assert_array_equal(X_r, iris.data) # same in sparse rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data) # Test using a customized loss function scoring = make_scorer(zero_one_loss, greater_is_better=False) rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=scoring) ignore_warnings(rfecv.fit)(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) # Test using a scorer scorer = SCORERS['accuracy'] rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=scorer) rfecv.fit(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) # Test fix on grid_scores def test_scorer(estimator, X, y): return 1.0 rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=test_scorer) rfecv.fit(X, y) assert_array_equal(rfecv.grid_scores_, np.ones(len(rfecv.grid_scores_)))
def test_rfecv(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = list(iris.target) # regression test: list should be supported # Test using the score function rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3) rfecv.fit(X, y) # non-regression test for missing worst feature: assert_equal(len(rfecv.cv_scores_), X.shape[1]) assert_equal(len(rfecv.ranking_), X.shape[1]) X_r = rfecv.transform(X) # same in sparse rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_equal(X_r.shape, iris.data.shape) assert_array_almost_equal(X_r[:10], iris.data[:10]) assert_array_almost_equal(X_r_sparse.toarray(), X_r) # Test using a customized loss function rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3, loss_func=zero_one) rfecv.fit(X, y) X_r = rfecv.transform(X) assert_equal(X_r.shape, iris.data.shape) assert_array_almost_equal(X_r[:10], iris.data[:10])
def test_rfecv(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = iris.target # Test using the score function rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3) rfecv.fit(X, y) # non-regression test for missing worst feature: assert_equal(len(rfecv.cv_scores_), X.shape[1]) assert_equal(len(rfecv.ranking_), X.shape[1]) X_r = rfecv.transform(X) # same in sparse rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_equal(X_r.shape, iris.data.shape) assert_array_almost_equal(X_r[:10], iris.data[:10]) assert_array_almost_equal(X_r_sparse.toarray(), X_r) # Test using a customized loss function rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3, loss_func=zero_one) rfecv.fit(X, y) X_r = rfecv.transform(X) assert_equal(X_r.shape, iris.data.shape) assert_array_almost_equal(X_r[:10], iris.data[:10])
def test_rfecv(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = iris.target # Test using the score function rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3) rfecv.fit(X, y) X_r = rfecv.transform(X) # same in sparse rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_equal(X_r.shape, iris.data.shape) assert_array_almost_equal(X_r[:10], iris.data[:10]) assert_array_almost_equal(X_r_sparse.toarray(), X_r) # Test using a customized loss function rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3, loss_func=zero_one) rfecv.fit(X, y) X_r = rfecv.transform(X) assert_equal(X_r.shape, iris.data.shape) assert_array_almost_equal(X_r[:10], iris.data[:10])
def test_rfecv(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = list(iris.target) # regression test: list should be supported # Test using the score function rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5) rfecv.fit(X, y) # non-regression test for missing worst feature: assert_equal(len(rfecv.grid_scores_), X.shape[1]) assert_equal(len(rfecv.ranking_), X.shape[1]) X_r = rfecv.transform(X) # All the noisy variable were filtered out assert_array_equal(X_r, iris.data) # same in sparse rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data) # Test using a customized loss function scoring = make_scorer(zero_one_loss, greater_is_better=False) rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=scoring) ignore_warnings(rfecv.fit)(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) # Test using a scorer scorer = get_scorer('accuracy') rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=scorer) rfecv.fit(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) # Test fix on grid_scores def test_scorer(estimator, X, y): return 1.0 rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=test_scorer) rfecv.fit(X, y) assert_array_equal(rfecv.grid_scores_, np.ones(len(rfecv.grid_scores_)))
def test_rfecv(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = iris.target # Test using the score function rfecv = RFECV(estimator=SVC(kernel="linear", C=100), step=1, cv=3) rfecv.fit(X, y) X_r = rfecv.transform(X) assert_true(X_r.shape == iris.data.shape) assert_array_almost_equal(X_r[:10], iris.data[:10]) # Test using a customized loss function rfecv = RFECV(estimator=SVC(kernel="linear", C=100), step=1, cv=3, loss_func=zero_one) rfecv.fit(X, y) X_r = rfecv.transform(X) assert_true(X_r.shape == iris.data.shape) assert_array_almost_equal(X_r[:10], iris.data[:10])
def test_rfecv(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = list(iris.target) # regression test: list should be supported # Test using the score function rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3) rfecv.fit(X, y) # non-regression test for missing worst feature: assert_equal(len(rfecv.grid_scores_), X.shape[1]) assert_equal(len(rfecv.ranking_), X.shape[1]) X_r = rfecv.transform(X) # same in sparse rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_equal(X_r.shape, iris.data.shape) assert_array_almost_equal(X_r[:10], iris.data[:10]) assert_array_almost_equal(X_r_sparse.toarray(), X_r) # Test using a customized loss function rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3, loss_func=zero_one_loss) with warnings.catch_warnings(record=True): rfecv.fit(X, y) X_r = rfecv.transform(X) assert_equal(X_r.shape, iris.data.shape) assert_array_almost_equal(X_r[:10], iris.data[:10]) # Test using a scorer scorer = SCORERS['accuracy'] rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3, scoring=scorer) rfecv.fit(X, y) X_r = rfecv.transform(X) assert_equal(X_r.shape, iris.data.shape) assert_array_almost_equal(X_r[:10], iris.data[:10])
def test_rfecv(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = list(iris.target) # regression test: list should be supported # Test using the score function rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5) rfecv.fit(X, y) # non-regression test for missing worst feature: assert_equal(len(rfecv.grid_scores_), X.shape[1]) assert_equal(len(rfecv.ranking_), X.shape[1]) X_r = rfecv.transform(X) # All the noisy variable were filtered out assert_array_equal(X_r, iris.data) # same in sparse rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data) # Test using a customized loss function rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, loss_func=zero_one_loss) with warnings.catch_warnings(record=True): rfecv.fit(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) # Test using a scorer scorer = SCORERS['accuracy'] rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=scorer) rfecv.fit(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data)
def test_rfecv(X, y): # generator = check_random_state(0) # iris = load_iris() # X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] # y = list(iris.target) # regression test: list should be supported # print(X) # print(y) # Test using the score function scorer = get_scorer('accuracy') rfecv = RFECV(estimator=LogisticRegression(), step=1, cv=10, scoring=scorer) rfecv.fit(X, y) X_r = rfecv.transform(X) print(rfecv.ranking_) print(rfecv.support_) print(X_r.shape) # All the noisy variable were filtered out #assert_array_equal(X_r, X) # same in sparse rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) print(X_r_sparse.shape) #assert_array_equal(X_r_sparse.toarray(), X) # Test using a customized loss function scoring = make_scorer(zero_one_loss, greater_is_better=False) rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=scoring) ignore_warnings(rfecv.fit)(X, y) X_r = rfecv.transform(X) print(X_r.shape) #assert_array_equal(X_r, X) # Test using a scorer scorer = get_scorer('accuracy') rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=scorer) rfecv.fit(X, y) X_r = rfecv.transform(X) print(X_r.shape) #assert_array_equal(X_r, X) # Same as the first two tests, but with step=2 rfecv = RFECV(estimator=SVC(kernel="linear"), step=2, cv=5) rfecv.fit(X, y) X_r = rfecv.transform(X) print(X_r.shape) #assert_array_equal(X_r, X) rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=2, cv=5) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) print(X_r_sparse.shape)
def test_rfecv(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = list(iris.target) # regression test: list should be supported # Test using the score function rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5) rfecv.fit(X, y) # non-regression test for missing worst feature: assert_equal(len(rfecv.grid_scores_), X.shape[1]) assert_equal(len(rfecv.ranking_), X.shape[1]) X_r = rfecv.transform(X) # All the noisy variable were filtered out assert_array_equal(X_r, iris.data) # same in sparse rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data) # Test using a customized loss function scoring = make_scorer(zero_one_loss, greater_is_better=False) rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=scoring) ignore_warnings(rfecv.fit)(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) # Test using a scorer scorer = get_scorer('accuracy') rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=scorer) rfecv.fit(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) # Test fix on grid_scores def test_scorer(estimator, X, y): return 1.0 rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=test_scorer) rfecv.fit(X, y) assert_array_equal(rfecv.grid_scores_, np.ones(len(rfecv.grid_scores_))) # In the event of cross validation score ties, the expected behavior of # RFECV is to return the FEWEST features that maximize the CV score. # Because test_scorer always returns 1.0 in this example, RFECV should # reduce the dimensionality to a single feature (i.e. n_features_ = 1) assert_equal(rfecv.n_features_, 1) # Same as the first two tests, but with step=2 rfecv = RFECV(estimator=SVC(kernel="linear"), step=2, cv=5) rfecv.fit(X, y) assert_equal(len(rfecv.grid_scores_), 6) assert_equal(len(rfecv.ranking_), X.shape[1]) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=2, cv=5) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data) # Verifying that steps < 1 don't blow up. rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=.2, cv=5) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data)