Esempio n. 1
0
def test_rfecv():
    generator = check_random_state(0)

    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = iris.target

    # Test using the score function
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3)
    rfecv.fit(X, y)
    X_r = rfecv.transform(X)

    assert_true(X_r.shape == iris.data.shape)
    assert_array_almost_equal(X_r[:10], iris.data[:10])

    # Test using a customized loss function
    rfecv = RFECV(estimator=SVC(kernel="linear"),
                  step=1,
                  cv=3,
                  loss_func=zero_one)
    rfecv.fit(X, y)
    X_r = rfecv.transform(X)

    assert_true(X_r.shape == iris.data.shape)
    assert_array_almost_equal(X_r[:10], iris.data[:10])
Esempio n. 2
0
def test_number_of_subsets_of_features():
    # In RFE, 'number_of_subsets_of_features'
    # = the number of iterations in '_fit'
    # = max(ranking_)
    # = 1 + (n_features + step - n_features_to_select - 1) // step
    # After optimization #4534, this number
    # = 1 + np.ceil((n_features - n_features_to_select) / float(step))
    # This test case is to test their equivalence, refer to #4534 and #3824

    def formula1(n_features, n_features_to_select, step):
        return 1 + ((n_features + step - n_features_to_select - 1) // step)

    def formula2(n_features, n_features_to_select, step):
        return 1 + np.ceil((n_features - n_features_to_select) / float(step))

    # RFE
    # Case 1, n_features - n_features_to_select is divisible by step
    # Case 2, n_features - n_features_to_select is not divisible by step
    n_features_list = [11, 11]
    n_features_to_select_list = [3, 3]
    step_list = [2, 3]
    for n_features, n_features_to_select, step in zip(
            n_features_list, n_features_to_select_list, step_list):
        generator = check_random_state(43)
        X = generator.normal(size=(100, n_features))
        y = generator.rand(100).round()
        rfe = RFE(estimator=SVC(kernel="linear"),
                  n_features_to_select=n_features_to_select,
                  step=step)
        rfe.fit(X, y)
        # this number also equals to the maximum of ranking_
        assert_equal(np.max(rfe.ranking_),
                     formula1(n_features, n_features_to_select, step))
        assert_equal(np.max(rfe.ranking_),
                     formula2(n_features, n_features_to_select, step))

    # In RFECV, 'fit' calls 'RFE._fit'
    # 'number_of_subsets_of_features' of RFE
    # = the size of 'grid_scores' of RFECV
    # = the number of iterations of the for loop before optimization #4534

    # RFECV, n_features_to_select = 1
    # Case 1, n_features - 1 is divisible by step
    # Case 2, n_features - 1 is not divisible by step

    n_features_to_select = 1
    n_features_list = [11, 10]
    step_list = [2, 2]
    for n_features, step in zip(n_features_list, step_list):
        generator = check_random_state(43)
        X = generator.normal(size=(100, n_features))
        y = generator.rand(100).round()
        rfecv = RFECV(estimator=SVC(kernel="linear"), step=step, cv=5)
        rfecv.fit(X, y)

        assert_equal(rfecv.grid_scores_.shape[0],
                     formula1(n_features, n_features_to_select, step))
        assert_equal(rfecv.grid_scores_.shape[0],
                     formula2(n_features, n_features_to_select, step))
Esempio n. 3
0
 def _fs_rfecv(self, data, labels, plot_filename, sample = 0.05):
     """
     Helper function to perform feature selection with Recursive Feature Elimination based
     on a cross-validation over the entire or part of the data set. 
     @param data: the values of the features
     @type data: numpy.array
     @param labels: the values of the training labels
     @type labels: numpy.array
     @param plot_filename: a string where the graph will be stored
     @type plot_filename: string
     @return: a trained transformer, the modified data and the co-efficients assigned to each feature
     @rtype: tuple of (scikit.transformer, numpy.array, OrderedDict)
     """
     attributes = OrderedDict()
     svc = SVC(kernel="linear")
     
     if sample: 
         skf = StratifiedKFold(labels, n_folds=int(round(1.00/sample)))
         last_fold = list(skf)[-1]
         sampledata = np.array([data[index] for index in last_fold[1]])
         samplelabels = np.array([labels[index] for index in last_fold[1]])
         log.info("RFECV will be performed on data: {}".format(sampledata.shape))
     else:
         sampledata = data
         samplelabels = labels
     
     transformer = RFECV(estimator=svc, step=1, cv=StratifiedKFold(samplelabels, 5),
       scoring='accuracy')
     log.info("scikit: Running feature selection RFECV_SVC")
     
     log.info("scikit: data dimensions before fit_transform(): {}".format(data.shape))
     log.info("scikit: labels dimensions before fit_transform(): {}".format(labels.shape))
     
     
     transformer.fit(sampledata, samplelabels)
     log.info("scikit: Data fit. Proceeding with transforming...")
     data = transformer.transform(data)
     log.info("scikit: Dimensions after fit_transform(): %s,%s" % data.shape)
     
     #produce a plot if requested and supported (for RFE)
     if plot_filename:
         try:
             grid_scores = transformer.grid_scores_
         except:
             return transformer, data, attributes
         plt.figure()
         plt.xlabel("Number of features selected")
         plt.ylabel("Cross validation score (nb of correct classifications)")
         plt.plot(range(1, len(grid_scores) + 1), grid_scores)
         plt.savefig(plot_filename, bbox_inches='tight')
         
     #put ranks in an array, so that we can get them in the log file
     for i, rank_strings in enumerate(transformer.ranking_):
         attributes["RFE_rank_f{}".format(i)] = rank_strings
     
     for i, rank_strings in enumerate(transformer.support_):
         attributes["RFE_mask_f{}".format(i)] = rank_strings
             
     return transformer, data, attributes
Esempio n. 4
0
def test_number_of_subsets_of_features():
    # In RFE, 'number_of_subsets_of_features'
    # = the number of iterations in '_fit'
    # = max(ranking_)
    # = 1 + (n_features + step - n_features_to_select - 1) // step
    # After optimization #4534, this number
    # = 1 + np.ceil((n_features - n_features_to_select) / float(step))
    # This test case is to test their equivalence, refer to #4534 and #3824

    def formula1(n_features, n_features_to_select, step):
        return 1 + ((n_features + step - n_features_to_select - 1) // step)

    def formula2(n_features, n_features_to_select, step):
        return 1 + np.ceil((n_features - n_features_to_select) / float(step))

    # RFE
    # Case 1, n_features - n_features_to_select is divisible by step
    # Case 2, n_features - n_features_to_select is not divisible by step
    n_features_list = [11, 11]
    n_features_to_select_list = [3, 3]
    step_list = [2, 3]
    for n_features, n_features_to_select, step in zip(
            n_features_list, n_features_to_select_list, step_list):
        generator = check_random_state(43)
        X = generator.normal(size=(100, n_features))
        y = generator.rand(100).round()
        rfe = RFE(estimator=SVC(kernel="linear"),
                  n_features_to_select=n_features_to_select, step=step)
        rfe.fit(X, y)
        # this number also equals to the maximum of ranking_
        assert_equal(np.max(rfe.ranking_),
                     formula1(n_features, n_features_to_select, step))
        assert_equal(np.max(rfe.ranking_),
                     formula2(n_features, n_features_to_select, step))

    # In RFECV, 'fit' calls 'RFE._fit'
    # 'number_of_subsets_of_features' of RFE
    # = the size of 'grid_scores' of RFECV
    # = the number of iterations of the for loop before optimization #4534

    # RFECV, n_features_to_select = 1
    # Case 1, n_features - 1 is divisible by step
    # Case 2, n_features - 1 is not divisible by step

    n_features_to_select = 1
    n_features_list = [11, 10]
    step_list = [2, 2]
    for n_features, step in zip(n_features_list, step_list):
        generator = check_random_state(43)
        X = generator.normal(size=(100, n_features))
        y = generator.rand(100).round()
        rfecv = RFECV(estimator=SVC(kernel="linear"), step=step, cv=5)
        rfecv.fit(X, y)

        assert_equal(rfecv.grid_scores_.shape[0],
                     formula1(n_features, n_features_to_select, step))
        assert_equal(rfecv.grid_scores_.shape[0],
                     formula2(n_features, n_features_to_select, step))
Esempio n. 5
0
def test_rfecv_mockclassifier():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = list(iris.target)   # regression test: list should be supported

    # Test using the score function
    rfecv = RFECV(estimator=MockClassifier(), step=1, cv=5)
    rfecv.fit(X, y)
    # non-regression test for missing worst feature:
    assert_equal(len(rfecv.grid_scores_), X.shape[1])
    assert_equal(len(rfecv.ranking_), X.shape[1])
Esempio n. 6
0
def test_rfecv_mockclassifier():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = list(iris.target)  # regression test: list should be supported

    # Test using the score function
    rfecv = RFECV(estimator=MockClassifier(), step=1, cv=5)
    rfecv.fit(X, y)
    # non-regression test for missing worst feature:
    assert_equal(len(rfecv.grid_scores_), X.shape[1])
    assert_equal(len(rfecv.ranking_), X.shape[1])
Esempio n. 7
0
def run_rfe1(lang_pair, results_fname, subset=None, step=100, folds=2):
    ambig_fname = config["sample"][lang_pair]["ambig_fname"]
    ambig_map = AmbiguityMap(ambig_fname, subset=subset)
    
    samples_fname = config["sample"][lang_pair]["samples_filt_fname"]
    sample_hdfile = h5py.File(samples_fname, "r")
    
    data_gen = DataSetGenerator(ambig_map, sample_hdfile)
    
    estimator = SGDClassifier()
    
    descriptor = [ ("lemma", "U32"),
                   ("pos", "U32"),
                   ("#cand", "i"),
                   ("#feats", "i"),
                   ("prec", "f"),
                   ("rec", "f"),
                   ("f-score", "f")] 
    results = np.zeros(len(ambig_map), dtype=descriptor)
    i = 0
    
    for data in data_gen:
        print i+1, data.source_lempos, 
        if not data.target_lempos:
            print "*** no samples ***"
            continue
        lemma, pos = data.source_lempos.rsplit("/", 1)
        n_cand = len(data.target_lempos)
        samples = data.samples.tocsr()
        # Fix scoring func
        rfecv = RFECV(estimator=estimator, 
                      step=step, 
                      cv=StratifiedKFold(data.targets, folds),
                      loss_func=loss_func,
                      verbose=True
                      )
        rfecv.fit(samples, data.targets)
        samples = rfecv.transform(samples)
        scores = cross_val_score(estimator, 
                                 samples, 
                                 data.targets,
                                 score_func=score_func)
        scores = scores.mean(axis=1) * 100
        results[i] = (lemma, pos, n_cand, rfecv.n_features_) + tuple(scores)
        print results[i]
        print rfecv.cv_scores_
        i += 1

    np.save(results_fname, results[:i])
Esempio n. 8
0
def test_rfe_cv_n_jobs():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = iris.target

    rfecv = RFECV(estimator=SVC(kernel='linear'))
    rfecv.fit(X, y)
    rfecv_ranking = rfecv.ranking_
    rfecv_grid_scores = rfecv.grid_scores_

    rfecv.set_params(n_jobs=2)
    rfecv.fit(X, y)
    assert_array_almost_equal(rfecv.ranking_, rfecv_ranking)
    assert_array_almost_equal(rfecv.grid_scores_, rfecv_grid_scores)
Esempio n. 9
0
def test_rfe_cv_groups():
    generator = check_random_state(0)
    iris = load_iris()
    number_groups = 4
    groups = np.floor(np.linspace(0, number_groups, len(iris.target)))
    X = iris.data
    y = (iris.target > 0).astype(int)

    est_groups = RFECV(
        estimator=RandomForestClassifier(random_state=generator),
        step=1,
        scoring='accuracy',
        cv=GroupKFold(n_splits=2))
    est_groups.fit(X, y, groups=groups)
    assert est_groups.n_features_ > 0
Esempio n. 10
0
def test_rfe_cv_n_jobs():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = iris.target

    rfecv = RFECV(estimator=SVC(kernel='linear'))
    rfecv.fit(X, y)
    rfecv_ranking = rfecv.ranking_
    rfecv_grid_scores = rfecv.grid_scores_

    rfecv.set_params(n_jobs=2)
    rfecv.fit(X, y)
    assert_array_almost_equal(rfecv.ranking_, rfecv_ranking)
    assert_array_almost_equal(rfecv.grid_scores_, rfecv_grid_scores)
Esempio n. 11
0
def test_rfe_cv_groups():
    generator = check_random_state(0)
    iris = load_iris()
    number_groups = 4
    groups = np.floor(np.linspace(0, number_groups, len(iris.target)))
    X = iris.data
    y = (iris.target > 0).astype(int)

    est_groups = RFECV(
        estimator=RandomForestClassifier(random_state=generator),
        step=1,
        scoring='accuracy',
        cv=GroupKFold(n_splits=2)
    )
    est_groups.fit(X, y, groups=groups)
    assert est_groups.n_features_ > 0
Esempio n. 12
0
def test_rfecv_verbose_output():
    # Check verbose=1 is producing an output.
    from io import StringIO
    import sys
    sys.stdout = StringIO()

    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = list(iris.target)

    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, verbose=1)
    rfecv.fit(X, y)

    verbose_output = sys.stdout
    verbose_output.seek(0)
    assert_greater(len(verbose_output.readline()), 0)
Esempio n. 13
0
def test_rfecv_verbose_output():
    # Check verbose=1 is producing an output.
    from sklearn.externals.six.moves import cStringIO as StringIO
    import sys
    sys.stdout = StringIO()

    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = list(iris.target)

    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, verbose=1)
    rfecv.fit(X, y)

    verbose_output = sys.stdout
    verbose_output.seek(0)
    assert_greater(len(verbose_output.readline()), 0)
Esempio n. 14
0
def test_rfecv_grid_scores_size():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = list(iris.target)   # regression test: list should be supported

    # Non-regression test for varying combinations of step and
    # min_features_to_select.
    for step, min_features_to_select in [[2, 1], [2, 2], [3, 3]]:
        rfecv = RFECV(estimator=MockClassifier(), step=step,
                      min_features_to_select=min_features_to_select, cv=5)
        rfecv.fit(X, y)

        score_len = np.ceil(
            (X.shape[1] - min_features_to_select) / step) + 1
        assert len(rfecv.grid_scores_) == score_len
        assert len(rfecv.ranking_) == X.shape[1]
        assert rfecv.n_features_ >= min_features_to_select
Esempio n. 15
0
def test_rfecv_grid_scores_size():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = list(iris.target)   # regression test: list should be supported

    # Non-regression test for varying combinations of step and
    # min_features_to_select.
    for step, min_features_to_select in [[2, 1], [2, 2], [3, 3]]:
        rfecv = RFECV(estimator=MockClassifier(), step=step,
                      min_features_to_select=min_features_to_select)
        rfecv.fit(X, y)

        score_len = np.ceil(
            (X.shape[1] - min_features_to_select) / step) + 1
        assert len(rfecv.grid_scores_) == score_len
        assert len(rfecv.ranking_) == X.shape[1]
        assert rfecv.n_features_ >= min_features_to_select
Esempio n. 16
0
def test_rfecv():
    generator = check_random_state(0)

    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = list(iris.target)  # regression test: list should be supported

    # Test using the score function
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5)
    rfecv.fit(X, y)
    # non-regression test for missing worst feature:
    assert_equal(len(rfecv.grid_scores_), X.shape[1])
    assert_equal(len(rfecv.ranking_), X.shape[1])
    X_r = rfecv.transform(X)

    # All the noisy variable were filtered out
    assert_array_equal(X_r, iris.data)

    # same in sparse
    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)
    assert_array_equal(X_r_sparse.toarray(), iris.data)

    # Test using a customized loss function
    scoring = make_scorer(zero_one_loss, greater_is_better=False)
    rfecv = RFECV(estimator=SVC(kernel="linear"),
                  step=1,
                  cv=5,
                  scoring=scoring)
    ignore_warnings(rfecv.fit)(X, y)
    X_r = rfecv.transform(X)
    assert_array_equal(X_r, iris.data)

    # Test using a scorer
    scorer = SCORERS['accuracy']
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=scorer)
    rfecv.fit(X, y)
    X_r = rfecv.transform(X)
    assert_array_equal(X_r, iris.data)

    # Test fix on grid_scores
    def test_scorer(estimator, X, y):
        return 1.0

    rfecv = RFECV(estimator=SVC(kernel="linear"),
                  step=1,
                  cv=5,
                  scoring=test_scorer)
    rfecv.fit(X, y)
    assert_array_equal(rfecv.grid_scores_, np.ones(len(rfecv.grid_scores_)))
Esempio n. 17
0
def test_rfecv():
    generator = check_random_state(0)

    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = list(iris.target)   # regression test: list should be supported

    # Test using the score function
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3)
    rfecv.fit(X, y)
    # non-regression test for missing worst feature:
    assert_equal(len(rfecv.cv_scores_), X.shape[1])
    assert_equal(len(rfecv.ranking_), X.shape[1])
    X_r = rfecv.transform(X)

    # same in sparse
    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)

    assert_equal(X_r.shape, iris.data.shape)
    assert_array_almost_equal(X_r[:10], iris.data[:10])
    assert_array_almost_equal(X_r_sparse.toarray(), X_r)

    # Test using a customized loss function
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3,
                  loss_func=zero_one)
    rfecv.fit(X, y)
    X_r = rfecv.transform(X)

    assert_equal(X_r.shape, iris.data.shape)
    assert_array_almost_equal(X_r[:10], iris.data[:10])
Esempio n. 18
0
def test_rfecv():
    generator = check_random_state(0)

    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = iris.target

    # Test using the score function
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3)
    rfecv.fit(X, y)
    # non-regression test for missing worst feature:
    assert_equal(len(rfecv.cv_scores_), X.shape[1])
    assert_equal(len(rfecv.ranking_), X.shape[1])
    X_r = rfecv.transform(X)

    # same in sparse
    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)

    assert_equal(X_r.shape, iris.data.shape)
    assert_array_almost_equal(X_r[:10], iris.data[:10])
    assert_array_almost_equal(X_r_sparse.toarray(), X_r)

    # Test using a customized loss function
    rfecv = RFECV(estimator=SVC(kernel="linear"),
                  step=1,
                  cv=3,
                  loss_func=zero_one)
    rfecv.fit(X, y)
    X_r = rfecv.transform(X)

    assert_equal(X_r.shape, iris.data.shape)
    assert_array_almost_equal(X_r[:10], iris.data[:10])
Esempio n. 19
0
def test_rfecv():
    generator = check_random_state(0)

    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = iris.target

    # Test using the score function
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3)
    rfecv.fit(X, y)
    X_r = rfecv.transform(X)

    # same in sparse
    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)

    assert_equal(X_r.shape, iris.data.shape)
    assert_array_almost_equal(X_r[:10], iris.data[:10])
    assert_array_almost_equal(X_r_sparse.toarray(), X_r)

    # Test using a customized loss function
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3,
            loss_func=zero_one)
    rfecv.fit(X, y)
    X_r = rfecv.transform(X)

    assert_equal(X_r.shape, iris.data.shape)
    assert_array_almost_equal(X_r[:10], iris.data[:10])
Esempio n. 20
0
def test_rfecv():
    generator = check_random_state(0)

    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = list(iris.target)   # regression test: list should be supported

    # Test using the score function
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5)
    rfecv.fit(X, y)
    # non-regression test for missing worst feature:
    assert_equal(len(rfecv.grid_scores_), X.shape[1])
    assert_equal(len(rfecv.ranking_), X.shape[1])
    X_r = rfecv.transform(X)

    # All the noisy variable were filtered out
    assert_array_equal(X_r, iris.data)

    # same in sparse
    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)
    assert_array_equal(X_r_sparse.toarray(), iris.data)

    # Test using a customized loss function
    scoring = make_scorer(zero_one_loss, greater_is_better=False)
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5,
                  scoring=scoring)
    ignore_warnings(rfecv.fit)(X, y)
    X_r = rfecv.transform(X)
    assert_array_equal(X_r, iris.data)

    # Test using a scorer
    scorer = get_scorer('accuracy')
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5,
                  scoring=scorer)
    rfecv.fit(X, y)
    X_r = rfecv.transform(X)
    assert_array_equal(X_r, iris.data)

    # Test fix on grid_scores
    def test_scorer(estimator, X, y):
        return 1.0
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5,
                  scoring=test_scorer)
    rfecv.fit(X, y)
    assert_array_equal(rfecv.grid_scores_, np.ones(len(rfecv.grid_scores_)))
Esempio n. 21
0
def test_rfecv():
    generator = check_random_state(0)

    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = iris.target

    # Test using the score function
    rfecv = RFECV(estimator=SVC(kernel="linear", C=100), step=1, cv=3)
    rfecv.fit(X, y)
    X_r = rfecv.transform(X)

    assert_true(X_r.shape == iris.data.shape)
    assert_array_almost_equal(X_r[:10], iris.data[:10])

    # Test using a customized loss function
    rfecv = RFECV(estimator=SVC(kernel="linear", C=100), step=1, cv=3,
            loss_func=zero_one)
    rfecv.fit(X, y)
    X_r = rfecv.transform(X)

    assert_true(X_r.shape == iris.data.shape)
    assert_array_almost_equal(X_r[:10], iris.data[:10])
def test_rfecv():
    generator = check_random_state(0)

    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = list(iris.target)  # regression test: list should be supported

    # Test using the score function
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3)
    rfecv.fit(X, y)
    # non-regression test for missing worst feature:
    assert_equal(len(rfecv.grid_scores_), X.shape[1])
    assert_equal(len(rfecv.ranking_), X.shape[1])
    X_r = rfecv.transform(X)

    # same in sparse
    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)

    assert_equal(X_r.shape, iris.data.shape)
    assert_array_almost_equal(X_r[:10], iris.data[:10])
    assert_array_almost_equal(X_r_sparse.toarray(), X_r)

    # Test using a customized loss function
    rfecv = RFECV(estimator=SVC(kernel="linear"),
                  step=1,
                  cv=3,
                  loss_func=zero_one_loss)
    with warnings.catch_warnings(record=True):
        rfecv.fit(X, y)
    X_r = rfecv.transform(X)

    assert_equal(X_r.shape, iris.data.shape)
    assert_array_almost_equal(X_r[:10], iris.data[:10])

    # Test using a scorer
    scorer = SCORERS['accuracy']
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3, scoring=scorer)
    rfecv.fit(X, y)
    X_r = rfecv.transform(X)

    assert_equal(X_r.shape, iris.data.shape)
    assert_array_almost_equal(X_r[:10], iris.data[:10])
Esempio n. 23
0
def test_rfecv():
    generator = check_random_state(0)

    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = list(iris.target)   # regression test: list should be supported

    # Test using the score function
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5)
    rfecv.fit(X, y)
    # non-regression test for missing worst feature:
    assert_equal(len(rfecv.grid_scores_), X.shape[1])
    assert_equal(len(rfecv.ranking_), X.shape[1])
    X_r = rfecv.transform(X)

    # All the noisy variable were filtered out
    assert_array_equal(X_r, iris.data)

    # same in sparse
    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)
    assert_array_equal(X_r_sparse.toarray(), iris.data)

    # Test using a customized loss function
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5,
                  loss_func=zero_one_loss)
    with warnings.catch_warnings(record=True):
        rfecv.fit(X, y)
    X_r = rfecv.transform(X)
    assert_array_equal(X_r, iris.data)

    # Test using a scorer
    scorer = SCORERS['accuracy']
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5,
                  scoring=scorer)
    rfecv.fit(X, y)
    X_r = rfecv.transform(X)
    assert_array_equal(X_r, iris.data)
def test_rfecv(X, y):
    # generator = check_random_state(0)
    # iris = load_iris()
    # X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    # y = list(iris.target)  # regression test: list should be supported
    # print(X)
    # print(y)

    # Test using the score function
    scorer = get_scorer('accuracy')
    rfecv = RFECV(estimator=LogisticRegression(),
                  step=1,
                  cv=10,
                  scoring=scorer)
    rfecv.fit(X, y)
    X_r = rfecv.transform(X)
    print(rfecv.ranking_)
    print(rfecv.support_)

    print(X_r.shape)
    # All the noisy variable were filtered out
    #assert_array_equal(X_r, X)

    # same in sparse
    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)
    print(X_r_sparse.shape)
    #assert_array_equal(X_r_sparse.toarray(), X)

    # Test using a customized loss function
    scoring = make_scorer(zero_one_loss, greater_is_better=False)
    rfecv = RFECV(estimator=SVC(kernel="linear"),
                  step=1,
                  cv=5,
                  scoring=scoring)
    ignore_warnings(rfecv.fit)(X, y)
    X_r = rfecv.transform(X)
    print(X_r.shape)
    #assert_array_equal(X_r, X)

    # Test using a scorer
    scorer = get_scorer('accuracy')
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=scorer)
    rfecv.fit(X, y)
    X_r = rfecv.transform(X)
    print(X_r.shape)
    #assert_array_equal(X_r, X)

    # Same as the first two tests, but with step=2
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=2, cv=5)
    rfecv.fit(X, y)
    X_r = rfecv.transform(X)
    print(X_r.shape)
    #assert_array_equal(X_r, X)

    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=2, cv=5)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)
    print(X_r_sparse.shape)
Esempio n. 25
0
def test_rfecv():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = list(iris.target)  # regression test: list should be supported

    # Test using the score function
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5)
    rfecv.fit(X, y)
    # non-regression test for missing worst feature:
    assert_equal(len(rfecv.grid_scores_), X.shape[1])
    assert_equal(len(rfecv.ranking_), X.shape[1])
    X_r = rfecv.transform(X)

    # All the noisy variable were filtered out
    assert_array_equal(X_r, iris.data)

    # same in sparse
    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)
    assert_array_equal(X_r_sparse.toarray(), iris.data)

    # Test using a customized loss function
    scoring = make_scorer(zero_one_loss, greater_is_better=False)
    rfecv = RFECV(estimator=SVC(kernel="linear"),
                  step=1,
                  cv=5,
                  scoring=scoring)
    ignore_warnings(rfecv.fit)(X, y)
    X_r = rfecv.transform(X)
    assert_array_equal(X_r, iris.data)

    # Test using a scorer
    scorer = get_scorer('accuracy')
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=scorer)
    rfecv.fit(X, y)
    X_r = rfecv.transform(X)
    assert_array_equal(X_r, iris.data)

    # Test fix on grid_scores
    def test_scorer(estimator, X, y):
        return 1.0

    rfecv = RFECV(estimator=SVC(kernel="linear"),
                  step=1,
                  cv=5,
                  scoring=test_scorer)
    rfecv.fit(X, y)
    assert_array_equal(rfecv.grid_scores_, np.ones(len(rfecv.grid_scores_)))
    # In the event of cross validation score ties, the expected behavior of
    # RFECV is to return the FEWEST features that maximize the CV score.
    # Because test_scorer always returns 1.0 in this example, RFECV should
    # reduce the dimensionality to a single feature (i.e. n_features_ = 1)
    assert_equal(rfecv.n_features_, 1)

    # Same as the first two tests, but with step=2
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=2, cv=5)
    rfecv.fit(X, y)
    assert_equal(len(rfecv.grid_scores_), 6)
    assert_equal(len(rfecv.ranking_), X.shape[1])
    X_r = rfecv.transform(X)
    assert_array_equal(X_r, iris.data)

    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=2, cv=5)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)
    assert_array_equal(X_r_sparse.toarray(), iris.data)

    # Verifying that steps < 1 don't blow up.
    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=.2, cv=5)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)
    assert_array_equal(X_r_sparse.toarray(), iris.data)
Esempio n. 26
0
def test_rfecv():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = list(iris.target)   # regression test: list should be supported

    # Test using the score function
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5)
    rfecv.fit(X, y)
    # non-regression test for missing worst feature:
    assert_equal(len(rfecv.grid_scores_), X.shape[1])
    assert_equal(len(rfecv.ranking_), X.shape[1])
    X_r = rfecv.transform(X)

    # All the noisy variable were filtered out
    assert_array_equal(X_r, iris.data)

    # same in sparse
    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)
    assert_array_equal(X_r_sparse.toarray(), iris.data)

    # Test using a customized loss function
    scoring = make_scorer(zero_one_loss, greater_is_better=False)
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5,
                  scoring=scoring)
    ignore_warnings(rfecv.fit)(X, y)
    X_r = rfecv.transform(X)
    assert_array_equal(X_r, iris.data)

    # Test using a scorer
    scorer = get_scorer('accuracy')
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5,
                  scoring=scorer)
    rfecv.fit(X, y)
    X_r = rfecv.transform(X)
    assert_array_equal(X_r, iris.data)

    # Test fix on grid_scores
    def test_scorer(estimator, X, y):
        return 1.0
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5,
                  scoring=test_scorer)
    rfecv.fit(X, y)
    assert_array_equal(rfecv.grid_scores_, np.ones(len(rfecv.grid_scores_)))
    # In the event of cross validation score ties, the expected behavior of
    # RFECV is to return the FEWEST features that maximize the CV score.
    # Because test_scorer always returns 1.0 in this example, RFECV should
    # reduce the dimensionality to a single feature (i.e. n_features_ = 1)
    assert_equal(rfecv.n_features_, 1)

    # Same as the first two tests, but with step=2
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=2, cv=5)
    rfecv.fit(X, y)
    assert_equal(len(rfecv.grid_scores_), 6)
    assert_equal(len(rfecv.ranking_), X.shape[1])
    X_r = rfecv.transform(X)
    assert_array_equal(X_r, iris.data)

    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=2, cv=5)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)
    assert_array_equal(X_r_sparse.toarray(), iris.data)

    # Verifying that steps < 1 don't blow up.
    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=.2, cv=5)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)
    assert_array_equal(X_r_sparse.toarray(), iris.data)