Exemple #1
0
 def _fs_rfe(self, data, labels, plot_filename, n_features=10):
     svc = SVC(kernel="linear", C=1)
     transformer = RFE(estimator=svc, n_features_to_select=n_features, step=1)
     data = transformer.fit_transform(data, labels)
     
     attributes = OrderedDict()
     #produce a plot if requested and supported (for RFE)
     if plot_filename:
         try:
             grid_scores = transformer.grid_scores_
         except:
             return transformer, data, attributes
         plt.figure()
         plt.xlabel("Number of features selected")
         plt.ylabel("Cross validation score (nb of correct classifications)")
         plt.plot(range(1, len(grid_scores) + 1), transformer.grid_scores)
         plt.savefig(plot_filename, bbox_inches='tight')
         
     #put ranks in an array, so that we can get them in the log file
     for i, rank_strings in enumerate(transformer.ranking_):
         attributes["RFE_rank_f{}".format(i)] = rank_strings
     
     for i, rank_strings in enumerate(transformer.support_):
         attributes["RFE_mask_f{}".format(i)] = rank_strings
             
     return transformer, data, attributes
Exemple #2
0
def test_rfe():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    X_sparse = sparse.csr_matrix(X)
    y = iris.target

    # dense model
    clf = SVC(kernel="linear")
    rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1)
    rfe.fit(X, y)
    X_r = rfe.transform(X)
    clf.fit(X_r, y)
    assert_equal(len(rfe.ranking_), X.shape[1])

    # sparse model
    clf_sparse = SVC(kernel="linear")
    rfe_sparse = RFE(estimator=clf_sparse, n_features_to_select=4, step=0.1)
    rfe_sparse.fit(X_sparse, y)
    X_r_sparse = rfe_sparse.transform(X_sparse)

    assert_equal(X_r.shape, iris.data.shape)
    assert_array_almost_equal(X_r[:10], iris.data[:10])

    assert_array_almost_equal(rfe.predict(X), clf.predict(iris.data))
    assert_equal(rfe.score(X, y), clf.score(iris.data, iris.target))
    assert_array_almost_equal(X_r, X_r_sparse.toarray())
def test_number_of_subsets_of_features():
    # In RFE, 'number_of_subsets_of_features'
    # = the number of iterations in '_fit'
    # = max(ranking_)
    # = 1 + (n_features + step - n_features_to_select - 1) // step
    # After optimization #4534, this number
    # = 1 + np.ceil((n_features - n_features_to_select) / float(step))
    # This test case is to test their equivalence, refer to #4534 and #3824

    def formula1(n_features, n_features_to_select, step):
        return 1 + ((n_features + step - n_features_to_select - 1) // step)

    def formula2(n_features, n_features_to_select, step):
        return 1 + np.ceil((n_features - n_features_to_select) / float(step))

    # RFE
    # Case 1, n_features - n_features_to_select is divisible by step
    # Case 2, n_features - n_features_to_select is not divisible by step
    n_features_list = [11, 11]
    n_features_to_select_list = [3, 3]
    step_list = [2, 3]
    for n_features, n_features_to_select, step in zip(
            n_features_list, n_features_to_select_list, step_list):
        generator = check_random_state(43)
        X = generator.normal(size=(100, n_features))
        y = generator.rand(100).round()
        rfe = RFE(estimator=SVC(kernel="linear"),
                  n_features_to_select=n_features_to_select,
                  step=step)
        rfe.fit(X, y)
        # this number also equals to the maximum of ranking_
        assert_equal(np.max(rfe.ranking_),
                     formula1(n_features, n_features_to_select, step))
        assert_equal(np.max(rfe.ranking_),
                     formula2(n_features, n_features_to_select, step))

    # In RFECV, 'fit' calls 'RFE._fit'
    # 'number_of_subsets_of_features' of RFE
    # = the size of 'grid_scores' of RFECV
    # = the number of iterations of the for loop before optimization #4534

    # RFECV, n_features_to_select = 1
    # Case 1, n_features - 1 is divisible by step
    # Case 2, n_features - 1 is not divisible by step

    n_features_to_select = 1
    n_features_list = [11, 10]
    step_list = [2, 2]
    for n_features, step in zip(n_features_list, step_list):
        generator = check_random_state(43)
        X = generator.normal(size=(100, n_features))
        y = generator.rand(100).round()
        rfecv = RFECV(estimator=SVC(kernel="linear"), step=step, cv=5)
        rfecv.fit(X, y)

        assert_equal(rfecv.grid_scores_.shape[0],
                     formula1(n_features, n_features_to_select, step))
        assert_equal(rfecv.grid_scores_.shape[0],
                     formula2(n_features, n_features_to_select, step))
Exemple #4
0
def test_number_of_subsets_of_features():
    # In RFE, 'number_of_subsets_of_features'
    # = the number of iterations in '_fit'
    # = max(ranking_)
    # = 1 + (n_features + step - n_features_to_select - 1) // step
    # After optimization #4534, this number
    # = 1 + np.ceil((n_features - n_features_to_select) / float(step))
    # This test case is to test their equivalence, refer to #4534 and #3824

    def formula1(n_features, n_features_to_select, step):
        return 1 + ((n_features + step - n_features_to_select - 1) // step)

    def formula2(n_features, n_features_to_select, step):
        return 1 + np.ceil((n_features - n_features_to_select) / float(step))

    # RFE
    # Case 1, n_features - n_features_to_select is divisible by step
    # Case 2, n_features - n_features_to_select is not divisible by step
    n_features_list = [11, 11]
    n_features_to_select_list = [3, 3]
    step_list = [2, 3]
    for n_features, n_features_to_select, step in zip(
            n_features_list, n_features_to_select_list, step_list):
        generator = check_random_state(43)
        X = generator.normal(size=(100, n_features))
        y = generator.rand(100).round()
        rfe = RFE(estimator=SVC(kernel="linear"),
                  n_features_to_select=n_features_to_select, step=step)
        rfe.fit(X, y)
        # this number also equals to the maximum of ranking_
        assert_equal(np.max(rfe.ranking_),
                     formula1(n_features, n_features_to_select, step))
        assert_equal(np.max(rfe.ranking_),
                     formula2(n_features, n_features_to_select, step))

    # In RFECV, 'fit' calls 'RFE._fit'
    # 'number_of_subsets_of_features' of RFE
    # = the size of 'grid_scores' of RFECV
    # = the number of iterations of the for loop before optimization #4534

    # RFECV, n_features_to_select = 1
    # Case 1, n_features - 1 is divisible by step
    # Case 2, n_features - 1 is not divisible by step

    n_features_to_select = 1
    n_features_list = [11, 10]
    step_list = [2, 2]
    for n_features, step in zip(n_features_list, step_list):
        generator = check_random_state(43)
        X = generator.normal(size=(100, n_features))
        y = generator.rand(100).round()
        rfecv = RFECV(estimator=SVC(kernel="linear"), step=step, cv=5)
        rfecv.fit(X, y)

        assert_equal(rfecv.grid_scores_.shape[0],
                     formula1(n_features, n_features_to_select, step))
        assert_equal(rfecv.grid_scores_.shape[0],
                     formula2(n_features, n_features_to_select, step))
Exemple #5
0
def test_rfe_set_params():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = iris.target
    clf = SVC(kernel="linear")
    rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1)
    y_pred = rfe.fit(X, y).predict(X)

    clf = SVC()
    rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1, estimator_params={"kernel": "linear"})
    y_pred2 = rfe.fit(X, y).predict(X)
    assert_array_equal(y_pred, y_pred2)
def test_rfe_mockclassifier():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = iris.target

    # dense model
    clf = MockClassifier()
    rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1)
    rfe.fit(X, y)
    X_r = rfe.transform(X)
    clf.fit(X_r, y)
    assert_equal(len(rfe.ranking_), X.shape[1])
    assert_equal(X_r.shape, iris.data.shape)
Exemple #7
0
def test_rfe_mockclassifier():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = iris.target

    # dense model
    clf = MockClassifier()
    rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1)
    rfe.fit(X, y)
    X_r = rfe.transform(X)
    clf.fit(X_r, y)
    assert_equal(len(rfe.ranking_), X.shape[1])
    assert_equal(X_r.shape, iris.data.shape)
def test_rfe_deprecation_estimator_params():
    deprecation_message = ("The parameter 'estimator_params' is deprecated as "
                           "of version 0.16 and will be removed in 0.18. The "
                           "parameter is no longer necessary because the "
                           "value is set via the estimator initialisation or "
                           "set_params method.")
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = iris.target
    assert_warns_message(DeprecationWarning,
                         deprecation_message,
                         RFE(estimator=SVC(),
                             n_features_to_select=4,
                             step=0.1,
                             estimator_params={
                                 'kernel': 'linear'
                             }).fit,
                         X=X,
                         y=y)

    assert_warns_message(DeprecationWarning,
                         deprecation_message,
                         RFECV(estimator=SVC(),
                               step=1,
                               cv=5,
                               estimator_params={
                                   'kernel': 'linear'
                               }).fit,
                         X=X,
                         y=y)
def test_rfe_estimator_tags():
    rfe = RFE(SVC(kernel='linear'))
    assert_equal(rfe._estimator_type, "classifier")
    # make sure that cross-validation is stratified
    iris = load_iris()
    score = cross_val_score(rfe, iris.data, iris.target)
    assert_greater(score.min(), .7)
def test_rfe_set_params():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = iris.target
    clf = SVC(kernel="linear")
    rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1)
    y_pred = rfe.fit(X, y).predict(X)

    clf = SVC()
    with warnings.catch_warnings(record=True):
        # estimator_params is deprecated
        rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1,
                  estimator_params={'kernel': 'linear'})
        y_pred2 = rfe.fit(X, y).predict(X)
    assert_array_equal(y_pred, y_pred2)
Exemple #11
0
def test_rfe_features_importance():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = iris.target

    clf = RandomForestClassifier(n_estimators=10, n_jobs=1)
    rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1)
    rfe.fit(X, y)
    assert_equal(len(rfe.ranking_), X.shape[1])

    clf_svc = SVC(kernel="linear")
    rfe_svc = RFE(estimator=clf_svc, n_features_to_select=4, step=0.1)
    rfe_svc.fit(X, y)

    # Check if the supports are equal
    diff_support = rfe.get_support() == rfe_svc.get_support()
    assert_true(sum(diff_support) == len(diff_support))
Exemple #12
0
def test_rfe_features_importance():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = iris.target

    clf = RandomForestClassifier(n_estimators=20,
                                 random_state=generator, max_depth=2)
    rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1)
    rfe.fit(X, y)
    assert len(rfe.ranking_) == X.shape[1]

    clf_svc = SVC(kernel="linear")
    rfe_svc = RFE(estimator=clf_svc, n_features_to_select=4, step=0.1)
    rfe_svc.fit(X, y)

    # Check if the supports are equal
    assert_array_equal(rfe.get_support(), rfe_svc.get_support())
def test_rfe_min_step():
    n_features = 10
    X, y = make_friedman1(n_samples=50, n_features=n_features, random_state=0)
    n_samples, n_features = X.shape
    estimator = SVR(kernel="linear")

    # Test when floor(step * n_features) <= 0
    selector = RFE(estimator, step=0.01)
    sel = selector.fit(X, y)
    assert_equal(sel.support_.sum(), n_features // 2)

    # Test when step is between (0,1) and floor(step * n_features) > 0
    selector = RFE(estimator, step=0.20)
    sel = selector.fit(X, y)
    assert_equal(sel.support_.sum(), n_features // 2)

    # Test when step is an integer
    selector = RFE(estimator, step=5)
    sel = selector.fit(X, y)
    assert_equal(sel.support_.sum(), n_features // 2)
Exemple #14
0
def test_rfe_features_importance():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = iris.target

    clf = RandomForestClassifier(n_estimators=20,
                                 random_state=generator, max_depth=2)
    rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1)
    rfe.fit(X, y)
    assert_equal(len(rfe.ranking_), X.shape[1])

    clf_svc = SVC(kernel="linear")
    rfe_svc = RFE(estimator=clf_svc, n_features_to_select=4, step=0.1)
    rfe_svc.fit(X, y)

    # Check if the supports are equal
    assert_array_equal(rfe.get_support(), rfe_svc.get_support())
Exemple #15
0
def test_rfe():
    generator = check_random_state(0)

    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = iris.target

    clf = SVC(kernel="linear")
    rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1)
    rfe.fit(X, y)
    X_r = rfe.transform(X)

    assert_true(X_r.shape == iris.data.shape)
    assert_array_almost_equal(X_r[:10], iris.data[:10])

    assert_array_almost_equal(rfe.predict(X), clf.predict(iris.data))
    assert_true(rfe.score(X, y) == clf.score(iris.data, iris.target))
Exemple #16
0
def test_rfe_set_params():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = iris.target
    clf = SVC(kernel="linear")
    rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1)
    y_pred = rfe.fit(X, y).predict(X)

    clf = SVC()
    rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1,
              estimator_params={'kernel': 'linear'})
    y_pred2 = rfe.fit(X, y).predict(X)
    assert_array_equal(y_pred, y_pred2)
Exemple #17
0
def test_rfe_min_step():
    n_features = 10
    X, y = make_friedman1(n_samples=50, n_features=n_features, random_state=0)
    n_samples, n_features = X.shape
    estimator = SVR(kernel="linear")

    # Test when floor(step * n_features) <= 0
    selector = RFE(estimator, step=0.01)
    sel = selector.fit(X, y)
    assert_equal(sel.support_.sum(), n_features // 2)

    # Test when step is between (0,1) and floor(step * n_features) > 0
    selector = RFE(estimator, step=0.20)
    sel = selector.fit(X, y)
    assert_equal(sel.support_.sum(), n_features // 2)

    # Test when step is an integer
    selector = RFE(estimator, step=5)
    sel = selector.fit(X, y)
    assert_equal(sel.support_.sum(), n_features // 2)
def test_rfe():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    X_sparse = sparse.csr_matrix(X)
    y = iris.target

    # dense model
    clf = SVC(kernel="linear")
    rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1)
    rfe.fit(X, y)
    X_r = rfe.transform(X)
    clf.fit(X_r, y)
    assert_equal(len(rfe.ranking_), X.shape[1])

    # sparse model
    clf_sparse = SVC(kernel="linear")
    rfe_sparse = RFE(estimator=clf_sparse, n_features_to_select=4, step=0.1)
    rfe_sparse.fit(X_sparse, y)
    X_r_sparse = rfe_sparse.transform(X_sparse)

    assert_equal(X_r.shape, iris.data.shape)
    assert_array_almost_equal(X_r[:10], iris.data[:10])

    assert_array_almost_equal(rfe.predict(X), clf.predict(iris.data))
    assert_equal(rfe.score(X, y), clf.score(iris.data, iris.target))
    assert_array_almost_equal(X_r, X_r_sparse.toarray())
model = linear_model.Ridge (alpha = 0.5)
RMSE = 0
for i in range(10):
    x = [slices[(i+1) % 9], slices[(i+2) % 9], slices[(i+3) % 9], slices[(i+4) % 9], slices[(i+5) % 9], slices[(i+6) % 9], slices[(i+7) % 9], slices[(i+8) % 9]]
    x = pd.concat(x)
    y = [slicesOfc7[(i+1) % 9], slicesOfc7[(i+2) % 9], slicesOfc7[(i+3) % 9], slicesOfc7[(i+4) % 9], slicesOfc7[(i+5) % 9], slicesOfc7[(i+6) % 9], slicesOfc7[(i+7) % 9], slicesOfc7[(i+8) % 9]]
    y = pd.concat(y)
    model.fit(x, y)
    pr = model.predict(slices[i])
    RMSE += math.pow(((pr - slicesOfc7[i]) ** 2).sum() / len(slicesOfc7[i]), 0.5)
print "RMSE for validation is:"
print RMSE/k

# run SVM for extra features
svc = SVC(kernel="linear", C=1)
sel = RFE(estimator=svc, n_features_to_select=20, step=0.5, verbose=5)
sel.fit(training, c7)
training = training[:][sel.get_support(True)]
# after SVM
print "afteeeeeeeeeeeeeeeeeeeeeeer SVM:"
modelLR = linear_model.LinearRegression()
modelL = linear_model.Lasso(normalize=True)
modelGBR = ensemble.GradientBoostingRegressor(n_estimators=500, max_depth=4, min_samples_split=2,learning_rate=0.01, loss='ls')
modelR = linear_model.Ridge(alpha = .5)
RMSELR = 0
RMSEL = 0
RMSEGBR = 0
RMSER = 0
for i in range(10):
    x = [slices[(i+1) % 9], slices[(i+2) % 9], slices[(i+3) % 9], slices[(i+4) % 9], slices[(i+5) % 9], slices[(i+6) % 9], slices[(i+7) % 9], slices[(i+8) % 9]]
    x = pd.concat(x)
from sklearn import datasets
from sklearn.feature_selection.rfe import RFE
from sklearn.linear_model.logistic import LogisticRegression

iris = datasets.load_iris()

rfe = RFE(estimator=LogisticRegression(), n_features_to_select=2)

fit = rfe.fit(iris.data, iris.target)

print(fit.n_features_)
print(fit.support_)
print(fit.ranking_)