Esempio n. 1
0
def test_nzv_bad_freq_cut():
    X = pd.DataFrame.from_records(data=np.array([[1, 2, 3], [4, 5, 3],
                                                 [6, 7, 5]]),
                                  columns=['a', 'b', 'c'])

    # show fails with a bad float value
    nzv_float = NearZeroVarianceFilter(freq_cut=1.)
    assert_raises(ValueError, nzv_float.fit, X)

    # show fails with a non-float/int
    nzv_str = NearZeroVarianceFilter(freq_cut='1.')
    assert_raises(ValueError, nzv_str.fit, X)
Esempio n. 2
0
def test_nzv_non_constant():
    X = pd.DataFrame.from_records(data=np.array([[1, 2, 3], [4, 5, 3],
                                                 [6, 7, 5]]),
                                  columns=['a', 'b', 'c'])

    nzv = NearZeroVarianceFilter(freq_cut=2)  # show passes with an int
    trans = nzv.fit_transform(X)

    # show the output is down a column
    assert trans.shape[1] == 2
    assert nzv.drop_ == ['c']

    # show the ratios are expected
    assert_array_equal(nzv.ratios_, np.array([1., 1., 2.]))
Esempio n. 3
0
def test_nzv_constant_col():
    X = pd.DataFrame.from_records(data=np.array([[1, 2, 3], [4, 5, 3],
                                                 [6, 7, 3], [8, 9, 3]]),
                                  columns=['a', 'b', 'c'])

    flt = NearZeroVarianceFilter(freq_cut=25)
    trans = flt.fit_transform(X)

    # show that the output is one column shorter
    assert trans.shape[1] == 2
    assert flt.drop_ == ['c']

    # show the ratios are expected
    assert_array_equal(flt.ratios_, np.array([1., 1., np.inf]))
Esempio n. 4
0
def test_complex_grid_search():
    # build a pipeline
    pipe = Pipeline([
        ('dropper', FeatureFilter()),  # won't drop any
        ('collinearity', MultiCorrFilter(threshold=0.85)),
        ('imputer', SelectiveImputer()),  # pass through since all full
        ('scaler', SelectiveMaxAbsScaler()),
        ('boxcox', BoxCoxTransformer(suppress_warnings=True)),
        ('nzv', NearZeroVarianceFilter()),
        ('pca', SelectivePCA(n_components=0.9)),
        ('custom', make_transformer(subtract_k, k=1)),
        ('model', RandomForestClassifier(n_jobs=1))
    ])

    # let's define a set of hyper-parameters over which to search
    hp = {
        'collinearity__threshold': uniform(loc=.8, scale=.15),
        'collinearity__method': ['pearson', 'kendall', 'spearman'],
        'pca__n_components': uniform(loc=.75, scale=.2),
        'pca__whiten': [True, False],
        'custom__k': [1, 2, 3],
        'custom__func': [subtract_k, add_k],
        'model__n_estimators': randint(5, 10),
        'model__max_depth': randint(2, 5),
        'model__min_samples_leaf': randint(1, 5),
        'model__max_features': uniform(loc=.5, scale=.5),
        'model__max_leaf_nodes': randint(10, 15)
    }

    # define the gridsearch
    search = RandomizedSearchCV(
        pipe,
        hp,
        n_iter=2,  # just to test it even works
        scoring='accuracy',
        cv=cv,
        random_state=42,
        # in parallel so we are testing pickling of the classes
        n_jobs=2)

    # fit the search
    search.fit(X_train, y_train)

    # Show we can profile the best estimator
    profile_estimator(search.best_estimator_)

    # Assert that it's persistable
    assert_persistable(pipe, "location.pkl", X_train, y_train)
Esempio n. 5
0
def test_complex_grid_search():
    # build a pipeline
    pipe = Pipeline([
        ('dropper', FeatureFilter()),  # won't drop any
        ('collinearity', MultiCorrFilter(threshold=0.85)),
        ('imputer', SelectiveImputer()),  # pass through since all full
        ('scaler', SelectiveScaler()),
        ('boxcox', BoxCoxTransformer()),
        ('nzv', NearZeroVarianceFilter()),
        ('pca', SelectivePCA(n_components=0.9)),
        ('model', RandomForestClassifier(n_jobs=1))
    ])

    # let's define a set of hyper-parameters over which to search
    hp = {
        'collinearity__threshold': uniform(loc=.8, scale=.15),
        'collinearity__method': ['pearson', 'kendall', 'spearman'],
        'scaler__scaler': [None, RobustScaler()],
        'pca__n_components': uniform(loc=.75, scale=.2),
        'pca__whiten': [True, False],
        'model__n_estimators': randint(5, 10),
        'model__max_depth': randint(2, 5),
        'model__min_samples_leaf': randint(1, 5),
        'model__max_features': uniform(loc=.5, scale=.5),
        'model__max_leaf_nodes': randint(10, 15)
    }

    # define the gridsearch
    search = RandomizedSearchCV(
        pipe,
        hp,
        n_iter=2,  # just to test it even works
        scoring='accuracy',
        cv=cv,
        random_state=42)

    # fit the search
    search.fit(X_train, y_train)
Esempio n. 6
0
def test_nzf_asdf():
    assert_transformer_asdf(NearZeroVarianceFilter(), iris)