Esempio n. 1
0
def test_feature_filter_none():
    dpr = FeatureFilter(cols=None)

    # none should be dropped
    trans = dpr.fit_transform(iris)  # type: pd.DataFrame
    assert trans.equals(iris)
    assert trans is not iris

    # assert empty drop list
    assert dpr.drop_ == []
Esempio n. 2
0
def test_feature_filter_some():
    dpr = FeatureFilter(cols=['a', 'b'])
    trans = dpr.fit_transform(iris)

    # only two should have been dropped
    assert 'a' not in trans.columns
    assert 'b' not in trans.columns

    # should be two left
    assert trans.shape[1] == 2
    assert trans.equals(iris[['c', 'd']])
Esempio n. 3
0
def test_complex_grid_search():
    # build a pipeline
    pipe = Pipeline([
        ('dropper', FeatureFilter()),  # won't drop any
        ('collinearity', MultiCorrFilter(threshold=0.85)),
        ('imputer', SelectiveImputer()),  # pass through since all full
        ('scaler', SelectiveMaxAbsScaler()),
        ('boxcox', BoxCoxTransformer(suppress_warnings=True)),
        ('nzv', NearZeroVarianceFilter()),
        ('pca', SelectivePCA(n_components=0.9)),
        ('custom', make_transformer(subtract_k, k=1)),
        ('model', RandomForestClassifier(n_jobs=1))
    ])

    # let's define a set of hyper-parameters over which to search
    hp = {
        'collinearity__threshold': uniform(loc=.8, scale=.15),
        'collinearity__method': ['pearson', 'kendall', 'spearman'],
        'pca__n_components': uniform(loc=.75, scale=.2),
        'pca__whiten': [True, False],
        'custom__k': [1, 2, 3],
        'custom__func': [subtract_k, add_k],
        'model__n_estimators': randint(5, 10),
        'model__max_depth': randint(2, 5),
        'model__min_samples_leaf': randint(1, 5),
        'model__max_features': uniform(loc=.5, scale=.5),
        'model__max_leaf_nodes': randint(10, 15)
    }

    # define the gridsearch
    search = RandomizedSearchCV(
        pipe,
        hp,
        n_iter=2,  # just to test it even works
        scoring='accuracy',
        cv=cv,
        random_state=42,
        # in parallel so we are testing pickling of the classes
        n_jobs=2)

    # fit the search
    search.fit(X_train, y_train)

    # Show we can profile the best estimator
    profile_estimator(search.best_estimator_)

    # Assert that it's persistable
    assert_persistable(pipe, "location.pkl", X_train, y_train)
Esempio n. 4
0
def test_complex_grid_search():
    # build a pipeline
    pipe = Pipeline([
        ('dropper', FeatureFilter()),  # won't drop any
        ('collinearity', MultiCorrFilter(threshold=0.85)),
        ('imputer', SelectiveImputer()),  # pass through since all full
        ('scaler', SelectiveScaler()),
        ('boxcox', BoxCoxTransformer()),
        ('nzv', NearZeroVarianceFilter()),
        ('pca', SelectivePCA(n_components=0.9)),
        ('model', RandomForestClassifier(n_jobs=1))
    ])

    # let's define a set of hyper-parameters over which to search
    hp = {
        'collinearity__threshold': uniform(loc=.8, scale=.15),
        'collinearity__method': ['pearson', 'kendall', 'spearman'],
        'scaler__scaler': [None, RobustScaler()],
        'pca__n_components': uniform(loc=.75, scale=.2),
        'pca__whiten': [True, False],
        'model__n_estimators': randint(5, 10),
        'model__max_depth': randint(2, 5),
        'model__min_samples_leaf': randint(1, 5),
        'model__max_features': uniform(loc=.5, scale=.5),
        'model__max_leaf_nodes': randint(10, 15)
    }

    # define the gridsearch
    search = RandomizedSearchCV(
        pipe,
        hp,
        n_iter=2,  # just to test it even works
        scoring='accuracy',
        cv=cv,
        random_state=42)

    # fit the search
    search.fit(X_train, y_train)
Esempio n. 5
0
def test_filter_asdf():
    assert_transformer_asdf(FeatureFilter(), iris)