def test_selective_imputer_bad_strategies(): # raises for a bad strategy string imputer = SelectiveImputer(strategy="bad strategy") assert_raises(ValueError, imputer.fit, X) # raises for a dim mismatch in cols and strategy imputer = SelectiveImputer(cols=['a'], strategy=['mean', 'mean']) assert_raises(ValueError, imputer.fit, X) # test type error for bad strategy imputer = SelectiveImputer(strategy=1) assert_raises(TypeError, imputer.fit, X) # test dict input that does not match dim-wise imputer = SelectiveImputer(cols=['a'], strategy={ 'a': 'mean', 'b': 'median' }) assert_raises(ValueError, imputer.fit, X) # test a dict input with bad columns breaks imputer = SelectiveImputer(strategy={'a': 'mean', 'D': 'median'}) assert_raises(ValueError, imputer.fit, X)
def test_complex_grid_search(): # build a pipeline pipe = Pipeline([ ('dropper', FeatureFilter()), # won't drop any ('collinearity', MultiCorrFilter(threshold=0.85)), ('imputer', SelectiveImputer()), # pass through since all full ('scaler', SelectiveMaxAbsScaler()), ('boxcox', BoxCoxTransformer(suppress_warnings=True)), ('nzv', NearZeroVarianceFilter()), ('pca', SelectivePCA(n_components=0.9)), ('custom', make_transformer(subtract_k, k=1)), ('model', RandomForestClassifier(n_jobs=1)) ]) # let's define a set of hyper-parameters over which to search hp = { 'collinearity__threshold': uniform(loc=.8, scale=.15), 'collinearity__method': ['pearson', 'kendall', 'spearman'], 'pca__n_components': uniform(loc=.75, scale=.2), 'pca__whiten': [True, False], 'custom__k': [1, 2, 3], 'custom__func': [subtract_k, add_k], 'model__n_estimators': randint(5, 10), 'model__max_depth': randint(2, 5), 'model__min_samples_leaf': randint(1, 5), 'model__max_features': uniform(loc=.5, scale=.5), 'model__max_leaf_nodes': randint(10, 15) } # define the gridsearch search = RandomizedSearchCV( pipe, hp, n_iter=2, # just to test it even works scoring='accuracy', cv=cv, random_state=42, # in parallel so we are testing pickling of the classes n_jobs=2) # fit the search search.fit(X_train, y_train) # Show we can profile the best estimator profile_estimator(search.best_estimator_) # Assert that it's persistable assert_persistable(pipe, "location.pkl", X_train, y_train)
def test_complex_grid_search(): # build a pipeline pipe = Pipeline([ ('dropper', FeatureFilter()), # won't drop any ('collinearity', MultiCorrFilter(threshold=0.85)), ('imputer', SelectiveImputer()), # pass through since all full ('scaler', SelectiveScaler()), ('boxcox', BoxCoxTransformer()), ('nzv', NearZeroVarianceFilter()), ('pca', SelectivePCA(n_components=0.9)), ('model', RandomForestClassifier(n_jobs=1)) ]) # let's define a set of hyper-parameters over which to search hp = { 'collinearity__threshold': uniform(loc=.8, scale=.15), 'collinearity__method': ['pearson', 'kendall', 'spearman'], 'scaler__scaler': [None, RobustScaler()], 'pca__n_components': uniform(loc=.75, scale=.2), 'pca__whiten': [True, False], 'model__n_estimators': randint(5, 10), 'model__max_depth': randint(2, 5), 'model__min_samples_leaf': randint(1, 5), 'model__max_features': uniform(loc=.5, scale=.5), 'model__max_leaf_nodes': randint(10, 15) } # define the gridsearch search = RandomizedSearchCV( pipe, hp, n_iter=2, # just to test it even works scoring='accuracy', cv=cv, random_state=42) # fit the search search.fit(X_train, y_train)