def test_random_grid(): # get our train/test X_train, X_test, y_train, y_test = train_test_split(X, iris.target, train_size=0.75, random_state=42) # default CV does not shuffle, so we define our own custom_cv = KFold(n=y_train.shape[0], n_folds=5, shuffle=True, random_state=42) # build a pipeline pipe = Pipeline([ ('retainer' , FeatureRetainer()), # will retain all ('dropper' , FeatureDropper()), # won't drop any ('mapper' , FunctionMapper()), # pass through ('encoder' , OneHotCategoricalEncoder()), # no object dtypes, so will pass through ('collinearity', MulticollinearityFilterer(threshold=0.85)), ('imputer' , SelectiveImputer()), # pass through ('scaler' , SelectiveScaler()), ('boxcox' , BoxCoxTransformer()), ('nzv' , NearZeroVarianceFilterer(threshold=1e-4)), ('pca' , SelectivePCA(n_components=0.9)), ('model' , RandomForestClassifier(n_jobs=1)) ]) # let's define a set of hyper-parameters over which to search hp = { 'collinearity__threshold' : uniform(loc=.8, scale=.15), 'collinearity__method' : ['pearson','kendall','spearman'], 'scaler__scaler' : [StandardScaler(), RobustScaler()], 'pca__n_components' : uniform(loc=.75, scale=.2), 'pca__whiten' : [True, False], 'model__n_estimators' : randint(5,100), 'model__max_depth' : randint(2,25), 'model__min_samples_leaf' : randint(1,15), 'model__max_features' : uniform(loc=.5, scale=.5), 'model__max_leaf_nodes' : randint(10,75) } # define the gridsearch search = RandomizedSearchCV(pipe, hp, n_iter=2, # just to test it even works scoring='accuracy', cv=custom_cv, random_state=42) # fit the search search.fit(X_train, y_train) # test the report the_report = report_grid_score_detail(search, charts=False)
def test_random_grid(): # build a pipeline pipe = Pipeline([ ('retainer', FeatureRetainer()), # will retain all ('dropper', FeatureDropper()), # won't drop any ('mapper', FunctionMapper()), # pass through ('encoder', OneHotCategoricalEncoder()), # no object dtypes, so will pass through ('collinearity', MulticollinearityFilterer(threshold=0.85)), ('imputer', SelectiveImputer()), # pass through ('scaler', SelectiveScaler()), ('boxcox', BoxCoxTransformer()), ('nzv', NearZeroVarianceFilterer(threshold=1e-4)), ('pca', SelectivePCA(n_components=0.9)), ('model', RandomForestClassifier(n_jobs=1)) ]) # let's define a set of hyper-parameters over which to search hp = { 'collinearity__threshold': uniform(loc=.8, scale=.15), 'collinearity__method': ['pearson', 'kendall', 'spearman'], 'scaler__scaler': [StandardScaler(), RobustScaler()], 'pca__n_components': uniform(loc=.75, scale=.2), 'pca__whiten': [True, False], 'model__n_estimators': randint(5, 10), 'model__max_depth': randint(2, 5), 'model__min_samples_leaf': randint(1, 5), 'model__max_features': uniform(loc=.5, scale=.5), 'model__max_leaf_nodes': randint(10, 15) } # define the gridsearch search = RandomizedSearchCV( pipe, hp, n_iter=2, # just to test it even works scoring='accuracy', cv=2, random_state=42) # fit the search search.fit(X_train, y_train) # test the report report_grid_score_detail(search, charts=False)
def test_large_grid(): """In this test, we purposely overfit a RandomForest to completely random data in order to assert that the test error will far supercede the train error. """ custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42) # define the pipe pipe = Pipeline([ ('scaler', SelectiveScaler()), ('pca', SelectivePCA(weight=True)), ('rf', RandomForestClassifier(random_state=42)) ]) # define hyper parameters hp = { 'scaler__scaler' : [StandardScaler(), RobustScaler(), MinMaxScaler()], 'pca__whiten' : [True, False], 'pca__weight' : [True, False], 'pca__n_components' : uniform(0.75, 0.15), 'rf__n_estimators' : randint(10, 150), 'rf__max_depth' : randint(5, 15) } # define the grid grid = RandomizedSearchCV(pipe, hp, n_iter=5, scoring='accuracy', n_jobs=-1, cv=custom_cv, random_state=42) # fit the grid grid.fit(X_train, y_train) # get predictions tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test) # evaluate score (SHOULD be better than random...) tr_score, te_score = accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred) # do we want to do this? if not tr_score >= te_score: warnings.warn('expected training accuracy to be higher (train: %.5f, test: %.5f)' % (tr_score, te_score))