Example #1
0
def test_random_grid():
    # get our train/test
    X_train, X_test, y_train, y_test = train_test_split(X, iris.target, train_size=0.75, random_state=42)

    # default CV does not shuffle, so we define our own
    custom_cv = KFold(n=y_train.shape[0], n_folds=5, shuffle=True, random_state=42)

    # build a pipeline
    pipe = Pipeline([
        ('retainer'    , FeatureRetainer()), # will retain all
        ('dropper'     , FeatureDropper()),  # won't drop any
        ('mapper'      , FunctionMapper()),  # pass through
        ('encoder'     , OneHotCategoricalEncoder()), # no object dtypes, so will pass through
        ('collinearity', MulticollinearityFilterer(threshold=0.85)),
        ('imputer'     , SelectiveImputer()), # pass through
        ('scaler'      , SelectiveScaler()),
        ('boxcox'      , BoxCoxTransformer()),
        ('nzv'         , NearZeroVarianceFilterer(threshold=1e-4)),
        ('pca'         , SelectivePCA(n_components=0.9)),
        ('model'       , RandomForestClassifier(n_jobs=1))
    ])

    # let's define a set of hyper-parameters over which to search
    hp = {
        'collinearity__threshold' : uniform(loc=.8, scale=.15),
        'collinearity__method'    : ['pearson','kendall','spearman'],
        'scaler__scaler'          : [StandardScaler(), RobustScaler()],
        'pca__n_components'       : uniform(loc=.75, scale=.2),
        'pca__whiten'             : [True, False],
        'model__n_estimators'     : randint(5,100),
        'model__max_depth'        : randint(2,25),
        'model__min_samples_leaf' : randint(1,15),
        'model__max_features'     : uniform(loc=.5, scale=.5),
        'model__max_leaf_nodes'   : randint(10,75)
    }

    # define the gridsearch
    search = RandomizedSearchCV(pipe, hp,
                                n_iter=2, # just to test it even works
                                scoring='accuracy',
                                cv=custom_cv,
                                random_state=42)

    # fit the search
    search.fit(X_train, y_train)

    # test the report
    the_report = report_grid_score_detail(search, charts=False)
Example #2
0
def test_random_grid():
    # build a pipeline
    pipe = Pipeline([
        ('retainer', FeatureRetainer()),  # will retain all
        ('dropper', FeatureDropper()),  # won't drop any
        ('mapper', FunctionMapper()),  # pass through
        ('encoder',
         OneHotCategoricalEncoder()),  # no object dtypes, so will pass through
        ('collinearity', MulticollinearityFilterer(threshold=0.85)),
        ('imputer', SelectiveImputer()),  # pass through
        ('scaler', SelectiveScaler()),
        ('boxcox', BoxCoxTransformer()),
        ('nzv', NearZeroVarianceFilterer(threshold=1e-4)),
        ('pca', SelectivePCA(n_components=0.9)),
        ('model', RandomForestClassifier(n_jobs=1))
    ])

    # let's define a set of hyper-parameters over which to search
    hp = {
        'collinearity__threshold': uniform(loc=.8, scale=.15),
        'collinearity__method': ['pearson', 'kendall', 'spearman'],
        'scaler__scaler': [StandardScaler(), RobustScaler()],
        'pca__n_components': uniform(loc=.75, scale=.2),
        'pca__whiten': [True, False],
        'model__n_estimators': randint(5, 10),
        'model__max_depth': randint(2, 5),
        'model__min_samples_leaf': randint(1, 5),
        'model__max_features': uniform(loc=.5, scale=.5),
        'model__max_leaf_nodes': randint(10, 15)
    }

    # define the gridsearch
    search = RandomizedSearchCV(
        pipe,
        hp,
        n_iter=2,  # just to test it even works
        scoring='accuracy',
        cv=2,
        random_state=42)

    # fit the search
    search.fit(X_train, y_train)

    # test the report
    report_grid_score_detail(search, charts=False)
Example #3
0
def test_large_grid():
	"""In this test, we purposely overfit a RandomForest to completely random data
	in order to assert that the test error will far supercede the train error.
	"""

	custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42)

	# define the pipe
	pipe = Pipeline([
			('scaler', SelectiveScaler()),
			('pca',    SelectivePCA(weight=True)),
			('rf',     RandomForestClassifier(random_state=42))
		])

	# define hyper parameters
	hp = {
		'scaler__scaler' : [StandardScaler(), RobustScaler(), MinMaxScaler()],
		'pca__whiten' : [True, False],
		'pca__weight' : [True, False],
		'pca__n_components' : uniform(0.75, 0.15),
		'rf__n_estimators' : randint(10, 150),
		'rf__max_depth' : randint(5, 15)
	}

	# define the grid
	grid = RandomizedSearchCV(pipe, hp, n_iter=5, scoring='accuracy', n_jobs=-1, cv=custom_cv, random_state=42)

	# fit the grid
	grid.fit(X_train, y_train)

	# get predictions
	tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test)

	# evaluate score (SHOULD be better than random...)
	tr_score, te_score = accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred)

	# do we want to do this?
	if not tr_score >= te_score:
		warnings.warn('expected training accuracy to be higher (train: %.5f, test: %.5f)' % (tr_score, te_score))