Ejemplo n.º 1
0
    def test_large_grid():
        """In this test, we purposely overfit a RandomForest to completely random data
        in order to assert that the test error will far supercede the train error.
        """

        if not SK18:
            custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42)
        else:
            custom_cv = KFold(n_splits=3, shuffle=True, random_state=42)

        # define the pipe
        pipe = Pipeline([
            ('scaler', SelectiveScaler()),
            ('pca', SelectivePCA(weight=True)),
            ('rf', RandomForestClassifier(random_state=42))
        ])

        # define hyper parameters
        hp = {
            'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()],
            'pca__whiten': [True, False],
            'pca__weight': [True, False],
            'pca__n_components': uniform(0.75, 0.15),
            'rf__n_estimators': randint(5, 10),
            'rf__max_depth': randint(5, 15)
        }

        # define the grid
        grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42)

        # this will fail because we haven't fit yet
        assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train)

        # fit the grid
        grid.fit(X_train, y_train)

        # score for coverage -- this might warn...
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            grid.score(X_train, y_train)

        # coverage:
        assert grid._estimator_type == 'classifier'

        # get predictions
        tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test)

        # evaluate score (SHOULD be better than random...)
        accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred)

        # grid score reports:
        # assert fails for bad percentile
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0})
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0})

        # assert fails for bad y_axis
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'})

        # assert passes otherwise
        report_grid_score_detail(grid, charts=True, percentile=0.95)  # just ensure percentile works
Ejemplo n.º 2
0
def test_random_grid():
    # build a pipeline
    pipe = Pipeline([
        ('retainer', FeatureRetainer()),  # will retain all
        ('dropper', FeatureDropper()),  # won't drop any
        ('mapper', FunctionMapper()),  # pass through
        ('encoder',
         OneHotCategoricalEncoder()),  # no object dtypes, so will pass through
        ('collinearity', MulticollinearityFilterer(threshold=0.85)),
        ('imputer', SelectiveImputer()),  # pass through
        ('scaler', SelectiveScaler()),
        ('boxcox', BoxCoxTransformer()),
        ('nzv', NearZeroVarianceFilterer(threshold=1e-4)),
        ('pca', SelectivePCA(n_components=0.9)),
        ('model', RandomForestClassifier(n_jobs=1))
    ])

    # let's define a set of hyper-parameters over which to search
    hp = {
        'collinearity__threshold': uniform(loc=.8, scale=.15),
        'collinearity__method': ['pearson', 'kendall', 'spearman'],
        'scaler__scaler': [StandardScaler(), RobustScaler()],
        'pca__n_components': uniform(loc=.75, scale=.2),
        'pca__whiten': [True, False],
        'model__n_estimators': randint(5, 10),
        'model__max_depth': randint(2, 5),
        'model__min_samples_leaf': randint(1, 5),
        'model__max_features': uniform(loc=.5, scale=.5),
        'model__max_leaf_nodes': randint(10, 15)
    }

    # define the gridsearch
    search = RandomizedSearchCV(
        pipe,
        hp,
        n_iter=2,  # just to test it even works
        scoring='accuracy',
        cv=2,
        random_state=42)

    # fit the search
    search.fit(X_train, y_train)

    # test the report
    report_grid_score_detail(search, charts=False)
Ejemplo n.º 3
0
def test_random_grid():
    # build a pipeline
    pipe = Pipeline([
        ('retainer',       FeatureRetainer()),  # will retain all
        ('dropper',        FeatureDropper()),  # won't drop any
        ('mapper',         FunctionMapper()),  # pass through
        ('encoder',        OneHotCategoricalEncoder()),  # no object dtypes, so will pass through
        ('collinearity',   MulticollinearityFilterer(threshold=0.85)),
        ('imputer',        SelectiveImputer()),  # pass through
        ('scaler',         SelectiveScaler()),
        ('boxcox',         BoxCoxTransformer()),
        ('nzv',            NearZeroVarianceFilterer(threshold=1e-4)),
        ('pca',            SelectivePCA(n_components=0.9)),
        ('model',          RandomForestClassifier(n_jobs=1))
    ])

    # let's define a set of hyper-parameters over which to search
    hp = {
        'collinearity__threshold':    uniform(loc=.8, scale=.15),
        'collinearity__method':       ['pearson', 'kendall', 'spearman'],
        'scaler__scaler':             [StandardScaler(), RobustScaler()],
        'pca__n_components':          uniform(loc=.75, scale=.2),
        'pca__whiten':                [True, False],
        'model__n_estimators':        randint(5, 10),
        'model__max_depth':           randint(2, 5),
        'model__min_samples_leaf':    randint(1, 5),
        'model__max_features':        uniform(loc=.5, scale=.5),
        'model__max_leaf_nodes':      randint(10, 15)
    }

    # define the gridsearch
    search = RandomizedSearchCV(pipe, hp,
                                n_iter=2,  # just to test it even works
                                scoring='accuracy',
                                cv=2,
                                random_state=42)

    # fit the search
    search.fit(X_train, y_train)

    # test the report
    report_grid_score_detail(search, charts=False)
Ejemplo n.º 4
0
def test_regular_grid():
    # build a pipeline
    pipe = Pipeline([
        ('retainer', FeatureRetainer()),  # will retain all
        ('dropper', FeatureDropper()),  # won't drop any
        ('mapper', FunctionMapper()),  # pass through
        ('encoder',
         OneHotCategoricalEncoder()),  # no object dtypes, so will pass through
        ('collinearity', MulticollinearityFilterer(threshold=0.85)),
        ('imputer', SelectiveImputer()),  # pass through since no missing
        ('scaler', SelectiveScaler()),
        ('boxcox', BoxCoxTransformer()),
        ('nzv', NearZeroVarianceFilterer(threshold=1e-4)),
        ('pca', SelectivePCA(n_components=0.9)),
        ('model', RandomForestClassifier(n_jobs=1))
    ])

    # let's define a set of hyper-parameters over which to search (exhaustively, so for the test, just do one of each)
    hp = {
        'collinearity__threshold': [0.90],
        'collinearity__method': ['spearman'],
        'scaler__scaler': [RobustScaler()],
        'pca__n_components': [0.95],
        'pca__whiten': [True],
        'model__n_estimators': [5],
        'model__max_depth': [5],
        'model__min_samples_leaf': [8],
        'model__max_features': [0.75],
        'model__max_leaf_nodes': [20]
    }

    # define the gridsearch
    search = GridSearchCV(pipe,
                          hp,
                          scoring='accuracy',
                          cv=custom_cv,
                          verbose=1)

    # fit the search
    search.fit(X_train, y_train)
    # search.score(X_train, y_train) # throws a warning...
    search.predict(X_train)
    search.predict_proba(X_train)
    search.predict_log_proba(X_train)

    # this poses an issue.. the models are trained on X as a
    # array, and selecting the best estimator causes the retained
    # names to go away. We need to find a way to force the best_estimator_
    # to retain the names on which to start the training process.
    # search.best_estimator_.predict(X_train)

    # test the report
    report_grid_score_detail(search, charts=False)

    # test with invalid X and ys
    assert_fails(search.fit, Exception, X_train, None)

    # fit the search with a series
    search.fit(X_train, pd.Series(y_train))

    # fit the search with a DF as y
    search.fit(X_train, pd.DataFrame(pd.Series(y_train)))

    # test with invalid X and ys
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        assert_fails(search.fit, Exception, X_train,
                     pd.DataFrame([pd.Series(y_train),
                                   pd.Series(y_train)]))

    # test with short y
    assert_fails(search.fit, ValueError, X_train, [0, 1, 2])
Ejemplo n.º 5
0
def test_regular_grid():
    # build a pipeline
    pipe = Pipeline([
        ('retainer',      FeatureRetainer()),  # will retain all
        ('dropper',       FeatureDropper()),  # won't drop any
        ('mapper',        FunctionMapper()),  # pass through
        ('encoder',       OneHotCategoricalEncoder()),  # no object dtypes, so will pass through
        ('collinearity',  MulticollinearityFilterer(threshold=0.85)),
        ('imputer',       SelectiveImputer()),  # pass through since no missing
        ('scaler',        SelectiveScaler()),
        ('boxcox',        BoxCoxTransformer()),
        ('nzv',           NearZeroVarianceFilterer(threshold=1e-4)),
        ('pca',           SelectivePCA(n_components=0.9)),
        ('model',         RandomForestClassifier(n_jobs=1))
    ])

    # let's define a set of hyper-parameters over which to search (exhaustively, so for the test, just do one of each)
    hp = {
        'collinearity__threshold': [0.90],
        'collinearity__method':    ['spearman'],
        'scaler__scaler':          [RobustScaler()],
        'pca__n_components':       [0.95],
        'pca__whiten':             [True],
        'model__n_estimators':     [5],
        'model__max_depth':        [5],
        'model__min_samples_leaf': [8],
        'model__max_features':     [0.75],
        'model__max_leaf_nodes':   [20]
    }

    # define the gridsearch
    search = GridSearchCV(pipe, hp,
                          scoring='accuracy',
                          cv=custom_cv,
                          verbose=1)

    # fit the search
    search.fit(X_train, y_train)
    # search.score(X_train, y_train) # throws a warning...
    search.predict(X_train)
    search.predict_proba(X_train)
    search.predict_log_proba(X_train)

    # this poses an issue.. the models are trained on X as a
    # array, and selecting the best estimator causes the retained
    # names to go away. We need to find a way to force the best_estimator_
    # to retain the names on which to start the training process.
    # search.best_estimator_.predict(X_train)

    # test the report
    report_grid_score_detail(search, charts=False)

    # test with invalid X and ys
    assert_fails(search.fit, Exception, X_train, None)

    # fit the search with a series
    search.fit(X_train, pd.Series(y_train))

    # fit the search with a DF as y
    search.fit(X_train, pd.DataFrame(pd.Series(y_train)))

    # test with invalid X and ys
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        assert_fails(search.fit, Exception, X_train, pd.DataFrame([pd.Series(y_train), pd.Series(y_train)]))

    # test with short y
    assert_fails(search.fit, ValueError, X_train, [0, 1, 2])
Ejemplo n.º 6
0
    def test_large_grid():
        """In this test, we purposely overfit a RandomForest to completely random data
        in order to assert that the test error will far supercede the train error.
        """

        if not SK18:
            custom_cv = KFold(n=y_train.shape[0],
                              n_folds=3,
                              shuffle=True,
                              random_state=42)
        else:
            custom_cv = KFold(n_splits=3, shuffle=True, random_state=42)

        # define the pipe
        pipe = Pipeline([('scaler', SelectiveScaler()),
                         ('pca', SelectivePCA(weight=True)),
                         ('rf', RandomForestClassifier(random_state=42))])

        # define hyper parameters
        hp = {
            'scaler__scaler':
            [StandardScaler(),
             RobustScaler(), MinMaxScaler()],
            'pca__whiten': [True, False],
            'pca__weight': [True, False],
            'pca__n_components': uniform(0.75, 0.15),
            'rf__n_estimators': randint(5, 10),
            'rf__max_depth': randint(5, 15)
        }

        # define the grid
        grid = RandomizedSearchCV(pipe,
                                  hp,
                                  n_iter=2,
                                  scoring='accuracy',
                                  n_jobs=1,
                                  cv=custom_cv,
                                  random_state=42)

        # this will fail because we haven't fit yet
        assert_fails(grid.score, (ValueError, AttributeError), X_train,
                     y_train)

        # fit the grid
        grid.fit(X_train, y_train)

        # score for coverage -- this might warn...
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            grid.score(X_train, y_train)

        # coverage:
        assert grid._estimator_type == 'classifier'

        # get predictions
        tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test)

        # evaluate score (SHOULD be better than random...)
        accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred)

        # grid score reports:
        # assert fails for bad percentile
        assert_fails(report_grid_score_detail, ValueError, **{
            'random_search': grid,
            'percentile': 0.0
        })
        assert_fails(report_grid_score_detail, ValueError, **{
            'random_search': grid,
            'percentile': 1.0
        })

        # assert fails for bad y_axis
        assert_fails(report_grid_score_detail, ValueError, **{
            'random_search': grid,
            'y_axis': 'bad_axis'
        })

        # assert passes otherwise
        report_grid_score_detail(
            grid, charts=True, percentile=0.95)  # just ensure percentile works