def test_input_errors_randomized(params, expected_error_message):
    # tests specific to HalvingRandomSearchCV

    base_estimator = FastClassifier()
    param_grid = {'a': [1]}
    X, y = make_classification(100)

    sh = HalvingRandomSearchCV(base_estimator, param_grid, **params)

    with pytest.raises(ValueError, match=expected_error_message):
        sh.fit(X, y)
def test_random_search_discrete_distributions(param_distributions,
                                              expected_n_candidates):
    # Make sure random search samples the appropriate number of candidates when
    # we ask for more than what's possible. How many parameters are sampled
    # depends whether the distributions are 'all lists' or not (see
    # ParameterSampler for details). This is somewhat redundant with the checks
    # in ParameterSampler but interaction bugs were discovered during
    # developement of SH

    n_samples = 1024
    X, y = make_classification(n_samples=n_samples, random_state=0)
    base_estimator = FastClassifier()
    sh = HalvingRandomSearchCV(base_estimator, param_distributions,
                               n_candidates=10)
    sh.fit(X, y)
    assert sh.n_candidates_[0] == expected_n_candidates
def test_random_search(max_resources, n_candidates, expected_n_candidates):
    # Test random search and make sure the number of generated candidates is
    # as expected

    n_samples = 1024
    X, y = make_classification(n_samples=n_samples, random_state=0)
    param_grid = {'a': norm, 'b': norm}
    base_estimator = FastClassifier()
    sh = HalvingRandomSearchCV(base_estimator, param_grid,
                               n_candidates=n_candidates, cv=2,
                               max_resources=max_resources, factor=2,
                               min_resources=4)
    sh.fit(X, y)
    assert sh.n_candidates_[0] == expected_n_candidates
    if n_candidates == 'exhaust':
        # Make sure 'exhaust' makes the last iteration use as much resources as
        # we can
        assert sh.n_resources_[-1] == max_resources
    max_depth=[2, 5, 10],
    min_samples_leaf=[1, 5, 10, 20],
    min_samples_split=[5, 10, 20, 30, 50],
)
alpha = 0.05
neg_mean_pinball_loss_05p_scorer = make_scorer(
    mean_pinball_loss,
    alpha=alpha,
    greater_is_better=False,  # maximize the negative loss
)
gbr = GradientBoostingRegressor(loss="quantile", alpha=alpha, random_state=0)
search_05p = HalvingRandomSearchCV(
    gbr,
    param_grid,
    resource="n_estimators",
    max_resources=250,
    min_resources=50,
    scoring=neg_mean_pinball_loss_05p_scorer,
    n_jobs=2,
    random_state=0,
).fit(X_train, y_train)
pprint(search_05p.best_params_)

# %%
# We observe that the hyper-parameters that were hand-tuned for the median
# regressor are in the same range as the hyper-parameters suitable for the 5th
# percentile regressor.
#
# Let's now tune the hyper-parameters for the 95th percentile regressor. We
# need to redefine the `scoring` metric used to select the best model, along
# with adjusting the alpha parameter of the inner gradient boosting estimator
# itself:
rng = np.random.RandomState(0)

X, y = datasets.make_classification(n_samples=700, random_state=rng)

clf = RandomForestClassifier(n_estimators=20, random_state=rng)

param_dist = {
    "max_depth": [3, None],
    "max_features": randint(1, 11),
    "min_samples_split": randint(2, 11),
    "bootstrap": [True, False],
    "criterion": ["gini", "entropy"]
}

rsh = HalvingRandomSearchCV(estimator=clf,
                            param_distributions=param_dist,
                            factor=2,
                            random_state=rng)
rsh.fit(X, y)

# %%
# We can now use the `cv_results_` attribute of the search estimator to inspect
# and plot the evolution of the search.

results = pd.DataFrame(rsh.cv_results_)
results['params_str'] = results.params.apply(str)
results.drop_duplicates(subset=('params_str', 'iter'), inplace=True)
mean_scores = results.pivot(index='iter',
                            columns='params_str',
                            values='mean_test_score')
ax = mean_scores.plot(legend=False, alpha=.6)
Esempio n. 6
0
                         n_iter=5,
                         scoring='roc_auc',
                         n_jobs=-1)
clf = clf.fit(train_features, train_labels)
# Score model
score_randomized = roc_auc_score(test_labels,
                                 clf.predict_proba(test_features)[:, 1])
print(f'ROC AUC Score for RandomizedSearchCV model: {score_randomized}')
print(clf.best_params_)

# Fit rf model with HalvingRandomSearchCV
clf_halving = HalvingRandomSearchCV(pipe,
                                    param_grid,
                                    cv=cv,
                                    verbose=1,
                                    scoring='roc_auc',
                                    n_jobs=-1,
                                    aggressive_elimination=True,
                                    factor=2,
                                    min_resources=20)
clf_halving = clf_halving.fit(train_features, train_labels)
# Score model
score_halving = roc_auc_score(test_labels,
                              clf_halving.predict_proba(test_features)[:, 1])
print(f'ROC AUC Score for HalvingRandomSearchCV model: {score_halving}')
print(clf_halving.best_params_)

print(f'ROC AUC Score for out of the box model: {score_rf}')
print(f'ROC AUC Score for RandomizedSearchCV model: {score_randomized}')
print(f'ROC AUC Score for HalvingRandomSearchCV model: {score_halving}')
Esempio n. 7
0
    search_multi.fit(X_train, y_train)
    time_end = timeit.default_timer()
    time_elapsed = time_end - time_start
    print('Execution time (hour:min:sec): {}'.format(
        str(dt.timedelta(seconds=time_elapsed))))
    print('Best parameter (CV score = {:.3f}):'.format(
        search_multi.best_score_))
    print(search_multi.best_params_)

elif search_type == 'HalvingRandomSearchCV':
    # Bandit-based successive halving strategy.
    time_start = timeit.default_timer()
    search_multi = HalvingRandomSearchCV(estimator=pipe,
                                         param_distributions=param_dists,
                                         cv=TimeSeriesSplit(n_splits=3),
                                         scoring='neg_mean_squared_error',
                                         factor=2,
                                         refit=True,
                                         n_jobs=NJOBS)
    search_multi.fit(X_train, y_train)
    time_end = timeit.default_timer()
    time_elapsed = time_end - time_start
    print('Execution time (hour:min:sec): {}'.format(
        str(dt.timedelta(seconds=time_elapsed))))
    print('Best parameter (CV score = {:.3f}):'.format(
        search_multi.best_score_))
    print(search_multi.best_params_)

else:
    raise NotImplementedError('Search method "{}" is not recognized '
                              'or implemented!'.format(search_type))
Esempio n. 8
0
    def search_best_rf(self, n_trees=2500, saveStats=True):
        """
        Seach Best Random Forest Model
  
        Parameters
         ----------
        df : DataFrame prepared (method prepared_data)
  
        Returns
        -------
        JSON File (model_params_rf.json).
  
        """
        #Process Time
        start = time.time()

        #Datasets
        feat_tsf = self.feat_tsf_dataset
        labels = self.labels_dataset

        #Generate random state
        #min_samples_split_values to test
        max_features_list = np.arange(0.20, 0.66, 0.01).tolist()
        max_features_list = [round(elem, 2) for elem in max_features_list]

        max_features_list.append('sqrt')
        max_features_list.append('auto')

        #Get max n_trees
        max_n_trees = self.depth_of_trees.max()[0]
        max_depth_list = np.arange(int(max_n_trees / 4), max_n_trees,
                                   1).tolist()
        max_depth_list.append(None)

        #min_impurity_decrease
        min_impurity_decrease_list = np.arange(0.01, 0.26, 0.01).tolist()
        min_impurity_decrease_list = [
            round(elem, 2) for elem in min_impurity_decrease_list
        ]

        #min_samples_leaf_list.append(None)

        param_grid = {
            "max_features": max_features_list,
            "max_depth": max_depth_list,
            "min_impurity_decrease": min_impurity_decrease_list
        }

        #RF Model to test
        rf = RandomForestRegressor(bootstrap=True,
                                   oob_score=True,
                                   n_estimators=n_trees,
                                   random_state=7)

        #Define and execute pipe
        grid_cv = HalvingRandomSearchCV(estimator=rf,
                                        param_distributions=param_grid,
                                        random_state=7,
                                        max_resources='auto',
                                        verbose=3).fit(feat_tsf, labels)

        df_results = pd.DataFrame(grid_cv.cv_results_)

        #Save CV Results
        if saveStats:

            df_results.to_csv('data/cv_hyperparams_model.csv')

        print("Best Params:")
        print(grid_cv.best_params_)

        print("Saving model in 'model_params.joblib'")
        # Writing joblibfile with best model
        dump(grid_cv.best_estimator_, 'model_params.joblib')

        #Save json file with params best model
        json_txt = json.dumps(grid_cv.best_params_, indent=4)
        with open('model_params', 'w') as file:
            file.write(json_txt)

        #End Time
        end = time.time()
        time_elapsed = round((end - start) / 60, 1)

        return ('Time elapsed minutes: %1.f' % (time_elapsed))
Esempio n. 9
0
_ = halving_cv.fit(X, y)


# deal with class imbalance

counts = pd.Series(y.flatten()).value_counts()

scale_pos_weight = counts["No"] / counts["Yes"]


param_grid_2 = {
    "max_depth": [3, 4, 5],
    "gamma": [5, 30, 50],
    "learning_rate": [0.01, 0.1, 0.3, 0.5],
    "min_child_weight": [1, 3, 5],
    "reg_lambda": [50, 100, 300],
    "scale_pos_weight": [scale_pos_weight],  # Fix scale_pos_weight
    "subsample": [0.7, 0.8, 0.9],
    "colsample_bytree": [0.7, 0.8, 0.9],
}

from sklearn.model_selection import HalvingRandomSearchCV

halving_random_cv = HalvingRandomSearchCV(
    xgb_cl, param_grid_2, scoring="roc_auc", n_jobs=-1, n_candidates="exhaust", factor=4
)

_ = halving_random_cv.fit(X, y)


if do_search == "halving":
    distributions = dict(
        lr=expon(1e-2),
        sampler_lr=expon(1e-1),
        sampler=["mala", "langevin", "tempered mala", "tempered langevin"],
        weight_decay=expon(1e-3),
        #     max_iter=poisson(30),
        replay_prob=beta(a=9, b=1),
        adversary_weight=beta(a=1, b=1),
        num_units=poisson(32),
        num_layers=poisson(3),
        max_replay=poisson(10),
    )
    clf_cv = HalvingRandomSearchCV(clf,
                                   distributions,
                                   random_state=0,
                                   n_jobs=5,
                                   resource="max_iter",
                                   max_resources=max_resources)
    search = clf_cv.fit(X.values)
    clf = clf_cv.best_estimator_
elif do_search == "bohb":
    distributions = CS.ConfigurationSpace(seed=42)
    distributions.add_hyperparameter(
        CSH.UniformFloatHyperparameter("lr",
                                       1e-4,
                                       3e-1,
                                       log=True,
                                       default_value=6e-3))
    distributions.add_hyperparameter(
        CSH.UniformFloatHyperparameter("sampler_lr",
                                       1e-4,