def train_grid_search(random_state=42):
    ''' max_depth, n_estimators and ccp_alpha '''
    X_valid, y_valid, _, _ = get_data(random_state=random_state, test_size=0.1)

    n_estimators = [10, 50]

    max_depth = [5, 50, 100]

    ccp_alpha = [0.001, 0.01]

    hyperparameter_grid = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'ccp_alpha': ccp_alpha
    }

    target_model = RandomForestClassifier(random_state=random_state)

    # create search object
    grid_cv = TuneGridSearchCV(estimator=target_model,
                               param_grid=hyperparameter_grid,
                               cv=4)

    # Fit on the all training data using random search object
    grid_cv.fit(X_valid, y_valid)
    return grid_cv.best_estimator
Example #2
0
default_model = RandomForestClassifier()
default_model.fit(x_train, y_train)
default_pred = default_model.predict(x_test)
default_params = default_model.get_params()
default_accuracy = np.count_nonzero(
    np.array(default_pred) == np.array(y_test)) / len(default_pred)

parameter_grid = {
    "n_estimators": [10, 50],
    "max_depth": [5, 50, 100],
    "ccp_alpha": [0.001, 0.01]
}

tune_search = TuneGridSearchCV(RandomForestClassifier(),
                               param_grid=parameter_grid,
                               scoring="accuracy")

start = time.time()
tune_search.fit(x_train, y_train)
end = time.time()

best_score = tune_search.best_score_
best_params = tune_search.best_params_

print('''This cluster consists of
    {} nodes in total
    {} CPU resources in total
'''.format(len(ray.nodes()),
           ray.cluster_resources()['CPU']))
Example #3
0
    def tune_model(
        self,
        estimator=None,
        n_iter=2,
        optimize='accuracy',
        search_library: str = 'optuna',
        search_algorithm='random',
        early_stopping='asha',
        early_stopping_max_iters=10,
        estimator_params={},
        n_jobs=-1,
        verbose=True,
    ):
        LOGGER.info('TUNNING MODEL ...')
        best_params_model = {}
        model_grid = None

        estimator_model = {}
        if estimator is None:
            if len(self.estimator.keys()) > 0:
                for name_model, estimator in self.estimator.items():
                    if name_model in estimator_params.keys():
                        estimator_model[
                            name_model] = ModelFactory.create_executor(
                                name_model, **estimator_params[name_model])
                    else:
                        estimator_model[name_model] = estimator

            else:
                for name_model in ModelFactory.name_registry:
                    if name_model in estimator_params.keys():
                        estimator_model[
                            name_model] = ModelFactory.create_executor(
                                name_model, **estimator_params[name_model])
                    else:
                        estimator_model[
                            name_model] = ModelFactory.create_executor(
                                name_model)
        else:
            for name_model in estimator:
                if name_model in estimator_params.keys():
                    estimator_model[name_model] = ModelFactory.create_executor(
                        name_model, **estimator_params[name_model])
                else:
                    estimator_model[name_model] = ModelFactory.create_executor(
                        name_model)

        # update estimator_params
        for name_model, params in estimator_params.items():
            self.estimator_params[name_model] = params

        for name_model, model in estimator_model.items():
            LOGGER.info('tunning model_name: {}'.format(name_model))
            estimator = model.estimator

            parameter_grid = model.tune_grid
            parameter_distributions = model.tune_distributions

            if (search_library == 'scikit-learn' or search_library
                    == 'tune-sklearn') and (search_algorithm == 'grid'
                                            or search_algorithm == 'random'):
                parameter_grid = model.tune_grid
            else:
                parameter_grid = model.tune_distributions
            model_grid = None
            if search_library == 'optuna':
                pruner_translator = {
                    "asha": optuna.pruners.SuccessiveHalvingPruner(),
                    "hyperband": optuna.pruners.HyperbandPruner(),
                    "median": optuna.pruners.MedianPruner(),
                    False: optuna.pruners.NopPruner(),
                    None: optuna.pruners.NopPruner(),
                }
                pruner = early_stopping
                if pruner in pruner_translator:
                    pruner = pruner_translator[early_stopping]

                sampler_translator = {
                    "tpe": optuna.samplers.TPESampler(seed=24),
                    "random": optuna.samplers.RandomSampler(seed=24),
                }
                sampler = sampler_translator[search_algorithm]

                try:
                    param_grid = get_optuna_distributions(
                        parameter_distributions)
                except:
                    LOGGER.warn(
                        "Couldn't convert param_grid to specific library distributions. Exception:"
                    )
                    LOGGER.warn(traceback.format_exc())
                study = optuna.create_study(direction='maximize',
                                            sampler=sampler,
                                            pruner=pruner)
                LOGGER.info('Initializing optuna.intergration.OptnaSearchCV')
                model_grid = optuna.integration.OptunaSearchCV(
                    estimator=estimator,
                    param_distributions=param_grid,
                    max_iter=early_stopping_max_iters,
                    n_jobs=n_jobs,
                    n_trials=n_iter,
                    random_state=24,
                    scoring=optimize,
                    study=study,
                    refit=False,
                    verbose=verbose,
                    error_score='raise')
            elif search_library == 'tune-sklearn':
                early_stopping_translator = {
                    "asha": "ASHAScheduler",
                    "hyperband": "HyperBandScheduler",
                    "median": "MedianStoppingRule",
                }
                if early_stopping in early_stopping_translator:
                    early_stopping = early_stopping_translator[early_stopping]
                do_early_stop = early_stopping and can_early_stop(
                    estimator, True, True, True, parameter_grid)

                if search_algorithm == 'grid':

                    LOGGER.info('Initializing tune_sklearn.TuneGridSearchCV')
                    model_grid = TuneGridSearchCV(
                        estimator=estimator,
                        param_grid=parameter_grid,
                        early_stopping=do_early_stop,
                        scoring=optimize,
                        cv=fold,
                        max_iters=early_stopping_max_iters,
                        refit=True,
                        n_jobs=n_jobs)

            model_grid.fit(self.X, self.y)
            best_params = model_grid.best_params_
            best_params_model[name_model] = best_params

        # update estimator_params
        for name_model, params in best_params_model.items():
            self.estimator_params[name_model] = params
        LOGGER.info('best_params_model: {}'.format(best_params_model))
        return best_params_model
Example #4
0
X, y = make_classification(n_samples=11000,
                           n_features=1000,
                           n_informative=50,
                           n_redundant=0,
                           n_classes=10,
                           class_sep=2.5)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=1000)

# Example parameters to tune from SGDClassifier
parameter_grid = {"alpha": [1e-4, 1e-1, 1], "epsilon": [0.01, 0.1]}

#######################################################################
# As you can see, the setup here is exactly how you would do it for Scikit-Learn. Now, let's try fitting a model.

tune_search = TuneGridSearchCV(SGDClassifier(),
                               parameter_grid,
                               early_stopping=True,
                               max_iters=10)

import time  # Just to compare fit times
start = time.time()
tune_search.fit(x_train, y_train)
end = time.time()
print("Tune GridSearch Fit Time:", end - start)
# Tune GridSearch Fit Time: 15.436315774917603 (for an 8 core laptop)

#######################################################################
# Note the slight differences we introduced above:
#
#  * a `early_stopping`, and
#  * a specification of `max_iters` parameter
#
    def tune_model(self,
                   X,
                   y,
                   estimator=None,
                   fold=None,
                   n_iter=2,
                   optimize='max_error',
                   search_library: str = 'optuna',
                   search_algorithm='random',
                   early_stopping='asha',
                   early_stopping_max_iters=10,
                   verbose=True,
                   n_jobs=-1):
        if self.preprocess == True:
            self.preprocessor.fit(X, y)
            X, y = self.preprocessor.transform(X, y)
        X = pd.DataFrame(X).reset_index(drop=True)
        y = pd.DataFrame(y).reset_index(drop=True)
        LOGGER.info('tune models')
        best_params_model = {}
        model_grid = None

        estimator_tune = {}
        if estimator is None:
            for name_model in ModelFactory.name_registry:
                estimator_tune[name_model] = ModelFactory.create_executor(
                    name_model)
        else:
            for name_model in estimator:
                estimator_tune[name_model] = ModelFactory.create_executor(
                    name_model)

        for name_model, model in estimator_tune.items():
            LOGGER.info('tunning model_name: {}'.format(name_model))
            estimator = model.estimator

            parameter_grid = model.tune_grid
            parameter_distributions = model.tune_distributions
            if (search_library == 'scikit-learn' or search_library
                    == 'tune-sklearn') and (search_algorithm == 'grid'
                                            or search_algorithm == 'random'):
                parameter_grid = model.tune_grid
            else:
                parameter_grid = model.tune_distributions
            model_grid = None
            if search_library == 'optuna':
                pruner_translator = {
                    "asha": optuna.pruners.SuccessiveHalvingPruner(),
                    "hyperband": optuna.pruners.HyperbandPruner(),
                    "median": optuna.pruners.MedianPruner(),
                    False: optuna.pruners.NopPruner(),
                    None: optuna.pruners.NopPruner(),
                }
                pruner = early_stopping
                if pruner in pruner_translator:
                    pruner = pruner_translator[early_stopping]

                sampler_translator = {
                    "tpe": optuna.samplers.TPESampler(seed=24),
                    "random": optuna.samplers.RandomSampler(seed=24),
                }
                sampler = sampler_translator[search_algorithm]

                try:
                    param_grid = get_optuna_distributions(
                        parameter_distributions)
                except:
                    logger.warning(
                        "Couldn't convert param_grid to specific library distributions. Exception:"
                    )
                    logger.warning(traceback.format_exc())
                study = optuna.create_study(direction='maximize',
                                            sampler=sampler,
                                            pruner=pruner)
                LOGGER.info('Initializing optuna.intergration.OptnaSearchCV')
                model_grid = optuna.integration.OptunaSearchCV(
                    estimator=estimator,
                    param_distributions=param_grid,
                    max_iter=early_stopping_max_iters,
                    n_jobs=n_jobs,
                    n_trials=n_iter,
                    random_state=24,
                    scoring=optimize,
                    study=study,
                    refit=False,
                    verbose=verbose,
                    error_score='raise')
            elif search_library == 'tune-sklearn':
                early_stopping_translator = {
                    "asha": "ASHAScheduler",
                    "hyperband": "HyperBandScheduler",
                    "median": "MedianStoppingRule",
                }
                if early_stopping in early_stopping_translator:
                    early_stopping = early_stopping_translator[early_stopping]
                do_early_stop = early_stopping and can_early_stop(
                    estimator, True, True, True, parameter_grid)

                if search_algorithm == 'grid':

                    LOGGER.info('Initializing tune_sklearn.TuneGridSearchCV')
                    model_grid = TuneGridSearchCV(
                        estimator=estimator,
                        param_grid=parameter_grid,
                        early_stopping=do_early_stop,
                        scoring=optimize,
                        cv=fold,
                        max_iters=early_stopping_max_iters,
                        refit=True,
                        n_jobs=n_jobs)

            model_grid.fit(X, y)
            best_params = model_grid.best_params_
            LOGGER.info('best_params: {}'.format(best_params))
            best_params_model[name_model] = best_params
        return best_params_model
print(classification_report(y_test, y_pred))

start = time.time()
model = GridSearchCV(DecisionTreeClassifier(random_state=1),
                     param_grid=param_model,
                     scoring='accuracy',
                     n_jobs=-1)

register_ray()
with joblib.parallel_backend('ray'):
    model = model.fit(X_train, y_train)
print(
    f"executed GridSearchCV with joblib in {time.time() - start}, nodes {model.best_estimator_.tree_.node_count}, "
    f"max_depth {model.best_estimator_.tree_.max_depth}")

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

start = time.time()
model = TuneGridSearchCV(DecisionTreeClassifier(random_state=1),
                         param_grid=param_model,
                         scoring='accuracy')

model = model.fit(X_train, y_train)
print(
    f"executed TuneGridSearchCV in {time.time() - start}, nodes {model.best_estimator_.tree_.node_count}, "
    f"max_depth {model.best_estimator_.tree_.max_depth}")

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))