def train_grid_search(random_state=42): ''' max_depth, n_estimators and ccp_alpha ''' X_valid, y_valid, _, _ = get_data(random_state=random_state, test_size=0.1) n_estimators = [10, 50] max_depth = [5, 50, 100] ccp_alpha = [0.001, 0.01] hyperparameter_grid = { 'n_estimators': n_estimators, 'max_depth': max_depth, 'ccp_alpha': ccp_alpha } target_model = RandomForestClassifier(random_state=random_state) # create search object grid_cv = TuneGridSearchCV(estimator=target_model, param_grid=hyperparameter_grid, cv=4) # Fit on the all training data using random search object grid_cv.fit(X_valid, y_valid) return grid_cv.best_estimator
default_model = RandomForestClassifier() default_model.fit(x_train, y_train) default_pred = default_model.predict(x_test) default_params = default_model.get_params() default_accuracy = np.count_nonzero( np.array(default_pred) == np.array(y_test)) / len(default_pred) parameter_grid = { "n_estimators": [10, 50], "max_depth": [5, 50, 100], "ccp_alpha": [0.001, 0.01] } tune_search = TuneGridSearchCV(RandomForestClassifier(), param_grid=parameter_grid, scoring="accuracy") start = time.time() tune_search.fit(x_train, y_train) end = time.time() best_score = tune_search.best_score_ best_params = tune_search.best_params_ print('''This cluster consists of {} nodes in total {} CPU resources in total '''.format(len(ray.nodes()), ray.cluster_resources()['CPU']))
def tune_model( self, estimator=None, n_iter=2, optimize='accuracy', search_library: str = 'optuna', search_algorithm='random', early_stopping='asha', early_stopping_max_iters=10, estimator_params={}, n_jobs=-1, verbose=True, ): LOGGER.info('TUNNING MODEL ...') best_params_model = {} model_grid = None estimator_model = {} if estimator is None: if len(self.estimator.keys()) > 0: for name_model, estimator in self.estimator.items(): if name_model in estimator_params.keys(): estimator_model[ name_model] = ModelFactory.create_executor( name_model, **estimator_params[name_model]) else: estimator_model[name_model] = estimator else: for name_model in ModelFactory.name_registry: if name_model in estimator_params.keys(): estimator_model[ name_model] = ModelFactory.create_executor( name_model, **estimator_params[name_model]) else: estimator_model[ name_model] = ModelFactory.create_executor( name_model) else: for name_model in estimator: if name_model in estimator_params.keys(): estimator_model[name_model] = ModelFactory.create_executor( name_model, **estimator_params[name_model]) else: estimator_model[name_model] = ModelFactory.create_executor( name_model) # update estimator_params for name_model, params in estimator_params.items(): self.estimator_params[name_model] = params for name_model, model in estimator_model.items(): LOGGER.info('tunning model_name: {}'.format(name_model)) estimator = model.estimator parameter_grid = model.tune_grid parameter_distributions = model.tune_distributions if (search_library == 'scikit-learn' or search_library == 'tune-sklearn') and (search_algorithm == 'grid' or search_algorithm == 'random'): parameter_grid = model.tune_grid else: parameter_grid = model.tune_distributions model_grid = None if search_library == 'optuna': pruner_translator = { "asha": optuna.pruners.SuccessiveHalvingPruner(), "hyperband": optuna.pruners.HyperbandPruner(), "median": optuna.pruners.MedianPruner(), False: optuna.pruners.NopPruner(), None: optuna.pruners.NopPruner(), } pruner = early_stopping if pruner in pruner_translator: pruner = pruner_translator[early_stopping] sampler_translator = { "tpe": optuna.samplers.TPESampler(seed=24), "random": optuna.samplers.RandomSampler(seed=24), } sampler = sampler_translator[search_algorithm] try: param_grid = get_optuna_distributions( parameter_distributions) except: LOGGER.warn( "Couldn't convert param_grid to specific library distributions. Exception:" ) LOGGER.warn(traceback.format_exc()) study = optuna.create_study(direction='maximize', sampler=sampler, pruner=pruner) LOGGER.info('Initializing optuna.intergration.OptnaSearchCV') model_grid = optuna.integration.OptunaSearchCV( estimator=estimator, param_distributions=param_grid, max_iter=early_stopping_max_iters, n_jobs=n_jobs, n_trials=n_iter, random_state=24, scoring=optimize, study=study, refit=False, verbose=verbose, error_score='raise') elif search_library == 'tune-sklearn': early_stopping_translator = { "asha": "ASHAScheduler", "hyperband": "HyperBandScheduler", "median": "MedianStoppingRule", } if early_stopping in early_stopping_translator: early_stopping = early_stopping_translator[early_stopping] do_early_stop = early_stopping and can_early_stop( estimator, True, True, True, parameter_grid) if search_algorithm == 'grid': LOGGER.info('Initializing tune_sklearn.TuneGridSearchCV') model_grid = TuneGridSearchCV( estimator=estimator, param_grid=parameter_grid, early_stopping=do_early_stop, scoring=optimize, cv=fold, max_iters=early_stopping_max_iters, refit=True, n_jobs=n_jobs) model_grid.fit(self.X, self.y) best_params = model_grid.best_params_ best_params_model[name_model] = best_params # update estimator_params for name_model, params in best_params_model.items(): self.estimator_params[name_model] = params LOGGER.info('best_params_model: {}'.format(best_params_model)) return best_params_model
X, y = make_classification(n_samples=11000, n_features=1000, n_informative=50, n_redundant=0, n_classes=10, class_sep=2.5) x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=1000) # Example parameters to tune from SGDClassifier parameter_grid = {"alpha": [1e-4, 1e-1, 1], "epsilon": [0.01, 0.1]} ####################################################################### # As you can see, the setup here is exactly how you would do it for Scikit-Learn. Now, let's try fitting a model. tune_search = TuneGridSearchCV(SGDClassifier(), parameter_grid, early_stopping=True, max_iters=10) import time # Just to compare fit times start = time.time() tune_search.fit(x_train, y_train) end = time.time() print("Tune GridSearch Fit Time:", end - start) # Tune GridSearch Fit Time: 15.436315774917603 (for an 8 core laptop) ####################################################################### # Note the slight differences we introduced above: # # * a `early_stopping`, and # * a specification of `max_iters` parameter #
def tune_model(self, X, y, estimator=None, fold=None, n_iter=2, optimize='max_error', search_library: str = 'optuna', search_algorithm='random', early_stopping='asha', early_stopping_max_iters=10, verbose=True, n_jobs=-1): if self.preprocess == True: self.preprocessor.fit(X, y) X, y = self.preprocessor.transform(X, y) X = pd.DataFrame(X).reset_index(drop=True) y = pd.DataFrame(y).reset_index(drop=True) LOGGER.info('tune models') best_params_model = {} model_grid = None estimator_tune = {} if estimator is None: for name_model in ModelFactory.name_registry: estimator_tune[name_model] = ModelFactory.create_executor( name_model) else: for name_model in estimator: estimator_tune[name_model] = ModelFactory.create_executor( name_model) for name_model, model in estimator_tune.items(): LOGGER.info('tunning model_name: {}'.format(name_model)) estimator = model.estimator parameter_grid = model.tune_grid parameter_distributions = model.tune_distributions if (search_library == 'scikit-learn' or search_library == 'tune-sklearn') and (search_algorithm == 'grid' or search_algorithm == 'random'): parameter_grid = model.tune_grid else: parameter_grid = model.tune_distributions model_grid = None if search_library == 'optuna': pruner_translator = { "asha": optuna.pruners.SuccessiveHalvingPruner(), "hyperband": optuna.pruners.HyperbandPruner(), "median": optuna.pruners.MedianPruner(), False: optuna.pruners.NopPruner(), None: optuna.pruners.NopPruner(), } pruner = early_stopping if pruner in pruner_translator: pruner = pruner_translator[early_stopping] sampler_translator = { "tpe": optuna.samplers.TPESampler(seed=24), "random": optuna.samplers.RandomSampler(seed=24), } sampler = sampler_translator[search_algorithm] try: param_grid = get_optuna_distributions( parameter_distributions) except: logger.warning( "Couldn't convert param_grid to specific library distributions. Exception:" ) logger.warning(traceback.format_exc()) study = optuna.create_study(direction='maximize', sampler=sampler, pruner=pruner) LOGGER.info('Initializing optuna.intergration.OptnaSearchCV') model_grid = optuna.integration.OptunaSearchCV( estimator=estimator, param_distributions=param_grid, max_iter=early_stopping_max_iters, n_jobs=n_jobs, n_trials=n_iter, random_state=24, scoring=optimize, study=study, refit=False, verbose=verbose, error_score='raise') elif search_library == 'tune-sklearn': early_stopping_translator = { "asha": "ASHAScheduler", "hyperband": "HyperBandScheduler", "median": "MedianStoppingRule", } if early_stopping in early_stopping_translator: early_stopping = early_stopping_translator[early_stopping] do_early_stop = early_stopping and can_early_stop( estimator, True, True, True, parameter_grid) if search_algorithm == 'grid': LOGGER.info('Initializing tune_sklearn.TuneGridSearchCV') model_grid = TuneGridSearchCV( estimator=estimator, param_grid=parameter_grid, early_stopping=do_early_stop, scoring=optimize, cv=fold, max_iters=early_stopping_max_iters, refit=True, n_jobs=n_jobs) model_grid.fit(X, y) best_params = model_grid.best_params_ LOGGER.info('best_params: {}'.format(best_params)) best_params_model[name_model] = best_params return best_params_model
print(classification_report(y_test, y_pred)) start = time.time() model = GridSearchCV(DecisionTreeClassifier(random_state=1), param_grid=param_model, scoring='accuracy', n_jobs=-1) register_ray() with joblib.parallel_backend('ray'): model = model.fit(X_train, y_train) print( f"executed GridSearchCV with joblib in {time.time() - start}, nodes {model.best_estimator_.tree_.node_count}, " f"max_depth {model.best_estimator_.tree_.max_depth}") y_pred = model.predict(X_test) print(classification_report(y_test, y_pred)) start = time.time() model = TuneGridSearchCV(DecisionTreeClassifier(random_state=1), param_grid=param_model, scoring='accuracy') model = model.fit(X_train, y_train) print( f"executed TuneGridSearchCV in {time.time() - start}, nodes {model.best_estimator_.tree_.node_count}, " f"max_depth {model.best_estimator_.tree_.max_depth}") y_pred = model.predict(X_test) print(classification_report(y_test, y_pred))