def _tune_run(self, config, resources_per_trial): """Wrapper to call ``tune.run``. Multiple estimators are generated when early stopping is possible, whereas a single estimator is generated when early stopping is not possible. Args: config (dict): Configurations such as hyperparameters to run ``tune.run`` on. resources_per_trial (dict): Resources to use per trial within Ray. Accepted keys are `cpu`, `gpu` and custom resources, and values are integers specifying the number of each resource to use. Returns: analysis (`ExperimentAnalysis`): Object returned by `tune.run`. """ trainable = _Trainable if self.pipeline_auto_early_stop and check_is_pipeline( self.estimator) and self.early_stopping: trainable = _PipelineTrainable if self.early_stopping is not None: config["estimator_list"] = [ clone(self.estimator) for _ in range(self.n_splits) ] else: config["estimator_list"] = [self.estimator] if isinstance(self.param_grid, list): analysis = tune.run( trainable, search_alg=ListSearcher(self.param_grid), num_samples=self._list_grid_num_samples(), scheduler=self.early_stopping, reuse_actors=True, verbose=self.verbose, stop={"training_iteration": self.max_iters}, config=config, fail_fast=True, resources_per_trial=resources_per_trial, local_dir=os.path.expanduser(self.local_dir), loggers=self.loggers) else: analysis = tune.run( trainable, scheduler=self.early_stopping, reuse_actors=True, verbose=self.verbose, stop={"training_iteration": self.max_iters}, config=config, fail_fast=True, resources_per_trial=resources_per_trial, local_dir=os.path.expanduser(self.local_dir), loggers=self.loggers) return analysis
def _tune_run(self, config, resources_per_trial): """Wrapper to call ``tune.run``. Multiple estimators are generated when early stopping is possible, whereas a single estimator is generated when early stopping is not possible. Args: config (dict): Configurations such as hyperparameters to run ``tune.run`` on. resources_per_trial (dict): Resources to use per trial within Ray. Accepted keys are `cpu`, `gpu` and custom resources, and values are integers specifying the number of each resource to use. Returns: analysis (`ExperimentAnalysis`): Object returned by `tune.run`. """ trainable = _Trainable if self.pipeline_auto_early_stop and check_is_pipeline( self.estimator) and self.early_stopping: trainable = _PipelineTrainable if self.early_stopping is not None: config["estimator_ids"] = [ ray.put(self.estimator) for _ in range(self.n_splits) ] else: config["estimator_ids"] = [ray.put(self.estimator)] stopper = MaximumIterationStopper(max_iter=self.max_iters) if self.stopper: stopper = CombinedStopper(stopper, self.stopper) run_args = dict(scheduler=self.early_stopping, reuse_actors=True, verbose=self.verbose, stop=stopper, config=config, fail_fast="raise", resources_per_trial=resources_per_trial, local_dir=os.path.expanduser(self.local_dir), loggers=self.loggers, time_budget_s=self.time_budget_s) if isinstance(self.param_grid, list): run_args.update( dict(search_alg=ListSearcher(self.param_grid), num_samples=self._list_grid_num_samples())) with warnings.catch_warnings(): warnings.filterwarnings("ignore", message="fail_fast='raise' " "detected.") analysis = tune.run(trainable, **run_args) return analysis
def _fit(self, X, y=None, groups=None, **fit_params): """Helper method to run fit procedure Args: X (:obj:`array-like` (shape = [n_samples, n_features])): Training vector, where n_samples is the number of samples and n_features is the number of features. y (:obj:`array-like`): Shape of array expected to be [n_samples] or [n_samples, n_output]). Target relative to X for classification or regression; None for unsupervised learning. groups (:obj:`array-like` (shape (n_samples,)), optional): Group labels for the samples used while splitting the dataset into train/test set. Only used in conjunction with a "Group" `cv` instance (e.g., `GroupKFold`). **fit_params (:obj:`dict` of str): Parameters passed to the ``fit`` method of the estimator. Returns: :obj:`TuneBaseSearchCV` child instance, after fitting. """ self._check_params() classifier = is_classifier(self.estimator) cv = check_cv(cv=self.cv, y=y, classifier=classifier) self.n_splits = cv.get_n_splits(X, y, groups) if not hasattr(self, "is_multi"): self.scoring, self.is_multi = _check_multimetric_scoring( self.estimator, self.scoring) else: self.scoring, _ = _check_multimetric_scoring( self.estimator, self.scoring) if self.is_multi: if self.refit and (not isinstance(self.refit, str) or self.refit not in self.scoring): raise ValueError("When using multimetric scoring, refit " "must be the name of the scorer used to " "pick the best parameters. If not needed, " "set refit to False") assert isinstance( self.n_jobs, int), ("Internal error: self.n_jobs must be an integer.") if self.n_jobs < 0: resources_per_trial = {"cpu": 1, "gpu": 1 if self.use_gpu else 0} if self.n_jobs < -1: warnings.warn( "`self.n_jobs` is automatically set " "-1 for any negative values.", category=UserWarning) else: available_cpus = multiprocessing.cpu_count() if ray.is_initialized(): available_cpus = ray.cluster_resources()["CPU"] cpu_fraction = available_cpus / self.n_jobs if cpu_fraction > 1: cpu_fraction = int(np.ceil(cpu_fraction)) resources_per_trial = { "cpu": cpu_fraction, "gpu": 1 if self.use_gpu else 0 } X_id = ray.put(X) y_id = ray.put(y) config = {} config["early_stopping"] = bool(self.early_stopping) config["early_stop_type"] = self.early_stop_type config["X_id"] = X_id config["y_id"] = y_id config["groups"] = groups config["cv"] = cv config["fit_params"] = fit_params config["scoring"] = self.scoring config["max_iters"] = self.max_iters config["return_train_score"] = self.return_train_score config["n_jobs"] = self.sk_n_jobs self._fill_config_hyperparam(config) analysis = self._tune_run(config, resources_per_trial) self.cv_results_ = self._format_results(self.n_splits, analysis) metric = self._metric_name base_metric = self._base_metric_name # For multi-metric evaluation, store the best_index, best_params and # best_score iff refit is one of the scorer names # In single metric evaluation, refit_metric is "score" if self.refit or not self.is_multi: # If callable, refit is expected to return the index of the best # parameter set. if callable(self.refit): self.best_index = self.refit(self.cv_results_) if not isinstance(self.best_index, numbers.Integral): raise TypeError("best_index returned is not an integer") if (self.best_index < 0 or self.best_index >= len(self.cv_results_["params"])): raise IndexError("best_index index out of range") else: self.best_index = self.cv_results_["rank_test_%s" % base_metric].argmin() self.best_score = self.cv_results_[ "mean_test_%s" % base_metric][self.best_index] best_config = analysis.get_best_config(metric=metric, mode="max", scope="last") self.best_params = self._clean_config_dict(best_config) if self.refit: base_estimator = clone(self.estimator) if self.early_stop_type == EarlyStopping.WARM_START_ENSEMBLE: logger.info("tune-sklearn uses `n_estimators` to warm " "start, so this parameter can't be " "set when warm start early stopping. " "`n_estimators` defaults to `max_iters`.") if check_is_pipeline(base_estimator): cloned_final_estimator = base_estimator.steps[-1][1] cloned_final_estimator.set_params( **{"n_estimators": self.max_iters}) else: self.best_params["n_estimators"] = self.max_iters # we clone again after setting params in case some # of the params are estimators as well. self.best_estimator = clone( base_estimator.set_params(**self.best_params)) refit_start_time = time.time() if y is not None: self.best_estimator.fit(X, y, **fit_params) else: self.best_estimator.fit(X, **fit_params) refit_end_time = time.time() self.refit_time = refit_end_time - refit_start_time return self
def __init__(self, estimator, early_stopping=None, scoring=None, n_jobs=None, sk_n_jobs=-1, cv=5, refit=True, verbose=0, error_score="raise", return_train_score=False, local_dir="~/ray_results", max_iters=1, use_gpu=False, loggers=None, pipeline_auto_early_stop=True): if max_iters < 1: raise ValueError("max_iters must be greater than or equal to 1.") self.estimator = estimator self.base_estimator = estimator self.pipeline_auto_early_stop = pipeline_auto_early_stop if self.pipeline_auto_early_stop and check_is_pipeline(estimator): _, self.base_estimator = self.base_estimator.steps[-1] self.early_stop_type = get_early_stop_type(self.base_estimator, bool(early_stopping)) if not self._can_early_stop(): if early_stopping: raise ValueError("Early stopping is not supported because " "the estimator does not have `partial_fit`, " "does not support warm_start, or is a " "tree classifier. Set " "`early_stopping=False`.") if not early_stopping and max_iters > 1: warnings.warn( "max_iters is set > 1 but incremental/partial training " "is not enabled. To enable partial training, " "ensure the estimator has `partial_fit` or " "`warm_start` and set `early_stopping=True`. " "Automatically setting max_iters=1.", category=UserWarning) max_iters = 1 # Get metric scoring name self.scoring = scoring self.refit = refit if not hasattr(self, "is_multi"): self.scoring, self.is_multi = _check_multimetric_scoring( self.estimator, self.scoring) if self.is_multi: self._base_metric_name = self.refit else: self._base_metric_name = "score" self._metric_name = "average_test_%s" % self._base_metric_name if early_stopping: if not self._can_early_stop() and is_lightgbm_model( self.base_estimator): warnings.warn("lightgbm>=3.0.0 required for early_stopping " "functionality.") assert self._can_early_stop() if max_iters == 1: warnings.warn( "early_stopping is enabled but max_iters = 1. " "To enable partial training, set max_iters > 1.", category=UserWarning) if self.early_stop_type == EarlyStopping.XGB: warnings.warn( "tune-sklearn implements incremental learning " "for xgboost models following this: " "https://github.com/dmlc/xgboost/issues/1686. " "This may negatively impact performance. To " "disable, set `early_stopping=False`.", category=UserWarning) elif self.early_stop_type == EarlyStopping.LGBM: warnings.warn( "tune-sklearn implements incremental learning " "for lightgbm models following this: " "https://lightgbm.readthedocs.io/en/latest/pythonapi/" "lightgbm.LGBMModel.html#lightgbm.LGBMModel.fit " "This may negatively impact performance. To " "disable, set `early_stopping=False`.", category=UserWarning) elif self.early_stop_type == EarlyStopping.CATBOOST: warnings.warn( "tune-sklearn implements incremental learning " "for Catboost models following this: " "https://catboost.ai/docs/concepts/python-usages-" "examples.html#training-continuation " "This may negatively impact performance. To " "disable, set `early_stopping=False`.", category=UserWarning) if early_stopping is True: # Override the early_stopping variable so # that it is resolved appropriately in # the next block early_stopping = "AsyncHyperBandScheduler" # Resolve the early stopping object early_stopping = resolve_early_stopping(early_stopping, max_iters, self._metric_name) self.early_stopping = early_stopping self.max_iters = max_iters self.cv = cv self.n_jobs = int(n_jobs or -1) if os.environ.get("SKLEARN_N_JOBS") is not None: self.sk_n_jobs = int(os.environ.get("SKLEARN_N_JOBS")) else: self.sk_n_jobs = sk_n_jobs self.verbose = verbose self.error_score = error_score self.return_train_score = return_train_score self.local_dir = local_dir self.use_gpu = use_gpu self.loggers = resolve_loggers(loggers) assert isinstance(self.n_jobs, int)
def _tune_run(self, config, resources_per_trial): """Wrapper to call ``tune.run``. Multiple estimators are generated when early stopping is possible, whereas a single estimator is generated when early stopping is not possible. Args: config (dict): Configurations such as hyperparameters to run ``tune.run`` on. resources_per_trial (dict): Resources to use per trial within Ray. Accepted keys are `cpu`, `gpu` and custom resources, and values are integers specifying the number of each resource to use. Returns: analysis (`ExperimentAnalysis`): Object returned by `tune.run`. """ if self.seed is not None: random.seed(self.seed) np.random.seed(self.seed) trainable = _Trainable if self.pipeline_auto_early_stop and check_is_pipeline( self.estimator) and self.early_stopping: trainable = _PipelineTrainable max_iter = self.max_iters if self.early_stopping is not None: config["estimator_list"] = [ clone(self.estimator) for _ in range(self.n_splits) ] if hasattr(self.early_stopping, "_max_t_attr"): # we want to delegate stopping to schedulers which # support it, but we want it to stop eventually, just in case # the solution is to make the stop condition very big max_iter = self.max_iters * 10 else: config["estimator_list"] = [self.estimator] stopper = MaximumIterationStopper(max_iter=max_iter) if self.stopper: stopper = CombinedStopper(stopper, self.stopper) run_args = dict(scheduler=self.early_stopping, reuse_actors=True, verbose=self.verbose, stop=stopper, num_samples=self.n_trials, config=config, fail_fast="raise", resources_per_trial=resources_per_trial, local_dir=os.path.expanduser(self.local_dir), loggers=self.loggers, time_budget_s=self.time_budget_s) if self.search_optimization == "random": if isinstance(self.param_distributions, list): search_algo = RandomListSearcher(self.param_distributions) else: search_algo = BasicVariantGenerator() run_args["search_alg"] = search_algo else: search_space = None override_search_space = True if self._is_param_distributions_all_tune_domains(): run_args["config"].update(self.param_distributions) override_search_space = False search_kwargs = self.search_kwargs.copy() search_kwargs.update(metric=self._metric_name, mode="max") if self.search_optimization == "bayesian": from ray.tune.suggest.skopt import SkOptSearch if override_search_space: search_space = self.param_distributions search_algo = SkOptSearch(space=search_space, **search_kwargs) run_args["search_alg"] = search_algo elif self.search_optimization == "bohb": from ray.tune.suggest.bohb import TuneBOHB if override_search_space: search_space = self._get_bohb_config_space() if self.seed: warnings.warn("'seed' is not implemented for BOHB.") search_algo = TuneBOHB(space=search_space, **search_kwargs) # search_algo = TuneBOHB( # space=search_space, seed=self.seed, **search_kwargs) run_args["search_alg"] = search_algo elif self.search_optimization == "optuna": from ray.tune.suggest.optuna import OptunaSearch from optuna.samplers import TPESampler sampler = TPESampler(seed=self.seed) if override_search_space: search_space = self._get_optuna_params() search_algo = OptunaSearch(space=search_space, sampler=sampler, **search_kwargs) run_args["search_alg"] = search_algo elif self.search_optimization == "hyperopt": from ray.tune.suggest.hyperopt import HyperOptSearch if override_search_space: search_space = self._get_hyperopt_params() search_algo = HyperOptSearch(space=search_space, random_state_seed=self.seed, **search_kwargs) run_args["search_alg"] = search_algo else: # This should not happen as we validate the input before # this method. Still, just to be sure, raise an error here. raise ValueError( f"Invalid search optimizer: {self.search_optimization}") if isinstance(self.n_jobs, int) and self.n_jobs > 0 \ and not self.search_optimization == "random": search_algo = ConcurrencyLimiter(search_algo, max_concurrent=self.n_jobs) run_args["search_alg"] = search_algo with warnings.catch_warnings(): warnings.filterwarnings("ignore", message="fail_fast='raise' " "detected.") analysis = tune.run(trainable, **run_args) return analysis
def __init__(self, estimator, early_stopping=None, scoring=None, n_jobs=None, sk_n_jobs=-1, cv=5, refit=True, verbose=0, error_score="raise", return_train_score=False, local_dir="~/ray_results", max_iters=1, use_gpu=False, loggers=None, pipeline_auto_early_stop=True): if max_iters < 1: raise ValueError("max_iters must be greater than or equal to 1.") self.estimator = estimator self.base_estimator = estimator self.pipeline_auto_early_stop = pipeline_auto_early_stop if self.pipeline_auto_early_stop and check_is_pipeline(estimator): _, self.base_estimator = self.base_estimator.steps[-1] self.early_stop_type = get_early_stop_type(self.base_estimator, bool(early_stopping)) if not self._can_early_stop(): if early_stopping: raise ValueError("Early stopping is not supported because " "the estimator does not have `partial_fit`, " "does not support warm_start, or is a " "tree classifier. Set " "`early_stopping=False`.") if not early_stopping and max_iters > 1: warnings.warn( "max_iters is set > 1 but incremental/partial training " "is not enabled. To enable partial training, " "ensure the estimator has `partial_fit` or " "`warm_start` and set `early_stopping=True`. " "Automatically setting max_iters=1.", category=UserWarning) max_iters = 1 if early_stopping: assert self._can_early_stop() if max_iters == 1: warnings.warn( "early_stopping is enabled but max_iters = 1. " "To enable partial training, set max_iters > 1.", category=UserWarning) if self.early_stop_type == EarlyStopping.XGB: warnings.warn( "tune-sklearn implements incremental learning " "for xgboost models following this: " "https://github.com/dmlc/xgboost/issues/1686. " "This may negatively impact performance. To " "disable, set `early_stopping=False`.", category=UserWarning) if early_stopping is True: # Override the early_stopping variable so # that it is resolved appropriately in # the next block early_stopping = "AsyncHyperBandScheduler" # Resolve the early stopping object early_stopping = resolve_early_stopping(early_stopping, max_iters) self.early_stopping = early_stopping self.max_iters = max_iters self.cv = cv self.scoring = scoring self.n_jobs = int(n_jobs or -1) if os.environ.get("SKLEARN_N_JOBS") is not None: self.sk_n_jobs = int(os.environ.get("SKLEARN_N_JOBS")) else: self.sk_n_jobs = sk_n_jobs self.refit = refit self.verbose = verbose self.error_score = error_score self.return_train_score = return_train_score self.local_dir = local_dir self.use_gpu = use_gpu self.loggers = resolve_loggers(loggers) assert isinstance(self.n_jobs, int)
def _tune_run(self, config, resources_per_trial): """Wrapper to call ``tune.run``. Multiple estimators are generated when early stopping is possible, whereas a single estimator is generated when early stopping is not possible. Args: config (dict): Configurations such as hyperparameters to run ``tune.run`` on. resources_per_trial (dict): Resources to use per trial within Ray. Accepted keys are `cpu`, `gpu` and custom resources, and values are integers specifying the number of each resource to use. Returns: analysis (`ExperimentAnalysis`): Object returned by `tune.run`. """ trainable = _Trainable if self.pipeline_auto_early_stop and check_is_pipeline( self.estimator) and self.early_stopping: trainable = _PipelineTrainable stop_condition = {"training_iteration": self.max_iters} if self.early_stopping is not None: config["estimator_list"] = [ clone(self.estimator) for _ in range(self.n_splits) ] if hasattr(self.early_stopping, "_max_t_attr"): # we want to delegate stopping to schedulers which # support it, but we want it to stop eventually, just in case # the solution is to make the stop condition very big stop_condition = {"training_iteration": self.max_iters * 10} else: config["estimator_list"] = [self.estimator] if self.search_optimization == "random": run_args = dict(scheduler=self.early_stopping, reuse_actors=True, verbose=self.verbose, stop=stop_condition, num_samples=self.num_samples, config=config, fail_fast=True, resources_per_trial=resources_per_trial, local_dir=os.path.expanduser(self.local_dir), loggers=self.loggers) if isinstance(self.param_distributions, list): run_args["search_alg"] = RandomListSearcher( self.param_distributions) analysis = tune.run(trainable, **run_args) return analysis elif self.search_optimization == "bayesian": from skopt import Optimizer from ray.tune.suggest.skopt import SkOptSearch hyperparameter_names, spaces = self._get_skopt_params() search_algo = SkOptSearch(Optimizer(spaces), hyperparameter_names, metric=self._metric_name, mode="max", **self.search_kwargs) elif self.search_optimization == "bohb": from ray.tune.suggest.bohb import TuneBOHB config_space = self._get_bohb_config_space() search_algo = TuneBOHB(config_space, metric=self._metric_name, mode="max", **self.search_kwargs) elif self.search_optimization == "optuna": from ray.tune.suggest.optuna import OptunaSearch config_space = self._get_optuna_params() search_algo = OptunaSearch(config_space, metric=self._metric_name, mode="max", **self.search_kwargs) elif self.search_optimization == "hyperopt": from ray.tune.suggest.hyperopt import HyperOptSearch config_space = self._get_hyperopt_params() search_algo = HyperOptSearch(config_space, metric=self._metric_name, mode="max", **self.search_kwargs) if isinstance(self.n_jobs, int) and self.n_jobs > 0: search_algo = ConcurrencyLimiter(search_algo, max_concurrent=self.n_jobs) analysis = tune.run(trainable, search_alg=search_algo, scheduler=self.early_stopping, reuse_actors=True, verbose=self.verbose, stop=stop_condition, num_samples=self.num_samples, config=config, fail_fast=True, resources_per_trial=resources_per_trial, local_dir=os.path.expanduser(self.local_dir), loggers=self.loggers) return analysis