def __init__( self, restore_path: str = None, trainable: Optional[Union[str, Callable, Type[Trainable], BaseTrainer, ]] = None, param_space: Optional[Dict[str, Any]] = None, tune_config: Optional[TuneConfig] = None, run_config: Optional[RunConfig] = None, _tuner_kwargs: Optional[Dict] = None, ): # Restored from Tuner checkpoint. if restore_path: trainable_ckpt = os.path.join(restore_path, _TRAINABLE_PKL) with open(trainable_ckpt, "rb") as fp: trainable = pickle.load(fp) tuner_ckpt = os.path.join(restore_path, _TUNER_PKL) with open(tuner_ckpt, "rb") as fp: tuner = pickle.load(fp) self.__dict__.update(tuner.__dict__) self._is_restored = True self._trainable = trainable self._experiment_checkpoint_dir = restore_path return # Start from fresh if not trainable: raise TuneError("You need to provide a trainable to tune.") # If no run config was passed to Tuner directly, use the one from the Trainer, # if available if not run_config and isinstance(trainable, BaseTrainer): run_config = trainable.run_config self._is_restored = False self._trainable = trainable self._tune_config = tune_config or TuneConfig() self._run_config = run_config or RunConfig() self._tuner_kwargs = copy.deepcopy(_tuner_kwargs) or {} self._experiment_checkpoint_dir = self._setup_create_experiment_checkpoint_dir( self._run_config) # Not used for restored Tuner. self._param_space = param_space or {} # This needs to happen before `tune.run()` is kicked in. # This is because currently tune does not exit gracefully if # run in ray client mode - if crash happens, it just exits immediately # without allowing for checkpointing tuner and trainable. # Thus this has to happen before tune.run() so that we can have something # to restore from. tuner_ckpt = os.path.join(self._experiment_checkpoint_dir, _TUNER_PKL) with open(tuner_ckpt, "wb") as fp: pickle.dump(self, fp) trainable_ckpt = os.path.join(self._experiment_checkpoint_dir, _TRAINABLE_PKL) with open(trainable_ckpt, "wb") as fp: pickle.dump(self._trainable, fp)
def test_data_parallel_trainer(ray_start_8_cpus): num_workers = 2 trainer = AssertingDataParallelTrainer( train_fn, scaling_config=ScalingConfig(num_workers=num_workers) ) tuner = Tuner( trainer, param_space={ "train_loop_config": { "num_epochs": 100, "metric": tune.grid_search([1, 2, 3, 4, 5]), } }, tune_config=TuneConfig( mode="max", metric="metric", scheduler=ResourceChangingScheduler( ASHAScheduler(), resources_allocation_function=DistributeResources( add_bundles=True, reserve_resources={"CPU": 1} ), ), ), run_config=RunConfig(failure_config=FailureConfig(fail_fast=True)), ) result_grid = tuner.fit() assert not any(x.error for x in result_grid) # + 1 for Trainable assert result_grid.get_dataframe()["num_cpus"].max() > num_workers + 1
def test_tuner_with_torch_trainer(self): """Test a successful run using torch trainer.""" shutil.rmtree(os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_torch"), ignore_errors=True) # The following two should be tunable. config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 10} scaling_config = {"num_workers": 1, "use_gpu": False} trainer = TorchTrainer( train_loop_per_worker=linear_train_func, train_loop_config=config, scaling_config=scaling_config, ) param_space = { "scaling_config": { "num_workers": tune.grid_search([1, 2]), }, "train_loop_config": { "batch_size": tune.grid_search([4, 8]), "epochs": tune.grid_search([5, 10]), }, } tuner = Tuner( trainable=trainer, run_config=RunConfig(name="test_tuner"), param_space=param_space, tune_config=TuneConfig(mode="min", metric="loss"), ) results = tuner.fit() assert len(results) == 8
def test_retry(ray_start_4_cpus): def train_func(): ckpt = session.get_checkpoint() restored = bool(ckpt) # Does a previous checkpoint exist? itr = 0 if ckpt: ckpt = ckpt.to_dict() itr = ckpt["iter"] + 1 for i in range(itr, 4): if i == 2 and not restored: raise Exception("try to fail me") session.report( dict(test=i, training_iteration=i), checkpoint=Checkpoint.from_dict(dict(iter=i)), ) trainer = DataParallelTrainer( train_func, backend_config=TestConfig(), scaling_config=ScalingConfig(num_workers=1), ) tuner = Tuner( trainer, run_config=RunConfig(failure_config=FailureConfig(max_failures=3)) ) analysis = tuner.fit()._experiment_analysis checkpoint_path = analysis.trials[0].checkpoint.dir_or_data checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict() assert checkpoint["iter"] == 3 trial_dfs = list(analysis.trial_dataframes.values()) assert len(trial_dfs[0]["training_iteration"]) == 4
def __init__( self, *, scaling_config: Optional[ScalingConfig] = None, run_config: Optional[RunConfig] = None, datasets: Optional[Dict[str, GenDataset]] = None, preprocessor: Optional["Preprocessor"] = None, resume_from_checkpoint: Optional[Checkpoint] = None, ): self.scaling_config = (scaling_config if scaling_config is not None else ScalingConfig()) self.run_config = run_config if run_config is not None else RunConfig() self.datasets = datasets if datasets is not None else {} self.preprocessor = preprocessor self.resume_from_checkpoint = resume_from_checkpoint self._validate_attributes() if datasets and not self.scaling_config._max_cpu_fraction_per_node: logger.warning( "When passing `datasets` to a Trainer, it is recommended to " "reserve at least 20% of node CPUs for Dataset execution by setting " "`_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. " "Not doing so can lead to resource contention or hangs. " "See https://docs.ray.io/en/master/data/key-concepts.html" "#example-datasets-in-tune for more info.")
def test_tuner_trainer_fail(self): trainer = FailingTrainer() param_space = { "scaling_config": ScalingConfig(num_workers=tune.grid_search([1, 2])) } tuner = Tuner( trainable=trainer, run_config=RunConfig(name="test_tuner_trainer_fail"), param_space=param_space, tune_config=TuneConfig(mode="max", metric="iteration"), ) results = tuner.fit() assert len(results) == 2 for i in range(2): assert results[i].error
def main(num_workers=2, use_gpu=False): trainer = TorchTrainer( train_func, train_loop_config={ "lr": 1e-3, "batch_size": 64, "epochs": 4 }, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), run_config=RunConfig(callbacks=[ MLflowLoggerCallback(experiment_name="train_fashion_mnist") ]), ) final_results = trainer.fit() print("Final metrics: ", final_results.metrics)
def __init__( self, *, scaling_config: Optional[ScalingConfig] = None, run_config: Optional[RunConfig] = None, datasets: Optional[Dict[str, GenDataset]] = None, preprocessor: Optional["Preprocessor"] = None, resume_from_checkpoint: Optional[Checkpoint] = None, ): self.scaling_config = scaling_config if scaling_config is not None else {} self.run_config = run_config if run_config is not None else RunConfig() self.datasets = datasets if datasets is not None else {} self.preprocessor = preprocessor self.resume_from_checkpoint = resume_from_checkpoint self._validate_attributes()
def test_tuner_with_xgboost_trainer(self): """Test a successful run.""" shutil.rmtree(os.path.join(DEFAULT_RESULTS_DIR, "test_tuner"), ignore_errors=True) trainer = XGBoostTrainer( label_column="target", params={}, datasets={"train": gen_dataset_func_eager()}, ) # prep_v1 = StandardScaler(["worst radius", "worst area"]) # prep_v2 = StandardScaler(["worst concavity", "worst smoothness"]) param_space = { "scaling_config": { "num_workers": tune.grid_search([1, 2]), }, # "preprocessor": tune.grid_search([prep_v1, prep_v2]), "datasets": { "train": tune.grid_search( [gen_dataset_func(), gen_dataset_func(do_shuffle=True)]), }, "params": { "objective": "binary:logistic", "tree_method": "approx", "eval_metric": ["logloss", "error"], "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": tune.randint(1, 9), }, } tuner = Tuner( trainable=trainer, run_config=RunConfig(name="test_tuner"), param_space=param_space, tune_config=TuneConfig(mode="min", metric="train-error"), # limiting the number of trials running at one time. # As the unit test only has access to 4 CPUs on Buildkite. _tuner_kwargs={"max_concurrent_trials": 1}, ) results = tuner.fit() assert not isinstance(results.get_best_result().checkpoint, TrialCheckpoint) assert len(results) == 4
def test_gbdt_trainer(ray_start_8_cpus): data_raw = load_breast_cancer() dataset_df = pd.DataFrame(data_raw["data"], columns=data_raw["feature_names"]) dataset_df["target"] = data_raw["target"] train_ds = ray.data.from_pandas(dataset_df).repartition(16) trainer = AssertingXGBoostTrainer( datasets={TRAIN_DATASET_KEY: train_ds}, label_column="target", scaling_config=ScalingConfig(num_workers=2), params={ "objective": "binary:logistic", "eval_metric": ["logloss"], }, ) tuner = Tuner( trainer, param_space={ "num_boost_round": 100, "params": { "eta": tune.grid_search([0.28, 0.29, 0.3, 0.31, 0.32]), }, }, tune_config=TuneConfig( mode="min", metric="train-logloss", scheduler=ResourceChangingScheduler( ASHAScheduler(), resources_allocation_function=DistributeResources( add_bundles=True, reserve_resources={"CPU": 1} ), ), ), run_config=RunConfig(failure_config=FailureConfig(fail_fast=True)), ) result_grid = tuner.fit() assert not any(x.error for x in result_grid)
def test_tuner_run_config_override(self): trainer = DummyTrainer(run_config=RunConfig(stop={"metric": 4})) tuner = Tuner(trainer) assert tuner._local_tuner._run_config.stop == {"metric": 4}
def test_tuner_with_xgboost_trainer_driver_fail_and_resume(self): # So that we have some global checkpointing happening. os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "1" shutil.rmtree( os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_driver_fail"), ignore_errors=True, ) trainer = XGBoostTrainer( label_column="target", params={}, datasets={"train": gen_dataset_func_eager()}, ) # prep_v1 = StandardScaler(["worst radius", "worst area"]) # prep_v2 = StandardScaler(["worst concavity", "worst smoothness"]) param_space = { "scaling_config": { "num_workers": tune.grid_search([1, 2]), }, # "preprocessor": tune.grid_search([prep_v1, prep_v2]), "datasets": { "train": tune.grid_search( [gen_dataset_func(), gen_dataset_func(do_shuffle=True)]), }, "params": { "objective": "binary:logistic", "tree_method": "approx", "eval_metric": ["logloss", "error"], "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": tune.randint(1, 9), }, } class FailureInjectionCallback(Callback): """Inject failure at the configured iteration number.""" def __init__(self, num_iters=10): self.num_iters = num_iters def on_step_end(self, iteration, trials, **kwargs): if iteration == self.num_iters: print(f"Failing after {self.num_iters} iters.") raise RuntimeError tuner = Tuner( trainable=trainer, run_config=RunConfig(name="test_tuner_driver_fail", callbacks=[FailureInjectionCallback()]), param_space=param_space, tune_config=TuneConfig(mode="min", metric="train-error"), # limiting the number of trials running at one time. # As the unit test only has access to 4 CPUs on Buildkite. _tuner_kwargs={"max_concurrent_trials": 1}, ) with self.assertRaises(TuneError): tuner.fit() # Test resume restore_path = os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_driver_fail") tuner = Tuner.restore(restore_path) # A hack before we figure out RunConfig semantics across resumes. tuner._local_tuner._run_config.callbacks = None results = tuner.fit() assert len(results) == 4
results = tuner.fit() assert len(results) == 8 def test_tuner_run_config_override(self): trainer = DummyTrainer(run_config=RunConfig(stop={"metric": 4})) tuner = Tuner(trainer) assert tuner._local_tuner._run_config.stop == {"metric": 4} @pytest.mark.parametrize( "params_expected", [ ( { "run_config": RunConfig(progress_reporter=CLIReporter()) }, lambda kw: isinstance(kw["progress_reporter"], CLIReporter), ), ( { "run_config": RunConfig(reuse_actors=True) }, lambda kw: kw["reuse_actors"] is True, ), ( { "run_config": RunConfig(log_to_file="some_file") }, lambda kw: kw["log_to_file"] == "some_file", ),
trainer, param_space={ "train_loop_config": { "lr": tune.choice([0.001, 0.01, 0.1]), "momentum": 0.8, "head_location": None, "worker_locations": None, "test_mode": args.smoke_test, "batch_size": 128 * num_training_workers, # For the long running test, we want the training to run forever, # and it will be terminated by the release test infra. "epochs": 1 if args.smoke_test else sys.maxsize, } }, tune_config=TuneConfig(num_samples=4, metric="loss", mode="min", scheduler=pbt_scheduler), run_config=RunConfig( stop={"training_iteration": 1} if args.smoke_test else None, callbacks=[ FailureInjectorCallback(time_between_checks=90), ProgressCallback() ], ), ) results = tuner.fit() print(results.get_best_result(metric="loss", mode="min"))
"momentum": [0.8, 0.9, 0.99], } }, ) tuner = Tuner( trainer, param_space={ "train_loop_config": { "lr": tune.choice([0.001, 0.01, 0.1]), "momentum": 0.8, "batch_size": 128 * args.num_workers, "epochs": args.num_epochs, "test_mode": args.smoke_test, # whether to to subset the data } }, tune_config=TuneConfig(num_samples=4, metric="loss", mode="min", scheduler=pbt_scheduler), run_config=RunConfig( stop={"training_iteration": 2 if args.smoke_test else 100}, failure_config=FailureConfig( max_failures=3), # used for fault tolerance ), ) results = tuner.fit() print(results.get_best_result(metric="loss", mode="min"))