def test_reuse_checkpoint(ray_start_4_cpus): def train_func(config): itr = 0 ckpt = session.get_checkpoint() if ckpt is not None: ckpt = ckpt.to_dict() itr = ckpt["iter"] + 1 for i in range(itr, config["max_iter"]): session.report( dict(test=i, training_iteration=i), checkpoint=Checkpoint.from_dict(dict(iter=i)), ) trainer = DataParallelTrainer( train_func, backend_config=TestConfig(), scaling_config=ScalingConfig(num_workers=1), ) tuner = Tuner( trainer, param_space={"train_loop_config": {"max_iter": 5}}, ) [trial] = tuner.fit()._experiment_analysis.trials checkpoint_path = trial.checkpoint.dir_or_data checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict() assert checkpoint["iter"] == 4 tuner = Tuner( trainer, param_space={"train_loop_config": {"max_iter": 10}}, ).restore(trial.local_dir) analysis = tuner.fit()._experiment_analysis trial_dfs = list(analysis.trial_dataframes.values()) assert len(trial_dfs[0]["training_iteration"]) == 5
def test_tuner_fn_trainable_checkpoint_at_end_none(): tuner = Tuner( lambda config, checkpoint_dir: 1, run_config=ray.air.RunConfig( checkpoint_config=ray.air.CheckpointConfig( checkpoint_at_end=None)), ) tuner.fit()
def test_tuner_fn_trainable_checkpoint_at_end_true(): tuner = Tuner( lambda config, checkpoint_dir: 1, run_config=ray.air.RunConfig( checkpoint_config=ray.air.CheckpointConfig( checkpoint_at_end=True)), ) with pytest.raises(TuneError): tuner.fit()
def test_tuner_api_kwargs(params_expected): tuner_params, assertion = params_expected tuner = Tuner(lambda config: 1, **tuner_params) caught_kwargs = {} def catch_kwargs(**kwargs): caught_kwargs.update(kwargs) with patch("ray.tune.impl.tuner_internal.run", catch_kwargs): tuner.fit() assert assertion(caught_kwargs)
def train_rl_ppo_online(num_workers: int, use_gpu: bool = False) -> Result: print("Starting online training") trainer = RLTrainer( run_config=RunConfig(stop={"training_iteration": 5}), scaling_config={ "num_workers": num_workers, "use_gpu": use_gpu, }, algorithm="PPO", config={ "env": "CartPole-v0", "framework": "tf", "evaluation_num_workers": 1, "evaluation_interval": 1, "evaluation_config": { "input": "sampler" }, }, ) # Todo (krfricke/xwjiang): Enable checkpoint config in RunConfig # result = trainer.fit() tuner = Tuner( trainer, _tuner_kwargs={"checkpoint_at_end": True}, ) result = tuner.fit()[0] return result
def train_rl_bc_offline(path: str, num_workers: int, use_gpu: bool = False) -> Result: print("Starting offline training") dataset = ray.data.read_json(path, parallelism=num_workers, ray_remote_args={"num_cpus": 1}) trainer = RLTrainer( run_config=RunConfig(stop={"training_iteration": 5}), scaling_config={ "num_workers": num_workers, "use_gpu": use_gpu, }, datasets={"train": dataset}, algorithm=BCTrainer, config={ "env": "CartPole-v0", "framework": "tf", "evaluation_num_workers": 1, "evaluation_interval": 1, "evaluation_config": { "input": "sampler" }, }, ) # Todo (krfricke/xwjiang): Enable checkpoint config in RunConfig # result = trainer.fit() tuner = Tuner( trainer, _tuner_kwargs={"checkpoint_at_end": True}, ) result = tuner.fit()[0] return result
def tune_horovod(num_workers, num_samples, use_gpu, mode="square", x_max=1.0): horovod_trainer = HorovodTrainer( train_loop_per_worker=train_loop_per_worker, scaling_config={ "num_workers": num_workers, "use_gpu": use_gpu }, train_loop_config={ "mode": mode, "x_max": x_max }, ) tuner = Tuner( horovod_trainer, param_space={"train_loop_config": { "lr": tune.uniform(0.1, 1) }}, tune_config=TuneConfig(mode="min", metric="loss", num_samples=num_samples), _tuner_kwargs={"fail_fast": True}, ) result_grid = tuner.fit() print("Best hyperparameters found were: ", result_grid.get_best_result().config)
def test_data_parallel_trainer(ray_start_8_cpus): num_workers = 2 trainer = AssertingDataParallelTrainer( train_fn, scaling_config=ScalingConfig(num_workers=num_workers) ) tuner = Tuner( trainer, param_space={ "train_loop_config": { "num_epochs": 100, "metric": tune.grid_search([1, 2, 3, 4, 5]), } }, tune_config=TuneConfig( mode="max", metric="metric", scheduler=ResourceChangingScheduler( ASHAScheduler(), resources_allocation_function=DistributeResources( add_bundles=True, reserve_resources={"CPU": 1} ), ), ), run_config=RunConfig(failure_config=FailureConfig(fail_fast=True)), ) result_grid = tuner.fit() assert not any(x.error for x in result_grid) # + 1 for Trainable assert result_grid.get_dataframe()["num_cpus"].max() > num_workers + 1
def test_tuner_with_torch_trainer(self): """Test a successful run using torch trainer.""" shutil.rmtree(os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_torch"), ignore_errors=True) # The following two should be tunable. config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 10} scaling_config = {"num_workers": 1, "use_gpu": False} trainer = TorchTrainer( train_loop_per_worker=linear_train_func, train_loop_config=config, scaling_config=scaling_config, ) param_space = { "scaling_config": { "num_workers": tune.grid_search([1, 2]), }, "train_loop_config": { "batch_size": tune.grid_search([4, 8]), "epochs": tune.grid_search([5, 10]), }, } tuner = Tuner( trainable=trainer, run_config=RunConfig(name="test_tuner"), param_space=param_space, tune_config=TuneConfig(mode="min", metric="loss"), ) results = tuner.fit() assert len(results) == 8
def test_retry(ray_start_4_cpus): def train_func(): ckpt = session.get_checkpoint() restored = bool(ckpt) # Does a previous checkpoint exist? itr = 0 if ckpt: ckpt = ckpt.to_dict() itr = ckpt["iter"] + 1 for i in range(itr, 4): if i == 2 and not restored: raise Exception("try to fail me") session.report( dict(test=i, training_iteration=i), checkpoint=Checkpoint.from_dict(dict(iter=i)), ) trainer = DataParallelTrainer( train_func, backend_config=TestConfig(), scaling_config=ScalingConfig(num_workers=1), ) tuner = Tuner( trainer, run_config=RunConfig(failure_config=FailureConfig(max_failures=3)) ) analysis = tuner.fit()._experiment_analysis checkpoint_path = analysis.trials[0].checkpoint.dir_or_data checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict() assert checkpoint["iter"] == 3 trial_dfs = list(analysis.trial_dataframes.values()) assert len(trial_dfs[0]["training_iteration"]) == 4
def tune_linear(num_workers, num_samples, use_gpu): train_dataset, val_dataset = get_datasets() config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3} scaling_config = {"num_workers": num_workers, "use_gpu": use_gpu} trainer = TorchTrainer( train_loop_per_worker=train_func, train_loop_config=config, scaling_config=scaling_config, datasets={ "train": train_dataset, "validation": val_dataset }, ) tuner = Tuner( trainer, param_space={ "train_loop_config": { "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([4, 16, 32]), "epochs": 3, } }, tune_config=TuneConfig(num_samples=num_samples, metric="loss", mode="min"), ) result_grid = tuner.fit() best_result = result_grid.get_best_result() print(best_result) return best_result
def test_tuner_trainer_fail(self): trainer = FailingTrainer() param_space = { "scaling_config": ScalingConfig(num_workers=tune.grid_search([1, 2])) } tuner = Tuner( trainable=trainer, run_config=RunConfig(name="test_tuner_trainer_fail"), param_space=param_space, tune_config=TuneConfig(mode="max", metric="iteration"), ) results = tuner.fit() assert len(results) == 2 for i in range(2): assert results[i].error
def test_tune_error(ray_start_4_cpus): def train_func(config): raise RuntimeError("Error in training function!") trainer = DataParallelTrainer( train_func, backend_config=TestConfig(), scaling_config=ScalingConfig(num_workers=1), ) tuner = Tuner( trainer, ) result_grid = tuner.fit() with pytest.raises(RuntimeError): raise result_grid[0].error
def test_tuner_with_xgboost_trainer(self): """Test a successful run.""" shutil.rmtree(os.path.join(DEFAULT_RESULTS_DIR, "test_tuner"), ignore_errors=True) trainer = XGBoostTrainer( label_column="target", params={}, datasets={"train": gen_dataset_func_eager()}, ) # prep_v1 = StandardScaler(["worst radius", "worst area"]) # prep_v2 = StandardScaler(["worst concavity", "worst smoothness"]) param_space = { "scaling_config": { "num_workers": tune.grid_search([1, 2]), }, # "preprocessor": tune.grid_search([prep_v1, prep_v2]), "datasets": { "train": tune.grid_search( [gen_dataset_func(), gen_dataset_func(do_shuffle=True)]), }, "params": { "objective": "binary:logistic", "tree_method": "approx", "eval_metric": ["logloss", "error"], "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": tune.randint(1, 9), }, } tuner = Tuner( trainable=trainer, run_config=RunConfig(name="test_tuner"), param_space=param_space, tune_config=TuneConfig(mode="min", metric="train-error"), # limiting the number of trials running at one time. # As the unit test only has access to 4 CPUs on Buildkite. _tuner_kwargs={"max_concurrent_trials": 1}, ) results = tuner.fit() assert not isinstance(results.get_best_result().checkpoint, TrialCheckpoint) assert len(results) == 4
def test_tuner_with_xgboost_trainer(self): """Test a successful run.""" shutil.rmtree(os.path.join(DEFAULT_RESULTS_DIR, "test_tuner"), ignore_errors=True) trainer = XGBoostTrainer( label_column="target", params={}, # TODO(xwjiang): change when dataset out-of-band ser/des is landed. datasets={"train": gen_dataset_func_eager()}, ) # prep_v1 = StandardScaler(["worst radius", "worst area"]) # prep_v2 = StandardScaler(["worst concavity", "worst smoothness"]) param_space = { "scaling_config": { "num_workers": tune.grid_search([1, 2]), }, # TODO(xwjiang): Add when https://github.com/ray-project/ray/issues/23363 # is resolved. # "preprocessor": tune.grid_search([prep_v1, prep_v2]), # "datasets": { # "train": tune.choice( # [gen_dataset_func(), gen_dataset_func(do_shuffle=True)] # ), # }, "params": { "objective": "binary:logistic", "tree_method": "approx", "eval_metric": ["logloss", "error"], "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": tune.randint(1, 9), }, } tuner = Tuner( trainable=trainer, run_config=RunConfig(name="test_tuner"), param_space=param_space, tune_config=TuneConfig(mode="min", metric="train-error"), ) results = tuner.fit() assert not isinstance(results.get_best_result().checkpoint, TrialCheckpoint) assert len(results) == 2
def tune_tensorflow_mnist(num_workers, num_samples): trainer = TensorflowTrainer( train_func, scaling_config=ScalingConfig(num_workers=num_workers)) tuner = Tuner( trainer, param_space={ "train_loop_config": { "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([32, 64, 128]), "epochs": 3, }, }, tune_config=TuneConfig(num_samples=num_samples), ) analysis = tuner.fit() best_loss = analysis.get_best_result(metric="loss", mode="min") best_accuracy = analysis.get_best_result(metric="accuracy", mode="max") print(f"Best loss result: {best_loss}") print(f"Best accuracy result: {best_accuracy}") return analysis
def test_tune(ray_start_4_cpus): def train_func(config): session.report({"loss": config["x"]}) trainer = DataParallelTrainer( train_loop_per_worker=train_func, train_loop_config={"x": 100}, scaling_config=scale_config, ) tuner = Tuner( trainer, param_space={"train_loop_config": {"x": tune.choice([200, 300])}}, tune_config=TuneConfig(num_samples=2), ) result_grid = tuner.fit() assert result_grid[0].metrics["loss"] in [200, 300] # Make sure original Trainer is not affected. assert trainer._train_loop_config["x"] == 100
def test_tuner_trainer_fail(self): class DummyTrainer(Trainer): def training_loop(self) -> None: raise RuntimeError("There is an error in trainer!") trainer = DummyTrainer() param_space = { "scaling_config": { "num_workers": tune.grid_search([1, 2]), } } tuner = Tuner( trainable=trainer, run_config=RunConfig(name="test_tuner_trainer_fail"), param_space=param_space, tune_config=TuneConfig(mode="max", metric="iteration"), ) results = tuner.fit() assert len(results) == 2 for i in range(2): assert results[i].error
def test_tuner_with_torch_trainer(self): """Test a successful run using torch trainer.""" shutil.rmtree( os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_torch"), ignore_errors=True ) # The following two should be tunable. config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 10} scaling_config = {"num_workers": 1, "use_gpu": False} trainer = TorchTrainer( train_loop_per_worker=linear_train_func, train_loop_config=config, scaling_config=scaling_config, ) # prep_v1 = StandardScaler(["worst radius", "worst area"]) # prep_v2 = StandardScaler(["worst concavity", "worst smoothness"]) param_space = { "scaling_config": { "num_workers": tune.grid_search([1, 2]), }, # TODO(xwjiang): Add when https://github.com/ray-project/ray/issues/23363 # is resolved. # "preprocessor": tune.grid_search([prep_v1, prep_v2]), # "datasets": { # "train": tune.choice( # [gen_dataset_func(), gen_dataset_func(do_shuffle=True)] # ), # }, "train_loop_config": { "batch_size": tune.grid_search([4, 8]), "epochs": tune.grid_search([5, 10]), }, } tuner = Tuner( trainable=trainer, run_config=RunConfig(name="test_tuner"), param_space=param_space, tune_config=TuneConfig(mode="min", metric="loss"), ) results = tuner.fit() assert len(results) == 8
def tune_tensorflow_mnist(num_workers: int = 2, num_samples: int = 2, use_gpu: bool = False): trainer = TensorflowTrainer( train_loop_per_worker=train_func, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), ) tuner = Tuner( trainer, tune_config=TuneConfig(num_samples=num_samples, metric="accuracy", mode="max"), param_space={ "train_loop_config": { "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([32, 64, 128]), "epochs": 3, } }, ) best_accuracy = tuner.fit().get_best_result().metrics["accuracy"] print(f"Best accuracy config: {best_accuracy}")
def test_tune_checkpoint(ray_start_4_cpus): def train_func(): for i in range(9): session.report(dict(test=i)) session.report( dict(test=i + 1), checkpoint=Checkpoint.from_dict(dict(hello="world")) ) trainer = DataParallelTrainer( train_func, backend_config=TestConfig(), scaling_config=ScalingConfig(num_workers=1), ) tuner = Tuner( trainer, param_space={"train_loop_config": {"max_iter": 5}}, ) [trial] = tuner.fit()._experiment_analysis.trials checkpoint_path = trial.checkpoint.dir_or_data assert os.path.exists(checkpoint_path) checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict() assert checkpoint["hello"] == "world"
def torch_fashion_mnist(num_workers, use_gpu, num_samples): trainer = TorchTrainer( fashion_mnist_train_func, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), ) tuner = Tuner( trainer, param_space={ "train_loop_config": { "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([32, 64, 128]), "epochs": 2, } }, tune_config=TuneConfig( num_samples=num_samples, ), ) analysis = tuner.fit()._experiment_analysis # Check that loss decreases in each trial. for path, df in analysis.trial_dataframes.items(): assert df.loc[1, "loss"] < df.loc[0, "loss"]
def test_tune_torch_get_device_gpu(ray_2_node_4_gpu, num_gpus_per_worker): from ray import tune from ray.tune.tuner import Tuner, TuneConfig num_samples = 2 @patch("torch.cuda.is_available", lambda: True) def train_func(): train.report(device_id=train.torch.get_device().index) trainer = TorchTrainer( train_func, torch_config=TorchConfig(backend="gloo"), scaling_config=ScalingConfig( num_workers=2, use_gpu=True, resources_per_worker={"GPU": num_gpus_per_worker}, ), ) tuner = Tuner( trainer, param_space={ "train_loop_config": { "dummy": tune.choice([32, 64, 128]), } }, tune_config=TuneConfig( num_samples=num_samples, ), ) analysis = tuner.fit()._experiment_analysis trial_dfs = list(analysis.trial_dataframes.values()) device_ids = [trial_df["device_id"].tolist() for trial_df in trial_dfs] assert len(device_ids) == num_samples for i in range(num_samples): assert device_ids[i][0] == 0
def test_gbdt_trainer(ray_start_8_cpus): data_raw = load_breast_cancer() dataset_df = pd.DataFrame(data_raw["data"], columns=data_raw["feature_names"]) dataset_df["target"] = data_raw["target"] train_ds = ray.data.from_pandas(dataset_df).repartition(16) trainer = AssertingXGBoostTrainer( datasets={TRAIN_DATASET_KEY: train_ds}, label_column="target", scaling_config=ScalingConfig(num_workers=2), params={ "objective": "binary:logistic", "eval_metric": ["logloss"], }, ) tuner = Tuner( trainer, param_space={ "num_boost_round": 100, "params": { "eta": tune.grid_search([0.28, 0.29, 0.3, 0.31, 0.32]), }, }, tune_config=TuneConfig( mode="min", metric="train-logloss", scheduler=ResourceChangingScheduler( ASHAScheduler(), resources_allocation_function=DistributeResources( add_bundles=True, reserve_resources={"CPU": 1} ), ), ), run_config=RunConfig(failure_config=FailureConfig(fail_fast=True)), ) result_grid = tuner.fit() assert not any(x.error for x in result_grid)
def fit(self) -> Result: """Runs training. Returns: A Result object containing the training result. Raises: TrainingFailedError: If any failures during the execution of ``self.as_trainable()``. """ from ray.tune.tuner import Tuner trainable = self.as_trainable() tuner = Tuner(trainable=trainable, run_config=self.run_config) result_grid = tuner.fit() assert len(result_grid) == 1 try: result = result_grid[0] if result.error: raise result.error except TuneError: raise TrainingFailedError return result
trainer, param_space={ "train_loop_config": { "lr": tune.choice([0.001, 0.01, 0.1]), "momentum": 0.8, "head_location": None, "worker_locations": None, "test_mode": args.smoke_test, "batch_size": 128 * num_training_workers, # For the long running test, we want the training to run forever, # and it will be terminated by the release test infra. "epochs": 1 if args.smoke_test else sys.maxsize, } }, tune_config=TuneConfig(num_samples=4, metric="loss", mode="min", scheduler=pbt_scheduler), run_config=RunConfig( stop={"training_iteration": 1} if args.smoke_test else None, callbacks=[ FailureInjectorCallback(time_between_checks=90), ProgressCallback() ], ), ) results = tuner.fit() print(results.get_best_result(metric="loss", mode="min"))
from ray.tune.tuner import Tuner, TuneConfig ray.init(num_cpus=5) # Generate a synthetic 100MiB tensor dataset. dataset = ray.data.range_tensor(500, shape=(80, 80, 4), parallelism=10) # Create an example trainer that simply loops over the data a few times. trainer = DummyTrainer(datasets={"train": dataset}, runtime_seconds=1) # Run the Trainer 4x in parallel with Tune. tuner = Tuner( trainer, tune_config=TuneConfig(num_samples=4), ) tuner.fit() # __shared_dataset_end__ ray.shutdown() # __indep_dataset_start__ import ray from ray import tune from ray.ml.utils.check_ingest import DummyTrainer from ray.tune.tuner import Tuner, TuneConfig ray.init(num_cpus=5) def make_ds_1(): """Dataset creator function 1."""
def test_tuner_with_xgboost_trainer_driver_fail_and_resume(self): # So that we have some global checkpointing happening. os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "1" shutil.rmtree( os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_driver_fail"), ignore_errors=True, ) trainer = XGBoostTrainer( label_column="target", params={}, # TODO(xwjiang): change when dataset out-of-band ser/des is landed. datasets={"train": gen_dataset_func_eager()}, ) # prep_v1 = StandardScaler(["worst radius", "worst area"]) # prep_v2 = StandardScaler(["worst concavity", "worst smoothness"]) param_space = { "scaling_config": { "num_workers": tune.grid_search([1, 2]), }, # TODO(xwjiang): Add when https://github.com/ray-project/ray/issues/23363 # is resolved. # "preprocessor": tune.grid_search([prep_v1, prep_v2]), # "datasets": { # "train": tune.choice( # [gen_dataset_func(), gen_dataset_func(do_shuffle=True)] # ), # }, "params": { "objective": "binary:logistic", "tree_method": "approx", "eval_metric": ["logloss", "error"], "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": tune.randint(1, 9), }, } class FailureInjectionCallback(Callback): """Inject failure at the configured iteration number.""" def __init__(self, num_iters=10): self.num_iters = num_iters def on_step_end(self, iteration, trials, **kwargs): if iteration == self.num_iters: print(f"Failing after {self.num_iters} iters.") raise RuntimeError tuner = Tuner( trainable=trainer, run_config=RunConfig(name="test_tuner_driver_fail", callbacks=[FailureInjectionCallback()]), param_space=param_space, tune_config=TuneConfig(mode="min", metric="train-error"), ) with self.assertRaises(TuneError): tuner.fit() # Test resume restore_path = os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_driver_fail") tuner = Tuner.restore(restore_path) # A hack before we figure out RunConfig semantics across resumes. tuner._local_tuner._run_config.callbacks = None results = tuner.fit() assert len(results) == 2
"train_loop_config": { "lr": 0.1 if args.smoke_test else tune.grid_search( [0.1 * i for i in range(1, 10)]) } }, tune_config=TuneConfig( num_samples=2 if args.smoke_test else 1, metric="loss", mode="min", scheduler=pbt, ), run_config=RunConfig( stop={"training_iteration": 1} if args.smoke_test else None, callbacks=[ProgressCallback()], ), _tuner_kwargs={ "fail_fast": False, "keep_checkpoints_num": 1 }, ) result_grid = tuner.fit() # Make sure trials do not fail. for result in result_grid: assert not result.error print("Best hyperparameters found were: ", result_grid.get_best_result().config)