def test_checkpoint(ray_start_2_cpus): config = TestConfig() def train_func(): assert train.load_checkpoint() is None for i in range(3): train.save_checkpoint(epoch=i) return 1 trainer = Trainer(config, num_workers=2) trainer.start() trainer.run(train_func) checkpoint = trainer.latest_checkpoint assert checkpoint is not None assert checkpoint["epoch"] == 2 def train_func_checkpoint(): checkpoint = train.load_checkpoint() assert checkpoint is not None assert checkpoint["epoch"] == 2 for i in range(checkpoint["epoch"], 5): train.save_checkpoint(epoch=i) return 1 trainer.run(train_func_checkpoint, checkpoint=checkpoint) checkpoint = trainer.latest_checkpoint assert checkpoint is not None assert checkpoint["epoch"] == 4
def test_tf_non_distributed(ray_start_2_cpus): """Make sure Ray Train works without TF MultiWorkerMirroredStrategy.""" trainer = Trainer(backend="torch", num_workers=1) trainer.start() trainer.run(tf_quick_start_train_func) trainer.shutdown()
def test_mismatch_checkpoint_report(ray_start_2_cpus): test_config = TestConfig() def train_func(): for i in range(2): train.save_checkpoint(epoch=i) train.report(index=i) def train_mismatch(): train.save_checkpoint(epoch=0) train.report(index=0) # skip checkpoint train.report(index=1) new_backend_executor_cls = gen_new_backend_executor(train_mismatch) callback = TestCallback() with patch.object(ray.train.trainer, "BackendExecutor", new_backend_executor_cls): trainer = Trainer(test_config, num_workers=2) trainer.start() with pytest.raises(RuntimeError): trainer.run(train_func, callbacks=[callback]) # validate checkpoint assert trainer.latest_checkpoint["epoch"] == 0 # validate callback result_list = callback.result_list assert len(result_list) == 1 # 1 epoch succeeded intermediate_results = result_list[0] assert len(intermediate_results) == 2 # both workers reported for worker_result in intermediate_results: assert worker_result["index"] == 0
def test_fast_slow(ray_start_2_cpus): test_config = TestConfig() def train_func(): for i in range(2): train.save_checkpoint(epoch=i) train.report(index=i) def train_slow(): for i in range(2): train.save_checkpoint(epoch=i) time.sleep(5) train.report(index=i) time.sleep(5) new_backend_executor_cls = gen_new_backend_executor(train_slow) callback = TestCallback() with patch.object(ray.train.trainer, "BackendExecutor", new_backend_executor_cls): trainer = Trainer(test_config, num_workers=2) trainer.start() trainer.run(train_func, callbacks=[callback]) assert trainer.latest_checkpoint["epoch"] == 1 result_list = callback.result_list assert len(result_list) == 2 for index in range(len(result_list)): intermediate_results = result_list[index] assert len(intermediate_results) == 2 for worker_result in intermediate_results: assert worker_result["index"] == index
def test_persisted_checkpoint(ray_start_2_cpus, logdir): config = TestConfig() def train_func(): for i in range(2): train.save_checkpoint(epoch=i) time.sleep(1) trainer = Trainer(config, num_workers=2, logdir=logdir) trainer.start() trainer.run(train_func) assert trainer.best_checkpoint_path is not None if logdir is not None: assert trainer.logdir == Path(logdir).expanduser().resolve() assert trainer.latest_checkpoint_dir.is_dir() assert trainer.best_checkpoint_path.is_file() assert trainer.best_checkpoint_path.name == f"checkpoint_{2:06d}" assert trainer.best_checkpoint_path.parent.name == "checkpoints" latest_checkpoint = trainer.latest_checkpoint def validate(): checkpoint = train.load_checkpoint() assert checkpoint is not None assert checkpoint == latest_checkpoint trainer.run(validate, checkpoint=trainer.best_checkpoint_path)
def test_torch_auto_unwrap(ray_start_2_cpus): """Tests if underlying model from DDP is extracted when saving ckpt.""" def train_fn(): model = torch.nn.Linear(1, 1) # Wrap in DDP. model = train.torch.prepare_model(model) # Save DDP wrapped model. train.save_checkpoint(model=model) # Report DDP wrapped model. train.report(model=model) num_workers = 2 trainer = Trainer("torch", num_workers) trainer.start() class ValidateEncodedCallback(TrainingCallback): def handle_result(self, results, **info): for result in results: model = result["model"] assert isinstance(model, torch.nn.Module) and not \ isinstance(model, torch.nn.parallel.DistributedDataParallel) trainer.run(train_fn, callbacks=[ValidateEncodedCallback()]) last_checkpoint = trainer.latest_checkpoint model = last_checkpoint["model"] assert isinstance(model, torch.nn.Module) and not \ isinstance(model, torch.nn.parallel.DistributedDataParallel) trainer.shutdown()
def test_torch_non_distributed(ray_start_2_cpus): """Make sure Ray Train works without torch DDP.""" trainer = Trainer(backend="torch", num_workers=1) trainer.start() trainer.run(torch_quick_start_train_func) trainer.shutdown()
def test_dataset_fault_tolerance(ray_start_4_cpus): dataset = ray.data.range(10) test_config = TestConfig() def train_func(): return train.get_dataset_shard() def train_actor_failure(): import sys sys.exit(0) new_backend_executor_cls = gen_new_backend_executor(train_actor_failure) class SingleGetDatasetShardsBackendExecutor(new_backend_executor_cls): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._has_called_get_dataset_shards = False def _get_dataset_shards(self, dataset_or_dict): if self._has_called_get_dataset_shards: raise Exception self._has_called_get_dataset_shards = True return super()._get_dataset_shards(dataset_or_dict) with patch.object(ray.train.trainer, "BackendExecutor", SingleGetDatasetShardsBackendExecutor): trainer = Trainer(test_config, num_workers=2) trainer.start() trainer.run(train_func, dataset=dataset)
def test_gpu_requests(ray_start_4_cpus_4_gpus_4_extra): class CudaTestBackend(TestBackend): share_cuda_visible_devices = True class CudaTestConfig(TestConfig): @property def backend_cls(self): return CudaTestBackend # GPUs should not be requested if `use_gpu` is False. with pytest.raises(ValueError): Trainer(CudaTestConfig(), num_workers=2, use_gpu=False, resources_per_worker={"GPU": 1}) # GPUs should not be set to 0 if `use_gpu` is True. with pytest.raises(ValueError): Trainer(CudaTestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 0}) def get_resources(): return os.environ["CUDA_VISIBLE_DEVICES"] # 0 GPUs will be requested and should not raise an error. trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=False) trainer.start() result = trainer.run(get_resources) assert result == ["", ""] trainer.shutdown() # 1 GPU will be requested and should not raise an error. trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=True) trainer.start() result = trainer.run(get_resources) assert result == ["0,1", "0,1"] trainer.shutdown() # Partial GPUs should not raise an error. trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 0.1}) trainer.start() result = trainer.run(get_resources) assert result == ["0", "0"] trainer.shutdown() # Multiple GPUs should not raise an error. trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 2}) trainer.start() result = trainer.run(get_resources) assert result == ["0,1,2,3", "0,1,2,3"] trainer.shutdown()
def test_json(monkeypatch, ray_start_4_cpus, make_temp_dir, workers_to_log, detailed, filename): if detailed: monkeypatch.setenv(ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, "1") config = TestConfig() num_iters = 5 num_workers = 4 if workers_to_log is None: num_workers_to_log = num_workers elif isinstance(workers_to_log, int): num_workers_to_log = 1 else: num_workers_to_log = len(workers_to_log) def train_func(): for i in range(num_iters): train.report(index=i) return 1 if filename is None: # if None, use default value callback = JsonLoggerCallback(workers_to_log=workers_to_log) else: callback = JsonLoggerCallback(filename=filename, workers_to_log=workers_to_log) trainer = Trainer(config, num_workers=num_workers, logdir=make_temp_dir) trainer.start() trainer.run(train_func, callbacks=[callback]) if filename is None: assert str( callback.log_path.name) == JsonLoggerCallback._default_filename else: assert str(callback.log_path.name) == filename with open(callback.log_path, "r") as f: log = json.load(f) print(log) assert len(log) == num_iters assert len(log[0]) == num_workers_to_log assert all(len(element) == len(log[0]) for element in log) assert all( all(worker["index"] == worker[TRAINING_ITERATION] - 1 for worker in element) for element in log) assert all( all( all(key in worker for key in BASIC_AUTOFILLED_KEYS) for worker in element) for element in log) if detailed: assert all( all( all(key in worker for key in DETAILED_AUTOFILLED_KEYS) for worker in element) for element in log) else: assert all( all(not any(key in worker for key in DETAILED_AUTOFILLED_KEYS) for worker in element) for element in log)
def latency(amp: bool) -> float: trainer = Trainer("torch", num_workers=2, use_gpu=True) trainer.start() start_time = timer() trainer.run(train_func, {"amp": amp}) end_time = timer() trainer.shutdown() return end_time - start_time
def test_gpu_requests(ray_start_4_cpus_4_gpus_4_extra): # GPUs should not be requested if `use_gpu` is False. with pytest.raises(ValueError): Trainer( TestConfig(), num_workers=2, use_gpu=False, resources_per_worker={"GPU": 1}) # GPUs should not be set to 0 if `use_gpu` is True. with pytest.raises(ValueError): Trainer( TestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 0}) def get_resources(): return os.environ["CUDA_VISIBLE_DEVICES"] os.environ[ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV] = "1" # 0 GPUs will be requested and should not raise an error. trainer = Trainer(TestConfig(), num_workers=2, use_gpu=False) trainer.start() result = trainer.run(get_resources) assert result == ["", ""] trainer.shutdown() # 1 GPU will be requested and should not raise an error. trainer = Trainer(TestConfig(), num_workers=2, use_gpu=True) trainer.start() result = trainer.run(get_resources) assert result == ["0,1", "0,1"] trainer.shutdown() # Partial GPUs should not raise an error. trainer = Trainer( TestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 0.1}) trainer.start() result = trainer.run(get_resources) assert result == ["0", "0"] trainer.shutdown() # Multiple GPUs should not raise an error. trainer = Trainer( TestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 2}) trainer.start() result = trainer.run(get_resources) assert result == ["0,1,2,3", "0,1,2,3"] trainer.shutdown()
def test_checkpoint_torch_model_with_amp(ray_start_4_cpus_2_gpus): """Test that model with AMP is serializable.""" def train_func(): train.torch.accelerate(amp=True) model = torchvision.models.resnet101() model = train.torch.prepare_model(model) train.save_checkpoint(model=model) trainer = Trainer("torch", num_workers=1, use_gpu=True) trainer.start() trainer.run(train_func) trainer.shutdown()
def test_torch_amp(ray_start_2_cpus): def train_fn(): train.torch.accelerate(amp=True) model = torch.nn.Linear(1, 1) model = train.torch.prepare_model(model) # Make sure model is serializable even with amp enabled. return model.module num_workers = 2 trainer = Trainer("torch", num_workers) trainer.start() trainer.run(train_fn) trainer.shutdown()
def test_mlflow(ray_start_4_cpus, tmp_path): config = TestConfig() params = {"p1": "p1"} temp_dir = tmp_path num_workers = 4 def train_func(config): train.report(episode_reward_mean=4) train.report(episode_reward_mean=5) train.report(episode_reward_mean=6) return 1 callback = MLflowLoggerCallback(experiment_name="test_exp", logdir=temp_dir) trainer = Trainer(config, num_workers=num_workers) trainer.start() trainer.run(train_func, config=params, callbacks=[callback]) from mlflow.tracking import MlflowClient client = MlflowClient( tracking_uri=callback.mlflow_util._mlflow.get_tracking_uri()) experiment_id = client.get_experiment_by_name("test_exp").experiment_id all_runs = callback.mlflow_util._mlflow.search_runs( experiment_ids=[experiment_id]) assert len(all_runs) == 1 # all_runs is a pandas dataframe. all_runs = all_runs.to_dict(orient="records") run_id = all_runs[0]["run_id"] run = client.get_run(run_id) assert run.data.params == params assert ("episode_reward_mean" in run.data.metrics and run.data.metrics["episode_reward_mean"] == 6.0) assert (TRAINING_ITERATION in run.data.metrics and run.data.metrics[TRAINING_ITERATION] == 3.0) metric_history = client.get_metric_history(run_id=run_id, key="episode_reward_mean") assert len(metric_history) == 3 iterations = [metric.step for metric in metric_history] assert iterations == [1, 2, 3] rewards = [metric.value for metric in metric_history] assert rewards == [4, 5, 6]
def test_dataset_pipeline(ray_start_4_cpus): """Checks that Pipeline is correctly sharded even with multiple epochs.""" num_epochs = 2 num_data = 10 dataset = ray.data.range(num_data).repeat() def get_dataset(): pipeline_iterator = train.get_dataset_shard().iter_epochs() data_all_epochs = [] for _ in range(num_epochs): dataset_this_epoch = next(pipeline_iterator) data_this_epoch = [] for batch in dataset_this_epoch.iter_batches( batch_format="native"): data_this_epoch.extend(batch) data_all_epochs.append(data_this_epoch) return data_all_epochs config = TestConfig() trainer = Trainer(config, num_workers=2) trainer.start() results = trainer.run(get_dataset, dataset=dataset) check_dataset_output(num_data, num_epochs, results)
def test_torch_get_device_dist(ray_2_node_4_gpu, num_gpus_per_worker): @patch("torch.cuda.is_available", lambda: True) def train_fn(): return train.torch.get_device().index trainer = Trainer( TorchConfig(backend="gloo"), num_workers=int(8 / num_gpus_per_worker), use_gpu=True, resources_per_worker={"GPU": num_gpus_per_worker}, ) trainer.start() devices = trainer.run(train_fn) trainer.shutdown() count = Counter(devices) if num_gpus_per_worker == 0.5: for i in range(4): assert count[i] == 4 elif num_gpus_per_worker == 1: for i in range(4): assert count[i] == 2 elif num_gpus_per_worker == 2: for i in range(2): assert count[2 * i] == 2 else: raise RuntimeError( "New parameter for this test has been added without checking that the " "correct devices have been returned.")
def test_dataset_pipeline_shuffle(ray_start_4_cpus): num_epochs = 2 num_data = 20 dataset = ray.data.range(num_data).repeat().random_shuffle_each_window() def get_dataset(): pipeline_iterator = train.get_dataset_shard().iter_epochs() data_all_epochs = [] for _ in range(2): dataset_this_epoch = next(pipeline_iterator) data_this_epoch = [] for batch in dataset_this_epoch.iter_batches( batch_format="native"): data_this_epoch.extend(batch) if len(data_all_epochs) > 0: # Make sure data is shuffled per epoch. assert data_this_epoch != data_all_epochs[-1] data_all_epochs.append(data_this_epoch) return data_all_epochs config = TestConfig() trainer = Trainer(config, num_workers=2) trainer.start() results = trainer.run(get_dataset, dataset=dataset) check_dataset_output(num_data, num_epochs, results)
def test_run_after_user_error(ray_start_2_cpus): config = TestConfig() def fail_train(): raise NotImplementedError trainer = Trainer(config, num_workers=2) trainer.start() with pytest.raises(NotImplementedError): trainer.run(fail_train) def train_func(): return 1 output = trainer.run(train_func) assert output == [1, 1]
def train_tensorflow_mnist(num_workers=2, use_gpu=False, epochs=4): trainer = Trainer(backend="tensorflow", num_workers=num_workers, use_gpu=use_gpu) trainer.start() results = trainer.run( train_func=train_func, config={"lr": 1e-3, "batch_size": 64, "epochs": epochs} ) trainer.shutdown() print(f"Results: {results[0]}")
def test_load_checkpoint_from_path(ray_start_2_cpus, tmpdir): config = TestConfig() checkpoint_strategy = CheckpointStrategy(checkpoint_score_attribute="loss", checkpoint_score_order="min") def train_func_checkpoint(): train.save_checkpoint(loss=3) train.save_checkpoint(loss=7) trainer = Trainer(config, num_workers=2, logdir=tmpdir) trainer.start() trainer.run(train_func_checkpoint, checkpoint_strategy=checkpoint_strategy) assert trainer.best_checkpoint["loss"] == 3 assert (Trainer.load_checkpoint_from_path( trainer.best_checkpoint_path) == trainer.best_checkpoint)
def test_multiple_run(ray_start_2_cpus): config = TestConfig() def train_1(): return 1 trainer = Trainer(config, num_workers=2) trainer.start() output_1 = trainer.run(train_1) assert output_1 == [1, 1] def train_2(): return 2 output_2 = trainer.run(train_2) assert output_2 == [2, 2]
def main(): args = parse_args() config = {"args": args} if args.start_local or args.address or args.num_workers > 1 or args.use_gpu: if args.start_local: # Start a local Ray runtime. ray.init(num_cpus=args.num_workers) else: # Connect to a Ray cluster for distributed training. ray.init(address=args.address) trainer = Trainer("torch", num_workers=args.num_workers, use_gpu=args.use_gpu) trainer.start() trainer.run(train_func, config) else: # Run training locally. train_func(config)
def test_mismatch_checkpoint(ray_start_2_cpus): test_config = TestConfig() def train_func(): for i in range(2): train.save_checkpoint(epoch=i) def train_mismatch(): train.save_checkpoint(epoch=0) new_backend_executor_cls = gen_new_backend_executor(train_mismatch) with patch.object(ray.train.trainer, "BackendExecutor", new_backend_executor_cls): trainer = Trainer(test_config, num_workers=2) trainer.start() with pytest.raises(RuntimeError): trainer.run(train_func)
def test_run_failure(ray_start_2_cpus): test_config = TestConfig() def train_invalid_signature(a, b): pass trainer = Trainer(test_config, num_workers=2) # Raise RuntimeError when trainer has not been started yet. with pytest.raises(RuntimeError): trainer.run(lambda: 1) trainer.start() with pytest.raises(ValueError): trainer.run(train_invalid_signature) trainer.shutdown()
def test_TBX(ray_start_4_cpus, tmp_path): config = TestConfig() temp_dir = tmp_path num_workers = 4 def train_func(): train.report(episode_reward_mean=4) train.report(episode_reward_mean=5) train.report(episode_reward_mean=6, score=[1, 2, 3], hello={"world": 1}) return 1 callback = TBXLoggerCallback(temp_dir) trainer = Trainer(config, num_workers=num_workers) trainer.start() trainer.run(train_func, callbacks=[callback]) _validate_tbx_result(temp_dir)
def test_torch_prepare_model(ray_start_4_cpus_2_gpus): """Tests if ``prepare_model`` correctly wraps in DDP.""" def train_fn(): model = torch.nn.Linear(1, 1) # Wrap in DDP. model = train.torch.prepare_model(model) # Make sure model is wrapped in DDP. assert isinstance(model, DistributedDataParallel) # Make sure model is on cuda. assert next(model.parameters()).is_cuda trainer = Trainer("torch", num_workers=2, use_gpu=True) trainer.start() trainer.run(train_fn) trainer.shutdown()
def test_print(ray_start_4_cpus): num_workers = 4 def train_func(): train.report(rank=train.world_rank()) stream = io.StringIO() with redirect_stdout(stream): trainer = Trainer(TestConfig(), num_workers=num_workers) trainer.start() trainer.run(train_func, callbacks=[PrintCallback()]) trainer.shutdown() output = stream.getvalue() results = json.loads(output) assert len(results) == num_workers for i, result in enumerate(results): assert set(result.keys()) == (BASIC_AUTOFILLED_KEYS | {"rank"}) assert result["rank"] == i
def test_world_rank(ray_start_2_cpus): config = TestConfig() def train_func(): return train.world_rank() trainer = Trainer(config, num_workers=2) trainer.start() results = trainer.run(train_func) assert set(results) == {0, 1}
def test_env_var(ray_start_2_cpus): """Tests if Train env vars are propagated to the BackendExecutor.""" config = TestConfig() os.environ[TRAIN_ENABLE_WORKER_SPREAD_ENV] = "1" class EnvBackendExecutor(BackendExecutor): def __init__(self, *args, **kwargs): assert TRAIN_ENABLE_WORKER_SPREAD_ENV in os.environ and \ os.environ[TRAIN_ENABLE_WORKER_SPREAD_ENV] == "1" super().__init__(*args, **kwargs) with patch.object(ray.train.trainer, "BackendExecutor", EnvBackendExecutor): trainer = Trainer(config, num_workers=1) trainer.start() trainer.run(lambda: 1) trainer.shutdown() del os.environ[TRAIN_ENABLE_WORKER_SPREAD_ENV]