def train_tensorflow_mnist(num_workers=2, use_gpu=False, epochs=4): trainer = Trainer(backend="tensorflow", num_workers=num_workers, use_gpu=use_gpu) trainer.start() results = trainer.run(train_func=train_func, config={ "lr": 1e-3, "batch_size": 64, "epochs": epochs }) trainer.shutdown() print(f"Results: {results[0]}")
def test_run(ray_start_2_cpus): config = TestConfig() def train_func(): return 1 trainer = Trainer(config, num_workers=2) trainer.start() results = trainer.run(train_func) trainer.shutdown() assert len(results) == 2 assert all(result == 1 for result in results)
def test_checkpoint_torch_model_with_amp(ray_start_4_cpus_2_gpus): """Test that model with AMP is serializable.""" def train_func(): train.torch.accelerate(amp=True) model = torchvision.models.resnet101() model = train.torch.prepare_model(model) train.save_checkpoint(model=model) trainer = Trainer("torch", num_workers=1, use_gpu=True) trainer.start() trainer.run(train_func) trainer.shutdown()
def test_max_failures(ray_start_2_cpus): test_config = TestConfig() def train_func(): import sys sys.exit(0) trainer = Trainer(test_config, num_workers=2) trainer.start() iterator = trainer.run_iterator(train_func) with pytest.raises(RuntimeError): iterator.get_final_results(force=True) assert ray.get( iterator._backend_executor_actor._get_num_failures.remote()) == 3
def test_worker_kill_checkpoint(ray_start_2_cpus): test_config = TestConfig() def train_func(): checkpoint = train.load_checkpoint() if checkpoint: epoch = checkpoint["epoch"] else: epoch = 0 print("Epoch: ", epoch) for i in range(epoch, 2): train.report(loss=1, iter=i) train.save_checkpoint(epoch=i + 1) trainer = Trainer(test_config, num_workers=2) trainer.start() kill_callback = KillCallback(fail_on=0, trainer=trainer) trainer.run(train_func, callbacks=[kill_callback]) # Run 1: epoch=0, counter=1, Successful # *Checkpoint is saved.* # *Worker is killed* # *Getting checkpoint fails. Workers are restarted from beginning* # Run 2: epoch=0, counter=2, Successful # Run 3: epoch=1, counter=3, Successful assert kill_callback.counter == 3 assert trainer.latest_checkpoint["epoch"] == 2 trainer.shutdown() trainer.start() kill_callback = KillCallback(fail_on=1, trainer=trainer) trainer.run(train_func, callbacks=[kill_callback]) # Run 1: epoch=0, counter=1, Successful # *Checkpoint saved* # *Latest checkpoint updated, epoch=1 # Run 2: epoch=1, counter=2, Successful # *Checkpoint saved* # *Worker is killed* # *Getting checkpoint fails. Workers are restarted from last checkpoint.* # Run 3: epoch=1, counter=3, Successful. assert kill_callback.counter == 3 assert trainer.latest_checkpoint["epoch"] == 2 def train_func(): return 1 # Make sure Trainer is usable even after failure handling. trainer.run(train_func)
def test_horovod_torch_mnist_gpu(ray_start_4_cpus_2_gpus): num_workers = 2 num_epochs = 2 trainer = Trainer("horovod", num_workers, use_gpu=True) trainer.start() results = trainer.run( horovod_torch_train_func, config={"num_epochs": num_epochs, "lr": 1e-3} ) trainer.shutdown() assert len(results) == num_workers for worker_result in results: assert len(worker_result) == num_epochs assert worker_result[num_epochs - 1] < worker_result[0]
def test_gpu_requests(ray_start_4_cpus_4_gpus_4_extra): class CudaTestBackend(TestBackend): share_cuda_visible_devices = True class CudaTestConfig(TestConfig): @property def backend_cls(self): return CudaTestBackend # GPUs should not be requested if `use_gpu` is False. with pytest.raises(ValueError): Trainer(CudaTestConfig(), num_workers=2, use_gpu=False, resources_per_worker={"GPU": 1}) # GPUs should not be set to 0 if `use_gpu` is True. with pytest.raises(ValueError): Trainer(CudaTestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 0}) def get_resources(): return os.environ["CUDA_VISIBLE_DEVICES"] # 0 GPUs will be requested and should not raise an error. trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=False) trainer.start() result = trainer.run(get_resources) assert result == ["", ""] trainer.shutdown() # 1 GPU will be requested and should not raise an error. trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=True) trainer.start() result = trainer.run(get_resources) assert result == ["0,1", "0,1"] trainer.shutdown() # Partial GPUs should not raise an error. trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 0.1}) trainer.start() result = trainer.run(get_resources) assert result == ["0", "0"] trainer.shutdown() # Multiple GPUs should not raise an error. trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 2}) trainer.start() result = trainer.run(get_resources) assert result == ["0,1,2,3", "0,1,2,3"] trainer.shutdown()
def test_torch_amp(ray_start_2_cpus): def train_fn(): train.torch.accelerate(amp=True) model = torch.nn.Linear(1, 1) model = train.torch.prepare_model(model) # Make sure model is serializable even with amp enabled. return model.module num_workers = 2 trainer = Trainer("torch", num_workers) trainer.start() trainer.run(train_fn) trainer.shutdown()
def train_linear(num_workers=2, use_gpu=False): datasets = get_datasets() trainer = Trainer("torch", num_workers=num_workers, use_gpu=use_gpu) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3} trainer.start() results = trainer.run( train_func, config, dataset=datasets, callbacks=[JsonLoggerCallback(), TBXLoggerCallback()], ) trainer.shutdown() print(results) return results
def test_torch_linear(ray_start_2_cpus, num_workers): num_workers = num_workers epochs = 3 trainer = Trainer("torch", num_workers=num_workers) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} trainer.start() results = trainer.run(linear_train_func, config) trainer.shutdown() assert len(results) == num_workers for result in results: assert len(result) == epochs assert result[-1]["loss"] < result[0]["loss"]
def test_run_iterator_error(ray_start_2_cpus): config = TestConfig() def fail_train(): raise NotImplementedError trainer = Trainer(config, num_workers=2) trainer.start() iterator = trainer.run_iterator(fail_train) with pytest.raises(NotImplementedError): next(iterator) assert iterator.get_final_results() is None assert iterator.is_finished()
def test_torch_fashion_mnist(ray_start_2_cpus): num_workers = 2 epochs = 3 trainer = Trainer("torch", num_workers=num_workers) config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} trainer.start() results = trainer.run(fashion_mnist_train_func, config) trainer.shutdown() assert len(results) == num_workers for result in results: assert len(result) == epochs assert result[-1] < result[0]
def test_no_exhaust(ray_start_2_cpus, tmp_path): """Tests if training can finish even if queue is not exhausted.""" def train_func(): for _ in range(2): train.report(loss=1) return 2 config = TestConfig() trainer = Trainer(config, num_workers=2) trainer.start() iterator = trainer.run_iterator(train_func) output = iterator.get_final_results(force=True) assert output == [2, 2]
def train_linear(num_workers=2, use_gpu=False, epochs=3): trainer = Trainer(backend="torch", num_workers=num_workers, use_gpu=use_gpu) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} trainer.start() results = trainer.run( train_func, config, callbacks=[JsonLoggerCallback(), TBXLoggerCallback()]) trainer.shutdown() print(results) return results
def test_run_config(ray_start_2_cpus): backend_config = TestConfig() def train_func(config): return config["fruit"] config = {"fruit": "banana"} trainer = Trainer(backend_config, num_workers=2) trainer.start() results = trainer.run(train_func, config) trainer.shutdown() assert len(results) == 2 assert all(result == "banana" for result in results)
def test_mlflow(ray_start_4_cpus, tmp_path): config = TestConfig() params = {"p1": "p1"} temp_dir = tmp_path num_workers = 4 def train_func(config): train.report(episode_reward_mean=4) train.report(episode_reward_mean=5) train.report(episode_reward_mean=6) return 1 callback = MLflowLoggerCallback(experiment_name="test_exp", logdir=temp_dir) trainer = Trainer(config, num_workers=num_workers) trainer.start() trainer.run(train_func, config=params, callbacks=[callback]) from mlflow.tracking import MlflowClient client = MlflowClient( tracking_uri=callback.mlflow_util._mlflow.get_tracking_uri()) experiment_id = client.get_experiment_by_name("test_exp").experiment_id all_runs = callback.mlflow_util._mlflow.search_runs( experiment_ids=[experiment_id]) assert len(all_runs) == 1 # all_runs is a pandas dataframe. all_runs = all_runs.to_dict(orient="records") run_id = all_runs[0]["run_id"] run = client.get_run(run_id) assert run.data.params == params assert ("episode_reward_mean" in run.data.metrics and run.data.metrics["episode_reward_mean"] == 6.0) assert (TRAINING_ITERATION in run.data.metrics and run.data.metrics[TRAINING_ITERATION] == 3.0) metric_history = client.get_metric_history(run_id=run_id, key="episode_reward_mean") assert len(metric_history) == 3 iterations = [metric.step for metric in metric_history] assert iterations == [1, 2, 3] rewards = [metric.value for metric in metric_history] assert rewards == [4, 5, 6]
def test_start_shutdown(ray_start_2_cpus, num_workers): config = TestConfig() assert ray.available_resources()["CPU"] == 2 trainer = Trainer(config, num_workers=num_workers) trainer.start() time.sleep(1) remaining = 2 - num_workers if remaining == 0: assert "CPU" not in ray.available_resources() else: assert ray.available_resources()["CPU"] == remaining trainer.shutdown() time.sleep(1) assert ray.available_resources()["CPU"] == 2
def train_tensorflow_linear(num_workers=2, use_gpu=False): dataset_pipeline = get_dataset_pipeline() trainer = Trainer(backend="tensorflow", num_workers=num_workers, use_gpu=use_gpu) trainer.start() results = trainer.run(train_func=train_func, dataset=dataset_pipeline, config={ "lr": 1e-3, "batch_size": 32, "epochs": 4 }) trainer.shutdown() print(f"Results: {results[0]}") return results
def main(num_workers=2, use_gpu=False): trainer = Trainer( backend="torch", num_workers=num_workers, use_gpu=use_gpu) trainer.start() final_results = trainer.run( train_func=train_func, config={ "lr": 1e-3, "batch_size": 64, "epochs": 4 }, callbacks=[ MLflowLoggerCallback(experiment_name="train_fashion_mnist") ]) print("Full losses for rank 0 worker: ", final_results)
def test_torch_linear_failure(ray_start_4_cpus): num_workers = 2 epochs = 3 trainer = Trainer("torch", num_workers=num_workers) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} trainer.start() kill_callback = KillCallback(fail_on=1, trainer=trainer) results = trainer.run(linear_train_func, config, callbacks=[kill_callback]) trainer.shutdown() assert len(results) == num_workers for result in results: assert len(result) == epochs assert result[-1]["loss"] < result[0]["loss"]
def test_run_after_user_error(ray_start_2_cpus): config = TestConfig() def fail_train(): raise NotImplementedError trainer = Trainer(config, num_workers=2) trainer.start() with pytest.raises(NotImplementedError): trainer.run(fail_train) def train_func(): return 1 output = trainer.run(train_func) assert output == [1, 1]
def test_gpu_requests(ray_start_4_cpus_4_gpus_4_extra): # GPUs should not be requested if `use_gpu` is False. with pytest.raises(ValueError): Trainer( TestConfig(), num_workers=2, use_gpu=False, resources_per_worker={"GPU": 1}) # GPUs should not be set to 0 if `use_gpu` is True. with pytest.raises(ValueError): Trainer( TestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 0}) def get_resources(): return os.environ["CUDA_VISIBLE_DEVICES"] os.environ[ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV] = "1" # 0 GPUs will be requested and should not raise an error. trainer = Trainer(TestConfig(), num_workers=2, use_gpu=False) trainer.start() result = trainer.run(get_resources) assert result == ["", ""] trainer.shutdown() # 1 GPU will be requested and should not raise an error. trainer = Trainer(TestConfig(), num_workers=2, use_gpu=True) trainer.start() result = trainer.run(get_resources) assert result == ["0,1", "0,1"] trainer.shutdown() # Partial GPUs should not raise an error. trainer = Trainer( TestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 0.1}) trainer.start() result = trainer.run(get_resources) assert result == ["0", "0"] trainer.shutdown() # Multiple GPUs should not raise an error. trainer = Trainer( TestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 2}) trainer.start() result = trainer.run(get_resources) assert result == ["0,1,2,3", "0,1,2,3"] trainer.shutdown()
def test_load_checkpoint_from_path(ray_start_2_cpus, tmpdir): config = TestConfig() checkpoint_strategy = CheckpointStrategy(checkpoint_score_attribute="loss", checkpoint_score_order="min") def train_func_checkpoint(): train.save_checkpoint(loss=3) train.save_checkpoint(loss=7) trainer = Trainer(config, num_workers=2, logdir=tmpdir) trainer.start() trainer.run(train_func_checkpoint, checkpoint_strategy=checkpoint_strategy) assert trainer.best_checkpoint["loss"] == 3 assert (Trainer.load_checkpoint_from_path( trainer.best_checkpoint_path) == trainer.best_checkpoint)
def test_multiple_run(ray_start_2_cpus): config = TestConfig() def train_1(): return 1 trainer = Trainer(config, num_workers=2) trainer.start() output_1 = trainer.run(train_1) assert output_1 == [1, 1] def train_2(): return 2 output_2 = trainer.run(train_2) assert output_2 == [2, 2]
def test_mismatch_checkpoint(ray_start_2_cpus): test_config = TestConfig() def train_func(): for i in range(2): train.save_checkpoint(epoch=i) def train_mismatch(): train.save_checkpoint(epoch=0) new_backend_executor_cls = gen_new_backend_executor(train_mismatch) with patch.object(ray.train.trainer, "BackendExecutor", new_backend_executor_cls): trainer = Trainer(test_config, num_workers=2) trainer.start() with pytest.raises(RuntimeError): trainer.run(train_func)
def main(): args = parse_args() config = {"args": args} if args.start_local or args.address or args.num_workers > 1 or args.use_gpu: if args.start_local: # Start a local Ray runtime. ray.init(num_cpus=args.num_workers) else: # Connect to a Ray cluster for distributed training. ray.init(address=args.address) trainer = Trainer("torch", num_workers=args.num_workers, use_gpu=args.use_gpu) trainer.start() trainer.run(train_func, config) else: # Run training locally. train_func(config)
def start_ray_train(config, num_workers=4, use_gpu=False): ''' Train model using RayTrain. num_workers determines the number of processes. Uses the same config as local training. ''' trainer = Trainer(backend="tensorflow", num_workers=num_workers, use_gpu=use_gpu) trainer.start() start_time = time.time() results = trainer.run(train_epochs_remote, config=config) duration = time.time() - start_time trainer.shutdown() return None, results, duration
def test_run_iterator_returns(ray_start_2_cpus): config = TestConfig() def train_func(): for i in range(3): train.report(index=i) return 1 trainer = Trainer(config, num_workers=2) trainer.start() iterator = trainer.run_iterator(train_func) assert iterator.get_final_results() is None assert iterator.get_final_results(force=True) == [1, 1] with pytest.raises(StopIteration): next(iterator)
def test_worker_failure_1(ray_start_2_cpus): test_config = TestConfig() def train_func(): return 1 def train_actor_failure(): import sys sys.exit(0) new_backend_executor_cls = gen_new_backend_executor(train_actor_failure) with patch.object(ray.train.trainer, "BackendExecutor", new_backend_executor_cls): trainer = Trainer(test_config, num_workers=2) trainer.start() results = trainer.run(train_func) assert results == [1, 1]
def test_run_failure(ray_start_2_cpus): test_config = TestConfig() def train_invalid_signature(a, b): pass trainer = Trainer(test_config, num_workers=2) # Raise RuntimeError when trainer has not been started yet. with pytest.raises(RuntimeError): trainer.run(lambda: 1) trainer.start() with pytest.raises(ValueError): trainer.run(train_invalid_signature) trainer.shutdown()