def test_checkpoint(ray_start_2_cpus): config = TestConfig() def train_func(): assert sgd.load_checkpoint() is None for i in range(3): sgd.save_checkpoint(epoch=i) return 1 trainer = Trainer(config, num_workers=2) trainer.start() trainer.run(train_func) checkpoint = trainer.latest_checkpoint assert checkpoint is not None assert checkpoint["epoch"] == 2 def train_func_checkpoint(): checkpoint = sgd.load_checkpoint() assert checkpoint is not None assert checkpoint["epoch"] == 2 for i in range(checkpoint["epoch"], 5): sgd.save_checkpoint(epoch=i) return 1 trainer.run(train_func_checkpoint, checkpoint=checkpoint) checkpoint = trainer.latest_checkpoint assert checkpoint is not None assert checkpoint["epoch"] == 4
def test_fast_slow(ray_start_2_cpus): test_config = TestConfig() def train(): for i in range(2): sgd.save_checkpoint(epoch=i) sgd.report(index=i) def train_slow(): for i in range(2): sgd.save_checkpoint(epoch=i) time.sleep(5) sgd.report(index=i) time.sleep(5) new_backend_executor_cls = gen_new_backend_executor(train_slow) callback = TestCallback() with patch.object(ray.util.sgd.v2.trainer, "BackendExecutor", new_backend_executor_cls): trainer = Trainer(test_config, num_workers=2) trainer.start() trainer.run(train, callbacks=[callback]) assert trainer.latest_checkpoint["epoch"] == 1 result_list = callback.result_list assert len(result_list) == 2 for index in range(len(result_list)): intermediate_results = result_list[index] assert len(intermediate_results) == 2 for worker_result in intermediate_results: assert worker_result["index"] == index
def test_json(ray_start_4_cpus, make_temp_dir, workers_to_log, detailed, filename): if detailed: os.environ[ENABLE_DETAILED_AUTOFILLED_METRICS_ENV] = "1" else: os.environ.pop(ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, 0) config = TestConfig() num_iters = 5 num_workers = 4 if workers_to_log is None: num_workers_to_log = num_workers elif isinstance(workers_to_log, int): num_workers_to_log = 1 else: num_workers_to_log = len(workers_to_log) def train_func(): for i in range(num_iters): sgd.report(index=i) return 1 if filename is None: # if None, use default value callback = JsonLoggerCallback( make_temp_dir, workers_to_log=workers_to_log) assert str( callback.log_path.name) == JsonLoggerCallback._default_filename else: callback = JsonLoggerCallback( make_temp_dir, filename=filename, workers_to_log=workers_to_log) assert str(callback.log_path.name) == filename trainer = Trainer(config, num_workers=num_workers) trainer.start() trainer.run(train_func, callbacks=[callback]) with open(callback.log_path, "r") as f: log = json.load(f) print(log) assert len(log) == num_iters assert len(log[0]) == num_workers_to_log assert all(len(element) == len(log[0]) for element in log) assert all( all(worker["index"] == worker[TRAINING_ITERATION] - 1 for worker in element) for element in log) assert all( all( all(key in worker for key in BASIC_AUTOFILLED_KEYS) for worker in element) for element in log) if detailed: assert all( all( all(key in worker for key in DETAILED_AUTOFILLED_KEYS) for worker in element) for element in log) else: assert all( all(not any(key in worker for key in DETAILED_AUTOFILLED_KEYS) for worker in element) for element in log)
def test_persisted_checkpoint(ray_start_2_cpus, logdir): config = TestConfig() def train(): for i in range(2): sgd.save_checkpoint(epoch=i) trainer = Trainer(config, num_workers=2, logdir=logdir) trainer.start() trainer.run(train) assert trainer.latest_checkpoint_path is not None if logdir is not None: assert trainer.logdir == Path(logdir).expanduser().resolve() assert trainer.latest_checkpoint_dir.is_dir() assert trainer.latest_checkpoint_path.is_file() assert trainer.latest_checkpoint_path.name == f"checkpoint_{2:06d}" assert trainer.latest_checkpoint_path.parent.name == "checkpoints" latest_checkpoint = trainer.latest_checkpoint def validate(): checkpoint = sgd.load_checkpoint() assert checkpoint is not None assert checkpoint == latest_checkpoint trainer.run(validate, checkpoint=trainer.latest_checkpoint_path)
def test_mismatch_checkpoint_report(ray_start_2_cpus): test_config = TestConfig() def train(): for i in range(2): sgd.save_checkpoint(epoch=i) sgd.report(index=i) def train_mismatch(): sgd.save_checkpoint(epoch=0) sgd.report(index=0) # skip checkpoint sgd.report(index=1) new_backend_executor_cls = gen_new_backend_executor(train_mismatch) callback = TestCallback() with patch.object(ray.util.sgd.v2.trainer, "BackendExecutor", new_backend_executor_cls): trainer = Trainer(test_config, num_workers=2) trainer.start() with pytest.raises(RuntimeError): trainer.run(train, callbacks=[callback]) # validate checkpoint assert trainer.latest_checkpoint["epoch"] == 0 # validate callback result_list = callback.result_list assert len(result_list) == 1 # 1 epoch succeeded intermediate_results = result_list[0] assert len(intermediate_results) == 2 # both workers reported for worker_result in intermediate_results: assert worker_result["index"] == 0
def test_gpu_requests(ray_start_4_cpus_4_gpus_4_extra): # GPUs should not be requested if `use_gpu` is False. with pytest.raises(ValueError): Trainer(TestConfig(), num_workers=2, use_gpu=False, resources_per_worker={"GPU": 1}) # GPUs should not be set to 0 if `use_gpu` is True. with pytest.raises(ValueError): Trainer(TestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 0}) def get_resources(): return os.environ["CUDA_VISIBLE_DEVICES"] os.environ[ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV] = "1" # 0 GPUs will be requested and should not raise an error. trainer = Trainer(TestConfig(), num_workers=2, use_gpu=False) trainer.start() result = trainer.run(get_resources) assert result == ["", ""] trainer.shutdown() # 1 GPU will be requested and should not raise an error. trainer = Trainer(TestConfig(), num_workers=2, use_gpu=True) trainer.start() result = trainer.run(get_resources) assert result == ["0,1", "0,1"] trainer.shutdown() # Partial GPUs should not raise an error. trainer = Trainer(TestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 0.1}) trainer.start() result = trainer.run(get_resources) assert result == ["0", "0"] trainer.shutdown() # Multiple GPUs should not raise an error. trainer = Trainer(TestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 2}) trainer.start() result = trainer.run(get_resources) assert result == ["0,1,2,3", "0,1,2,3"] trainer.shutdown()
def test_run_after_user_error(ray_start_2_cpus): config = TestConfig() def fail_train(): raise NotImplementedError trainer = Trainer(config, num_workers=2) trainer.start() with pytest.raises(NotImplementedError): trainer.run(fail_train) def train(): return 1 output = trainer.run(train) assert output == [1, 1]
def test_dataset_pipeline_shuffle(ray_start_4_cpus): num_epochs = 2 num_data = 20 dataset = ray.data.range(num_data).repeat().random_shuffle_each_window() def get_dataset(): pipeline_iterator = sgd.get_dataset_shard().iter_datasets() data_all_epochs = [] for _ in range(2): dataset_this_epoch = next(pipeline_iterator) data_this_epoch = [] for batch in dataset_this_epoch.iter_batches(): data_this_epoch.extend(batch) if len(data_all_epochs) > 0: # Make sure data is shuffled per epoch. assert data_this_epoch != data_all_epochs[-1] data_all_epochs.append(data_this_epoch) return data_all_epochs config = TestConfig() trainer = Trainer(config, num_workers=2) trainer.start() results = trainer.run(get_dataset, dataset=dataset) check_dataset_output(num_data, num_epochs, results)
def test_multiple_run(ray_start_2_cpus): config = TestConfig() def train_1(): return 1 trainer = Trainer(config, num_workers=2) trainer.start() output_1 = trainer.run(train_1) assert output_1 == [1, 1] def train_2(): return 2 output_2 = trainer.run(train_2) assert output_2 == [2, 2]
def test_TBX(ray_start_4_cpus, make_temp_dir): config = TestConfig() temp_dir = make_temp_dir num_workers = 4 def train_func(): sgd.report(episode_reward_mean=4) sgd.report(episode_reward_mean=5) sgd.report(episode_reward_mean=6, score=[1, 2, 3], hello={"world": 1}) return 1 callback = TBXLoggerCallback(temp_dir) trainer = Trainer(config, num_workers=num_workers) trainer.start() trainer.run(train_func, callbacks=[callback]) _validate_tbx_result(temp_dir)
def test_mismatch_checkpoint(ray_start_2_cpus): test_config = TestConfig() def train(): for i in range(2): sgd.save_checkpoint(epoch=i) def train_mismatch(): sgd.save_checkpoint(epoch=0) new_backend_executor_cls = gen_new_backend_executor(train_mismatch) with patch.object(ray.util.sgd.v2.trainer, "BackendExecutor", new_backend_executor_cls): trainer = Trainer(test_config, num_workers=2) trainer.start() with pytest.raises(RuntimeError): trainer.run(train)
def test_run_failure(ray_start_2_cpus): test_config = TestConfig() def train_invalid_signature(a, b): pass trainer = Trainer(test_config, num_workers=2) # Raise RuntimeError when trainer has not been started yet. with pytest.raises(RuntimeError): trainer.run(lambda: 1) trainer.start() with pytest.raises(ValueError): trainer.run(train_invalid_signature) trainer.shutdown()
def test_world_rank(ray_start_2_cpus): config = TestConfig() def train_func(): return sgd.world_rank() trainer = Trainer(config, num_workers=2) trainer.start() results = trainer.run(train_func) assert set(results) == {0, 1}
def main(): args = parse_args() config = {"args": args} if args.start_local or args.address or \ args.num_workers > 1 or args.use_gpu: if args.start_local: # Start a local Ray runtime. ray.init(num_cpus=args.num_workers) else: # Connect to a Ray cluster for distributed training. ray.init(address=args.address) trainer = Trainer("torch", num_workers=args.num_workers, use_gpu=args.use_gpu) trainer.start() trainer.run(train_func, config) else: # Run training locally. train_func(config)
def train_linear(num_workers=1): trainer = Trainer(TorchConfig(backend="gloo"), num_workers=num_workers) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3} trainer.start() results = trainer.run(train_func, config, callbacks=[JsonLoggerCallback("./sgd_results")]) trainer.shutdown() print(results) return results
def test_horovod_simple(ray_start_2_cpus): def simple_fn(): hvd_torch.init() return hvd_torch.rank() num_workers = 2 trainer = Trainer("horovod", num_workers) trainer.start() result = trainer.run(simple_fn) trainer.shutdown() assert result == list(range(num_workers))
def test_user_error(ray_start_2_cpus): """Tests if user training function raises an error""" config = TestConfig() def fail_train_1(): raise NotImplementedError trainer = Trainer(config, num_workers=2) trainer.start() with pytest.raises(NotImplementedError): trainer.run(fail_train_1) def fail_train_2(): for _ in range(2): sgd.report(loss=1) raise NotImplementedError with pytest.raises(NotImplementedError): trainer.run(fail_train_2)
def test_run(ray_start_2_cpus): config = TestConfig() def train_func(): return 1 trainer = Trainer(config, num_workers=2) trainer.start() results = trainer.run(train_func) trainer.shutdown() assert len(results) == 2 assert all(result == 1 for result in results)
def test_worker_failure_2(ray_start_2_cpus): test_config = TestConfig() def train(): for _ in range(2): sgd.report(loss=1) return 1 def train_actor_failure(): for _ in range(2): sgd.report(loss=1) import sys sys.exit(0) new_backend_executor_cls = gen_new_backend_executor(train_actor_failure) with patch.object(ray.util.sgd.v2.trainer, "BackendExecutor", new_backend_executor_cls): trainer = Trainer(test_config, num_workers=2) trainer.start() with pytest.raises(RuntimeError): trainer.run(train)
def train_tensorflow_mnist(num_workers=2, use_gpu=False): trainer = Trainer(backend="tensorflow", num_workers=num_workers, use_gpu=use_gpu) trainer.start() results = trainer.run(train_func=train_func, config={ "lr": 1e-3, "batch_size": 64, "epochs": 4 }) trainer.shutdown() print(f"Results: {results[0]}")
def test_worker_failure_1(ray_start_2_cpus): test_config = TestConfig() def train(): return 1 def train_actor_failure(): import sys sys.exit(0) new_backend_executor_cls = gen_new_backend_executor(train_actor_failure) with patch.object(ray.util.sgd.v2.trainer, "BackendExecutor", new_backend_executor_cls): trainer = Trainer(test_config, num_workers=2) trainer.start() with pytest.raises(RuntimeError): trainer.run(train) # Make sure Trainer is shutdown after worker failure. with pytest.raises(RuntimeError): trainer.run(train)
def test_dataset_fault_tolerance(ray_start_4_cpus): dataset = ray.data.range(10) dataset_splits = dataset.split(n=2, equal=True) test_config = TestConfig() def train(): return 1 def train_actor_failure(): import sys sys.exit(0) new_backend_executor_cls = gen_new_backend_executor(train_actor_failure) with patch.object(ray.util.sgd.v2.trainer, "BackendExecutor", new_backend_executor_cls): with patch.object(new_backend_executor_cls, "_get_dataset_shards", return_value=dataset_splits) as mock_method: trainer = Trainer(test_config, num_workers=2) trainer.start() trainer.run(train, dataset=dataset) mock_method.assert_called_once()
def train_linear(num_workers=2, use_gpu=False): datasets = get_datasets() trainer = Trainer("torch", num_workers=num_workers, use_gpu=use_gpu) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3} trainer.start() results = trainer.run( train_func, config, dataset=datasets, callbacks=[JsonLoggerCallback(), TBXLoggerCallback()]) trainer.shutdown() print(results) return results
def test_run_config(ray_start_2_cpus): backend_config = TestConfig() def train_func(config): return config["fruit"] config = {"fruit": "banana"} trainer = Trainer(backend_config, num_workers=2) trainer.start() results = trainer.run(train_func, config) trainer.shutdown() assert len(results) == 2 assert all(result == "banana" for result in results)
def test_torch_fashion_mnist_gpu(ray_start_2_cpus_2_gpus): num_workers = 2 epochs = 3 trainer = Trainer("torch", num_workers=num_workers, use_gpu=True) config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} trainer.start() results = trainer.run(fashion_mnist_train_func, config) trainer.shutdown() assert len(results) == num_workers for result in results: assert len(result) == epochs assert result[-1] < result[0]
def test_torch_linear(ray_start_2_cpus): num_workers = 2 epochs = 3 trainer = Trainer("torch", num_workers=num_workers) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} trainer.start() results = trainer.run(linear_train_func, config) trainer.shutdown() assert len(results) == num_workers for result in results: assert len(result) == epochs assert result[-1]["loss"] < result[0]["loss"]
def train_tensorflow_linear(num_workers=2, use_gpu=False): dataset_pipeline = get_dataset_pipeline() trainer = Trainer(backend="tensorflow", num_workers=num_workers, use_gpu=use_gpu) trainer.start() results = trainer.run(train_func=train_func, dataset=dataset_pipeline, config={ "lr": 1e-3, "batch_size": 32, "epochs": 4 }) trainer.shutdown() print(f"Results: {results[0]}") return results
def test_horovod_torch_mnist_gpu(ray_start_2_cpus_2_gpus): num_workers = 2 num_epochs = 2 trainer = Trainer("horovod", num_workers, use_gpu=True) trainer.start() results = trainer.run(horovod_torch_train_func, config={ "num_epochs": num_epochs, "lr": 1e-3 }) trainer.shutdown() assert len(results) == num_workers for worker_result in results: assert len(worker_result) == num_epochs assert worker_result[num_epochs - 1] < worker_result[0]
def test_worker_kill_checkpoint(ray_start_2_cpus): test_config = TestConfig() def train(): checkpoint = sgd.load_checkpoint() if checkpoint: epoch = checkpoint["epoch"] else: epoch = 0 print("Epoch: ", epoch) for i in range(epoch, 2): sgd.report(loss=1, iter=i) sgd.save_checkpoint(epoch=i + 1) trainer = Trainer(test_config, num_workers=2) trainer.start() kill_callback = KillCallback(fail_on=0, worker_group=trainer._executor.worker_group) trainer.run(train, callbacks=[kill_callback]) # Run 1: epoch=0, counter=1, Successful # *Checkpoint is saved.* # *Worker is killed* # *Getting checkpoint fails. Workers are restarted from beginning* # Run 2: epoch=0, counter=2, Successful # Run 3: epoch=1, counter=3, Successful assert kill_callback.counter == 3 assert trainer.latest_checkpoint["epoch"] == 2 trainer.shutdown() trainer.start() kill_callback = KillCallback(fail_on=1, worker_group=trainer._executor.worker_group) trainer.run(train, callbacks=[kill_callback]) # Run 1: epoch=0, counter=1, Successful # *Checkpoint saved* # *Latest checkpoint updated, epoch=1 # Run 2: epoch=1, counter=2, Successful # *Checkpoint saved* # *Worker is killed* # *Getting checkpoint fails. Workers are restarted from last checkpoint.* # Run 3: epoch=1, counter=3, Successful. assert kill_callback.counter == 3 assert trainer.latest_checkpoint["epoch"] == 2 def train(): return 1 # Make sure Trainer is usable even after failure handling. trainer.run(train)
def test_worker_failure_1(ray_start_2_cpus): test_config = TestConfig() def train(): return 1 def train_actor_failure(): import sys sys.exit(0) new_backend_executor_cls = gen_new_backend_executor(train_actor_failure) with patch.object(ray.util.sgd.v2.trainer, "BackendExecutor", new_backend_executor_cls): trainer = Trainer(test_config, num_workers=2) trainer.start() results = trainer.run(train) assert results == [1, 1]