def train_linear(num_workers=1): trainer = Trainer(TorchConfig(backend="gloo"), num_workers=num_workers) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3} trainer.start() results = trainer.run(train_func, config, callbacks=[JsonLoggerCallback("./sgd_results")]) trainer.shutdown() print(results) return results
def test_horovod_simple(ray_start_2_cpus): def simple_fn(): hvd_torch.init() return hvd_torch.rank() num_workers = 2 trainer = Trainer("horovod", num_workers) trainer.start() result = trainer.run(simple_fn) trainer.shutdown() assert result == list(range(num_workers))
def test_worker_kill_checkpoint(ray_start_2_cpus): test_config = TestConfig() def train(): checkpoint = sgd.load_checkpoint() if checkpoint: epoch = checkpoint["epoch"] else: epoch = 0 print("Epoch: ", epoch) for i in range(epoch, 2): sgd.report(loss=1, iter=i) sgd.save_checkpoint(epoch=i + 1) trainer = Trainer(test_config, num_workers=2) trainer.start() kill_callback = KillCallback(fail_on=0, worker_group=trainer._executor.worker_group) trainer.run(train, callbacks=[kill_callback]) # Run 1: epoch=0, counter=1, Successful # *Checkpoint is saved.* # *Worker is killed* # *Getting checkpoint fails. Workers are restarted from beginning* # Run 2: epoch=0, counter=2, Successful # Run 3: epoch=1, counter=3, Successful assert kill_callback.counter == 3 assert trainer.latest_checkpoint["epoch"] == 2 trainer.shutdown() trainer.start() kill_callback = KillCallback(fail_on=1, worker_group=trainer._executor.worker_group) trainer.run(train, callbacks=[kill_callback]) # Run 1: epoch=0, counter=1, Successful # *Checkpoint saved* # *Latest checkpoint updated, epoch=1 # Run 2: epoch=1, counter=2, Successful # *Checkpoint saved* # *Worker is killed* # *Getting checkpoint fails. Workers are restarted from last checkpoint.* # Run 3: epoch=1, counter=3, Successful. assert kill_callback.counter == 3 assert trainer.latest_checkpoint["epoch"] == 2 def train(): return 1 # Make sure Trainer is usable even after failure handling. trainer.run(train)
def test_run(ray_start_2_cpus): config = TestConfig() def train_func(): return 1 trainer = Trainer(config, num_workers=2) trainer.start() results = trainer.run(train_func) trainer.shutdown() assert len(results) == 2 assert all(result == 1 for result in results)
def train_tensorflow_mnist(num_workers=2, use_gpu=False): trainer = Trainer(backend="tensorflow", num_workers=num_workers, use_gpu=use_gpu) trainer.start() results = trainer.run(train_func=train_func, config={ "lr": 1e-3, "batch_size": 64, "epochs": 4 }) trainer.shutdown() print(f"Results: {results[0]}")
def test_torch_fashion_mnist_gpu(ray_start_2_cpus_2_gpus): num_workers = 2 epochs = 3 trainer = Trainer("torch", num_workers=num_workers, use_gpu=True) config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} trainer.start() results = trainer.run(fashion_mnist_train_func, config) trainer.shutdown() assert len(results) == num_workers for result in results: assert len(result) == epochs assert result[-1] < result[0]
def test_torch_linear(ray_start_2_cpus): num_workers = 2 epochs = 3 trainer = Trainer("torch", num_workers=num_workers) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} trainer.start() results = trainer.run(linear_train_func, config) trainer.shutdown() assert len(results) == num_workers for result in results: assert len(result) == epochs assert result[-1]["loss"] < result[0]["loss"]
def test_run_config(ray_start_2_cpus): backend_config = TestConfig() def train_func(config): return config["fruit"] config = {"fruit": "banana"} trainer = Trainer(backend_config, num_workers=2) trainer.start() results = trainer.run(train_func, config) trainer.shutdown() assert len(results) == 2 assert all(result == "banana" for result in results)
def train_linear(num_workers=2, use_gpu=False): datasets = get_datasets() trainer = Trainer("torch", num_workers=num_workers, use_gpu=use_gpu) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3} trainer.start() results = trainer.run( train_func, config, dataset=datasets, callbacks=[JsonLoggerCallback(), TBXLoggerCallback()]) trainer.shutdown() print(results) return results
def test_start_shutdown(ray_start_2_cpus, num_workers): config = TestConfig() assert ray.available_resources()["CPU"] == 2 trainer = Trainer(config, num_workers=num_workers) trainer.start() time.sleep(1) remaining = 2 - num_workers if remaining == 0: assert "CPU" not in ray.available_resources() else: assert ray.available_resources()["CPU"] == remaining trainer.shutdown() time.sleep(1) assert ray.available_resources()["CPU"] == 2
def train_tensorflow_linear(num_workers=2, use_gpu=False): dataset_pipeline = get_dataset_pipeline() trainer = Trainer(backend="tensorflow", num_workers=num_workers, use_gpu=use_gpu) trainer.start() results = trainer.run(train_func=train_func, dataset=dataset_pipeline, config={ "lr": 1e-3, "batch_size": 32, "epochs": 4 }) trainer.shutdown() print(f"Results: {results[0]}") return results
def test_horovod_torch_mnist_gpu(ray_start_2_cpus_2_gpus): num_workers = 2 num_epochs = 2 trainer = Trainer("horovod", num_workers, use_gpu=True) trainer.start() results = trainer.run(horovod_torch_train_func, config={ "num_epochs": num_epochs, "lr": 1e-3 }) trainer.shutdown() assert len(results) == num_workers for worker_result in results: assert len(worker_result) == num_epochs assert worker_result[num_epochs - 1] < worker_result[0]
def test_horovod_torch_mnist_stateful(ray_start_2_cpus): num_workers = 2 num_epochs = 2 trainer = Trainer("horovod", num_workers) workers = trainer.to_worker_group(HorovodTrainClass, config={ "num_epochs": num_epochs, "lr": 1e-3 }) results = [] for epoch in range(num_epochs): results.append(ray.get([w.train.remote(epoch=epoch) for w in workers])) trainer.shutdown() assert len(results) == num_epochs for i in range(num_workers): assert results[num_epochs - 1][i] < results[0][i]
def test_run_failure(ray_start_2_cpus): test_config = TestConfig() def train_invalid_signature(a, b): pass trainer = Trainer(test_config, num_workers=2) # Raise RuntimeError when trainer has not been started yet. with pytest.raises(RuntimeError): trainer.run(lambda: 1) trainer.start() with pytest.raises(ValueError): trainer.run(train_invalid_signature) trainer.shutdown()
def test_gpu_requests(ray_start_4_cpus_4_gpus_4_extra): # GPUs should not be requested if `use_gpu` is False. with pytest.raises(ValueError): Trainer(TestConfig(), num_workers=2, use_gpu=False, resources_per_worker={"GPU": 1}) # GPUs should not be set to 0 if `use_gpu` is True. with pytest.raises(ValueError): Trainer(TestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 0}) def get_resources(): return os.environ["CUDA_VISIBLE_DEVICES"] os.environ[ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV] = "1" # 0 GPUs will be requested and should not raise an error. trainer = Trainer(TestConfig(), num_workers=2, use_gpu=False) trainer.start() result = trainer.run(get_resources) assert result == ["", ""] trainer.shutdown() # 1 GPU will be requested and should not raise an error. trainer = Trainer(TestConfig(), num_workers=2, use_gpu=True) trainer.start() result = trainer.run(get_resources) assert result == ["0,1", "0,1"] trainer.shutdown() # Partial GPUs should not raise an error. trainer = Trainer(TestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 0.1}) trainer.start() result = trainer.run(get_resources) assert result == ["0", "0"] trainer.shutdown() # Multiple GPUs should not raise an error. trainer = Trainer(TestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 2}) trainer.start() result = trainer.run(get_resources) assert result == ["0,1,2,3", "0,1,2,3"] trainer.shutdown()
def test_worker_kill(ray_start_2_cpus, backend): if backend == "test": test_config = TestConfig() elif backend == "torch": test_config = TorchConfig() elif backend == "tf": test_config = TensorflowConfig() elif backend == "horovod": test_config = HorovodConfig() trainer = Trainer(test_config, num_workers=2) def train_func(): for i in range(2): sgd.report(loss=1, iter=i) trainer.start() kill_callback = KillCallback(fail_on=0, worker_group=trainer._executor.worker_group) trainer.run(train_func, callbacks=[kill_callback]) # Run 1: iter=0, counter=1, Successful # Run 2: iter=1, counter=1, Unsuccessful, starts training from beginning # Run 3: iter=0, counter=2, Successful # Run 4: iter=1, counter=3, Successful assert kill_callback.counter == 3 trainer.shutdown() trainer.start() kill_callback = KillCallback(fail_on=1, worker_group=trainer._executor.worker_group) trainer.run(train_func, callbacks=[kill_callback]) # Run 1: iter=0, counter=1, Successful # Run 2: iter=1, counter=2, Successful # Run 3: None, counter=2, Unsuccessful, starts training from beginning. # Run 4: iter=0, counter=3, Successful # Run 5: iter=1, counter=4, Successful assert kill_callback.counter == 4 def train(): return 1 # Make sure Trainer is usable even after failure handling. trainer.run(train)
def test_resources(ray_start_4_cpus_4_gpus_4_extra, resource, num_requested): num_workers = 2 config = TestConfig() original = ray.available_resources().get(resource) resources_per_worker = {resource: num_requested} use_gpu = resource == "GPU" trainer = Trainer(config, num_workers=num_workers, use_gpu=use_gpu, resources_per_worker=resources_per_worker) trainer.start() expected = original - num_workers * num_requested wait_for_condition( lambda: ray.available_resources().get(resource, 0) == expected) trainer.shutdown() wait_for_condition( lambda: ray.available_resources().get(resource, 0) == original)
def test_tensorflow_mnist_gpu(ray_start_2_cpus_2_gpus): num_workers = 2 epochs = 3 trainer = Trainer("tensorflow", num_workers=num_workers, use_gpu=True) config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} trainer.start() results = trainer.run(tensorflow_mnist_train_func, config) trainer.shutdown() assert len(results) == num_workers result = results[0] loss = result["loss"] assert len(loss) == epochs assert loss[-1] < loss[0] accuracy = result["accuracy"] assert len(accuracy) == epochs assert accuracy[-1] > accuracy[0]
def test_multiple_datasets(ray_start_4_cpus): num_epochs = 2 num_data_1 = 10 num_data_2 = 6 train_data = ray.data.range(num_data_1) val_data = ray.data.range(num_data_2) def get_dataset(): data_train_all_epochs = [] data_val_all_epochs = [] for _ in range(2): data_this_epoch_train = [] train_dataset = sgd.get_dataset_shard("train") for batch in train_dataset.iter_batches(): data_this_epoch_train.extend(batch) data_train_all_epochs.append(data_this_epoch_train) data_this_epoch_val = [] val_dataset = sgd.get_dataset_shard("val") for batch in val_dataset.iter_batches(): data_this_epoch_val.extend(batch) data_val_all_epochs.append(data_this_epoch_val) return data_train_all_epochs, data_val_all_epochs config = TestConfig() trainer = Trainer(config, num_workers=2) trainer.start() results = trainer.run(get_dataset, dataset={ "train": train_data, "val": val_data }) check_dataset_output(num_data_1, num_epochs, [worker_data[0] for worker_data in results]) check_dataset_output(num_data_2, num_epochs, [worker_data[1] for worker_data in results]) trainer.shutdown()
def test_dataset(ray_start_4_cpus): """Checks that Dataset is correctly sharded even with multiple epochs.""" num_epochs = 2 num_data = 10 dataset = ray.data.range(num_data) def get_dataset(): data_all_epochs = [] for _ in range(2): data_this_epoch = [] dataset = sgd.get_dataset_shard() for batch in dataset.iter_batches(): data_this_epoch.extend(batch) data_all_epochs.append(data_this_epoch) return data_all_epochs config = TestConfig() trainer = Trainer(config, num_workers=2) trainer.start() results = trainer.run(get_dataset, dataset=dataset) check_dataset_output(num_data, num_epochs, results) trainer.shutdown()
def main(num_workers, use_gpu, kwargs): trainer = Trainer("horovod", use_gpu=use_gpu, num_workers=num_workers) trainer.start() loss_per_epoch = trainer.run(train_func, config=kwargs) trainer.shutdown() print(loss_per_epoch)