import json import os import time import ray from ray.train import Trainer from ray.train.examples.horovod.horovod_example import ( train_func as horovod_torch_train_func, ) if __name__ == "__main__": ray.init(address=os.environ.get("RAY_ADDRESS", "auto")) start_time = time.time() num_workers = 8 num_epochs = 10 trainer = Trainer("horovod", num_workers) trainer.start() results = trainer.run(horovod_torch_train_func, config={ "num_epochs": num_epochs, "lr": 1e-3 }) trainer.shutdown() assert len(results) == num_workers for worker_result in results: assert len(worker_result) == num_epochs assert worker_result[num_epochs - 1] < worker_result[0] delta = time.time() - start_time with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:
def test_json(ray_start_4_cpus, make_temp_dir, workers_to_log, detailed, filename): if detailed: os.environ[ENABLE_DETAILED_AUTOFILLED_METRICS_ENV] = "1" config = TestConfig() num_iters = 5 num_workers = 4 if workers_to_log is None: num_workers_to_log = num_workers elif isinstance(workers_to_log, int): num_workers_to_log = 1 else: num_workers_to_log = len(workers_to_log) def train_func(): for i in range(num_iters): train.report(index=i) return 1 if filename is None: # if None, use default value callback = JsonLoggerCallback(workers_to_log=workers_to_log) else: callback = JsonLoggerCallback(filename=filename, workers_to_log=workers_to_log) trainer = Trainer(config, num_workers=num_workers, logdir=make_temp_dir) trainer.start() trainer.run(train_func, callbacks=[callback]) if filename is None: assert str( callback.log_path.name) == JsonLoggerCallback._default_filename else: assert str(callback.log_path.name) == filename with open(callback.log_path, "r") as f: log = json.load(f) print(log) assert len(log) == num_iters assert len(log[0]) == num_workers_to_log assert all(len(element) == len(log[0]) for element in log) assert all( all(worker["index"] == worker[TRAINING_ITERATION] - 1 for worker in element) for element in log) assert all( all( all(key in worker for key in BASIC_AUTOFILLED_KEYS) for worker in element) for element in log) if detailed: assert all( all( all(key in worker for key in DETAILED_AUTOFILLED_KEYS) for worker in element) for element in log) else: assert all( all(not any(key in worker for key in DETAILED_AUTOFILLED_KEYS) for worker in element) for element in log) os.environ.pop(ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, 0) assert ENABLE_DETAILED_AUTOFILLED_METRICS_ENV not in os.environ
# TODO: TBXLoggerCallback should create nonexistent logdir # and should also create 1 directory per file. tbx_logdir = "./runs" os.makedirs(tbx_logdir, exist_ok=True) callbacks = [ TBXLoggerCallback(logdir=tbx_logdir), MLflowLoggerCallback(experiment_name="cuj-big-data-training", save_artifact=True), ] # Remove CPU resource so Datasets can be scheduled. resources_per_worker = {"CPU": 0, "GPU": 1} if use_gpu else None trainer = Trainer( backend="torch", num_workers=num_workers, use_gpu=use_gpu, resources_per_worker=resources_per_worker, ) trainer.start() results = trainer.run(train_func=train_func, config=config, callbacks=callbacks, dataset=datasets) model = results[0] trainer.shutdown() if args.mlflow_register_model: mlflow.pytorch.log_model(model, artifact_path="models", registered_model_name="torch_model")
def test_torch_auto_gpu_to_cpu(ray_start_4_cpus_2_gpus): """Tests if GPU tensors are auto converted to CPU on driver.""" # Disable GPU on the driver. os.environ["CUDA_VISIBLE_DEVICES"] = "" num_workers = 2 class ValidateCPUCallback(TrainingCallback): def handle_result(self, results, **info): for result in results: model = result["model"] assert not next(model.parameters()).is_cuda def train_func(): model = torch.nn.Linear(1, 1) # Move to GPU device. model = ray.train.torch.prepare_model(model) assert next(model.parameters()).is_cuda ray.train.save_checkpoint(model=model) ray.train.report(model=model) trainer = Trainer("torch", num_workers=num_workers, use_gpu=True) trainer.start() trainer.run(train_func, callbacks=[ValidateCPUCallback()]) model = trainer.latest_checkpoint["model"] assert not next(model.parameters()).is_cuda trainer.shutdown() # Test the same thing for state dict. class ValidateCPUStateDictCallback(TrainingCallback): def handle_result(self, results, **info): for result in results: state_dict = result["state_dict"] for tensor in state_dict.values(): assert not tensor.is_cuda def train_func(): model = torch.nn.Linear(1, 1) # Move to GPU device. model = ray.train.torch.prepare_model(model) assert next(model.parameters()).is_cuda state_dict = model.state_dict() for tensor in state_dict.values(): assert tensor.is_cuda ray.train.save_checkpoint(state_dict=state_dict) ray.train.report(state_dict=state_dict) trainer = Trainer("torch", num_workers=num_workers, use_gpu=True) trainer.start() trainer.run(train_func, callbacks=[ValidateCPUStateDictCallback()]) state_dict = trainer.latest_checkpoint["state_dict"] for tensor in state_dict.values(): assert not tensor.is_cuda trainer.shutdown() # Reset the env var. os.environ.pop("CUDA_VISIBLE_DEVICES")
action="store_true", default=False, help="Finish quickly for testing.") parser.add_argument( "--use-gpu", action="store_true", default=False, help="Enables GPU training") args, _ = parser.parse_known_args() if args.smoke_test: ray.init(num_cpus=4) else: ray.init(address=args.address) trainer = Trainer( "torch", num_workers=args.num_workers, use_gpu=args.use_gpu) Trainable = trainer.to_tune_trainable(train_func) pbt_scheduler = PopulationBasedTraining( time_attr="training_iteration", metric="loss", mode="min", perturbation_interval=1, hyperparam_mutations={ # distribution for resampling "lr": lambda: np.random.uniform(0.001, 1), # allow perturbations within this set of categorical values "momentum": [0.8, 0.9, 0.99], }) reporter = CLIReporter() reporter.add_metric_column("loss", "loss")
parser = argparse.ArgumentParser() parser.add_argument( "--smoke-test", action="store_true", default=False, help="Finish quickly for training.", ) args = parser.parse_args() ray.init(address="auto" if not args.smoke_test else None, log_to_driver=True) num_training_workers = 1 if args.smoke_test else 3 trainer = Trainer( num_workers=num_training_workers, use_gpu=not args.smoke_test, backend=TorchConfig(backend="gloo"), ) TorchTrainable = trainer.to_tune_trainable(train_func=train_func) pbt_scheduler = PopulationBasedTraining( time_attr="training_iteration", metric="loss", mode="min", perturbation_interval=1, hyperparam_mutations={ # distribution for resampling "lr": lambda: np.random.uniform(0.001, 1), # allow perturbations within this set of categorical values "momentum": [0.8, 0.9, 0.99], },
loss_fn = nn.MSELoss() optimizer = optim.SGD(model.parameters(), lr=0.1) for epoch in range(num_epochs): output = model(input) loss = loss_fn(output, labels) optimizer.zero_grad() loss.backward() optimizer.step() print(f"epoch: {epoch}, loss: {loss.item()}") # __torch_distributed_end__ if __name__ == "__main__": # __torch_single_run_begin__ train_func() # __torch_single_run_end__ # __torch_trainer_begin__ from ray.train import Trainer trainer = Trainer(backend="torch", num_workers=4) trainer.start() results = trainer.run(train_func_distributed) trainer.shutdown() # __torch_trainer_end__
def test_worker_kill(ray_start_2_cpus, backend): if backend == "test": test_config = TestConfig() elif backend == "torch": test_config = TorchConfig() elif backend == "tf": test_config = TensorflowConfig() elif backend == "horovod": test_config = HorovodConfig() trainer = Trainer(test_config, num_workers=2) def train_func(): for i in range(2): train.report(loss=1, iter=i) trainer.start() kill_callback = KillCallback(fail_on=0, trainer=trainer) trainer.run(train_func, callbacks=[kill_callback]) # Run 1: iter=0, counter=1, Successful # Run 2: iter=1, counter=1, Unsuccessful, starts training from beginning # Run 3: iter=0, counter=2, Successful # Run 4: iter=1, counter=3, Successful assert kill_callback.counter == 3 trainer.shutdown() trainer.start() kill_callback = KillCallback(fail_on=1, trainer=trainer) trainer.run(train_func, callbacks=[kill_callback]) # Run 1: iter=0, counter=1, Successful # Run 2: iter=1, counter=2, Successful # Run 3: None, counter=2, Unsuccessful, starts training from beginning. # Run 4: iter=0, counter=3, Successful # Run 5: iter=1, counter=4, Successful assert kill_callback.counter == 4 def train_func(): return 1 # Make sure Trainer is usable even after failure handling. trainer.run(train_func)
def test_worker_kill_checkpoint(ray_start_2_cpus): test_config = TestConfig() def train_func(): checkpoint = train.load_checkpoint() if checkpoint: epoch = checkpoint["epoch"] else: epoch = 0 print("Epoch: ", epoch) for i in range(epoch, 2): train.report(loss=1, iter=i) train.save_checkpoint(epoch=i + 1) trainer = Trainer(test_config, num_workers=2) trainer.start() kill_callback = KillCallback(fail_on=0, trainer=trainer) trainer.run(train_func, callbacks=[kill_callback]) # Run 1: epoch=0, counter=1, Successful # *Checkpoint is saved.* # *Worker is killed* # *Getting checkpoint fails. Workers are restarted from beginning* # Run 2: epoch=0, counter=2, Successful # Run 3: epoch=1, counter=3, Successful assert kill_callback.counter == 3 assert trainer.latest_checkpoint["epoch"] == 2 trainer.shutdown() trainer.start() kill_callback = KillCallback(fail_on=1, trainer=trainer) trainer.run(train_func, callbacks=[kill_callback]) # Run 1: epoch=0, counter=1, Successful # *Checkpoint saved* # *Latest checkpoint updated, epoch=1 # Run 2: epoch=1, counter=2, Successful # *Checkpoint saved* # *Worker is killed* # *Getting checkpoint fails. Workers are restarted from last checkpoint.* # Run 3: epoch=1, counter=3, Successful. assert kill_callback.counter == 3 assert trainer.latest_checkpoint["epoch"] == 2 def train_func(): return 1 # Make sure Trainer is usable even after failure handling. trainer.run(train_func)
def test_init_failure(ray_start_2_cpus): with pytest.raises(TypeError): Trainer(5, num_workers=2) with pytest.raises(ValueError): Trainer("invalid", num_workers=2)
def test_start_failure(ray_start_2_cpus): with pytest.raises(ValueError): trainer = Trainer("torch", num_workers=0) trainer.start()
def test_gpu_requests(ray_start_4_cpus_4_gpus_4_extra): class CudaTestBackend(TestBackend): share_cuda_visible_devices = True class CudaTestConfig(TestConfig): @property def backend_cls(self): return CudaTestBackend # GPUs should not be requested if `use_gpu` is False. with pytest.raises(ValueError): Trainer(CudaTestConfig(), num_workers=2, use_gpu=False, resources_per_worker={"GPU": 1}) # GPUs should not be set to 0 if `use_gpu` is True. with pytest.raises(ValueError): Trainer(CudaTestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 0}) def get_resources(): return os.environ["CUDA_VISIBLE_DEVICES"] # 0 GPUs will be requested and should not raise an error. trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=False) trainer.start() result = trainer.run(get_resources) assert result == ["", ""] trainer.shutdown() # 1 GPU will be requested and should not raise an error. trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=True) trainer.start() result = trainer.run(get_resources) assert result == ["0,1", "0,1"] trainer.shutdown() # Partial GPUs should not raise an error. trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 0.1}) trainer.start() result = trainer.run(get_resources) assert result == ["0", "0"] trainer.shutdown() # Multiple GPUs should not raise an error. trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 2}) trainer.start() result = trainer.run(get_resources) assert result == ["0,1,2,3", "0,1,2,3"] trainer.shutdown()
def main(num_workers, use_gpu, kwargs): trainer = Trainer("horovod", use_gpu=use_gpu, num_workers=num_workers) trainer.start() loss_per_epoch = trainer.run(train_func, config=kwargs) trainer.shutdown() print(loss_per_epoch)
def test_gpu_requests(ray_start_4_cpus_4_gpus_4_extra): # GPUs should not be requested if `use_gpu` is False. with pytest.raises(ValueError): Trainer( TestConfig(), num_workers=2, use_gpu=False, resources_per_worker={"GPU": 1}) # GPUs should not be set to 0 if `use_gpu` is True. with pytest.raises(ValueError): Trainer( TestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 0}) def get_resources(): return os.environ["CUDA_VISIBLE_DEVICES"] os.environ[ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV] = "1" # 0 GPUs will be requested and should not raise an error. trainer = Trainer(TestConfig(), num_workers=2, use_gpu=False) trainer.start() result = trainer.run(get_resources) assert result == ["", ""] trainer.shutdown() # 1 GPU will be requested and should not raise an error. trainer = Trainer(TestConfig(), num_workers=2, use_gpu=True) trainer.start() result = trainer.run(get_resources) assert result == ["0,1", "0,1"] trainer.shutdown() # Partial GPUs should not raise an error. trainer = Trainer( TestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 0.1}) trainer.start() result = trainer.run(get_resources) assert result == ["0", "0"] trainer.shutdown() # Multiple GPUs should not raise an error. trainer = Trainer( TestConfig(), num_workers=2, use_gpu=True, resources_per_worker={"GPU": 2}) trainer.start() result = trainer.run(get_resources) assert result == ["0,1,2,3", "0,1,2,3"] trainer.shutdown()
from ray import train from ray.train import Trainer from ray.train.callbacks import MLflowLoggerCallback, TBXLoggerCallback def train_func(): for i in range(3): train.report(epoch=i) trainer = Trainer(backend="torch", num_workers=2) trainer.start() # Run the training function, logging all the intermediate results # to MLflow and Tensorboard. result = trainer.run( train_func, callbacks=[ MLflowLoggerCallback(experiment_name="train_experiment"), TBXLoggerCallback(), ], ) # Print the latest run directory and keep note of it. # For example: /home/ray_results/train_2021-09-01_12-00-00/run_001 print("Run directory:", trainer.latest_run_dir) trainer.shutdown()
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--address", required=False, type=str, help="the address to use for Ray") parser.add_argument( "--num-workers", "-n", type=int, default=2, help="Sets number of workers for training.", ) parser.add_argument("--use-gpu", action="store_true", default=False, help="Enables GPU training") args = parser.parse_args() ray.init(address=args.address) callbacks = [TorchTensorboardProfilerCallback(), TBXLoggerCallback()] trainer = Trainer(backend="torch", num_workers=args.num_workers, use_gpu=args.use_gpu) trainer.start() trainer.run(train_func, callbacks=callbacks) trainer.shutdown()