def test_restore_checkpoint(ray_start_2_cpus): # noqa: F811 trainable_cls = DistributedTrainableCreator(_train_simple, num_workers=2) trainer = trainable_cls(config={"epochs": 1}) trainer.train() checkpoint_dir = trainer.save() trainer.restore(checkpoint_dir) trainer.stop()
def test_simple_tune(ray_start_4_cpus, enabled_checkpoint): trainable_cls = DistributedTrainableCreator(_train_simple, num_workers=2) analysis = tune.run(trainable_cls, config={"enable_checkpoint": enabled_checkpoint}, num_samples=2, stop={"training_iteration": 2}) assert analysis.trials[0].last_result["training_iteration"] == 2 assert analysis.trials[0].has_checkpoint() == enabled_checkpoint
def test_save_checkpoint(ray_start_2_cpus): # noqa: F811 trainable_cls = DistributedTrainableCreator(_train_simple, num_workers=2) trainer = trainable_cls(config={"epochs": 1}) trainer.train() checkpoint_dir = trainer.save() model_state_dict, opt_state_dict = torch.load( os.path.join(checkpoint_dir, "checkpoint")) trainer.stop()
def test_colocated(ray_4_node): # noqa: F811 assert ray.available_resources()["CPU"] == 4 trainable_cls = DistributedTrainableCreator(_train_check_global, num_workers=4, num_workers_per_host=1) trainable = trainable_cls() assert ray.available_resources().get("CPU", 0) == 0 trainable.train() trainable.stop()
def test_colocated_gpu_double(ray_4_node_gpu): # noqa: F811 assert ray.available_resources()["GPU"] == 8 trainable_cls = DistributedTrainableCreator(_train_check_global, num_workers=8, num_gpus_per_worker=1, num_workers_per_host=2, timeout_s=30) trainable = trainable_cls() print("?????") print(ray.available_resources().get("GPU")) assert ray.available_resources().get("GPU", 0) == 0 trainable.train() trainable.stop()
def run_ddp_tune(num_workers, num_gpus_per_worker, workers_per_node=None): trainable_cls = DistributedTrainableCreator( train_mnist, num_workers=num_workers, num_gpus_per_worker=num_gpus_per_worker, num_workers_per_host=workers_per_node) analysis = tune.run(trainable_cls, num_samples=4, stop={"training_iteration": 10}, metric="mean_accuracy", mode="max") print("Best hyperparameters found were: ", analysis.best_config)
help="enables CUDA training") parser.add_argument("--cluster", action="store_true", default=False, help="enables multi-node tuning") parser.add_argument( "--workers-per-node", type=int, help="Forces workers to be colocated on machines if set.") args = parser.parse_args() if args.cluster: options = dict(address="auto") else: options = dict(num_cpus=2) ray.init(**options) trainable_cls = DistributedTrainableCreator( train_mnist, num_workers=args.num_workers, num_gpus_per_worker=args.num_gpus_per_worker, num_workers_per_host=args.workers_per_node) analysis = tune.run(trainable_cls, num_samples=4, stop={"training_iteration": 10}, metric="mean_accuracy", mode="max") print("Best hyperparameters found were: ", analysis.best_config)
model.load_state_dict(model_state) optimizer.load_state_dict(optimizer_state) model = DistributedDataParallel(model) for epoch in range(400): train(model, optimizer, train_loader, device) acc = test(model, test_loader, device) if epoch % 3 == 0: with distributed_checkpoint_dir(step=epoch) as checkpoint_dir: path = os.path.join(checkpoint_dir, "checkpoint") torch.save((model.state_dict(), optimizer.state_dict()), path) tune.report(mean_accuracy=acc) if __name__ == "__main__": training_iter = 10 n_samples = 4 if os.environ.get("ANYSCALE_HOST", None): print("Running on Anyscale") training_iter = 20 n_samples = 10 ray.init() trainable_cls = DistributedTrainableCreator(train_mnist) tune.run(trainable_cls, num_samples=n_samples, stop={"training_iteration": training_iter})
"-n", type=int, default=2, help="Sets number of workers for training.") parser.add_argument("--use-gpu", action="store_true", default=False, help="enables CUDA training") parser.add_argument("--cluster", action="store_true", default=False, help="enables multi-node tuning") args = parser.parse_args() if args.cluster: options = dict(address="auto") else: options = dict(num_cpus=2) ray.init(**options) trainable_cls = DistributedTrainableCreator(train_mnist, num_workers=args.num_workers, use_gpu=args.use_gpu) analysis = tune.run(trainable_cls, num_samples=4, stop={"training_iteration": 10}, metric="mean_accuracy", mode="max") print("Best hyperparameters found were: ", analysis.best_config)
def test_validation(ray_start_2_cpus): # noqa: F811 def bad_func(a, b, c): return 1 with pytest.raises(ValueError): DistributedTrainableCreator(bad_func, num_workers=2)
def test_set_global(ray_start_2_cpus): # noqa: F811 trainable_cls = DistributedTrainableCreator( _train_check_global, num_workers=2) trainable = trainable_cls() result = trainable.train() assert result["is_distributed"]
def test_single_step(ray_start_2_cpus): # noqa: F811 trainable_cls = DistributedTrainableCreator(_train_simple, num_workers=2) trainer = trainable_cls() trainer.train() trainer.stop()
def test_step_after_completion(ray_start_2_cpus): # noqa: F811 trainable_cls = DistributedTrainableCreator(_train_simple, num_workers=2) trainer = trainable_cls(config={"epochs": 1}) with pytest.raises(RuntimeError): for i in range(10): trainer.train()
def test_validate_session(ray_start_2_cpus): trainable_cls = DistributedTrainableCreator(_train_validate_session) tune.run(trainable_cls)
loss = torch.Tensor([0.0]) for epoch in adl.remaining_epochs_until(config.get("epochs", 10)): for (x, y) in dataloader: x, y = x.to(device), y.to(device) optimizer.zero_grad() output = model(x) loss = loss_fn(output, y) loss.backward() optimizer.step() tune.report(mean_loss=loss.item()) ray.init(address="auto", _tracing_startup_hook=None) trainable_cls = DistributedTrainableCreator(_train_simple) config_0 = {"epochs": 50} config_1 = { "epochs": 60, "H": tune.choice([8, 12]), "N": tune.grid_search(list(range(32, 64, 8))) } analysis = tune.run( trainable_cls, num_samples=1, # total trials will be num_samples x points on the grid scheduler=AdaptDLScheduler(), config=config_1, metric="mean_loss", mode="min")
with distributed_checkpoint_dir(step=epoch) as checkpoint_dir: path = os.path.join(checkpoint_dir, "checkpoint") torch.save((model.state_dict(), optimizer.state_dict()), path) tune.report(mean_accuracy=acc) if __name__ == "__main__": training_iter = 50 n_samples = 30 # the following line run from anyscale project # this does not appear to work with incoming connections on anysacle, just like locally #ray.init() # the following runs on an existing ray on laptop # it also has worked when running pushed code to the anyscale session ray.init(address='auto', _redis_password='******') # the following is for anyscale connect verification # issues with current nightly are preventing its working atm #import anyscale #anyscale\ #.require("./requirements.txt")\ #.session(session_name="new-sbox")\ #.connect() print("Starting Training") trainable_cls = DistributedTrainableCreator(train_mnist, num_workers=2) tune.run(trainable_cls, num_samples=n_samples, stop={"training_iteration": training_iter})
acc = test(model, test_loader, device) if epoch % 3 == 0: with distributed_checkpoint_dir(step=epoch) as checkpoint_dir: path = os.path.join(checkpoint_dir, "checkpoint") torch.save((model.state_dict(), optimizer.state_dict()), path) tune.report(mean_accuracy=acc) if __name__ == "__main__": training_iter = 10 n_samples = 4 n_workers = 0 if os.environ.get("ANYSCALE_HOST", None): training_iter = 20 n_samples = 10 n_workers = 1 ray.init() gpu_resources = 0 if "GPU" in ray.available_resources(): print("GPU ENABLED") gpu_resources = 0.1 trainable_cls = DistributedTrainableCreator(train_mnist, n_workers, 1, gpu_resources) tune.run( trainable_cls, num_samples=n_samples, stop={"training_iteration": training_iter}, )
import ray from ray import tune from ray.tune.integration.torch import DistributedTrainableCreator from ray.util.sgd.torch import is_distributed_trainable ray.init() def my_trainable(config, checkpoint_dir=None): if is_distributed_trainable(): pass trainable = DistributedTrainableCreator( my_trainable, use_gpu=True, num_workers=4, num_cpus_per_worker=1, ) config = {} tune.run( trainable, resources_per_trial=None, config=config, )