def test_horovod_state_dict(ray_start_4_cpus): def train_func(config): result = hvd_train_func(config) assert len(result) == epochs assert result[-1] < result[0] num_workers = 2 epochs = 10 scaling_config = ScalingConfig(num_workers=num_workers) config = {"num_epochs": epochs, "save_model_as_dict": True} trainer = HorovodTrainer( train_loop_per_worker=train_func, train_loop_config=config, scaling_config=scaling_config, ) result = trainer.fit() predictor = TorchPredictor.from_checkpoint(result.checkpoint, model=Net()) # Find some test data to run on. test_set = datasets.MNIST( "./data", train=False, download=True, transform=transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] ), ) test_dataloader = DataLoader(test_set, batch_size=10) test_dataloader_iter = iter(test_dataloader) images, labels = next( test_dataloader_iter ) # only running a batch inference of 10 images predicted_labels = run_image_prediction(predictor.model, images) assert torch.equal(predicted_labels, labels)
def main(num_workers, use_gpu, kwargs): trainer = HorovodTrainer( train_func, train_loop_config=kwargs, scaling_config=ScalingConfig(use_gpu=use_gpu, num_workers=num_workers), ) results = trainer.fit() print(results.metrics)
def main(num_workers, use_gpu, kwargs): trainer = HorovodTrainer( train_loop_per_worker=train_func, train_loop_config={ "num_epochs": kwargs["num_epochs"], "log_interval": kwargs["log_interval"], "use_cuda": kwargs["use_cuda"], }, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), ) result = trainer.fit() print(result)
def tune_horovod(num_workers, num_samples, use_gpu, mode="square", x_max=1.0): horovod_trainer = HorovodTrainer( train_loop_per_worker=train_loop_per_worker, scaling_config={ "num_workers": num_workers, "use_gpu": use_gpu }, train_loop_config={ "mode": mode, "x_max": x_max }, ) tuner = Tuner( horovod_trainer, param_space={"train_loop_config": { "lr": tune.uniform(0.1, 1) }}, tune_config=TuneConfig(mode="min", metric="loss", num_samples=num_samples), _tuner_kwargs={"fail_fast": True}, ) result_grid = tuner.fit() print("Best hyperparameters found were: ", result_grid.get_best_result().config)
[ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(CIFAR10_STATS["mean"], CIFAR10_STATS["std"]), ] ) # meanstd transformation dataset = torchvision.datasets.CIFAR10( root="/tmp/data_cifar", train=True, download=True, transform=transform_train ) horovod_trainer = HorovodTrainer( train_loop_per_worker=train_loop_per_worker, scaling_config={ "use_gpu": False if args.smoke_test else True, "num_workers": 2 if args.smoke_test else 4, }, train_loop_config={"batch_size": 64, "data": ray.put(dataset)}, ) # ensure that checkpointing works. pbt = create_scheduler( "pbt", perturbation_interval=2, hyperparam_mutations={ "train_loop_config": {"lr": tune.uniform(0.001, 0.1)}, }, ) tuner = Tuner( horovod_trainer,
transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(CIFAR10_STATS["mean"], CIFAR10_STATS["std"]), ]) # meanstd transformation dataset = torchvision.datasets.CIFAR10(root="/tmp/data_cifar", train=True, download=True, transform=transform_train) horovod_trainer = HorovodTrainer( train_loop_per_worker=train_loop_per_worker, scaling_config=ScalingConfig( use_gpu=False if args.smoke_test else True, num_workers=2 if args.smoke_test else 4, ), train_loop_config={ "batch_size": 64, "data": ray.put(dataset) }, ) # ensure that checkpointing works. pbt = create_scheduler( "pbt", perturbation_interval=2, hyperparam_mutations={ "train_loop_config": { "lr": tune.uniform(0.001, 0.1) }, },