def test_train_client(tmpdir, start_ray_client_server_2_cpus, seed, num_slots): """Tests if training modifies model weights.""" assert ray.util.client.ray.is_connected() model = BoringModel() plugin = HorovodRayPlugin(num_slots=num_slots, use_gpu=False) trainer = get_trainer(tmpdir, plugins=[plugin]) train_test(trainer, model)
def test_distributed_sampler(tmpdir, ray_start_2_cpus): """Tests if distributed sampler is properly set.""" model = BoringModel() train_dataloader = model.train_dataloader() initial_sampler = train_dataloader.sampler assert not isinstance(initial_sampler, DistributedSampler) class DistributedSamplerCallback(Callback): def on_train_start(self, trainer, pl_module): train_sampler = trainer.train_dataloader.sampler assert isinstance(train_sampler, DistributedSampler) assert train_sampler.shuffle assert train_sampler.num_replicas == 2 assert train_sampler.rank == trainer.global_rank def on_validation_start(self, trainer, pl_module): train_sampler = trainer.val_dataloaders[0].sampler assert isinstance(train_sampler, DistributedSampler) assert not train_sampler.shuffle assert train_sampler.num_replicas == 2 assert train_sampler.rank == trainer.global_rank def on_test_start(self, trainer, pl_module): train_sampler = trainer.test_dataloaders[0].sampler assert isinstance(train_sampler, DistributedSampler) assert not train_sampler.shuffle assert train_sampler.num_replicas == 2 assert train_sampler.rank == trainer.global_rank plugin = RayPlugin(num_workers=2) trainer = get_trainer( tmpdir, plugins=[plugin], callbacks=[DistributedSamplerCallback()]) trainer.fit(model)
def test_multi_node(tmpdir): """Tests if multi-node GPU training works.""" ray.init("auto") num_gpus = ray.available_resources()["GPU"] model = BoringModel() plugin = RayPlugin(num_workers=num_gpus, use_gpu=True) trainer = get_trainer(tmpdir, plugins=[plugin], use_gpu=True) train_test(trainer, model)
def test_multi_node(tmpdir): """Tests if multi-node GPU training works.""" ray.init("auto") num_gpus = ray.available_resources()["GPU"] model = BoringModel() accelerator = RayAccelerator(num_workers=num_gpus, use_gpu=True) trainer = get_trainer(tmpdir, accelerator=accelerator, use_gpu=True) train_test(trainer, model)
def _inner_train(config): model = BoringModel() trainer = get_trainer(dir, use_gpu=use_gpu, callbacks=callbacks, accelerator=accelerator, **config) trainer.fit(model)
def _inner_train(config): model = BoringModel() trainer = get_trainer(dir, use_gpu=use_gpu, callbacks=callbacks, plugins=[plugin], checkpoint_callback=False, **config) trainer.fit(model)
def test_unused_parameters(tmpdir, ray_start_2_cpus): """Tests if find_unused_parameters is properly passed to model.""" model = BoringModel() plugin = RayPlugin( num_workers=2, use_gpu=False, find_unused_parameters=False) class UnusedParameterCallback(Callback): def on_train_start(self, trainer, pl_module): assert trainer.model.find_unused_parameters is False trainer = get_trainer( tmpdir, plugins=[plugin], callbacks=[UnusedParameterCallback()]) trainer.fit(model)
def test_actor_creation(tmpdir, ray_start_2_cpus, num_workers): """Tests whether the appropriate number of training actors are created.""" model = BoringModel() def check_num_actor(): assert len(ray.actors()) == num_workers model.on_epoch_end = check_num_actor plugin = RayPlugin(num_workers=num_workers) trainer = get_trainer(tmpdir, plugins=[plugin]) trainer.fit(model) assert all(actor["State"] == ray.gcs_utils.ActorTableData.DEAD for actor in list(ray.actors().values()))
def test_model_to_gpu(tmpdir, ray_start_2_gpus): """Tests if model is placed on CUDA device.""" model = BoringModel() class CheckGPUCallback(Callback): def on_epoch_end(self, trainer, pl_module): assert next(pl_module.parameters()).is_cuda plugin = RayPlugin(num_workers=2, use_gpu=True) trainer = get_trainer(tmpdir, plugins=[plugin], use_gpu=True, callbacks=[CheckGPUCallback()]) trainer.fit(model)
def test_predict(tmpdir, ray_start_2_cpus, seed, num_workers): """Tests if trained model has high accuracy on test set.""" config = { "layer_1": 32, "layer_2": 32, "lr": 1e-2, "batch_size": 32, } model = LightningMNISTClassifier(config, tmpdir) dm = MNISTDataModule( data_dir=tmpdir, num_workers=1, batch_size=config["batch_size"]) plugin = RayPlugin(num_workers=num_workers, use_gpu=False) trainer = get_trainer( tmpdir, limit_train_batches=20, max_epochs=1, plugins=[plugin]) predict_test(trainer, model, dm)
def test_early_stop(tmpdir, ray_start_2_cpus): """Tests if early stopping callback works correctly.""" model = BoringModel() accelerator = RayAccelerator(num_workers=1, use_gpu=False) early_stop = EarlyStopping(monitor="val_loss", patience=2, verbose=True) trainer = get_trainer(tmpdir, max_epochs=500, accelerator=accelerator, callbacks=[early_stop], limit_train_batches=1.0, limit_val_batches=1.0, progress_bar_refresh_rate=1) trainer.fit(model) trained_model = BoringModel.load_from_checkpoint( trainer.checkpoint_callback.best_model_path) assert trained_model.val_epoch == 2, trained_model.val_epoch
def test_correct_devices(tmpdir, ray_start_2_gpus): """Tests if GPU devices are correctly set.""" model = BoringModel() class CheckDevicesCallback(Callback): def on_epoch_end(self, trainer, pl_module): assert trainer.root_gpu == 0 assert int(os.environ["CUDA_VISIBLE_DEVICES"]) == \ trainer.local_rank assert trainer.root_gpu == pl_module.device.index assert torch.cuda.current_device() == trainer.root_gpu plugin = RayPlugin(num_workers=2, use_gpu=True) trainer = get_trainer(tmpdir, plugins=plugin, use_gpu=True, callbacks=[CheckDevicesCallback()]) trainer.fit(model)
def test_predict_client(tmpdir, start_ray_client_server_2_cpus, seed, num_slots): assert ray.util.client.ray.is_connected() config = { "layer_1": 32, "layer_2": 32, "lr": 1e-2, "batch_size": 32, } model = LightningMNISTClassifier(config, tmpdir) dm = MNISTDataModule(data_dir=tmpdir, num_workers=1, batch_size=config["batch_size"]) plugin = HorovodRayPlugin(num_slots=num_slots, use_gpu=False) trainer = get_trainer(tmpdir, limit_train_batches=20, max_epochs=1, plugins=[plugin]) predict_test(trainer, model, dm)
def test_predict_gpu(tmpdir, ray_start_2_gpus, seed, num_slots): """Tests if trained model has high accuracy on test set.""" config = { "layer_1": 32, "layer_2": 32, "lr": 1e-2, "batch_size": 32, } model = LightningMNISTClassifier(config, tmpdir) dm = MNISTDataModule(data_dir=tmpdir, num_workers=1, batch_size=config["batch_size"]) accelerator = HorovodRayAccelerator(num_slots=num_slots, use_gpu=True) trainer = get_trainer(tmpdir, limit_train_batches=10, max_epochs=1, accelerator=accelerator, use_gpu=True) predict_test(trainer, model, dm)
def test_train_gpu(tmpdir, ray_start_2_gpus, seed, num_slots): """Tests if training modifies model weights.""" model = BoringModel() accelerator = HorovodRayAccelerator(num_slots=num_slots, use_gpu=True) trainer = get_trainer(tmpdir, accelerator=accelerator, use_gpu=True) train_test(trainer, model)
def test_load(tmpdir, ray_start_2_cpus, num_workers): """Tests if model checkpoint can be loaded.""" model = BoringModel() accelerator = RayAccelerator(num_workers=num_workers, use_gpu=False) trainer = get_trainer(tmpdir, accelerator=accelerator) load_test(trainer, model)
def test_train(tmpdir, ray_start_2_cpus, num_workers): """Tests if training modifies model weights.""" model = BoringModel() accelerator = RayAccelerator(num_workers=num_workers) trainer = get_trainer(tmpdir, accelerator=accelerator) train_test(trainer, model)
def test_train_gpu(tmpdir, ray_start_2_gpus, seed, num_slots): """Tests if training modifies model weights.""" model = BoringModel() plugin = HorovodRayPlugin(num_slots=num_slots, use_gpu=True) trainer = get_trainer(tmpdir, plugins=[plugin], use_gpu=True) train_test(trainer, model)
def test_train(tmpdir, ray_start_2_cpus, num_workers): """Tests if training modifies model weights.""" model = BoringModel() plugin = RayPlugin(num_workers=num_workers) trainer = get_trainer(tmpdir, plugins=[plugin]) train_test(trainer, model)
def test_load(tmpdir, ray_start_2_cpus, num_workers): """Tests if model checkpoint can be loaded.""" model = BoringModel() plugin = RayPlugin(num_workers=num_workers, use_gpu=False) trainer = get_trainer(tmpdir, plugins=[plugin]) load_test(trainer, model)
def test_train_client(tmpdir, start_ray_client_server_2_cpus, num_workers): assert ray.util.client.ray.is_connected() model = BoringModel() plugin = RayPlugin(num_workers=num_workers) trainer = get_trainer(tmpdir, plugins=[plugin]) train_test(trainer, model)
def test_load_gpu(tmpdir, ray_start_2_gpus, seed, num_slots): """Tests if model checkpoint can be loaded.""" model = BoringModel() plugin = HorovodRayPlugin(num_slots=num_slots, use_gpu=True) trainer = get_trainer(tmpdir, plugins=[plugin], use_gpu=True) load_test(trainer, model)
def test_load_gpu(tmpdir, ray_start_2_gpus, seed, num_slots): """Tests if model checkpoint can be loaded.""" model = BoringModel() accelerator = HorovodRayAccelerator(num_slots=num_slots, use_gpu=True) trainer = get_trainer(tmpdir, accelerator=accelerator, use_gpu=True) load_test(trainer, model)