def test_distributed_sampler(tmpdir, ray_start_2_cpus): """Tests if distributed sampler is properly set.""" model = BoringModel() train_dataloader = model.train_dataloader() initial_sampler = train_dataloader.sampler assert not isinstance(initial_sampler, DistributedSampler) class DistributedSamplerCallback(Callback): def on_train_start(self, trainer, pl_module): train_sampler = trainer.train_dataloader.sampler assert isinstance(train_sampler, DistributedSampler) assert train_sampler.shuffle assert train_sampler.num_replicas == 2 assert train_sampler.rank == trainer.global_rank def on_validation_start(self, trainer, pl_module): train_sampler = trainer.val_dataloaders[0].sampler assert isinstance(train_sampler, DistributedSampler) assert not train_sampler.shuffle assert train_sampler.num_replicas == 2 assert train_sampler.rank == trainer.global_rank def on_test_start(self, trainer, pl_module): train_sampler = trainer.test_dataloaders[0].sampler assert isinstance(train_sampler, DistributedSampler) assert not train_sampler.shuffle assert train_sampler.num_replicas == 2 assert train_sampler.rank == trainer.global_rank accelerator = RayAccelerator(num_workers=2) trainer = get_trainer(tmpdir, accelerator=accelerator, callbacks=[DistributedSamplerCallback()]) trainer.fit(model)
def test_multi_node(tmpdir): """Tests if multi-node GPU training works.""" ray.init("auto") num_gpus = ray.available_resources()["GPU"] model = BoringModel() accelerator = RayAccelerator(num_workers=num_gpus, use_gpu=True) trainer = get_trainer(tmpdir, accelerator=accelerator, use_gpu=True) train_test(trainer, model)
def test_actor_creation(tmpdir, ray_start_2_cpus, num_workers): """Tests whether the appropriate number of training actors are created.""" model = BoringModel() def check_num_actor(): assert len(ray.actors()) == num_workers model.on_epoch_end = check_num_actor accelerator = RayAccelerator(num_workers=num_workers) trainer = get_trainer(tmpdir, accelerator=accelerator) trainer.fit(model) assert all(actor["State"] == ray.gcs_utils.ActorTableData.DEAD for actor in list(ray.actors().values()))
def test_model_to_gpu(tmpdir, ray_start_2_gpus): """Tests if model is placed on CUDA device.""" model = BoringModel() class CheckGPUCallback(Callback): def on_epoch_end(self, trainer, pl_module): assert next(pl_module.parameters()).is_cuda accelerator = RayAccelerator(num_workers=2, use_gpu=True) trainer = get_trainer(tmpdir, accelerator=accelerator, use_gpu=True, callbacks=[CheckGPUCallback()]) trainer.fit(model)
def test_early_stop(tmpdir, ray_start_2_cpus): """Tests if early stopping callback works correctly.""" model = BoringModel() accelerator = RayAccelerator(num_workers=1, use_gpu=False) early_stop = EarlyStopping(monitor="val_loss", patience=2, verbose=True) trainer = get_trainer(tmpdir, max_epochs=500, accelerator=accelerator, callbacks=[early_stop], limit_train_batches=1.0, limit_val_batches=1.0, progress_bar_refresh_rate=1) trainer.fit(model) trained_model = BoringModel.load_from_checkpoint( trainer.checkpoint_callback.best_model_path) assert trained_model.val_epoch == 2, trained_model.val_epoch
def train_mnist(config, data_dir=None, num_epochs=10, num_workers=1, use_gpu=False, callbacks=None): model = MNISTClassifier(config, data_dir) callbacks = callbacks or [] trainer = pl.Trainer(max_epochs=num_epochs, gpus=int(use_gpu), callbacks=callbacks, accelerator=RayAccelerator(num_workers=num_workers, use_gpu=use_gpu)) trainer.fit(model)
def test_predict(tmpdir, ray_start_2_cpus, seed, num_workers): """Tests if trained model has high accuracy on test set.""" config = { "layer_1": 32, "layer_2": 32, "lr": 1e-2, "batch_size": 32, } model = LightningMNISTClassifier(config, tmpdir) dm = MNISTDataModule(data_dir=tmpdir, num_workers=1, batch_size=config["batch_size"]) accelerator = RayAccelerator(num_workers=num_workers, use_gpu=False) trainer = get_trainer(tmpdir, limit_train_batches=10, max_epochs=1, accelerator=accelerator) predict_test(trainer, model, dm)
def test_correct_devices(tmpdir, ray_start_2_gpus): """Tests if GPU devices are correctly set.""" model = BoringModel() class CheckDevicesCallback(Callback): def on_epoch_end(self, trainer, pl_module): assert trainer.root_gpu == 0 assert int(os.environ["CUDA_VISIBLE_DEVICES"]) == \ trainer.local_rank assert trainer.root_gpu == pl_module.device.index assert torch.cuda.current_device() == trainer.root_gpu accelerator = RayAccelerator(num_workers=2, use_gpu=True) trainer = get_trainer(tmpdir, accelerator=accelerator, use_gpu=True, callbacks=[CheckDevicesCallback()]) trainer.fit(model)
def test_load(tmpdir, ray_start_2_cpus, num_workers): """Tests if model checkpoint can be loaded.""" model = BoringModel() accelerator = RayAccelerator(num_workers=num_workers, use_gpu=False) trainer = get_trainer(tmpdir, accelerator=accelerator) load_test(trainer, model)
def test_train(tmpdir, ray_start_2_cpus, num_workers): """Tests if training modifies model weights.""" model = BoringModel() accelerator = RayAccelerator(num_workers=num_workers) trainer = get_trainer(tmpdir, accelerator=accelerator) train_test(trainer, model)
def test_checkpoint_ddp(tmpdir, ray_start_4_cpus): """Tests if Tune checkpointing works with RayAccelerator.""" accelerator = RayAccelerator(num_workers=2, use_gpu=False) checkpoint_test(tmpdir, accelerator)
def test_tune_iteration_ddp(tmpdir, ray_start_4_cpus): """Tests if each RayAccelerator runs the correct number of iterations.""" """Tests whether RayAccelerator works with Ray Tune.""" accelerator = RayAccelerator(num_workers=2, use_gpu=False) tune_test(tmpdir, accelerator)