def test_distributed_sampler(tmpdir, ray_start_2_cpus):
    """Tests if distributed sampler is properly set."""
    model = BoringModel()
    train_dataloader = model.train_dataloader()
    initial_sampler = train_dataloader.sampler
    assert not isinstance(initial_sampler, DistributedSampler)

    class DistributedSamplerCallback(Callback):
        def on_train_start(self, trainer, pl_module):
            train_sampler = trainer.train_dataloader.sampler
            assert isinstance(train_sampler, DistributedSampler)
            assert train_sampler.shuffle
            assert train_sampler.num_replicas == 2
            assert train_sampler.rank == trainer.global_rank

        def on_validation_start(self, trainer, pl_module):
            train_sampler = trainer.val_dataloaders[0].sampler
            assert isinstance(train_sampler, DistributedSampler)
            assert not train_sampler.shuffle
            assert train_sampler.num_replicas == 2
            assert train_sampler.rank == trainer.global_rank

        def on_test_start(self, trainer, pl_module):
            train_sampler = trainer.test_dataloaders[0].sampler
            assert isinstance(train_sampler, DistributedSampler)
            assert not train_sampler.shuffle
            assert train_sampler.num_replicas == 2
            assert train_sampler.rank == trainer.global_rank

    accelerator = RayAccelerator(num_workers=2)
    trainer = get_trainer(tmpdir,
                          accelerator=accelerator,
                          callbacks=[DistributedSamplerCallback()])
    trainer.fit(model)
Esempio n. 2
0
def test_multi_node(tmpdir):
    """Tests if multi-node GPU training works."""
    ray.init("auto")
    num_gpus = ray.available_resources()["GPU"]
    model = BoringModel()
    accelerator = RayAccelerator(num_workers=num_gpus, use_gpu=True)
    trainer = get_trainer(tmpdir, accelerator=accelerator, use_gpu=True)
    train_test(trainer, model)
def test_actor_creation(tmpdir, ray_start_2_cpus, num_workers):
    """Tests whether the appropriate number of training actors are created."""
    model = BoringModel()

    def check_num_actor():
        assert len(ray.actors()) == num_workers

    model.on_epoch_end = check_num_actor
    accelerator = RayAccelerator(num_workers=num_workers)
    trainer = get_trainer(tmpdir, accelerator=accelerator)
    trainer.fit(model)
    assert all(actor["State"] == ray.gcs_utils.ActorTableData.DEAD
               for actor in list(ray.actors().values()))
Esempio n. 4
0
def test_model_to_gpu(tmpdir, ray_start_2_gpus):
    """Tests if model is placed on CUDA device."""
    model = BoringModel()

    class CheckGPUCallback(Callback):
        def on_epoch_end(self, trainer, pl_module):
            assert next(pl_module.parameters()).is_cuda

    accelerator = RayAccelerator(num_workers=2, use_gpu=True)
    trainer = get_trainer(tmpdir,
                          accelerator=accelerator,
                          use_gpu=True,
                          callbacks=[CheckGPUCallback()])
    trainer.fit(model)
def test_early_stop(tmpdir, ray_start_2_cpus):
    """Tests if early stopping callback works correctly."""
    model = BoringModel()
    accelerator = RayAccelerator(num_workers=1, use_gpu=False)
    early_stop = EarlyStopping(monitor="val_loss", patience=2, verbose=True)
    trainer = get_trainer(tmpdir,
                          max_epochs=500,
                          accelerator=accelerator,
                          callbacks=[early_stop],
                          limit_train_batches=1.0,
                          limit_val_batches=1.0,
                          progress_bar_refresh_rate=1)
    trainer.fit(model)
    trained_model = BoringModel.load_from_checkpoint(
        trainer.checkpoint_callback.best_model_path)
    assert trained_model.val_epoch == 2, trained_model.val_epoch
def train_mnist(config,
                data_dir=None,
                num_epochs=10,
                num_workers=1,
                use_gpu=False,
                callbacks=None):
    model = MNISTClassifier(config, data_dir)

    callbacks = callbacks or []

    trainer = pl.Trainer(max_epochs=num_epochs,
                         gpus=int(use_gpu),
                         callbacks=callbacks,
                         accelerator=RayAccelerator(num_workers=num_workers,
                                                    use_gpu=use_gpu))
    trainer.fit(model)
def test_predict(tmpdir, ray_start_2_cpus, seed, num_workers):
    """Tests if trained model has high accuracy on test set."""
    config = {
        "layer_1": 32,
        "layer_2": 32,
        "lr": 1e-2,
        "batch_size": 32,
    }
    model = LightningMNISTClassifier(config, tmpdir)
    dm = MNISTDataModule(data_dir=tmpdir,
                         num_workers=1,
                         batch_size=config["batch_size"])
    accelerator = RayAccelerator(num_workers=num_workers, use_gpu=False)
    trainer = get_trainer(tmpdir,
                          limit_train_batches=10,
                          max_epochs=1,
                          accelerator=accelerator)
    predict_test(trainer, model, dm)
Esempio n. 8
0
def test_correct_devices(tmpdir, ray_start_2_gpus):
    """Tests if GPU devices are correctly set."""
    model = BoringModel()

    class CheckDevicesCallback(Callback):
        def on_epoch_end(self, trainer, pl_module):
            assert trainer.root_gpu == 0
            assert int(os.environ["CUDA_VISIBLE_DEVICES"]) == \
                trainer.local_rank
            assert trainer.root_gpu == pl_module.device.index
            assert torch.cuda.current_device() == trainer.root_gpu

    accelerator = RayAccelerator(num_workers=2, use_gpu=True)
    trainer = get_trainer(tmpdir,
                          accelerator=accelerator,
                          use_gpu=True,
                          callbacks=[CheckDevicesCallback()])
    trainer.fit(model)
def test_load(tmpdir, ray_start_2_cpus, num_workers):
    """Tests if model checkpoint can be loaded."""
    model = BoringModel()
    accelerator = RayAccelerator(num_workers=num_workers, use_gpu=False)
    trainer = get_trainer(tmpdir, accelerator=accelerator)
    load_test(trainer, model)
def test_train(tmpdir, ray_start_2_cpus, num_workers):
    """Tests if training modifies model weights."""
    model = BoringModel()
    accelerator = RayAccelerator(num_workers=num_workers)
    trainer = get_trainer(tmpdir, accelerator=accelerator)
    train_test(trainer, model)
Esempio n. 11
0
def test_checkpoint_ddp(tmpdir, ray_start_4_cpus):
    """Tests if Tune checkpointing works with RayAccelerator."""
    accelerator = RayAccelerator(num_workers=2, use_gpu=False)
    checkpoint_test(tmpdir, accelerator)
Esempio n. 12
0
def test_tune_iteration_ddp(tmpdir, ray_start_4_cpus):
    """Tests if each RayAccelerator runs the correct number of iterations."""
    """Tests whether RayAccelerator works with Ray Tune."""
    accelerator = RayAccelerator(num_workers=2, use_gpu=False)
    tune_test(tmpdir, accelerator)