def test_multi_gpu_none_backend(tmpdir):
    """Make sure when using multiple GPUs the user can't use `distributed_backend = None`."""
    tutils.set_random_master_port()
    trainer_options = dict(
        default_root_dir=tmpdir,
        distributed_backend=None,
        progress_bar_refresh_rate=0,
        max_epochs=1,
        limit_train_batches=0.2,
        limit_val_batches=0.2,
        gpus=2
    )

    model = EvalModelTemplate()
    tpipes.run_model_test(trainer_options, model)
Exemple #2
0
def test_running_test_pretrained_model_distrib_dp(tmpdir):
    """Verify `test()` on pretrained model."""
    tutils.set_random_master_port()

    model = EvalModelTemplate()

    # exp file to get meta
    logger = tutils.get_default_logger(tmpdir)

    # exp file to get weights
    checkpoint = tutils.init_checkpoint_callback(logger)

    trainer_options = dict(
        progress_bar_refresh_rate=0,
        max_epochs=2,
        limit_train_batches=0.4,
        limit_val_batches=0.2,
        checkpoint_callback=checkpoint,
        logger=logger,
        gpus=[0, 1],
        distributed_backend='dp',
        default_root_dir=tmpdir,
    )

    # fit model
    trainer = Trainer(**trainer_options)
    result = trainer.fit(model)

    # correct result and ok accuracy
    assert result == 1, 'training failed to complete'
    pretrained_model = EvalModelTemplate.load_from_checkpoint(
        trainer.checkpoint_callback.best_model_path)

    # run test set
    new_trainer = Trainer(**trainer_options)
    results = new_trainer.test(pretrained_model)
    pretrained_model.cpu()

    # test we have good test accuracy
    acc = results[0]['test_acc']
    assert acc > 0.5, f"Model failed to get expected {0.5} accuracy. test_acc = {acc}"

    dataloaders = model.test_dataloader()
    if not isinstance(dataloaders, list):
        dataloaders = [dataloaders]

    for dataloader in dataloaders:
        tpipes.run_prediction(dataloader, pretrained_model)
Exemple #3
0
def test_sync_batchnorm_ddp(tmpdir):
    seed_everything(234)
    set_random_master_port()

    # define datamodule and dataloader
    dm = MNISTDataModule()
    dm.prepare_data()
    dm.setup(stage=None)

    train_dataloader = dm.train_dataloader()
    model = SyncBNModule()

    bn_outputs = []

    # shuffle is false by default
    for batch_idx, batch in enumerate(train_dataloader):
        x, _ = batch

        _, out_bn = model.forward(x, batch_idx)
        bn_outputs.append(out_bn)

        # get 3 steps
        if batch_idx == 2:
            break

    bn_outputs = [x.cuda() for x in bn_outputs]

    # reset datamodule
    # batch-size = 16 because 2 GPUs in DDP
    dm = MNISTDataModule(batch_size=16, dist_sampler=True)
    dm.prepare_data()
    dm.setup(stage=None)

    model = SyncBNModule(gpu_count=2, bn_targets=bn_outputs)

    trainer = Trainer(
        gpus=2,
        num_nodes=1,
        distributed_backend='ddp_spawn',
        max_epochs=1,
        max_steps=3,
        sync_batchnorm=True,
        num_sanity_val_steps=0,
        replace_sampler_ddp=False,
    )

    result = trainer.fit(model, dm)
    assert result == 1, "Sync batchnorm failing with DDP"
Exemple #4
0
def test_multi_gpu_early_stop_ddp_spawn(tmpdir):
    """Make sure DDP works. with early stopping"""
    tutils.set_random_master_port()

    trainer_options = dict(
        default_root_dir=tmpdir,
        callbacks=[EarlyStopping()],
        max_epochs=50,
        limit_train_batches=10,
        limit_val_batches=10,
        gpus=[0, 1],
        distributed_backend='ddp_spawn',
    )

    model = EvalModelTemplate()
    tpipes.run_model_test(trainer_options, model)
Exemple #5
0
def test_result_reduce_horovod(enable_pl_optimizer, tmpdir):
    """Make sure result logging works with Horovod.

    This test mirrors tests/core/test_results.py::_ddp_test_fn
    """
    tutils.reset_seed()
    tutils.set_random_master_port()

    def hvd_test_fn():
        path_here = os.path.abspath(os.path.dirname(__file__))
        path_root = os.path.abspath(os.path.join(path_here, '..', '..'))
        sys.path.insert(0, os.path.abspath(path_root))

        class TestModel(BoringModel):
            def training_step(self, batch, batch_idx):
                self.training_step_called = True

                tensor = torch.tensor([1.0])
                self.log("test_tensor", tensor, sync_dist=True, sync_dist_op='sum',
                         on_step=True, on_epoch=True)

                res = self._results

                # Check that `tensor` is summed across all ranks automatically
                assert res["test_tensor"].item() == hvd.size(), \
                    "Result-Log does not work properly with Horovod and Tensors"

            def training_epoch_end(self, outputs) -> None:
                assert len(outputs) == 0

        model = TestModel()
        model.val_dataloader = None

        trainer = Trainer(
            default_root_dir=tmpdir,
            limit_train_batches=2,
            limit_val_batches=2,
            max_epochs=1,
            log_every_n_steps=1,
            weights_summary=None,
            enable_pl_optimizer=enable_pl_optimizer,
        )

        trainer.fit(model)

    horovod.run(hvd_test_fn, np=2)
Exemple #6
0
def test_multi_gpu_model_ddp_spawn(tmpdir):
    tutils.set_random_master_port()

    trainer_options = dict(default_root_dir=tmpdir,
                           max_epochs=1,
                           limit_train_batches=10,
                           limit_val_batches=10,
                           gpus=[0, 1],
                           distributed_backend='ddp_spawn',
                           progress_bar_refresh_rate=0)

    model = EvalModelTemplate()

    tpipes.run_model_test(trainer_options, model)

    # test memory helper functions
    memory.get_memory_profile('min_max')
Exemple #7
0
def test_ddp_all_dataloaders_passed_to_fit(tmpdir):
    """Make sure DDP works with dataloaders passed to fit()"""
    tutils.set_random_master_port()

    model = EvalModelTemplate()
    fit_options = dict(train_dataloader=model.train_dataloader(),
                       val_dataloaders=model.val_dataloader())

    trainer = Trainer(default_root_dir=tmpdir,
                      progress_bar_refresh_rate=0,
                      max_epochs=1,
                      limit_train_batches=0.2,
                      limit_val_batches=0.2,
                      gpus=[0, 1],
                      distributed_backend='ddp_spawn')
    result = trainer.fit(model, **fit_options)
    assert result == 1, "DDP doesn't work with dataloaders passed to fit()."
def run_test_from_config(trainer_options):
    """Trains the default model with the given config."""
    set_random_master_port()

    ckpt_path = trainer_options['default_root_dir']
    trainer_options.update(checkpoint_callback=ModelCheckpoint(ckpt_path))

    model = EvalModelTemplate()
    run_model_test(trainer_options, model, on_gpu=args.on_gpu, version=0, with_hpc=False)

    # Horovod should be initialized following training. If not, this will raise an exception.
    assert hvd.size() == 2

    if args.on_gpu:
        trainer = Trainer(gpus=1, distributed_backend='horovod', max_epochs=1)
        # Test the root_gpu property
        assert trainer.root_gpu == hvd.local_rank()
Exemple #9
0
def test_multi_cpu_model_ddp(tmpdir):
    """Make sure DDP works."""
    tutils.set_random_master_port()

    trainer_options = dict(
        default_root_dir=tmpdir,
        progress_bar_refresh_rate=0,
        max_epochs=1,
        limit_train_batches=0.4,
        limit_val_batches=0.2,
        gpus=None,
        num_processes=2,
        distributed_backend='ddp_cpu',
    )

    model = EvalModelTemplate()
    tpipes.run_model_test(trainer_options, model, on_gpu=False)
Exemple #10
0
def test_single_gpu_test(tmpdir):
    tutils.set_random_master_port()

    model = EvalModelTemplate()
    trainer = pl.Trainer(
        default_root_dir=os.getcwd(),
        max_epochs=2,
        limit_train_batches=10,
        limit_val_batches=10,
        gpus=[0],
    )
    trainer.fit(model)
    assert 'ckpt' in trainer.checkpoint_callback.best_model_path
    results = trainer.test()
    assert 'test_acc' in results

    results = trainer.test(model)
    assert 'test_acc' in results
def run_test_from_config(trainer_options):
    """Trains the default model with the given config."""
    set_random_master_port()
    reset_seed()

    ckpt_path = trainer_options['weights_save_path']
    trainer_options.update(checkpoint_callback=ModelCheckpoint(ckpt_path))

    model = EvalModelTemplate()

    trainer = Trainer(**trainer_options)
    result = trainer.fit(model)
    assert result == 1

    # Horovod should be initialized following training. If not, this will raise an exception.
    assert hvd.size() == 2

    if trainer.global_rank > 0:
        # on higher ranks the checkpoint location is unknown
        # we want to test checkpointing on rank 0 only
        assert not hasattr(trainer, 'ckpt_path')
        assert not trainer.checkpoint_callback.best_model_path
        return

    # test model loading
    pretrained_model = EvalModelTemplate.load_from_checkpoint(
        trainer.checkpoint_callback.best_model_path)

    # test new model accuracy
    test_loaders = model.test_dataloader()
    if not isinstance(test_loaders, list):
        test_loaders = [test_loaders]

    for dataloader in test_loaders:
        run_prediction(dataloader, pretrained_model)

    # test HPC loading / saving
    trainer.hpc_save(ckpt_path, trainer.logger)
    trainer.hpc_load(ckpt_path, on_gpu=args.on_gpu)

    if args.on_gpu:
        trainer = Trainer(gpus=1, distributed_backend='horovod', max_epochs=1)
        # Test the root_gpu property
        assert trainer.root_gpu == hvd.local_rank()
Exemple #12
0
def test_amp_multi_gpu(tmpdir, backend):
    """Make sure DP/DDP + AMP work."""
    tutils.set_random_master_port()

    model = EvalModelTemplate()

    trainer_options = dict(
        default_root_dir=tmpdir,
        max_epochs=1,
        # gpus=2,
        gpus='0, 1',  # test init with gpu string
        distributed_backend=backend,
        precision=16,
    )

    # tutils.run_model_test(trainer_options, model)
    trainer = Trainer(**trainer_options)
    result = trainer.fit(model)
    assert result
Exemple #13
0
def run_test_from_config(trainer_options):
    """Trains the default model with the given config."""
    set_random_master_port()
    reset_seed()

    ckpt_path = trainer_options['weights_save_path']
    trainer_options.update(callbacks=[ModelCheckpoint(dirpath=ckpt_path)])

    model = EvalModelTemplate()

    trainer = Trainer(**trainer_options)
    result = trainer.fit(model)
    assert result == 1

    # Horovod should be initialized following training. If not, this will raise an exception.
    assert hvd.size() == 2

    if trainer.global_rank > 0:
        return

    # test model loading
    pretrained_model = EvalModelTemplate.load_from_checkpoint(
        trainer.checkpoint_callback.best_model_path)

    # test new model accuracy
    test_loaders = model.test_dataloader()
    if not isinstance(test_loaders, list):
        test_loaders = [test_loaders]

    for dataloader in test_loaders:
        run_prediction(dataloader, pretrained_model)

    # test HPC saving
    trainer.checkpoint_connector.hpc_save(ckpt_path, trainer.logger)
    # test HPC loading
    checkpoint_path = trainer.checkpoint_connector.get_max_ckpt_path_from_folder(
        ckpt_path)
    trainer.checkpoint_connector.hpc_load(checkpoint_path, on_gpu=args.on_gpu)

    if args.on_gpu:
        trainer = Trainer(gpus=1, accelerator='horovod', max_epochs=1)
        # Test the root_gpu property
        assert trainer.root_gpu == hvd.local_rank()
Exemple #14
0
def test_model_saves_on_multi_gpu(tmpdir):
    """Test that ONNX model saves on a distributed backend"""
    tutils.set_random_master_port()

    trainer_options = dict(default_root_dir=tmpdir,
                           max_epochs=1,
                           limit_train_batches=10,
                           limit_val_batches=10,
                           gpus=[0, 1],
                           distributed_backend='ddp_spawn',
                           progress_bar_refresh_rate=0)

    model = EvalModelTemplate()

    tpipes.run_model_test(trainer_options, model)

    file_path = os.path.join(tmpdir, "model.onnx")
    model.to_onnx(file_path)
    assert os.path.exists(file_path) is True
def test_multi_gpu_early_stop(tmpdir, backend):
    """Make sure DDP works. with early stopping"""
    tutils.set_random_master_port()

    trainer_options = dict(
        default_root_dir=tmpdir,
        early_stop_callback=True,
        max_epochs=50,
        limit_train_batches=10,
        limit_val_batches=10,
        gpus=[0, 1],
        distributed_backend=backend,
    )

    model = EvalModelTemplate()
    # tutils.run_model_test(trainer_options, model)
    trainer = Trainer(**trainer_options)
    result = trainer.fit(model)
    assert result
Exemple #16
0
def test_multi_gpu_wandb(tmpdir, backend):
    """Make sure DP/DDP + AMP work."""
    from pytorch_lightning.loggers import WandbLogger
    tutils.set_random_master_port()

    model = EvalModelTemplate()
    logger = WandbLogger(name='utest')

    trainer_options = dict(
        default_root_dir=tmpdir,
        max_epochs=1,
        gpus=2,
        distributed_backend=backend,
        precision=16,
        logger=logger,
    )
    # tutils.run_model_test(trainer_options, model)
    trainer = Trainer(**trainer_options)
    result = trainer.fit(model)
    assert result
    trainer.test(model)
Exemple #17
0
def test_model_pickable(tmpdir, metric: Metric):
    """Make sure that metrics are pickable by including into a model and running in multi-gpu mode"""
    tutils.set_random_master_port()

    trainer_options = dict(
        default_root_dir=tmpdir,
        max_epochs=1,
        limit_train_batches=10,
        gpus=[0, 1],
        distributed_backend="ddp_spawn",
    )

    model = EvalModelTemplate()
    model.metric = metric()
    model.training_step = model.training_step__using_metrics

    trainer = Trainer(**trainer_options)
    result = trainer.fit(model)

    # correct result and ok accuracy
    assert result == 1, "ddp model failed to complete"
Exemple #18
0
def test_multi_gpu_model(tmpdir, backend):
    """Make sure DDP works."""
    tutils.set_random_master_port()

    trainer_options = dict(
        default_root_dir=tmpdir,
        max_epochs=1,
        limit_train_batches=0.4,
        limit_val_batches=0.2,
        gpus=[0, 1],
        distributed_backend=backend,
    )

    model = EvalModelTemplate()
    # tutils.run_model_test(trainer_options, model)
    trainer = Trainer(**trainer_options)
    result = trainer.fit(model)
    assert result

    # test memory helper functions
    memory.get_memory_profile('min_max')
Exemple #19
0
def test_amp_gpu_ddp_slurm_managed(tmpdir):
    """Make sure DDP + AMP work."""
    # simulate setting slurm flags
    tutils.set_random_master_port()
    os.environ['SLURM_LOCALID'] = str(0)

    model = EvalModelTemplate()

    # exp file to get meta
    logger = tutils.get_default_logger(tmpdir)

    # exp file to get weights
    checkpoint = tutils.init_checkpoint_callback(logger)

    # fit model
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        gpus=[0],
        distributed_backend='ddp_spawn',
        precision=16,
        checkpoint_callback=checkpoint,
        logger=logger,
    )
    trainer.is_slurm_managing_tasks = True
    result = trainer.fit(model)

    # correct result and ok accuracy
    assert result == 1, 'amp + ddp model failed to complete'

    # test root model address
    assert trainer.accelerator_connector.resolve_root_node_address(
        'abc') == 'abc'
    assert trainer.accelerator_connector.resolve_root_node_address(
        'abc[23]') == 'abc23'
    assert trainer.accelerator_connector.resolve_root_node_address(
        'abc[23-24]') == 'abc23'
    assert trainer.accelerator_connector.resolve_root_node_address(
        'abc[23-24, 45-40, 40]') == 'abc23'
Exemple #20
0
def test_multi_gpu_wandb_ddp_spawn(tmpdir):
    """Make sure DP/DDP + AMP work."""
    from pytorch_lightning.loggers import WandbLogger
    tutils.set_random_master_port()

    model = EvalModelTemplate()

    wandb.run = MagicMock()
    wandb.init(name='name', project='project')

    logger = WandbLogger(name='name', offline=True)
    trainer_options = dict(
        default_root_dir=tmpdir,
        max_epochs=1,
        gpus=2,
        distributed_backend='ddp_spawn',
        precision=16,
        logger=logger,
    )
    # tutils.run_model_test(trainer_options, model)
    trainer = Trainer(**trainer_options)
    result = trainer.fit(model)
    assert result
    trainer.test(model)
def test_numpy_metric_ddp():
    tutils.reset_seed()
    tutils.set_random_master_port()
    world_size = 2
    mp.spawn(_ddp_test_numpy_metric, args=(world_size, ), nprocs=world_size)