def run_checkpoint_test(tmpdir: str,
                        save_full_weights: bool,
                        automatic_optimization: bool = True,
                        accumulate_grad_batches: int = 2):
    seed_everything(1)
    if automatic_optimization:
        model = ModelParallelClassificationModel()
    else:
        model = ManualModelParallelClassificationModel()
    dm = ClassifDataModule()
    ck = ModelCheckpoint(monitor="val_acc",
                         mode="max",
                         save_last=True,
                         save_top_k=-1)
    trainer = Trainer(
        default_root_dir=tmpdir,
        progress_bar_refresh_rate=0,
        max_epochs=10,
        plugins=[
            DeepSpeedPlugin(stage=3, save_full_weights=save_full_weights)
        ],
        gpus=2,
        precision=16,
        accumulate_grad_batches=accumulate_grad_batches,
        callbacks=[ck],
    )
    trainer.fit(model, datamodule=dm)

    results = trainer.test(model, datamodule=dm)
    assert results[0]["test_acc"] > 0.7

    saved_results = trainer.test(ckpt_path=ck.best_model_path, datamodule=dm)
    assert saved_results[0]["test_acc"] > 0.7
    assert saved_results == results

    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=10,
        plugins=[
            DeepSpeedPlugin(stage=3, save_full_weights=save_full_weights)
        ],
        gpus=2,
        precision=16,
        accumulate_grad_batches=2,
        callbacks=[ck],
        resume_from_checkpoint=ck.best_model_path,
    )
    results = trainer.test(model, datamodule=dm)
    assert results[0]["test_acc"] > 0.7

    dm.predict_dataloader = dm.test_dataloader
    results = trainer.predict(datamodule=dm)
    assert results[-1] > 0.7
Ejemplo n.º 2
0
def test_running_test_pretrained_model_distrib_ddp_spawn(tmpdir):
    """Verify `test()` on pretrained model."""
    tutils.set_random_master_port()
    dm = ClassifDataModule()
    model = ClassificationModel()

    # exp file to get meta
    logger = tutils.get_default_logger(tmpdir)

    # exp file to get weights
    checkpoint = tutils.init_checkpoint_callback(logger)

    trainer_options = dict(
        progress_bar_refresh_rate=0,
        max_epochs=2,
        limit_train_batches=2,
        limit_val_batches=2,
        callbacks=[checkpoint],
        logger=logger,
        gpus=[0, 1],
        accelerator='ddp_spawn',
        default_root_dir=tmpdir,
    )

    # fit model
    trainer = Trainer(**trainer_options)
    trainer.fit(model, datamodule=dm)

    log.info(os.listdir(tutils.get_data_path(logger, path_dir=tmpdir)))

    # correct result and ok accuracy
    assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
    pretrained_model = ClassificationModel.load_from_checkpoint(
        trainer.checkpoint_callback.best_model_path)

    # run test set
    new_trainer = Trainer(**trainer_options)
    new_trainer.test(pretrained_model)
    pretrained_model.cpu()

    dataloaders = dm.test_dataloader()
    if not isinstance(dataloaders, list):
        dataloaders = [dataloaders]

    for dataloader in dataloaders:
        tpipes.run_prediction_eval_model_template(pretrained_model,
                                                  dataloader,
                                                  min_acc=0.1)
Ejemplo n.º 3
0
def test_lr_monitor_param_groups(tmpdir):
    """Test that learning rates are extracted and logged for single lr scheduler."""
    tutils.reset_seed()

    class CustomClassificationModel(ClassificationModel):
        def configure_optimizers(self):
            param_groups = [
                {"params": list(self.parameters())[:2], "lr": self.lr * 0.1},
                {"params": list(self.parameters())[2:], "lr": self.lr},
            ]

            optimizer = optim.Adam(param_groups)
            lr_scheduler = optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.1)
            return [optimizer], [lr_scheduler]

    model = CustomClassificationModel()
    dm = ClassifDataModule()

    lr_monitor = LearningRateMonitor()
    trainer = Trainer(
        default_root_dir=tmpdir, max_epochs=2, limit_val_batches=0.1, limit_train_batches=0.5, callbacks=[lr_monitor]
    )
    trainer.fit(model, datamodule=dm)
    assert trainer.state.finished, f"Training failed with {trainer.state}"

    assert lr_monitor.lrs, "No learning rates logged"
    assert len(lr_monitor.lrs) == 2 * len(
        trainer.lr_schedulers
    ), "Number of learning rates logged does not match number of param groups"
    assert lr_monitor.lr_sch_names == ["lr-Adam"]
    assert list(lr_monitor.lrs.keys()) == ["lr-Adam/pg1", "lr-Adam/pg2"], "Names of learning rates not set correctly"
Ejemplo n.º 4
0
def main():
    seed_everything(4321)

    parser = ArgumentParser(add_help=False)
    parser = Trainer.add_argparse_args(parser)
    parser.add_argument("--trainer_method", default="fit")
    parser.add_argument("--tmpdir")
    parser.add_argument("--workdir")
    parser.set_defaults(gpus=2)
    parser.set_defaults(accelerator="ddp")
    args = parser.parse_args()

    dm = ClassifDataModule()
    model = ClassificationModel()
    trainer = Trainer.from_argparse_args(args)

    if args.trainer_method == "fit":
        trainer.fit(model, datamodule=dm)
        result = None
    elif args.trainer_method == "test":
        result = trainer.test(model, datamodule=dm)
    elif args.trainer_method == "fit_test":
        trainer.fit(model, datamodule=dm)
        result = trainer.test(model, datamodule=dm)
    else:
        raise ValueError(f"Unsupported: {args.trainer_method}")

    result_ext = {
        "status": "complete",
        "method": args.trainer_method,
        "result": result
    }
    file_path = os.path.join(args.tmpdir, "ddp.result")
    torch.save(result_ext, file_path)
Ejemplo n.º 5
0
def test_callbacks_references_resume_from_checkpoint(tmpdir):
    """ Test that resuming from a checkpoint sets references as expected. """
    dm = ClassifDataModule()
    model = ClassificationModel()
    args = {'default_root_dir': tmpdir, 'max_steps': 1, 'logger': False}

    # initial training
    checkpoint = ModelCheckpoint(dirpath=tmpdir,
                                 monitor="val_loss",
                                 save_last=True)
    trainer = Trainer(**args, callbacks=[checkpoint])
    assert checkpoint is trainer.callbacks[-1] is trainer.checkpoint_callback
    trainer.fit(model, datamodule=dm)

    # resumed training
    new_checkpoint = ModelCheckpoint(dirpath=tmpdir,
                                     monitor="val_loss",
                                     save_last=True)
    # pass in a new checkpoint object, which should take
    # precedence over the one in the last.ckpt file
    trainer = Trainer(**args,
                      callbacks=[new_checkpoint],
                      resume_from_checkpoint=str(tmpdir / "last.ckpt"))
    assert checkpoint is not new_checkpoint
    assert new_checkpoint is trainer.callbacks[
        -1] is trainer.checkpoint_callback
    trainer.fit(model, datamodule=dm)
Ejemplo n.º 6
0
def test_optimization(tmpdir):
    seed_everything(42)

    dm = ClassifDataModule(length=1024)
    model = IPUClassificationModel()

    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, accelerator="ipu", devices=2)

    # fit model
    trainer.fit(model, dm)
    assert trainer.state.finished, f"Training failed with {trainer.state}"
    assert dm.trainer is not None

    # validate
    result = trainer.validate(datamodule=dm)
    assert dm.trainer is not None
    assert result[0]["val_acc"] > 0.7

    # test
    result = trainer.test(model, datamodule=dm)
    assert dm.trainer is not None
    test_result = result[0]["test_acc"]
    assert test_result > 0.6

    # test saved model
    model_path = os.path.join(tmpdir, "model.pt")
    trainer.save_checkpoint(model_path)

    model = IPUClassificationModel.load_from_checkpoint(model_path)

    trainer = Trainer(default_root_dir=tmpdir, accelerator="ipu", devices=2)

    result = trainer.test(model, datamodule=dm)
    saved_result = result[0]["test_acc"]
    assert saved_result == test_result
Ejemplo n.º 7
0
def test_dp_test(tmpdir):
    tutils.set_random_master_port()

    dm = ClassifDataModule()
    model = CustomClassificationModelDP()
    trainer = pl.Trainer(
        default_root_dir=tmpdir,
        max_epochs=2,
        limit_train_batches=10,
        limit_val_batches=10,
        gpus=[0, 1],
        accelerator='dp',
    )
    trainer.fit(model, datamodule=dm)
    assert 'ckpt' in trainer.checkpoint_callback.best_model_path
    results = trainer.test(datamodule=dm)
    assert 'test_acc' in results[0]

    old_weights = model.layer_0.weight.clone().detach().cpu()

    results = trainer.test(model, datamodule=dm)
    assert 'test_acc' in results[0]

    # make sure weights didn't change
    new_weights = model.layer_0.weight.clone().detach().cpu()

    assert torch.all(torch.eq(old_weights, new_weights))
Ejemplo n.º 8
0
def main():
    seed_everything(1234)

    parser = ArgumentParser(add_help=False)
    parser = Trainer.add_argparse_args(parser)
    parser.add_argument('--trainer_method', default='fit')
    parser.add_argument('--tmpdir')
    parser.add_argument('--workdir')
    parser.set_defaults(gpus=2)
    parser.set_defaults(accelerator="ddp")
    args = parser.parse_args()

    dm = ClassifDataModule()
    model = ClassificationModel()
    trainer = Trainer.from_argparse_args(args)

    if args.trainer_method == 'fit':
        trainer.fit(model, datamodule=dm)
        result = None
    elif args.trainer_method == 'test':
        result = trainer.test(model, datamodule=dm)
    elif args.trainer_method == 'fit_test':
        trainer.fit(model, datamodule=dm)
        result = trainer.test(model, datamodule=dm)
    else:
        raise ValueError(f'Unsupported: {args.trainer_method}')

    result_ext = {
        'status': 'complete',
        'method': args.trainer_method,
        'result': result,
    }
    file_path = os.path.join(args.tmpdir, 'ddp.result')
    torch.save(result_ext, file_path)
def test_full_loop(tmpdir):
    reset_seed()

    dm = ClassifDataModule()
    model = ClassificationModel()

    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        weights_summary=None,
        deterministic=True,
    )

    # fit model
    trainer.fit(model, dm)
    assert trainer.state.finished, f"Training failed with {trainer.state}"
    assert dm.trainer is not None

    # validate
    result = trainer.validate(datamodule=dm)
    assert dm.trainer is not None
    assert result[0]['val_acc'] > 0.7

    # test
    result = trainer.test(datamodule=dm)
    assert dm.trainer is not None
    assert result[0]['test_acc'] > 0.6
def test_deepspeed_multigpu_stage_3_warns_resume_training(tmpdir):
    """Test to ensure with Stage 3 and multiple GPUs that we can resume from training, throwing a warning that the
    optimizer state and scheduler states cannot be restored."""
    dm = ClassifDataModule()
    model = BoringModel()
    checkpoint_path = os.path.join(tmpdir, "model.pt")
    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
    trainer.fit(model)
    trainer.save_checkpoint(checkpoint_path)

    trainer = Trainer(
        default_root_dir=tmpdir,
        fast_dev_run=True,
        strategy=DeepSpeedStrategy(stage=3, load_full_weights=True),
        accelerator="gpu",
        devices=1,
        precision=16,
    )
    with pytest.warns(
            UserWarning,
            match=
            "A single checkpoint file has been given. This means optimizer states cannot be restored. "
            "If you'd like to restore these states, you must "
            "provide a path to the originally saved DeepSpeed checkpoint.",
    ):
        trainer.fit(model, datamodule=dm, ckpt_path=checkpoint_path)
Ejemplo n.º 11
0
def test_callbacks_state_resume_from_checkpoint(tmpdir):
    """ Test that resuming from a checkpoint restores callbacks that persist state. """
    dm = ClassifDataModule()
    model = ClassificationModel()
    callback_capture = CaptureCallbacksBeforeTraining()

    def get_trainer_args():
        checkpoint = ModelCheckpoint(dirpath=tmpdir,
                                     monitor="val_loss",
                                     save_last=True)
        trainer_args = dict(default_root_dir=tmpdir,
                            max_steps=1,
                            logger=False,
                            callbacks=[checkpoint, callback_capture])
        assert checkpoint.best_model_path == ""
        assert checkpoint.best_model_score is None
        return trainer_args

    # initial training
    trainer = Trainer(**get_trainer_args())
    trainer.fit(model, datamodule=dm)
    callbacks_before_resume = deepcopy(trainer.callbacks)

    # resumed training
    trainer = Trainer(**get_trainer_args(),
                      resume_from_checkpoint=str(tmpdir / "last.ckpt"))
    trainer.fit(model, datamodule=dm)

    assert len(callbacks_before_resume) == len(callback_capture.callbacks)

    for before, after in zip(callbacks_before_resume,
                             callback_capture.callbacks):
        if isinstance(before, ModelCheckpoint):
            assert before.best_model_path == after.best_model_path
            assert before.best_model_score == after.best_model_score
def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(
        tmpdir, offload_optimizer):
    """
    Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works.
    """
    seed_everything(42)

    class VerificationCallback(Callback):
        def on_train_batch_start(self, trainer, pl_module: LightningModule,
                                 batch: Any, batch_idx: int,
                                 dataloader_idx: int) -> None:
            deepspeed_engine = trainer.training_type_plugin.model
            assert trainer.global_step == deepspeed_engine.global_steps

    model = ModelParallelClassificationModel()
    dm = ClassifDataModule()
    trainer = Trainer(default_root_dir=tmpdir,
                      progress_bar_refresh_rate=0,
                      max_epochs=5,
                      plugins=[
                          DeepSpeedPlugin(stage=2,
                                          offload_optimizer=offload_optimizer)
                      ],
                      gpus=2,
                      limit_val_batches=2,
                      precision=16,
                      accumulate_grad_batches=2,
                      callbacks=[VerificationCallback()])
    trainer.fit(model, datamodule=dm)
Ejemplo n.º 13
0
def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer):
    """Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works."""
    seed_everything(42)

    class VerificationCallback(Callback):
        def __init__(self):
            self.on_train_batch_start_called = False

        def on_train_batch_start(self, trainer, pl_module: LightningModule, batch: Any, batch_idx: int) -> None:
            deepspeed_engine = trainer.strategy.model
            assert trainer.global_step == deepspeed_engine.global_steps
            self.on_train_batch_start_called = True

    model = ModelParallelClassificationModel()
    dm = ClassifDataModule()
    verification_callback = VerificationCallback()
    trainer = Trainer(
        default_root_dir=tmpdir,
        enable_progress_bar=False,
        # TODO: this test fails with max_epochs >1 as there are leftover batches per epoch.
        # there's divergence in how Lightning handles the last batch of the epoch with how DeepSpeed does it.
        # we step the optimizers on the last batch but DeepSpeed keeps the accumulation for the next epoch
        max_epochs=1,
        strategy=DeepSpeedStrategy(stage=2, offload_optimizer=offload_optimizer),
        gpus=2,
        limit_train_batches=5,
        limit_val_batches=2,
        precision=16,
        accumulate_grad_batches=2,
        callbacks=[verification_callback],
    )
    assert trainer.limit_train_batches % trainer.accumulate_grad_batches != 0, "leftover batches should be tested"
    trainer.fit(model, datamodule=dm)
    assert verification_callback.on_train_batch_start_called
Ejemplo n.º 14
0
def test_deepspeed_multigpu_stage_3_resume_training(tmpdir):
    """Test to ensure with Stage 3 and single GPU that we can resume training."""
    initial_model = ModelParallelClassificationModel()
    dm = ClassifDataModule()

    ck = ModelCheckpoint(monitor="val_acc", mode="max", save_last=True, save_top_k=-1)
    initial_trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        limit_train_batches=2,
        limit_val_batches=2,
        limit_test_batches=2,
        strategy=DeepSpeedStrategy(stage=3),
        gpus=1,
        precision=16,
        callbacks=[ck],
        enable_progress_bar=False,
        enable_model_summary=False,
    )
    initial_trainer.fit(initial_model, datamodule=dm)

    class TestCallback(Callback):
        def on_train_batch_start(
            self, trainer: Trainer, pl_module: LightningModule, batch: Any, batch_idx: int
        ) -> None:
            original_deepspeed_strategy = initial_trainer.strategy
            current_deepspeed_strategy = trainer.strategy

            assert isinstance(original_deepspeed_strategy, DeepSpeedStrategy)
            assert isinstance(current_deepspeed_strategy, DeepSpeedStrategy)
            # assert optimizer states are the correctly loaded
            original_optimizer_dict = original_deepspeed_strategy.deepspeed_engine.optimizer.state_dict()
            current_optimizer_dict = current_deepspeed_strategy.deepspeed_engine.optimizer.state_dict()
            for orig_tensor, current_tensor in zip(
                original_optimizer_dict["fp32_flat_groups"], current_optimizer_dict["fp32_flat_groups"]
            ):
                assert torch.all(orig_tensor.eq(current_tensor))
            # assert model state is loaded correctly
            for current_param, initial_param in zip(pl_module.parameters(), initial_model.parameters()):
                assert torch.equal(current_param.cpu(), initial_param.cpu())
            # assert epoch has correctly been restored
            assert trainer.current_epoch == 1

            # assert lr-scheduler states are loaded correctly
            original_lr_scheduler = initial_trainer.lr_scheduler_configs[0].scheduler
            current_lr_scheduler = trainer.lr_scheduler_configs[0].scheduler
            assert original_lr_scheduler.state_dict() == current_lr_scheduler.state_dict()

    model = ModelParallelClassificationModel()
    trainer = Trainer(
        default_root_dir=tmpdir,
        fast_dev_run=True,
        strategy=DeepSpeedStrategy(stage=3),
        gpus=1,
        precision=16,
        callbacks=TestCallback(),
        enable_progress_bar=False,
        enable_model_summary=False,
    )
    trainer.fit(model, datamodule=dm, ckpt_path=ck.best_model_path)
Ejemplo n.º 15
0
def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, automatic_optimization, accumulate_grad_batches):
    seed_everything(1)
    if automatic_optimization:
        model = ModelParallelClassificationModel()
    else:
        model = ManualModelParallelClassificationModel()
    dm = ClassifDataModule()
    ck = ModelCheckpoint(monitor="val_acc", mode="max", save_last=True, save_top_k=-1)
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=10,
        strategy=DeepSpeedStrategy(stage=3),
        gpus=2,
        precision=16,
        accumulate_grad_batches=accumulate_grad_batches,
        callbacks=[ck],
    )
    trainer.fit(model, datamodule=dm)

    results = trainer.test(datamodule=dm)
    assert results[0]["test_acc"] > 0.7
    saved_results = trainer.test(ckpt_path=ck.best_model_path, datamodule=dm)
    assert saved_results[0]["test_acc"] > 0.7
    assert saved_results == results

    if automatic_optimization:
        model = ModelParallelClassificationModel()
    else:
        model = ManualModelParallelClassificationModel()
    trainer = Trainer(default_root_dir=tmpdir, gpus=2, strategy=DeepSpeedStrategy(stage=3), precision=16)

    results = trainer.test(model, datamodule=dm, ckpt_path=ck.best_model_path)
    assert results[0]["test_acc"] > 0.7
Ejemplo n.º 16
0
def test_evaluate(tmpdir, trainer_kwargs):
    tutils.set_random_master_port()

    dm = ClassifDataModule()
    model = CustomClassificationModelDP()
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=2,
        limit_train_batches=10,
        limit_val_batches=10,
        deterministic=True,
        **trainer_kwargs
    )

    trainer.fit(model, datamodule=dm)
    assert "ckpt" in trainer.checkpoint_callback.best_model_path

    old_weights = model.layer_0.weight.clone().detach().cpu()

    result = trainer.validate(datamodule=dm)
    assert result[0]["val_acc"] > 0.55

    result = trainer.test(datamodule=dm)
    assert result[0]["test_acc"] > 0.55

    # make sure weights didn't change
    new_weights = model.layer_0.weight.clone().detach().cpu()
    torch.testing.assert_allclose(old_weights, new_weights)
Ejemplo n.º 17
0
def test_running_test_pretrained_model_distrib_dp(tmpdir):
    """Verify `test()` on pretrained model."""

    tutils.set_random_main_port()

    dm = ClassifDataModule()
    model = CustomClassificationModelDP(lr=0.1)

    # exp file to get meta
    logger = tutils.get_default_logger(tmpdir)

    # exp file to get weights
    checkpoint = tutils.init_checkpoint_callback(logger)

    trainer_options = dict(
        enable_progress_bar=False,
        max_epochs=2,
        limit_train_batches=5,
        limit_val_batches=5,
        callbacks=[checkpoint],
        logger=logger,
        accelerator="gpu",
        devices=[0, 1],
        strategy="dp",
        default_root_dir=tmpdir,
    )

    # fit model
    trainer = Trainer(**trainer_options)
    trainer.fit(model, datamodule=dm)

    # correct result and ok accuracy
    assert trainer.state.finished, f"Training failed with {trainer.state}"
    pretrained_model = CustomClassificationModelDP.load_from_checkpoint(
        trainer.checkpoint_callback.best_model_path)

    # run test set
    new_trainer = Trainer(**trainer_options)
    new_trainer.test(pretrained_model, datamodule=dm)
    pretrained_model.cpu()

    dataloaders = dm.test_dataloader()
    if not isinstance(dataloaders, list):
        dataloaders = [dataloaders]

    for dataloader in dataloaders:
        tpipes.run_model_prediction(pretrained_model, dataloader)
Ejemplo n.º 18
0
def test_deepspeed_multigpu_stage_3_resume_training(tmpdir):
    """Test to ensure with Stage 3 and multiple GPUs that we can resume training."""
    initial_model = ModelParallelClassificationModel()
    dm = ClassifDataModule()

    ck = ModelCheckpoint(monitor="val_acc",
                         mode="max",
                         save_last=True,
                         save_top_k=-1)
    initial_trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        limit_train_batches=2,
        limit_val_batches=2,
        limit_test_batches=2,
        plugins=DeepSpeedPlugin(stage=3),
        gpus=1,
        precision=16,
        callbacks=[ck],
    )
    initial_trainer.fit(initial_model, datamodule=dm)

    class TestCallback(Callback):
        def on_train_batch_start(self, trainer: Trainer,
                                 pl_module: LightningModule, batch: Any,
                                 batch_idx: int, dataloader_idx: int) -> None:
            original_deepspeed_plugin = initial_trainer.accelerator.training_type_plugin
            current_deepspeed_plugin = trainer.accelerator.training_type_plugin

            assert isinstance(original_deepspeed_plugin, DeepSpeedPlugin)
            assert isinstance(current_deepspeed_plugin, DeepSpeedPlugin)
            # assert optimizer states are the correctly loaded
            original_optimizer_dict = original_deepspeed_plugin.deepspeed_engine.optimizer.state_dict(
            )
            current_optimizer_dict = current_deepspeed_plugin.deepspeed_engine.optimizer.state_dict(
            )
            for orig_tensor, current_tensor in zip(
                    original_optimizer_dict["fp32_flat_groups"],
                    current_optimizer_dict["fp32_flat_groups"]):
                assert torch.all(orig_tensor.eq(current_tensor))
            # assert model state is loaded correctly
            for current_param, initial_param in zip(
                    pl_module.parameters(), initial_model.parameters()):
                assert torch.equal(current_param.cpu(), initial_param.cpu())
            # assert epoch has correctly been restored
            assert trainer.current_epoch == 1

    model = ModelParallelClassificationModel()
    trainer = Trainer(
        default_root_dir=tmpdir,
        fast_dev_run=True,
        plugins=DeepSpeedPlugin(stage=3),
        gpus=1,
        precision=16,
        resume_from_checkpoint=ck.best_model_path,
        callbacks=TestCallback(),
    )
    trainer.fit(model, datamodule=dm)
def run_checkpoint_test(tmpdir, save_full_weights):
    seed_everything(42)
    model = ModelParallelClassificationModel()
    dm = ClassifDataModule()
    ck = ModelCheckpoint(monitor="val_acc",
                         mode="max",
                         save_last=True,
                         save_top_k=-1)
    trainer = Trainer(max_epochs=10,
                      plugins=[
                          DeepSpeedPlugin(stage=3,
                                          save_full_weights=save_full_weights)
                      ],
                      default_root_dir=tmpdir,
                      gpus=2,
                      precision=16,
                      accumulate_grad_batches=2,
                      callbacks=[ck])
    trainer.fit(model, datamodule=dm)

    results = trainer.test(model, datamodule=dm)
    assert results[0]['test_acc'] > 0.7

    saved_results = trainer.test(ckpt_path=ck.best_model_path, datamodule=dm)
    assert saved_results[0]['test_acc'] > 0.7
    assert saved_results == results

    trainer = Trainer(max_epochs=10,
                      plugins=[
                          DeepSpeedPlugin(stage=3,
                                          save_full_weights=save_full_weights)
                      ],
                      default_root_dir=tmpdir,
                      gpus=2,
                      precision=16,
                      accumulate_grad_batches=2,
                      callbacks=[ck],
                      resume_from_checkpoint=ck.best_model_path)
    results = trainer.test(model, datamodule=dm)
    assert results[0]['test_acc'] > 0.7

    dm.predict_dataloader = dm.test_dataloader
    results = trainer.predict(datamodule=dm)
    assert results[-1] > 0.7
def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir):
    """
    Test to ensure with Stage 3 and multiple GPUs that we can save/load a model resuming from a checkpoint,
    and see convergence.
    """
    seed_everything(42)
    model = ModelParallelClassificationModel()
    dm = ClassifDataModule()
    ck = ModelCheckpoint(monitor="val_acc",
                         mode="max",
                         save_last=True,
                         save_top_k=-1)
    trainer = Trainer(max_epochs=10,
                      plugins=[DeepSpeedPlugin(stage=3)],
                      default_root_dir=tmpdir,
                      gpus=2,
                      precision=16,
                      accumulate_grad_batches=2,
                      callbacks=[ck])
    trainer.fit(model, datamodule=dm)

    results = trainer.test(model, datamodule=dm)
    assert results[0]['test_acc'] > 0.7

    saved_results = trainer.test(ckpt_path=ck.best_model_path, datamodule=dm)
    assert saved_results[0]['test_acc'] > 0.7
    assert saved_results == results

    trainer = Trainer(max_epochs=10,
                      plugins=[DeepSpeedPlugin(stage=3)],
                      default_root_dir=tmpdir,
                      gpus=2,
                      precision=16,
                      accumulate_grad_batches=2,
                      callbacks=[ck],
                      resume_from_checkpoint=ck.best_model_path)
    results = trainer.test(model, datamodule=dm)
    assert results[0]['test_acc'] > 0.7

    dm.predict_dataloader = dm.test_dataloader
    results = trainer.predict(datamodule=dm)
    assert results[-1] > 0.7
Ejemplo n.º 21
0
def test_callbacks_state_fit_ckpt_path(tmpdir):
    """Test that resuming from a checkpoint restores callbacks that persist state."""
    dm = ClassifDataModule()
    model = ClassificationModel()
    callback_capture = CaptureCallbacksBeforeTraining()

    def get_trainer_args():
        checkpoint = ModelCheckpoint(dirpath=tmpdir,
                                     monitor="val_loss",
                                     save_last=True)
        trainer_args = dict(
            default_root_dir=tmpdir,
            limit_train_batches=1,
            limit_val_batches=2,
            max_epochs=1,
            logger=False,
            callbacks=[checkpoint, callback_capture],
        )
        assert checkpoint.best_model_path == ""
        assert checkpoint.best_model_score is None
        return trainer_args

    # initial training
    trainer = Trainer(**get_trainer_args())
    with pytest.deprecated_call(
            match=
            "`Callback.on_pretrain_routine_end` hook has been deprecated in v1.6"
    ):
        trainer.fit(model, datamodule=dm)

    callbacks_before_resume = deepcopy(trainer.callbacks)

    # resumed training
    trainer = Trainer(**get_trainer_args())
    with pytest.deprecated_call(
            match=
            "`Callback.on_pretrain_routine_end` hook has been deprecated in v1.6"
    ):
        trainer.fit(model, datamodule=dm, ckpt_path=str(tmpdir / "last.ckpt"))

    assert len(callbacks_before_resume) == len(callback_capture.callbacks)

    for before, after in zip(callbacks_before_resume,
                             callback_capture.callbacks):
        if isinstance(before, ModelCheckpoint):
            for attribute in (
                    "best_model_path",
                    "best_model_score",
                    "best_k_models",
                    "kth_best_model_path",
                    "kth_value",
                    "last_model_path",
            ):
                assert getattr(before, attribute) == getattr(after, attribute)
Ejemplo n.º 22
0
def test_fit_csv_logger(tmpdir):
    dm = ClassifDataModule()
    model = ClassificationModel()
    logger = CSVLogger(save_dir=tmpdir)
    trainer = Trainer(default_root_dir=tmpdir,
                      max_steps=10,
                      logger=logger,
                      log_every_n_steps=1)
    trainer.fit(model, datamodule=dm)
    metrics_file = os.path.join(logger.log_dir,
                                ExperimentWriter.NAME_METRICS_FILE)
    assert os.path.isfile(metrics_file)
Ejemplo n.º 23
0
def test_suggestion_parameters_work(tmpdir):
    """Test that default skipping does not alter results in basic case."""

    dm = ClassifDataModule()
    model = ClassificationModel()

    # logger file to get meta
    trainer = Trainer(default_root_dir=tmpdir, max_epochs=3)

    lrfinder = trainer.tuner.lr_find(model, datamodule=dm)
    lr1 = lrfinder.suggestion(skip_begin=10)  # default
    lr2 = lrfinder.suggestion(skip_begin=150)  # way too high, should have an impact

    assert lr1 != lr2, "Skipping parameter did not influence learning rate"
def test_early_stopping_no_val_step(tmpdir):
    """Test that early stopping callback falls back to training metrics when no validation defined."""

    model = ClassificationModel()
    dm = ClassifDataModule()
    model.validation_step = None
    model.val_dataloader = None

    stopping = EarlyStopping(monitor="train_loss", min_delta=0.1, patience=0, check_on_train_epoch_end=True)
    trainer = Trainer(default_root_dir=tmpdir, callbacks=[stopping], overfit_batches=0.20, max_epochs=10)
    trainer.fit(model, datamodule=dm)

    assert trainer.state.finished, f"Training failed with {trainer.state}"
    assert trainer.current_epoch < trainer.max_epochs - 1
Ejemplo n.º 25
0
def test_multi_gpu_none_backend(tmpdir):
    """Make sure when using multiple GPUs the user can't use `accelerator = None`."""
    tutils.set_random_main_port()
    trainer_options = dict(
        default_root_dir=tmpdir,
        enable_progress_bar=False,
        max_epochs=1,
        limit_train_batches=0.2,
        limit_val_batches=0.2,
        gpus=2,
    )

    dm = ClassifDataModule()
    model = ClassificationModel()
    tpipes.run_model_test(trainer_options, model, dm)
Ejemplo n.º 26
0
def test_multi_gpu_none_backend(tmpdir):
    """Make sure when using multiple GPUs the user can't use `distributed_backend = None`."""
    tutils.set_random_master_port()
    trainer_options = dict(
        default_root_dir=tmpdir,
        progress_bar_refresh_rate=0,
        max_epochs=1,
        limit_train_batches=0.2,
        limit_val_batches=0.2,
        gpus=2,
    )

    dm = ClassifDataModule()
    model = ClassificationModel()
    tpipes.run_model_test(trainer_options, model, dm)
Ejemplo n.º 27
0
def test_datamodule_parameter(tmpdir):
    """Test that the datamodule parameter works"""
    seed_everything(1)

    dm = ClassifDataModule()
    model = ClassificationModel()

    before_lr = model.lr
    # logger file to get meta
    trainer = Trainer(default_root_dir=tmpdir, max_epochs=2)

    lrfinder = trainer.tuner.lr_find(model, datamodule=dm)
    after_lr = lrfinder.suggestion()
    model.lr = after_lr

    assert before_lr != after_lr, "Learning rate was not altered after running learning rate finder"
Ejemplo n.º 28
0
def test_multi_gpu_early_stop_ddp_spawn(tmpdir):
    tutils.set_random_main_port()

    trainer_options = dict(
        default_root_dir=tmpdir,
        callbacks=[EarlyStopping(monitor="train_acc")],
        max_epochs=50,
        limit_train_batches=10,
        limit_val_batches=10,
        gpus=[0, 1],
        strategy="ddp_spawn",
    )

    dm = ClassifDataModule()
    model = ClassificationModel()
    tpipes.run_model_test(trainer_options, model, dm)
def test_resume_early_stopping_from_checkpoint(tmpdir):
    """Prevent regressions to bugs:

    https://github.com/PyTorchLightning/pytorch-lightning/issues/1464
    https://github.com/PyTorchLightning/pytorch-lightning/issues/1463
    """
    seed_everything(42)
    model = ClassificationModel()
    dm = ClassifDataModule()
    checkpoint_callback = ModelCheckpoint(dirpath=tmpdir,
                                          monitor="train_loss",
                                          save_top_k=1)
    early_stop_callback = EarlyStoppingTestRestore(None, monitor="train_loss")
    trainer = Trainer(
        default_root_dir=tmpdir,
        callbacks=[early_stop_callback, checkpoint_callback],
        num_sanity_val_steps=0,
        max_epochs=4,
    )
    trainer.fit(model, datamodule=dm)

    assert len(early_stop_callback.saved_states) == 4

    checkpoint_filepath = checkpoint_callback.kth_best_model_path
    # ensure state is persisted properly
    checkpoint = torch.load(checkpoint_filepath)
    # the checkpoint saves "epoch + 1"
    early_stop_callback_state = early_stop_callback.saved_states[
        checkpoint["epoch"] - 1]
    assert 4 == len(early_stop_callback.saved_states)
    es_name = "EarlyStoppingTestRestore{'monitor': 'train_loss', 'mode': 'min'}"
    assert checkpoint["callbacks"][es_name] == early_stop_callback_state

    # ensure state is reloaded properly (assertion in the callback)
    early_stop_callback = EarlyStoppingTestRestore(early_stop_callback_state,
                                                   monitor="train_loss")
    new_trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        resume_from_checkpoint=checkpoint_filepath,
        callbacks=[early_stop_callback],
    )

    with pytest.raises(MisconfigurationException,
                       match=r"You restored a checkpoint with current_epoch"):
        new_trainer.fit(model)
Ejemplo n.º 30
0
def test_full_loop_dp(tmpdir):
    set_random_master_port()

    class CustomClassificationModelDP(ClassificationModel):
        def _step(self, batch, batch_idx):
            x, y = batch
            logits = self(x)
            return {'logits': logits, 'y': y}

        def training_step(self, batch, batch_idx):
            _, y = batch
            out = self._step(batch, batch_idx)
            loss = F.cross_entropy(out['logits'], y)
            return loss

        def validation_step(self, batch, batch_idx):
            return self._step(batch, batch_idx)

        def test_step(self, batch, batch_idx):
            return self._step(batch, batch_idx)

        def test_step_end(self, outputs):
            self.log('test_acc', self.test_acc(outputs['logits'],
                                               outputs['y']))

    dm = ClassifDataModule()
    model = CustomClassificationModelDP()

    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        weights_summary=None,
        accelerator='dp',
        gpus=2,
        deterministic=True,
    )

    # fit model
    result = trainer.fit(model, datamodule=dm)
    assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
    assert result

    # test
    result = trainer.test(datamodule=dm)
    assert result[0]['test_acc'] > 0.6