Example #1
0
def test_strict_model_load_more_params(monkeypatch, tmpdir, tmpdir_server,
                                       url_ckpt):
    """Tests use case where trainer saves the model, and user loads it from tags independently."""
    # set $TORCH_HOME, which determines torch hub's cache path, to tmpdir
    monkeypatch.setenv('TORCH_HOME', tmpdir)

    model = BoringModel()
    # Extra layer
    model.c_d3 = torch.nn.Linear(32, 32)

    # logger file to get meta
    logger = tutils.get_default_logger(tmpdir)

    # fit model
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        limit_train_batches=2,
        limit_val_batches=2,
        logger=logger,
        callbacks=[ModelCheckpoint(dirpath=tmpdir)],
    )
    trainer.fit(model)

    # traning complete
    assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"

    # save model
    new_weights_path = os.path.join(tmpdir, 'save_test.ckpt')
    trainer.save_checkpoint(new_weights_path)

    # load new model
    hparams_path = os.path.join(tutils.get_data_path(logger, path_dir=tmpdir),
                                'hparams.yaml')
    hparams_url = f'http://{tmpdir_server[0]}:{tmpdir_server[1]}/{os.path.basename(new_weights_path)}'
    ckpt_path = hparams_url if url_ckpt else new_weights_path

    BoringModel.load_from_checkpoint(
        checkpoint_path=ckpt_path,
        hparams_file=hparams_path,
        strict=False,
    )

    with pytest.raises(
            RuntimeError,
            match=
            r'Unexpected key\(s\) in state_dict: "c_d3.weight", "c_d3.bias"'):
        BoringModel.load_from_checkpoint(
            checkpoint_path=ckpt_path,
            hparams_file=hparams_path,
            strict=True,
        )
def run_test_from_config(trainer_options, on_gpu, check_size=True):
    """Trains the default model with the given config."""
    set_random_master_port()
    reset_seed()

    ckpt_path = trainer_options["weights_save_path"]
    trainer_options.update(callbacks=[ModelCheckpoint(dirpath=ckpt_path)])

    class TestModel(BoringModel):
        def on_train_start(self) -> None:
            expected_device = torch.device("cuda", self.trainer.local_rank) if on_gpu else torch.device("cpu")
            assert self.device == expected_device

        def training_epoch_end(self, outputs) -> None:
            res = self.trainer.training_type_plugin.reduce(torch.tensor(1.0, device=self.device), reduce_op="sum")
            assert res.sum() == self.trainer.training_type_plugin.world_size

    model = TestModel()
    trainer = Trainer(**trainer_options)

    trainer.fit(model)
    assert trainer.state.finished, f"Training failed with {trainer.state}"
    trainer.test(model)

    assert model.device == torch.device("cpu")

    # Horovod should be initialized following training. If not, this will raise an exception.
    if check_size:
        assert hvd.size() == 2

    if trainer.global_rank > 0:
        return

    # test model loading
    pretrained_model = BoringModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)

    # test new model accuracy
    test_loaders = model.test_dataloader()
    if not isinstance(test_loaders, list):
        test_loaders = [test_loaders]

    for dataloader in test_loaders:
        batch = next(iter(dataloader))
        pretrained_model(batch)

    # test HPC saving
    trainer.checkpoint_connector.hpc_save(ckpt_path, trainer.logger)
    # test HPC loading
    checkpoint_path = trainer.checkpoint_connector.get_max_ckpt_path_from_folder(ckpt_path)
    trainer.checkpoint_connector.restore(checkpoint_path)

    if on_gpu:
        trainer = Trainer(gpus=1, accelerator="horovod", max_epochs=1)
        # Test the root_gpu property
        assert trainer.root_gpu == hvd.local_rank()
Example #3
0
def test_model_saving_loading(tmpdir):
    """Tests use case where trainer saves the model, and user loads it from tags independently."""
    model = BoringModel()

    # logger file to get meta
    logger = tutils.get_default_logger(tmpdir)

    # fit model
    trainer = Trainer(
        max_epochs=1,
        limit_train_batches=2,
        limit_val_batches=2,
        logger=logger,
        callbacks=[ModelCheckpoint(dirpath=tmpdir)],
        default_root_dir=tmpdir,
    )
    trainer.fit(model)

    # traning complete
    assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"

    # make a prediction
    dataloaders = model.test_dataloader()
    if not isinstance(dataloaders, list):
        dataloaders = [dataloaders]

    batch = next(iter(dataloaders[0]))

    # generate preds before saving model
    model.eval()
    pred_before_saving = model(batch)

    # save model
    new_weights_path = os.path.join(tmpdir, 'save_test.ckpt')
    trainer.save_checkpoint(new_weights_path)

    # load new model
    hparams_path = tutils.get_data_path(logger, path_dir=tmpdir)
    hparams_path = os.path.join(hparams_path, 'hparams.yaml')
    model_2 = BoringModel.load_from_checkpoint(
        checkpoint_path=new_weights_path,
        hparams_file=hparams_path,
    )
    model_2.eval()

    # make prediction
    # assert that both predictions are the same
    new_pred = model_2(batch)
    assert torch.all(torch.eq(pred_before_saving, new_pred)).item() == 1
Example #4
0
def run_test_from_config(trainer_options, on_gpu, check_size=True):
    """Trains the default model with the given config."""
    set_random_master_port()
    reset_seed()

    ckpt_path = trainer_options['weights_save_path']
    trainer_options.update(callbacks=[ModelCheckpoint(dirpath=ckpt_path)])

    model = BoringModel()

    trainer = Trainer(**trainer_options)
    trainer.fit(model)
    assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"

    # Horovod should be initialized following training. If not, this will raise an exception.
    if check_size:
        assert hvd.size() == 2

    if trainer.global_rank > 0:
        return

    # test model loading
    pretrained_model = BoringModel.load_from_checkpoint(
        trainer.checkpoint_callback.best_model_path)

    # test new model accuracy
    test_loaders = model.test_dataloader()
    if not isinstance(test_loaders, list):
        test_loaders = [test_loaders]

    for dataloader in test_loaders:
        batch = next(iter(dataloader))
        pretrained_model(batch)

    # test HPC saving
    trainer.checkpoint_connector.hpc_save(ckpt_path, trainer.logger)
    # test HPC loading
    checkpoint_path = trainer.checkpoint_connector.get_max_ckpt_path_from_folder(
        ckpt_path)
    trainer.checkpoint_connector.hpc_load(checkpoint_path, on_gpu=on_gpu)

    if on_gpu:
        trainer = Trainer(gpus=1, accelerator='horovod', max_epochs=1)
        # Test the root_gpu property
        assert trainer.root_gpu == hvd.local_rank()