Python reset_seed Beispiele, tests.helpers.utils.reset_seed Python Beispiele

Beispiel #1

0

Datei anzeigen

def run_model_test(
    trainer_options,
    model: LightningModule,
    data: LightningDataModule = None,
    on_gpu: bool = True,
    version=None,
    with_hpc: bool = True,
    min_acc: float = 0.25,
):
    reset_seed()
    save_dir = trainer_options["default_root_dir"]

    # logger file to get meta
    logger = get_default_logger(save_dir, version=version)
    trainer_options.update(logger=logger)
    trainer = Trainer(**trainer_options)
    initial_values = torch.tensor(
        [torch.sum(torch.abs(x)) for x in model.parameters()])
    trainer.fit(model, datamodule=data)
    post_train_values = torch.tensor(
        [torch.sum(torch.abs(x)) for x in model.parameters()])

    assert trainer.state.finished, f"Training failed with {trainer.state}"
    # Check that the model is actually changed post-training
    change_ratio = torch.norm(initial_values - post_train_values)
    assert change_ratio > 0.1, f"the model is changed of {change_ratio}"

    # test model loading
    pretrained_model = load_model_from_checkpoint(
        logger, trainer.checkpoint_callback.best_model_path, type(model))

    # test new model accuracy
    test_loaders = model.test_dataloader(
    ) if not data else data.test_dataloader()
    if not isinstance(test_loaders, list):
        test_loaders = [test_loaders]

    if not isinstance(model, BoringModel):
        for dataloader in test_loaders:
            run_prediction_eval_model_template(model,
                                               dataloader,
                                               min_acc=min_acc)

    if with_hpc:
        if trainer._distrib_type in (DistributedType.DDP,
                                     DistributedType.DDP_SPAWN,
                                     DistributedType.DDP2):
            # on hpc this would work fine... but need to hack it for the purpose of the test
            trainer.optimizers, trainer.lr_schedulers, trainer.optimizer_frequencies = trainer.init_optimizers(
                pretrained_model)

        # test HPC saving
        trainer.checkpoint_connector.hpc_save(save_dir, logger)
        # test HPC loading
        checkpoint_path = trainer.checkpoint_connector.get_max_ckpt_path_from_folder(
            save_dir)
        trainer.checkpoint_connector.restore(checkpoint_path)

Beispiel #2

0

Datei anzeigen

Datei: test_tpu.py Projekt: brando90/pytorch-lightning

def test_dataloaders_passed_to_fit(tmpdir):
    """Test if dataloaders passed to trainer works on TPU"""

    tutils.reset_seed()
    model = BoringModel()

    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, tpu_cores=8)
    trainer.fit(model, train_dataloader=model.train_dataloader(), val_dataloaders=model.val_dataloader())
    assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"

Beispiel #3

0

Datei anzeigen

Datei: test_lr_monitor.py Projekt: thoglu/pytorch-lightning

def test_lr_monitor_no_logger(tmpdir):
    tutils.reset_seed()

    model = BoringModel()

    lr_monitor = LearningRateMonitor()
    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, callbacks=[lr_monitor], logger=False)

    with pytest.raises(MisconfigurationException, match="`Trainer` that has no logger"):
        trainer.fit(model)

Beispiel #4

0

Datei anzeigen

Datei: train_default_model.py Projekt: nunenuh/pytorch-lightning

def run_test_from_config(trainer_options, on_gpu, check_size=True):
    """Trains the default model with the given config."""
    set_random_master_port()
    reset_seed()

    ckpt_path = trainer_options["weights_save_path"]
    trainer_options.update(callbacks=[ModelCheckpoint(dirpath=ckpt_path)])

    class TestModel(BoringModel):
        def on_train_start(self) -> None:
            expected_device = torch.device("cuda", self.trainer.local_rank) if on_gpu else torch.device("cpu")
            assert self.device == expected_device

        def training_epoch_end(self, outputs) -> None:
            res = self.trainer.training_type_plugin.reduce(torch.tensor(1.0, device=self.device), reduce_op="sum")
            assert res.sum() == self.trainer.training_type_plugin.world_size

    model = TestModel()
    trainer = Trainer(**trainer_options)

    trainer.fit(model)
    assert trainer.state.finished, f"Training failed with {trainer.state}"
    trainer.test(model)

    assert model.device == torch.device("cpu")

    # Horovod should be initialized following training. If not, this will raise an exception.
    if check_size:
        assert hvd.size() == 2

    if trainer.global_rank > 0:
        return

    # test model loading
    pretrained_model = BoringModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)

    # test new model accuracy
    test_loaders = model.test_dataloader()
    if not isinstance(test_loaders, list):
        test_loaders = [test_loaders]

    for dataloader in test_loaders:
        batch = next(iter(dataloader))
        pretrained_model(batch)

    # test HPC saving
    trainer.checkpoint_connector.hpc_save(ckpt_path, trainer.logger)
    # test HPC loading
    checkpoint_path = trainer.checkpoint_connector.get_max_ckpt_path_from_folder(ckpt_path)
    trainer.checkpoint_connector.restore(checkpoint_path)

    if on_gpu:
        trainer = Trainer(gpus=1, accelerator="horovod", max_epochs=1)
        # Test the root_gpu property
        assert trainer.root_gpu == hvd.local_rank()

Beispiel #5

0

Datei anzeigen

def test_dm_checkpoint_save_and_load(tmpdir):
    class CustomBoringModel(BoringModel):
        def validation_step(self, batch, batch_idx):
            out = super().validation_step(batch, batch_idx)
            self.log("early_stop_on", out["x"])
            return out

    class CustomBoringDataModule(BoringDataModule):
        def state_dict(self) -> Dict[str, Any]:
            return {"my": "state_dict"}

        def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
            self.my_state_dict = state_dict

        def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
            checkpoint[self.__class__.__qualname__].update(
                {"on_save": "update"})

        def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
            self.checkpoint_state = checkpoint.get(
                self.__class__.__qualname__).copy()
            checkpoint[self.__class__.__qualname__].pop("on_save")

    reset_seed()
    dm = CustomBoringDataModule()
    model = CustomBoringModel()

    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        limit_train_batches=2,
        limit_val_batches=1,
        enable_model_summary=False,
        callbacks=[ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on")],
    )

    # fit model
    with pytest.deprecated_call(
            match="`LightningDataModule.on_save_checkpoint` was deprecated in"
            " v1.6 and will be removed in v1.8. Use `state_dict` instead."):
        trainer.fit(model, datamodule=dm)
    assert trainer.state.finished, f"Training failed with {trainer.state}"
    checkpoint_path = list(trainer.checkpoint_callback.best_k_models.keys())[0]
    checkpoint = torch.load(checkpoint_path)
    assert dm.__class__.__qualname__ in checkpoint
    assert checkpoint[dm.__class__.__qualname__] == {
        "my": "state_dict",
        "on_save": "update"
    }

    for trainer_fn in TrainerFn:
        trainer.state.fn = trainer_fn
        trainer._restore_modules_and_callbacks(checkpoint_path)
        assert dm.checkpoint_state == {"my": "state_dict", "on_save": "update"}
        assert dm.my_state_dict == {"my": "state_dict"}

Beispiel #6

0

Datei anzeigen

def test_model_checkpoint_path(tmpdir, logger_version: Union[None, int, str], expected: str):
    """Test that "version_" prefix is only added when logger's version is an integer."""
    tutils.reset_seed()
    model = LogInTwoMethods()
    logger = TensorBoardLogger(str(tmpdir), version=logger_version)

    trainer = Trainer(default_root_dir=tmpdir, overfit_batches=0.2, max_epochs=2, logger=logger)
    trainer.fit(model)

    ckpt_version = Path(trainer.checkpoint_callback.dirpath).parent.name
    assert ckpt_version == expected

Beispiel #7

0

Datei anzeigen

def test_load_model_from_checkpoint(tmpdir, model_template):
    """Verify test() on pretrained model."""
    tutils.reset_seed()
    model = model_template()

    trainer_options = dict(
        progress_bar_refresh_rate=0,
        max_epochs=2,
        limit_train_batches=2,
        limit_val_batches=2,
        limit_test_batches=2,
        callbacks=[
            ModelCheckpoint(dirpath=tmpdir, monitor='val_loss', save_top_k=-1)
        ],
        default_root_dir=tmpdir,
    )

    # fit model
    trainer = Trainer(**trainer_options)
    trainer.fit(model)
    trainer.test(ckpt_path=None)

    # correct result and ok accuracy
    assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"

    # load last checkpoint
    last_checkpoint = sorted(
        glob.glob(os.path.join(trainer.checkpoint_callback.dirpath,
                               "*.ckpt")))[-1]

    # Since `BoringModel` has `_save_hparams = True` by default, check that ckpt has hparams
    ckpt = torch.load(last_checkpoint)
    assert model_template.CHECKPOINT_HYPER_PARAMS_KEY in ckpt.keys(
    ), 'hyper_parameters missing from checkpoints'

    # Ensure that model can be correctly restored from checkpoint
    pretrained_model = model_template.load_from_checkpoint(last_checkpoint)

    # test that hparams loaded correctly
    for k, v in model.hparams.items():
        assert getattr(pretrained_model.hparams, k) == v

    # assert weights are the same
    for (old_name, old_p), (new_name,
                            new_p) in zip(model.named_parameters(),
                                          pretrained_model.named_parameters()):
        assert torch.all(torch.eq(
            old_p,
            new_p)), 'loaded weights are not the same as the saved weights'

    # Check `test` on pretrained model:
    new_trainer = Trainer(**trainer_options)
    new_trainer.test(pretrained_model)

Beispiel #8

0

Datei anzeigen

Datei: test_datamodules.py Projekt: brando90/pytorch-lightning

def test_test_loop_only(tmpdir):
    reset_seed()

    dm = BoringDataModule()
    model = BoringModel()

    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        weights_summary=None,
    )
    trainer.test(model, datamodule=dm)

Beispiel #9

0

Datei anzeigen

def test_amp_gpus(tmpdir, strategy, precision, gpus):
    """Make sure combinations of AMP and training types work if supported."""
    tutils.reset_seed()

    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, gpus=gpus, strategy=strategy, precision=precision)

    model = AMPTestModel()
    trainer.fit(model)
    trainer.test(model)
    trainer.predict(model, DataLoader(RandomDataset(32, 64)))

    assert trainer.state.finished, f"Training failed with {trainer.state}"

Beispiel #10

0

Datei anzeigen

Datei: test_tpu.py Projekt: ashleve/pytorch-lightning

def test_dataloaders_passed_to_fit(tmpdir):
    """Test if dataloaders passed to trainer works on TPU."""
    tutils.reset_seed()
    model = BoringModel()

    trainer = Trainer(default_root_dir=tmpdir,
                      max_epochs=1,
                      accelerator="tpu",
                      devices=8)
    trainer.fit(model,
                train_dataloaders=model.train_dataloader(),
                val_dataloaders=model.val_dataloader())
    assert trainer.state.finished, f"Training failed with {trainer.state}"

Beispiel #11

0

Datei anzeigen

def test_result_reduce_horovod(tmpdir):
    """Make sure result logging works with Horovod.

    This test mirrors tests/core/test_results.py::_ddp_test_fn
    """
    tutils.reset_seed()
    tutils.set_random_main_port()

    def hvd_test_fn():
        path_here = os.path.abspath(os.path.dirname(__file__))
        path_root = os.path.abspath(os.path.join(path_here, "..", ".."))
        sys.path.insert(0, os.path.abspath(path_root))

        class TestModel(BoringModel):
            def training_step(self, batch, batch_idx):
                self.training_step_called = True

                tensor = torch.tensor([1.0])
                self.log("test_tensor",
                         tensor,
                         sync_dist=True,
                         reduce_fx="sum",
                         on_step=True,
                         on_epoch=True)

                res = self._results

                # Check that `tensor` is summed across all ranks automatically
                assert (
                    res["test_tensor"].item() == hvd.size()
                ), "Result-Log does not work properly with Horovod and Tensors"

            def training_epoch_end(self, outputs) -> None:
                assert len(outputs) == 0

        model = TestModel()
        model.val_dataloader = None

        trainer = Trainer(
            default_root_dir=tmpdir,
            limit_train_batches=2,
            limit_val_batches=2,
            max_epochs=1,
            log_every_n_steps=1,
            enable_model_summary=False,
            logger=False,
        )

        trainer.fit(model)

    horovod.run(hvd_test_fn, np=2)

Beispiel #12

0

Datei anzeigen

def test_tpu_clip_grad_by_value(tmpdir):
    """Test if clip_gradients by value works on TPU"""
    tutils.reset_seed()
    trainer_options = dict(default_root_dir=tmpdir,
                           progress_bar_refresh_rate=0,
                           max_epochs=4,
                           tpu_cores=1,
                           limit_train_batches=10,
                           limit_val_batches=10,
                           gradient_clip_val=0.5,
                           gradient_clip_algorithm='value')

    model = BoringModel()
    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)

Beispiel #13

0

Datei anzeigen

def test_model_tpu_cores_1(tmpdir):
    """Make sure model trains on TPU."""
    tutils.reset_seed()
    trainer_options = dict(
        default_root_dir=tmpdir,
        progress_bar_refresh_rate=0,
        max_epochs=2,
        tpu_cores=1,
        limit_train_batches=4,
        limit_val_batches=4,
    )

    model = BoringModel()
    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)

Beispiel #14

0

Datei anzeigen

Datei: test_scale_batch_size.py Projekt: wandb/pytorch-lightning

def test_auto_scale_batch_size_trainer_arg(tmpdir, scale_arg):
    """Test possible values for 'batch size auto scaling' Trainer argument."""
    tutils.reset_seed()
    before_batch_size = 2
    model = BatchSizeModel(batch_size=before_batch_size)
    trainer = Trainer(default_root_dir=tmpdir,
                      max_epochs=1,
                      auto_scale_batch_size=scale_arg,
                      gpus=1)
    trainer.tune(model)
    after_batch_size = model.batch_size
    assert before_batch_size != after_batch_size, "Batch size was not altered after running auto scaling of batch size"

    assert not os.path.exists(tmpdir / "scale_batch_size_temp_model.ckpt")

Beispiel #15

0

Datei anzeigen

Datei: test_lr_monitor.py Projekt: nunenuh/pytorch-lightning

def test_lr_monitor_multi_lrs(tmpdir, logging_interval: str):
    """Test that learning rates are extracted and logged for multi lr schedulers."""
    tutils.reset_seed()

    class CustomBoringModel(BoringModel):
        def training_step(self, batch, batch_idx, optimizer_idx):
            return super().training_step(batch, batch_idx)

        def configure_optimizers(self):
            optimizer1 = optim.Adam(self.parameters(), lr=1e-2)
            optimizer2 = optim.Adam(self.parameters(), lr=1e-2)
            lr_scheduler1 = optim.lr_scheduler.StepLR(optimizer1, 1, gamma=0.1)
            lr_scheduler2 = optim.lr_scheduler.StepLR(optimizer2, 1, gamma=0.1)

            return [optimizer1, optimizer2], [lr_scheduler1, lr_scheduler2]

    model = CustomBoringModel()
    model.training_epoch_end = None

    lr_monitor = LearningRateMonitor(logging_interval=logging_interval)
    log_every_n_steps = 2

    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=2,
        log_every_n_steps=log_every_n_steps,
        limit_train_batches=7,
        limit_val_batches=0.1,
        callbacks=[lr_monitor],
    )
    trainer.fit(model)
    assert trainer.state.finished, f"Training failed with {trainer.state}"

    assert lr_monitor.lrs, "No learning rates logged"
    assert len(lr_monitor.lrs) == len(
        trainer.lr_schedulers
    ), "Number of learning rates logged does not match number of lr schedulers"
    assert lr_monitor.lr_sch_names == [
        "lr-Adam", "lr-Adam-1"
    ], "Names of learning rates not set correctly"

    if logging_interval == "step":
        expected_number_logged = trainer.global_step // log_every_n_steps
    if logging_interval == "epoch":
        expected_number_logged = trainer.max_epochs

    assert all(
        len(lr) == expected_number_logged for lr in lr_monitor.lrs.values()
    ), "Length of logged learning rates do not match the expected number"

Beispiel #16

0

Datei anzeigen

def test_amp_multi_gpu_dp(tmpdir):
    """Make sure DP/DDP + AMP work."""
    tutils.reset_seed()

    trainer = Trainer(default_root_dir=tmpdir,
                      max_epochs=1,
                      gpus=2,
                      accelerator="dp",
                      precision=16)

    model = AMPTestModel()
    # tutils.run_model_test(trainer_options, model)
    trainer.fit(model)

    assert trainer.state.finished, f"Training failed with {trainer.state}"

Beispiel #17

0

Datei anzeigen

def test_amp_single_gpu_ddp_spawn(tmpdir):
    """Make sure DP/DDP + AMP work."""
    tutils.reset_seed()
    trainer = Trainer(default_root_dir=tmpdir,
                      max_epochs=1,
                      gpus=1,
                      accelerator="ddp_spawn",
                      precision=16)

    model = AMPTestModel()
    # tutils.run_model_test(trainer_options, model)
    trainer.fit(model)
    trainer.test(model)
    trainer.predict(model, DataLoader(RandomDataset(32, 64)))
    assert trainer.state.finished, f"Training failed with {trainer.state}"

Beispiel #18

0

Datei anzeigen

def test_model_tpu_cores_8(tmpdir):
    """Make sure model trains on TPU."""
    tutils.reset_seed()
    trainer_options = dict(
        default_root_dir=tmpdir,
        progress_bar_refresh_rate=0,
        max_epochs=1,
        tpu_cores=8,
        limit_train_batches=0.4,
        limit_val_batches=0.4,
    )

    # 8 cores needs a big dataset
    model = SerialLoaderBoringModel()
    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False, min_acc=0.05)

Beispiel #19

0

Datei anzeigen

Datei: test_amp.py Projekt: thoglu/pytorch-lightning

def test_amp_cpus(tmpdir, accelerator, precision, num_processes):
    """Make sure combinations of AMP and training types work if supported."""
    tutils.reset_seed()

    trainer = Trainer(
        default_root_dir=tmpdir, num_processes=num_processes, max_epochs=1, accelerator=accelerator, precision=precision
    )

    model = AMPTestModel()
    # tutils.run_model_test(trainer_options, model)
    trainer.fit(model)
    trainer.test(model)
    trainer.predict(model, DataLoader(RandomDataset(32, 64)))

    assert trainer.state.finished, f"Training failed with {trainer.state}"

Beispiel #20

0

Datei anzeigen

def test_model_checkpoint_to_yaml(tmpdir, save_top_k):
    """ Test that None in checkpoint callback is valid and that chkp_path is set correctly """
    tutils.reset_seed()
    model = LogInTwoMethods()

    checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor='early_stop_on', save_top_k=save_top_k)

    trainer = Trainer(default_root_dir=tmpdir, callbacks=[checkpoint], overfit_batches=0.20, max_epochs=2)
    trainer.fit(model)

    path_yaml = os.path.join(tmpdir, 'best_k_models.yaml')
    checkpoint.to_yaml(path_yaml)
    d = yaml.full_load(open(path_yaml, 'r'))
    best_k = {k: v for k, v in checkpoint.best_k_models.items()}
    assert d == best_k

Beispiel #21

0

Datei anzeigen

def test_model_16bit_tpu_cores_1(tmpdir):
    """Make sure model trains on TPU."""
    tutils.reset_seed()
    trainer_options = dict(
        default_root_dir=tmpdir,
        precision=16,
        enable_progress_bar=False,
        max_epochs=2,
        tpu_cores=1,
        limit_train_batches=8,
        limit_val_batches=2,
    )

    model = BoringModel()
    tpipes.run_model_test(trainer_options, model, on_gpu=False)

Beispiel #22

0

Datei anzeigen

Datei: test_amp.py Projekt: akbir/pytorch-lightning

def test_amp_single_gpu_ddp_spawn(tmpdir):
    """Make sure DP/DDP + AMP work."""
    tutils.reset_seed()
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        gpus=1,
        accelerator='ddp_spawn',
        precision=16,
    )

    model = AMPTestModel()
    # tutils.run_model_test(trainer_options, model)
    trainer.fit(model)
    assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"

Beispiel #23

0

Datei anzeigen

Datei: test_tpu.py Projekt: ashleve/pytorch-lightning

def test_model_tpu_devices_1(tmpdir):
    """Make sure model trains on TPU."""
    tutils.reset_seed()
    trainer_options = dict(
        default_root_dir=tmpdir,
        enable_progress_bar=False,
        max_epochs=2,
        accelerator="tpu",
        devices=1,
        limit_train_batches=4,
        limit_val_batches=4,
    )

    model = BoringModel()
    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)

Beispiel #24

0

Datei anzeigen

def _run_horovod(trainer_options, on_gpu=False):
    """Execute the training script across multiple workers in parallel."""
    num_processes = trainer_options.get('gpus', 2)
    # for Horovod, we interpret `gpus` to be set per worker
    trainer_options.update(gpus=1 if on_gpu else None)
    tutils.reset_seed()
    cmdline = [
        'horovodrun', '-np',
        str(num_processes), sys.executable, TEST_SCRIPT, '--trainer-options',
        shlex.quote(json.dumps(trainer_options))
    ]
    if on_gpu:
        cmdline += ['--on-gpu']
    exit_code = subprocess.call(' '.join(cmdline), shell=True, env=os.environ.copy())
    assert exit_code == 0

Beispiel #25

0

Datei anzeigen

def test_tpu_grad_norm(tmpdir):
    """Test if grad_norm works on TPU."""
    tutils.reset_seed()
    trainer_options = dict(
        default_root_dir=tmpdir,
        progress_bar_refresh_rate=0,
        max_epochs=1,
        tpu_cores=1,
        limit_train_batches=0.4,
        limit_val_batches=0.4,
        gradient_clip_val=0.1,
    )

    model = BoringModel()
    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)

Beispiel #26

0

Datei anzeigen

def test_model_tpu_index(tmpdir, tpu_core):
    """Make sure model trains on TPU."""
    tutils.reset_seed()
    trainer_options = dict(
        default_root_dir=tmpdir,
        progress_bar_refresh_rate=0,
        max_epochs=2,
        tpu_cores=[tpu_core],
        limit_train_batches=4,
        limit_val_batches=4,
    )

    model = BoringModel()
    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
    assert torch_xla._XLAC._xla_get_default_device() == f'xla:{tpu_core}'

Beispiel #27

0

Datei anzeigen

def test_model_checkpoint_with_non_string_input(tmpdir, save_top_k: int):
    """Test that dirpath=None in checkpoint callback is valid and that ckpt_path is set correctly."""
    tutils.reset_seed()
    model = LogInTwoMethods()

    checkpoint = ModelCheckpoint(monitor="early_stop_on", dirpath=None, filename="{epoch}", save_top_k=save_top_k)
    max_epochs = 2
    trainer = Trainer(default_root_dir=tmpdir, callbacks=[checkpoint], overfit_batches=0.20, max_epochs=max_epochs)
    trainer.fit(model)
    assert checkpoint.dirpath == tmpdir / trainer.logger.name / "version_0" / "checkpoints"

    if save_top_k == -1:
        ckpt_files = os.listdir(checkpoint.dirpath)
        expected_ckpt_files = [f"epoch={i}.ckpt" for i in range(max_epochs)]
        assert len(ckpt_files) == len(expected_ckpt_files) == max_epochs
        assert set(ckpt_files) == set(expected_ckpt_files)

Beispiel #28

0

Datei anzeigen

Datei: test_tpu.py Projekt: ashleve/pytorch-lightning

def test_tpu_grad_norm(tmpdir):
    """Test if grad_norm works on TPU."""
    tutils.reset_seed()
    trainer_options = dict(
        default_root_dir=tmpdir,
        enable_progress_bar=False,
        max_epochs=4,
        accelerator="tpu",
        devices=1,
        limit_train_batches=0.4,
        limit_val_batches=0.4,
        gradient_clip_val=0.5,
    )

    model = BoringModel()
    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)

Beispiel #29

0

Datei anzeigen

def test_model_16bit_tpu_cores_1(tmpdir):
    """Make sure model trains on TPU."""
    tutils.reset_seed()
    trainer_options = dict(
        default_root_dir=tmpdir,
        precision=16,
        progress_bar_refresh_rate=0,
        max_epochs=1,
        tpu_cores=1,
        limit_train_batches=4,
        limit_val_batches=4,
    )

    model = BoringModel()
    tpipes.run_model_test(trainer_options, model, on_gpu=False)
    assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"

Beispiel #30

0

Datei anzeigen

Datei: pipelines.py Projekt: wandb/pytorch-lightning

def run_model_test(
    trainer_options,
    model: LightningModule,
    data: LightningDataModule = None,
    on_gpu: bool = True,
    version=None,
    with_hpc: bool = True,
    min_acc: float = 0.25,
):
    reset_seed()
    save_dir = trainer_options["default_root_dir"]

    # logger file to get meta
    logger = get_default_logger(save_dir, version=version)
    trainer_options.update(logger=logger)
    trainer = Trainer(**trainer_options)
    initial_values = torch.tensor([torch.sum(torch.abs(x)) for x in model.parameters()])
    trainer.fit(model, datamodule=data)
    post_train_values = torch.tensor([torch.sum(torch.abs(x)) for x in model.parameters()])

    assert trainer.state.finished, f"Training failed with {trainer.state}"
    # Check that the model is actually changed post-training
    change_ratio = torch.norm(initial_values - post_train_values)
    assert change_ratio > 0.03, f"the model is changed of {change_ratio}"

    # test model loading
    _ = load_model_from_checkpoint(logger, trainer.checkpoint_callback.best_model_path, type(model))

    # test new model accuracy
    test_loaders = model.test_dataloader() if not data else data.test_dataloader()
    if not isinstance(test_loaders, list):
        test_loaders = [test_loaders]

    if not isinstance(model, BoringModel):
        for dataloader in test_loaders:
            run_model_prediction(model, dataloader, min_acc=min_acc)

    if with_hpc:
        # test HPC saving
        # save logger to make sure we get all the metrics
        if logger:
            logger.finalize("finished")
        hpc_save_path = trainer.checkpoint_connector.hpc_save_path(save_dir)
        trainer.save_checkpoint(hpc_save_path)
        # test HPC loading
        checkpoint_path = trainer.checkpoint_connector._CheckpointConnector__get_max_ckpt_path_from_folder(save_dir)
        trainer.checkpoint_connector.restore(checkpoint_path)