def run_model_test( trainer_options, model: LightningModule, data: LightningDataModule = None, on_gpu: bool = True, version=None, with_hpc: bool = True, min_acc: float = 0.25, ): reset_seed() save_dir = trainer_options["default_root_dir"] # logger file to get meta logger = get_default_logger(save_dir, version=version) trainer_options.update(logger=logger) trainer = Trainer(**trainer_options) initial_values = torch.tensor( [torch.sum(torch.abs(x)) for x in model.parameters()]) trainer.fit(model, datamodule=data) post_train_values = torch.tensor( [torch.sum(torch.abs(x)) for x in model.parameters()]) assert trainer.state.finished, f"Training failed with {trainer.state}" # Check that the model is actually changed post-training change_ratio = torch.norm(initial_values - post_train_values) assert change_ratio > 0.1, f"the model is changed of {change_ratio}" # test model loading pretrained_model = load_model_from_checkpoint( logger, trainer.checkpoint_callback.best_model_path, type(model)) # test new model accuracy test_loaders = model.test_dataloader( ) if not data else data.test_dataloader() if not isinstance(test_loaders, list): test_loaders = [test_loaders] if not isinstance(model, BoringModel): for dataloader in test_loaders: run_prediction_eval_model_template(model, dataloader, min_acc=min_acc) if with_hpc: if trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2): # on hpc this would work fine... but need to hack it for the purpose of the test trainer.optimizers, trainer.lr_schedulers, trainer.optimizer_frequencies = trainer.init_optimizers( pretrained_model) # test HPC saving trainer.checkpoint_connector.hpc_save(save_dir, logger) # test HPC loading checkpoint_path = trainer.checkpoint_connector.get_max_ckpt_path_from_folder( save_dir) trainer.checkpoint_connector.restore(checkpoint_path)
def test_dataloaders_passed_to_fit(tmpdir): """Test if dataloaders passed to trainer works on TPU""" tutils.reset_seed() model = BoringModel() trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, tpu_cores=8) trainer.fit(model, train_dataloader=model.train_dataloader(), val_dataloaders=model.val_dataloader()) assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
def test_lr_monitor_no_logger(tmpdir): tutils.reset_seed() model = BoringModel() lr_monitor = LearningRateMonitor() trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, callbacks=[lr_monitor], logger=False) with pytest.raises(MisconfigurationException, match="`Trainer` that has no logger"): trainer.fit(model)
def run_test_from_config(trainer_options, on_gpu, check_size=True): """Trains the default model with the given config.""" set_random_master_port() reset_seed() ckpt_path = trainer_options["weights_save_path"] trainer_options.update(callbacks=[ModelCheckpoint(dirpath=ckpt_path)]) class TestModel(BoringModel): def on_train_start(self) -> None: expected_device = torch.device("cuda", self.trainer.local_rank) if on_gpu else torch.device("cpu") assert self.device == expected_device def training_epoch_end(self, outputs) -> None: res = self.trainer.training_type_plugin.reduce(torch.tensor(1.0, device=self.device), reduce_op="sum") assert res.sum() == self.trainer.training_type_plugin.world_size model = TestModel() trainer = Trainer(**trainer_options) trainer.fit(model) assert trainer.state.finished, f"Training failed with {trainer.state}" trainer.test(model) assert model.device == torch.device("cpu") # Horovod should be initialized following training. If not, this will raise an exception. if check_size: assert hvd.size() == 2 if trainer.global_rank > 0: return # test model loading pretrained_model = BoringModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path) # test new model accuracy test_loaders = model.test_dataloader() if not isinstance(test_loaders, list): test_loaders = [test_loaders] for dataloader in test_loaders: batch = next(iter(dataloader)) pretrained_model(batch) # test HPC saving trainer.checkpoint_connector.hpc_save(ckpt_path, trainer.logger) # test HPC loading checkpoint_path = trainer.checkpoint_connector.get_max_ckpt_path_from_folder(ckpt_path) trainer.checkpoint_connector.restore(checkpoint_path) if on_gpu: trainer = Trainer(gpus=1, accelerator="horovod", max_epochs=1) # Test the root_gpu property assert trainer.root_gpu == hvd.local_rank()
def test_dm_checkpoint_save_and_load(tmpdir): class CustomBoringModel(BoringModel): def validation_step(self, batch, batch_idx): out = super().validation_step(batch, batch_idx) self.log("early_stop_on", out["x"]) return out class CustomBoringDataModule(BoringDataModule): def state_dict(self) -> Dict[str, Any]: return {"my": "state_dict"} def load_state_dict(self, state_dict: Dict[str, Any]) -> None: self.my_state_dict = state_dict def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None: checkpoint[self.__class__.__qualname__].update( {"on_save": "update"}) def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None: self.checkpoint_state = checkpoint.get( self.__class__.__qualname__).copy() checkpoint[self.__class__.__qualname__].pop("on_save") reset_seed() dm = CustomBoringDataModule() model = CustomBoringModel() trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=2, limit_val_batches=1, enable_model_summary=False, callbacks=[ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on")], ) # fit model with pytest.deprecated_call( match="`LightningDataModule.on_save_checkpoint` was deprecated in" " v1.6 and will be removed in v1.8. Use `state_dict` instead."): trainer.fit(model, datamodule=dm) assert trainer.state.finished, f"Training failed with {trainer.state}" checkpoint_path = list(trainer.checkpoint_callback.best_k_models.keys())[0] checkpoint = torch.load(checkpoint_path) assert dm.__class__.__qualname__ in checkpoint assert checkpoint[dm.__class__.__qualname__] == { "my": "state_dict", "on_save": "update" } for trainer_fn in TrainerFn: trainer.state.fn = trainer_fn trainer._restore_modules_and_callbacks(checkpoint_path) assert dm.checkpoint_state == {"my": "state_dict", "on_save": "update"} assert dm.my_state_dict == {"my": "state_dict"}
def test_model_checkpoint_path(tmpdir, logger_version: Union[None, int, str], expected: str): """Test that "version_" prefix is only added when logger's version is an integer.""" tutils.reset_seed() model = LogInTwoMethods() logger = TensorBoardLogger(str(tmpdir), version=logger_version) trainer = Trainer(default_root_dir=tmpdir, overfit_batches=0.2, max_epochs=2, logger=logger) trainer.fit(model) ckpt_version = Path(trainer.checkpoint_callback.dirpath).parent.name assert ckpt_version == expected
def test_load_model_from_checkpoint(tmpdir, model_template): """Verify test() on pretrained model.""" tutils.reset_seed() model = model_template() trainer_options = dict( progress_bar_refresh_rate=0, max_epochs=2, limit_train_batches=2, limit_val_batches=2, limit_test_batches=2, callbacks=[ ModelCheckpoint(dirpath=tmpdir, monitor='val_loss', save_top_k=-1) ], default_root_dir=tmpdir, ) # fit model trainer = Trainer(**trainer_options) trainer.fit(model) trainer.test(ckpt_path=None) # correct result and ok accuracy assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}" # load last checkpoint last_checkpoint = sorted( glob.glob(os.path.join(trainer.checkpoint_callback.dirpath, "*.ckpt")))[-1] # Since `BoringModel` has `_save_hparams = True` by default, check that ckpt has hparams ckpt = torch.load(last_checkpoint) assert model_template.CHECKPOINT_HYPER_PARAMS_KEY in ckpt.keys( ), 'hyper_parameters missing from checkpoints' # Ensure that model can be correctly restored from checkpoint pretrained_model = model_template.load_from_checkpoint(last_checkpoint) # test that hparams loaded correctly for k, v in model.hparams.items(): assert getattr(pretrained_model.hparams, k) == v # assert weights are the same for (old_name, old_p), (new_name, new_p) in zip(model.named_parameters(), pretrained_model.named_parameters()): assert torch.all(torch.eq( old_p, new_p)), 'loaded weights are not the same as the saved weights' # Check `test` on pretrained model: new_trainer = Trainer(**trainer_options) new_trainer.test(pretrained_model)
def test_test_loop_only(tmpdir): reset_seed() dm = BoringDataModule() model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, weights_summary=None, ) trainer.test(model, datamodule=dm)
def test_amp_gpus(tmpdir, strategy, precision, gpus): """Make sure combinations of AMP and training types work if supported.""" tutils.reset_seed() trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, gpus=gpus, strategy=strategy, precision=precision) model = AMPTestModel() trainer.fit(model) trainer.test(model) trainer.predict(model, DataLoader(RandomDataset(32, 64))) assert trainer.state.finished, f"Training failed with {trainer.state}"
def test_dataloaders_passed_to_fit(tmpdir): """Test if dataloaders passed to trainer works on TPU.""" tutils.reset_seed() model = BoringModel() trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, accelerator="tpu", devices=8) trainer.fit(model, train_dataloaders=model.train_dataloader(), val_dataloaders=model.val_dataloader()) assert trainer.state.finished, f"Training failed with {trainer.state}"
def test_result_reduce_horovod(tmpdir): """Make sure result logging works with Horovod. This test mirrors tests/core/test_results.py::_ddp_test_fn """ tutils.reset_seed() tutils.set_random_main_port() def hvd_test_fn(): path_here = os.path.abspath(os.path.dirname(__file__)) path_root = os.path.abspath(os.path.join(path_here, "..", "..")) sys.path.insert(0, os.path.abspath(path_root)) class TestModel(BoringModel): def training_step(self, batch, batch_idx): self.training_step_called = True tensor = torch.tensor([1.0]) self.log("test_tensor", tensor, sync_dist=True, reduce_fx="sum", on_step=True, on_epoch=True) res = self._results # Check that `tensor` is summed across all ranks automatically assert ( res["test_tensor"].item() == hvd.size() ), "Result-Log does not work properly with Horovod and Tensors" def training_epoch_end(self, outputs) -> None: assert len(outputs) == 0 model = TestModel() model.val_dataloader = None trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=2, limit_val_batches=2, max_epochs=1, log_every_n_steps=1, enable_model_summary=False, logger=False, ) trainer.fit(model) horovod.run(hvd_test_fn, np=2)
def test_tpu_clip_grad_by_value(tmpdir): """Test if clip_gradients by value works on TPU""" tutils.reset_seed() trainer_options = dict(default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=4, tpu_cores=1, limit_train_batches=10, limit_val_batches=10, gradient_clip_val=0.5, gradient_clip_algorithm='value') model = BoringModel() tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
def test_model_tpu_cores_1(tmpdir): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=2, tpu_cores=1, limit_train_batches=4, limit_val_batches=4, ) model = BoringModel() tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
def test_auto_scale_batch_size_trainer_arg(tmpdir, scale_arg): """Test possible values for 'batch size auto scaling' Trainer argument.""" tutils.reset_seed() before_batch_size = 2 model = BatchSizeModel(batch_size=before_batch_size) trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, auto_scale_batch_size=scale_arg, gpus=1) trainer.tune(model) after_batch_size = model.batch_size assert before_batch_size != after_batch_size, "Batch size was not altered after running auto scaling of batch size" assert not os.path.exists(tmpdir / "scale_batch_size_temp_model.ckpt")
def test_lr_monitor_multi_lrs(tmpdir, logging_interval: str): """Test that learning rates are extracted and logged for multi lr schedulers.""" tutils.reset_seed() class CustomBoringModel(BoringModel): def training_step(self, batch, batch_idx, optimizer_idx): return super().training_step(batch, batch_idx) def configure_optimizers(self): optimizer1 = optim.Adam(self.parameters(), lr=1e-2) optimizer2 = optim.Adam(self.parameters(), lr=1e-2) lr_scheduler1 = optim.lr_scheduler.StepLR(optimizer1, 1, gamma=0.1) lr_scheduler2 = optim.lr_scheduler.StepLR(optimizer2, 1, gamma=0.1) return [optimizer1, optimizer2], [lr_scheduler1, lr_scheduler2] model = CustomBoringModel() model.training_epoch_end = None lr_monitor = LearningRateMonitor(logging_interval=logging_interval) log_every_n_steps = 2 trainer = Trainer( default_root_dir=tmpdir, max_epochs=2, log_every_n_steps=log_every_n_steps, limit_train_batches=7, limit_val_batches=0.1, callbacks=[lr_monitor], ) trainer.fit(model) assert trainer.state.finished, f"Training failed with {trainer.state}" assert lr_monitor.lrs, "No learning rates logged" assert len(lr_monitor.lrs) == len( trainer.lr_schedulers ), "Number of learning rates logged does not match number of lr schedulers" assert lr_monitor.lr_sch_names == [ "lr-Adam", "lr-Adam-1" ], "Names of learning rates not set correctly" if logging_interval == "step": expected_number_logged = trainer.global_step // log_every_n_steps if logging_interval == "epoch": expected_number_logged = trainer.max_epochs assert all( len(lr) == expected_number_logged for lr in lr_monitor.lrs.values() ), "Length of logged learning rates do not match the expected number"
def test_amp_multi_gpu_dp(tmpdir): """Make sure DP/DDP + AMP work.""" tutils.reset_seed() trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, gpus=2, accelerator="dp", precision=16) model = AMPTestModel() # tutils.run_model_test(trainer_options, model) trainer.fit(model) assert trainer.state.finished, f"Training failed with {trainer.state}"
def test_amp_single_gpu_ddp_spawn(tmpdir): """Make sure DP/DDP + AMP work.""" tutils.reset_seed() trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, gpus=1, accelerator="ddp_spawn", precision=16) model = AMPTestModel() # tutils.run_model_test(trainer_options, model) trainer.fit(model) trainer.test(model) trainer.predict(model, DataLoader(RandomDataset(32, 64))) assert trainer.state.finished, f"Training failed with {trainer.state}"
def test_model_tpu_cores_8(tmpdir): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, tpu_cores=8, limit_train_batches=0.4, limit_val_batches=0.4, ) # 8 cores needs a big dataset model = SerialLoaderBoringModel() tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False, min_acc=0.05)
def test_amp_cpus(tmpdir, accelerator, precision, num_processes): """Make sure combinations of AMP and training types work if supported.""" tutils.reset_seed() trainer = Trainer( default_root_dir=tmpdir, num_processes=num_processes, max_epochs=1, accelerator=accelerator, precision=precision ) model = AMPTestModel() # tutils.run_model_test(trainer_options, model) trainer.fit(model) trainer.test(model) trainer.predict(model, DataLoader(RandomDataset(32, 64))) assert trainer.state.finished, f"Training failed with {trainer.state}"
def test_model_checkpoint_to_yaml(tmpdir, save_top_k): """ Test that None in checkpoint callback is valid and that chkp_path is set correctly """ tutils.reset_seed() model = LogInTwoMethods() checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor='early_stop_on', save_top_k=save_top_k) trainer = Trainer(default_root_dir=tmpdir, callbacks=[checkpoint], overfit_batches=0.20, max_epochs=2) trainer.fit(model) path_yaml = os.path.join(tmpdir, 'best_k_models.yaml') checkpoint.to_yaml(path_yaml) d = yaml.full_load(open(path_yaml, 'r')) best_k = {k: v for k, v in checkpoint.best_k_models.items()} assert d == best_k
def test_model_16bit_tpu_cores_1(tmpdir): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, precision=16, enable_progress_bar=False, max_epochs=2, tpu_cores=1, limit_train_batches=8, limit_val_batches=2, ) model = BoringModel() tpipes.run_model_test(trainer_options, model, on_gpu=False)
def test_amp_single_gpu_ddp_spawn(tmpdir): """Make sure DP/DDP + AMP work.""" tutils.reset_seed() trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, gpus=1, accelerator='ddp_spawn', precision=16, ) model = AMPTestModel() # tutils.run_model_test(trainer_options, model) trainer.fit(model) assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
def test_model_tpu_devices_1(tmpdir): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=2, accelerator="tpu", devices=1, limit_train_batches=4, limit_val_batches=4, ) model = BoringModel() tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
def _run_horovod(trainer_options, on_gpu=False): """Execute the training script across multiple workers in parallel.""" num_processes = trainer_options.get('gpus', 2) # for Horovod, we interpret `gpus` to be set per worker trainer_options.update(gpus=1 if on_gpu else None) tutils.reset_seed() cmdline = [ 'horovodrun', '-np', str(num_processes), sys.executable, TEST_SCRIPT, '--trainer-options', shlex.quote(json.dumps(trainer_options)) ] if on_gpu: cmdline += ['--on-gpu'] exit_code = subprocess.call(' '.join(cmdline), shell=True, env=os.environ.copy()) assert exit_code == 0
def test_tpu_grad_norm(tmpdir): """Test if grad_norm works on TPU.""" tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, tpu_cores=1, limit_train_batches=0.4, limit_val_batches=0.4, gradient_clip_val=0.1, ) model = BoringModel() tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
def test_model_tpu_index(tmpdir, tpu_core): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=2, tpu_cores=[tpu_core], limit_train_batches=4, limit_val_batches=4, ) model = BoringModel() tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False) assert torch_xla._XLAC._xla_get_default_device() == f'xla:{tpu_core}'
def test_model_checkpoint_with_non_string_input(tmpdir, save_top_k: int): """Test that dirpath=None in checkpoint callback is valid and that ckpt_path is set correctly.""" tutils.reset_seed() model = LogInTwoMethods() checkpoint = ModelCheckpoint(monitor="early_stop_on", dirpath=None, filename="{epoch}", save_top_k=save_top_k) max_epochs = 2 trainer = Trainer(default_root_dir=tmpdir, callbacks=[checkpoint], overfit_batches=0.20, max_epochs=max_epochs) trainer.fit(model) assert checkpoint.dirpath == tmpdir / trainer.logger.name / "version_0" / "checkpoints" if save_top_k == -1: ckpt_files = os.listdir(checkpoint.dirpath) expected_ckpt_files = [f"epoch={i}.ckpt" for i in range(max_epochs)] assert len(ckpt_files) == len(expected_ckpt_files) == max_epochs assert set(ckpt_files) == set(expected_ckpt_files)
def test_tpu_grad_norm(tmpdir): """Test if grad_norm works on TPU.""" tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=4, accelerator="tpu", devices=1, limit_train_batches=0.4, limit_val_batches=0.4, gradient_clip_val=0.5, ) model = BoringModel() tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
def test_model_16bit_tpu_cores_1(tmpdir): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, precision=16, progress_bar_refresh_rate=0, max_epochs=1, tpu_cores=1, limit_train_batches=4, limit_val_batches=4, ) model = BoringModel() tpipes.run_model_test(trainer_options, model, on_gpu=False) assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"
def run_model_test( trainer_options, model: LightningModule, data: LightningDataModule = None, on_gpu: bool = True, version=None, with_hpc: bool = True, min_acc: float = 0.25, ): reset_seed() save_dir = trainer_options["default_root_dir"] # logger file to get meta logger = get_default_logger(save_dir, version=version) trainer_options.update(logger=logger) trainer = Trainer(**trainer_options) initial_values = torch.tensor([torch.sum(torch.abs(x)) for x in model.parameters()]) trainer.fit(model, datamodule=data) post_train_values = torch.tensor([torch.sum(torch.abs(x)) for x in model.parameters()]) assert trainer.state.finished, f"Training failed with {trainer.state}" # Check that the model is actually changed post-training change_ratio = torch.norm(initial_values - post_train_values) assert change_ratio > 0.03, f"the model is changed of {change_ratio}" # test model loading _ = load_model_from_checkpoint(logger, trainer.checkpoint_callback.best_model_path, type(model)) # test new model accuracy test_loaders = model.test_dataloader() if not data else data.test_dataloader() if not isinstance(test_loaders, list): test_loaders = [test_loaders] if not isinstance(model, BoringModel): for dataloader in test_loaders: run_model_prediction(model, dataloader, min_acc=min_acc) if with_hpc: # test HPC saving # save logger to make sure we get all the metrics if logger: logger.finalize("finished") hpc_save_path = trainer.checkpoint_connector.hpc_save_path(save_dir) trainer.save_checkpoint(hpc_save_path) # test HPC loading checkpoint_path = trainer.checkpoint_connector._CheckpointConnector__get_max_ckpt_path_from_folder(save_dir) trainer.checkpoint_connector.restore(checkpoint_path)