def test_ddp_sharded_strategy_finetune(tmpdir): """Test to ensure that we can save and restart training (simulate fine-tuning)""" model = BoringModel() trainer = Trainer(gpus=2, strategy="ddp_sharded_spawn", fast_dev_run=True) trainer.fit(model) checkpoint_path = os.path.join(tmpdir, "model.pt") trainer.save_checkpoint(checkpoint_path) saved_model = BoringModel.load_from_checkpoint(checkpoint_path) trainer = Trainer(fast_dev_run=True) trainer.fit(saved_model)
def _assert_save_model_is_equal(model, tmpdir, trainer): checkpoint_path = os.path.join(tmpdir, 'model.pt') trainer.save_checkpoint(checkpoint_path) # carry out the check only on rank 0 if trainer.global_rank == 0: saved_model = BoringModel.load_from_checkpoint(checkpoint_path) if model.dtype == torch.half: saved_model = saved_model.half() # model is loaded in float32 as default, move it to float16 model = model.cpu() # Assert model parameters are identical after loading for orig_param, trained_model_param in zip(model.parameters(), saved_model.parameters()): assert torch.equal(orig_param, trained_model_param)
def _assert_save_model_is_equal(model, tmpdir, trainer): checkpoint_path = os.path.join(tmpdir, 'model.pt') trainer.save_checkpoint(checkpoint_path) # carry out the check only on rank 0 if trainer.global_rank == 0: saved_model = BoringModel.load_from_checkpoint(checkpoint_path) saved_model = saved_model.float() model = model.float().cpu() # Assert model parameters are identical after loading for orig_param, trained_model_param in zip(model.parameters(), saved_model.parameters()): assert torch.equal(orig_param, trained_model_param)
def test_ddp_sharded_strategy_checkpoint_multi_gpu(tmpdir): """Test to ensure that checkpoint is saved correctly when using multiple GPUs.""" model = BoringModel() trainer = Trainer(gpus=2, strategy="ddp_sharded_spawn", fast_dev_run=True) trainer.fit(model) checkpoint_path = os.path.join(tmpdir, "model.pt") trainer.save_checkpoint(checkpoint_path) saved_model = BoringModel.load_from_checkpoint(checkpoint_path) # Assert model parameters are identical after loading for ddp_param, shard_param in zip(model.parameters(), saved_model.parameters()): assert torch.equal(ddp_param.to("cpu"), shard_param)
def test_ddp_sharded_plugin_checkpoint_cpu(tmpdir): """ Test to ensure that checkpoint is saved correctly """ model = BoringModel() trainer = Trainer(accelerator="ddp_sharded_spawn", num_processes=2, fast_dev_run=True) trainer.fit(model) checkpoint_path = os.path.join(tmpdir, "model.pt") trainer.save_checkpoint(checkpoint_path) saved_model = BoringModel.load_from_checkpoint(checkpoint_path) # Assert model parameters are identical after loading for ddp_param, shard_param in zip(model.parameters(), saved_model.parameters()): assert torch.equal(ddp_param.to("cpu"), shard_param)
def test_ddp_sharded_plugin_finetune(tmpdir): """ Test to ensure that we can save and restart training (simulate fine-tuning) """ model = BoringModel() trainer = Trainer( gpus=2, accelerator='ddp_sharded_spawn', fast_dev_run=True, ) trainer.fit(model) checkpoint_path = os.path.join(tmpdir, 'model.pt') trainer.save_checkpoint(checkpoint_path) saved_model = BoringModel.load_from_checkpoint(checkpoint_path) trainer = Trainer(fast_dev_run=True, ) trainer.fit(saved_model)
def test_ddp_sharded_plugin_checkpoint_multi_gpu(tmpdir): """ Test to ensure that checkpoint is saved correctly when using multiple GPUs """ model = BoringModel() trainer = Trainer( gpus=2, accelerator='ddp_spawn', plugins=[DDPShardedPlugin()], fast_dev_run=True, ) trainer.fit(model) checkpoint_path = os.path.join(tmpdir, 'model.pt') trainer.save_checkpoint(checkpoint_path) saved_model = BoringModel.load_from_checkpoint(checkpoint_path) # Assert model parameters are identical after loading for ddp_param, shard_param in zip(model.parameters(), saved_model.parameters()): assert torch.equal(ddp_param, shard_param)