def test_deepspeed_multigpu_single_file(tmpdir): """Test to ensure that DeepSpeed loads from a single file checkpoint.""" model = BoringModel() checkpoint_path = os.path.join(tmpdir, "model.pt") trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True) trainer.fit(model) trainer.save_checkpoint(checkpoint_path) trainer = Trainer( default_root_dir=tmpdir, plugins=[DeepSpeedPlugin(stage=3)], gpus=1, fast_dev_run=True, precision=16 ) plugin = trainer.training_type_plugin assert isinstance(plugin, DeepSpeedPlugin) assert not plugin.load_full_weights with pytest.raises(MisconfigurationException, match="DeepSpeed was unable to load the checkpoint."): trainer.test(model, ckpt_path=checkpoint_path) trainer = Trainer( default_root_dir=tmpdir, plugins=[DeepSpeedPlugin(stage=3, load_full_weights=True)], gpus=1, fast_dev_run=True, precision=16, ) plugin = trainer.training_type_plugin assert isinstance(plugin, DeepSpeedPlugin) assert plugin.load_full_weights trainer.test(model, ckpt_path=checkpoint_path)
def run_checkpoint_test(tmpdir: str, automatic_optimization: bool = True, accumulate_grad_batches: int = 2): seed_everything(1) if automatic_optimization: model = ModelParallelClassificationModel() else: model = ManualModelParallelClassificationModel() dm = ClassifDataModule() ck = ModelCheckpoint(monitor="val_acc", mode="max", save_last=True, save_top_k=-1) trainer = Trainer( default_root_dir=tmpdir, max_epochs=10, plugins=[DeepSpeedPlugin(stage=3)], gpus=2, precision=16, accumulate_grad_batches=accumulate_grad_batches, callbacks=[ck], ) trainer.fit(model, datamodule=dm) results = trainer.test(datamodule=dm) assert results[0]["test_acc"] > 0.7 saved_results = trainer.test(ckpt_path=ck.best_model_path, datamodule=dm) assert saved_results[0]["test_acc"] > 0.7 assert saved_results == results if automatic_optimization: model = ModelParallelClassificationModel() else: model = ManualModelParallelClassificationModel() trainer = Trainer(default_root_dir=tmpdir, gpus=2, plugins=[DeepSpeedPlugin(stage=3)], precision=16) results = trainer.test(model, datamodule=dm, ckpt_path=ck.best_model_path) assert results[0]["test_acc"] > 0.7
def test_deepspeed_multigpu_stage_3_resume_training(tmpdir): """Test to ensure with Stage 3 and multiple GPUs that we can resume training.""" initial_model = ModelParallelClassificationModel() dm = ClassifDataModule() ck = ModelCheckpoint(monitor="val_acc", mode="max", save_last=True, save_top_k=-1) initial_trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=2, limit_val_batches=2, limit_test_batches=2, plugins=DeepSpeedPlugin(stage=3), gpus=1, precision=16, callbacks=[ck], ) initial_trainer.fit(initial_model, datamodule=dm) class TestCallback(Callback): def on_train_batch_start(self, trainer: Trainer, pl_module: LightningModule, batch: Any, batch_idx: int, dataloader_idx: int) -> None: original_deepspeed_plugin = initial_trainer.accelerator.training_type_plugin current_deepspeed_plugin = trainer.accelerator.training_type_plugin assert isinstance(original_deepspeed_plugin, DeepSpeedPlugin) assert isinstance(current_deepspeed_plugin, DeepSpeedPlugin) # assert optimizer states are the correctly loaded original_optimizer_dict = original_deepspeed_plugin.deepspeed_engine.optimizer.state_dict( ) current_optimizer_dict = current_deepspeed_plugin.deepspeed_engine.optimizer.state_dict( ) for orig_tensor, current_tensor in zip( original_optimizer_dict["fp32_flat_groups"], current_optimizer_dict["fp32_flat_groups"]): assert torch.all(orig_tensor.eq(current_tensor)) # assert model state is loaded correctly for current_param, initial_param in zip( pl_module.parameters(), initial_model.parameters()): assert torch.equal(current_param.cpu(), initial_param.cpu()) # assert epoch has correctly been restored assert trainer.current_epoch == 1 model = ModelParallelClassificationModel() trainer = Trainer( default_root_dir=tmpdir, fast_dev_run=True, plugins=DeepSpeedPlugin(stage=3), gpus=1, precision=16, resume_from_checkpoint=ck.best_model_path, callbacks=TestCallback(), ) trainer.fit(model, datamodule=dm)
def run_checkpoint_test(tmpdir: str, save_full_weights: bool, automatic_optimization: bool = True, accumulate_grad_batches: int = 2): seed_everything(1) if automatic_optimization: model = ModelParallelClassificationModel() else: model = ManualModelParallelClassificationModel() dm = ClassifDataModule() ck = ModelCheckpoint(monitor="val_acc", mode="max", save_last=True, save_top_k=-1) trainer = Trainer(default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=10, plugins=[ DeepSpeedPlugin(stage=3, save_full_weights=save_full_weights) ], gpus=2, precision=16, accumulate_grad_batches=accumulate_grad_batches, callbacks=[ck]) trainer.fit(model, datamodule=dm) results = trainer.test(model, datamodule=dm) assert results[0]['test_acc'] > 0.7 saved_results = trainer.test(ckpt_path=ck.best_model_path, datamodule=dm) assert saved_results[0]['test_acc'] > 0.7 assert saved_results == results trainer = Trainer(default_root_dir=tmpdir, max_epochs=10, plugins=[ DeepSpeedPlugin(stage=3, save_full_weights=save_full_weights) ], gpus=2, precision=16, accumulate_grad_batches=2, callbacks=[ck], resume_from_checkpoint=ck.best_model_path) results = trainer.test(model, datamodule=dm) assert results[0]['test_acc'] > 0.7 dm.predict_dataloader = dm.test_dataloader results = trainer.predict(datamodule=dm) assert results[-1] > 0.7
def test_deepspeed_with_invalid_config_path(tmpdir): """Test to ensure if we pass an invalid config path we throw an exception.""" with pytest.raises( MisconfigurationException, match="You passed in a path to a DeepSpeed config but the path does not exist" ): DeepSpeedPlugin(config="invalid_path.json")
def test_deepspeed_setup_train_dataloader(tmpdir): """Test DeepSpeed works when setup is required to call in the DataModule.""" class TestSetupIsCalledDataModule(LightningDataModule): def __init__(self): super().__init__() self._setup = False def setup(self, stage: Optional[str] = None) -> None: self._setup = True def train_dataloader(self): assert self._setup return DataLoader(RandomDataset(32, 64), batch_size=2) def val_dataloader(self): assert self._setup return DataLoader(RandomDataset(32, 64), batch_size=2) def test_dataloader(self): assert self._setup return DataLoader(RandomDataset(32, 64), batch_size=2) model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, strategy=DeepSpeedPlugin(logging_level=logging.INFO), gpus=1, fast_dev_run=True, ) dm = TestSetupIsCalledDataModule() with mock.patch("deepspeed.utils.logging.logger.warning", autospec=True) as mock_object: trainer.fit(model, datamodule=dm) assert any("Tried to infer the batch size" in str(arg) for arg in mock_object.call_args_list)
def _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer): """Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works.""" seed_everything(42) class VerificationCallback(Callback): def __init__(self): self.on_train_batch_start_called = False def on_train_batch_start(self, trainer, pl_module: LightningModule, batch: Any, batch_idx: int) -> None: deepspeed_engine = trainer.training_type_plugin.model assert trainer.global_step == deepspeed_engine.global_steps self.on_train_batch_start_called = True model = ModelParallelClassificationModel() dm = ClassifDataModule() verification_callback = VerificationCallback() trainer = Trainer( default_root_dir=tmpdir, enable_progress_bar=False, # TODO: this test fails with max_epochs >1 as there are leftover batches per epoch. # there's divergence in how Lightning handles the last batch of the epoch with how DeepSpeed does it. # we step the optimizers on the last batch but DeepSpeed keeps the accumulation for the next epoch max_epochs=1, strategy=DeepSpeedPlugin(stage=2, offload_optimizer=offload_optimizer), gpus=2, limit_train_batches=5, limit_val_batches=2, precision=16, accumulate_grad_batches=2, callbacks=[verification_callback], ) assert trainer.limit_train_batches % trainer.accumulate_grad_batches != 0, "leftover batches should be tested" trainer.fit(model, datamodule=dm) assert verification_callback.on_train_batch_start_called
def test_deepspeed_config(tmpdir, deepspeed_zero_config): """Test to ensure deepspeed works correctly when passed a DeepSpeed config object including optimizers/schedulers and saves the model weights to load correctly.""" class TestCB(Callback): def on_train_start(self, trainer, pl_module) -> None: from deepspeed.runtime.lr_schedules import WarmupLR from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer assert isinstance(trainer.optimizers[0], FP16_DeepSpeedZeroOptimizer) assert isinstance(trainer.optimizers[0].optimizer, torch.optim.SGD) assert isinstance(trainer.lr_schedulers[0]["scheduler"], WarmupLR) model = BoringModel() trainer = Trainer( strategy=DeepSpeedPlugin(config=deepspeed_zero_config), default_root_dir=tmpdir, gpus=1, fast_dev_run=True, precision=16, callbacks=[TestCB()], ) trainer.fit(model) trainer.test(model)
def test_deepspeed_run_configure_optimizers(tmpdir): """Test end to end that deepspeed works with defaults (without ZeRO as that requires compilation), whilst using configure_optimizers for optimizers and schedulers.""" class TestCB(Callback): def on_train_start(self, trainer, pl_module) -> None: from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer assert isinstance(trainer.optimizers[0], FP16_DeepSpeedZeroOptimizer) assert isinstance(trainer.optimizers[0].optimizer, torch.optim.SGD) assert isinstance(trainer.lr_schedulers[0]["scheduler"], torch.optim.lr_scheduler.StepLR) # check that the lr_scheduler config was preserved assert trainer.lr_schedulers[0]["name"] == "Sean" class TestModel(BoringModel): def configure_optimizers(self): [optimizer], [scheduler] = super().configure_optimizers() return {"optimizer": optimizer, "lr_scheduler": {"scheduler": scheduler, "name": "Sean"}} model = TestModel() lr_monitor = LearningRateMonitor() trainer = Trainer( strategy=DeepSpeedPlugin(), # disable ZeRO so our optimizers are not wrapped default_root_dir=tmpdir, gpus=1, fast_dev_run=True, precision=16, callbacks=[TestCB(), lr_monitor], ) trainer.fit(model) assert lr_monitor.lrs == {"Sean": [0.1]} _assert_save_model_is_equal(model, tmpdir, trainer)
def test_deepspeed_multigpu_stage_2_accumulated_grad_batches( tmpdir, offload_optimizer): """ Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works. """ seed_everything(42) class VerificationCallback(Callback): def on_train_batch_start(self, trainer, pl_module: LightningModule, batch: Any, batch_idx: int, dataloader_idx: int) -> None: deepspeed_engine = trainer.training_type_plugin.model assert trainer.global_step == deepspeed_engine.global_steps model = ModelParallelClassificationModel() dm = ClassifDataModule() trainer = Trainer(default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=5, plugins=[ DeepSpeedPlugin(stage=2, offload_optimizer=offload_optimizer) ], gpus=2, limit_val_batches=2, precision=16, accumulate_grad_batches=2, callbacks=[VerificationCallback()]) trainer.fit(model, datamodule=dm)
def test_deepspeed_multigpu_test(tmpdir, deepspeed_config): """Test to ensure we can use DeepSpeed with just test using ZeRO Stage 3.""" model = ModelParallelBoringModel() trainer = Trainer( default_root_dir=tmpdir, plugins=[DeepSpeedPlugin(stage=3)], gpus=2, fast_dev_run=True, precision=16 ) trainer.test(model)
def test_deepspeed_custom_precision_params(tmpdir): """ Ensure if we modify the FP16 parameters via the DeepSpeedPlugin, the deepspeed config contains these changes. """ class TestModel(BoringModel): def on_train_start(self) -> None: assert self.trainer.training_type_plugin.config['fp16']['loss_scale'] == 10 assert self.trainer.training_type_plugin.config['fp16']['initial_scale_power'] == 10 assert self.trainer.training_type_plugin.config['fp16']['loss_scale_window'] == 10 assert self.trainer.training_type_plugin.config['fp16']['hysteresis'] == 10 assert self.trainer.training_type_plugin.config['fp16']['min_loss_scale'] == 10 raise SystemExit() model = TestModel() trainer = Trainer( plugins=[ DeepSpeedPlugin( loss_scale=10, initial_scale_power=10, loss_scale_window=10, hysteresis=10, min_loss_scale=10 ) ], precision=16, gpus=1 ) with pytest.raises(SystemExit): trainer.fit(model)
def test_deepspeed_setup_train_dataloader(tmpdir): """Test DeepSpeed works when setup is required to call, and the user passes the batch size manually.""" class TestSetupIsCalledDataModule(LightningDataModule): def __init__(self): super().__init__() self._setup = False def setup(self, stage: Optional[str] = None) -> None: self._setup = True def train_dataloader(self): assert self._setup return DataLoader(RandomDataset(32, 64), batch_size=2) def val_dataloader(self): assert self._setup return DataLoader(RandomDataset(32, 64), batch_size=2) def test_dataloader(self): assert self._setup return DataLoader(RandomDataset(32, 64), batch_size=2) model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, plugins=[DeepSpeedPlugin(logging_batch_size_per_gpu=32)], gpus=1, fast_dev_run=True, ) trainer.fit(model, datamodule=TestSetupIsCalledDataModule()) trainer.test(model)
def test_deepspeed_auto_batch_size_config_select(tmpdir, dataset_cls, value): """Test to ensure that the batch size is correctly set as expected for deepspeed logging purposes.""" class TestModel(BoringModel): def train_dataloader(self): return DataLoader(dataset_cls(32, 64)) class AssertCallback(Callback): def on_train_start(self, trainer, pl_module) -> None: assert isinstance(trainer.accelerator.training_type_plugin, DeepSpeedPlugin) config = trainer.accelerator.training_type_plugin.config # int value overrides auto mode expected_value = value if isinstance(value, int) else 1 if dataset_cls == RandomDataset: expected_value = pl_module.train_dataloader( ).batch_size if value == "auto" else value assert config['train_micro_batch_size_per_gpu'] == expected_value raise SystemExit ck = AssertCallback() model = TestModel() trainer = Trainer( default_root_dir=tmpdir, fast_dev_run=True, callbacks=ck, gpus=1, plugins=DeepSpeedPlugin(logging_batch_size_per_gpu=value, zero_optimization=False), ) with pytest.raises(SystemExit): trainer.fit(model)
def test_deepspeed_config(tmpdir, deepspeed_zero_config): """ Test to ensure deepspeed works correctly when passed a DeepSpeed config object including optimizers/schedulers and saves the model weights to load correctly. """ class TestCB(Callback): def on_train_start(self, trainer, pl_module) -> None: from deepspeed.runtime.lr_schedules import WarmupLR from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer assert isinstance(trainer.optimizers[0], FP16_DeepSpeedZeroOptimizer) assert isinstance(trainer.optimizers[0].optimizer, torch.optim.SGD) assert trainer.lr_schedulers == [ ] # DeepSpeed manages LR scheduler internally # Ensure DeepSpeed engine has initialized with our optimizer/lr_scheduler assert isinstance(trainer.model.lr_scheduler, WarmupLR) model = BoringModel() trainer = Trainer(plugins=[DeepSpeedPlugin(config=deepspeed_zero_config)], default_root_dir=tmpdir, gpus=1, fast_dev_run=True, precision=16, callbacks=[TestCB()]) trainer.fit(model) trainer.test(model) _assert_save_model_is_equal(model, tmpdir, trainer)
def test_deepspeed_defaults(tmpdir): """ Ensure that defaults are correctly set as a config for DeepSpeed if no arguments are passed. """ plugin = DeepSpeedPlugin() assert plugin.config is not None assert isinstance(plugin.config["zero_optimization"], dict)
def test_deepspeed_multigpu_stage_3_warns_resume_training(tmpdir): """Test to ensure with Stage 3 and multiple GPUs that we can resume from training, throwing a warning that the optimizer state and scheduler states cannot be restored.""" dm = ClassifDataModule() model = BoringModel() checkpoint_path = os.path.join(tmpdir, "model.pt") trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True) trainer.fit(model) trainer.save_checkpoint(checkpoint_path) trainer = Trainer( default_root_dir=tmpdir, fast_dev_run=True, plugins=DeepSpeedPlugin(stage=3, load_full_weights=True), gpus=1, precision=16, resume_from_checkpoint=checkpoint_path, ) with pytest.warns( UserWarning, match="A single checkpoint file has been given. This means optimizer states and " "scheduler states can not be restored. If you'd like to restore these states, you must " "provide a path to the originally saved DeepSpeed checkpoint.", ): trainer.fit(model, datamodule=dm)
def test_deepspeed_custom_precision_params(tmpdir): """Ensure if we modify the FP16 parameters via the DeepSpeedPlugin, the deepspeed config contains these changes.""" class TestCB(Callback): def on_train_start(self, trainer, pl_module) -> None: assert trainer.training_type_plugin.config['fp16'][ 'loss_scale'] == 10 assert trainer.training_type_plugin.config['fp16'][ 'initial_scale_power'] == 10 assert trainer.training_type_plugin.config['fp16'][ 'loss_scale_window'] == 10 assert trainer.training_type_plugin.config['fp16'][ 'hysteresis'] == 10 assert trainer.training_type_plugin.config['fp16'][ 'min_loss_scale'] == 10 raise SystemExit() model = BoringModel() ds = DeepSpeedPlugin(loss_scale=10, initial_scale_power=10, loss_scale_window=10, hysteresis=10, min_loss_scale=10) trainer = Trainer(default_root_dir=tmpdir, plugins=[ds], precision=16, gpus=1, callbacks=[TestCB()]) with pytest.raises(SystemExit): trainer.fit(model)
def test_deepspeed_run_configure_optimizers(tmpdir): """ Test end to end that deepspeed works with defaults (without ZeRO as that requires compilation), whilst using configure_optimizers for optimizers and schedulers. """ class TestModel(BoringModel): def on_train_start(self) -> None: assert isinstance(self.trainer.optimizers[0], torch.optim.SGD) assert self.trainer.lr_schedulers == [ ] # DeepSpeed manages LR scheduler internally # Ensure DeepSpeed engine has initialized with our optimizer/lr_scheduler assert isinstance(self.trainer.model.lr_scheduler, torch.optim.lr_scheduler.StepLR) model = TestModel() trainer = Trainer( plugins=DeepSpeedPlugin(zero_optimization=False), default_root_dir=tmpdir, gpus=1, fast_dev_run=True, ) trainer.fit(model) _assert_save_model_is_equal(model, tmpdir, trainer)
def test_deepspeed_config(tmpdir, deepspeed_config): """ Test to ensure deepspeed works correctly when passed a DeepSpeed config object including optimizers/schedulers and saves the model weights to load correctly. """ class TestModel(BoringModel): def on_train_start(self) -> None: import deepspeed assert isinstance(self.trainer.optimizers[0], torch.optim.SGD) assert self.trainer.lr_schedulers == [ ] # DeepSpeed manages LR scheduler internally assert isinstance(self.trainer.model.optimizer, torch.optim.SGD) assert isinstance(self.trainer.model.lr_scheduler, deepspeed.runtime.lr_schedules.WarmupLR) model = TestModel() trainer = Trainer( plugins=[DeepSpeedPlugin(config=deepspeed_config)], default_root_dir=tmpdir, gpus=1, fast_dev_run=True, ) trainer.fit(model) trainer.test(model) _assert_save_model_is_equal(model, tmpdir, trainer)
def test_deepspeed_plugin_env_variables(mock_deepspeed_distributed, tmpdir, platform): """ Test to ensure that we setup distributed communication using correctly. When using windows, ranks environment variables should not be set, and deepspeed should handle this. """ trainer = Trainer(default_root_dir=tmpdir, plugins=[DeepSpeedPlugin(stage=3)]) plugin = trainer.training_type_plugin assert isinstance(plugin, DeepSpeedPlugin) with mock.patch("platform.system", return_value=platform) as mock_platform: plugin.init_ddp_connection() mock_deepspeed_distributed.assert_called() mock_platform.assert_called() if platform == "Windows": # assert no env variables have been set within the DeepSpeedPlugin assert all(k not in os.environ for k in ("MASTER_PORT", "MASTER_ADDR", "RANK", "WORLD_SIZE", "LOCAL_RANK")) else: assert os.environ["MASTER_ADDR"] == str( trainer.training_type_plugin.cluster_environment.master_address()) assert os.environ["MASTER_PORT"] == str( trainer.training_type_plugin.cluster_environment.master_port()) assert os.environ["RANK"] == str( trainer.training_type_plugin.global_rank) assert os.environ["WORLD_SIZE"] == str( trainer.training_type_plugin.world_size) assert os.environ["LOCAL_RANK"] == str( trainer.training_type_plugin.local_rank)
def test_deepspeed_assert_config_zero_offload_disabled(tmpdir, deepspeed_zero_config): """Ensure if we use a config and turn off cpu_offload, that this is set to False within the config.""" deepspeed_zero_config["zero_optimization"]["cpu_offload"] = False class TestCallback(Callback): def on_before_accelerator_backend_setup(self, trainer, pl_module) -> None: assert trainer.training_type_plugin.config["zero_optimization"][ "cpu_offload"] is False raise SystemExit() model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, plugins=[DeepSpeedPlugin(config=deepspeed_zero_config)], precision=16, gpus=1, callbacks=[TestCallback()], ) with pytest.raises(SystemExit): trainer.fit(model)
def test_deepspeed_multigpu_partial_partition_parameters(tmpdir): """Test to ensure that a module that defines a layer inside the ``__init__`` and ``configure_sharded_model`` correctly converts all parameters to float16 when ``precision=16`` and runs successfully.""" class TestModel(ModelParallelBoringModel): def __init__(self): super().__init__() self.layer_2 = torch.nn.Linear(32, 32) def configure_sharded_model(self) -> None: self.layer = torch.nn.Linear(32, 2) def forward(self, x): x = self.layer_2(x) return self.layer(x) def on_train_epoch_start(self) -> None: assert all([x.dtype == torch.float16 for x in self.parameters()]) model = TestModel() trainer = Trainer(default_root_dir=tmpdir, strategy=DeepSpeedPlugin(stage=3), gpus=1, fast_dev_run=True, precision=16) trainer.fit(model)
def test_deepspeed_run_configure_optimizers(tmpdir): """ Test end to end that deepspeed works with defaults (without ZeRO as that requires compilation), whilst using configure_optimizers for optimizers and schedulers. """ class TestCB(Callback): def on_train_start(self, trainer, pl_module) -> None: from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer assert isinstance(trainer.optimizers[0], FP16_DeepSpeedZeroOptimizer) assert isinstance(trainer.optimizers[0].optimizer, torch.optim.SGD) assert trainer.lr_schedulers == [ ] # DeepSpeed manages LR scheduler internally # Ensure DeepSpeed engine has initialized with our optimizer/lr_scheduler assert isinstance(trainer.model.lr_scheduler, torch.optim.lr_scheduler.StepLR) model = BoringModel() trainer = Trainer( plugins=DeepSpeedPlugin( ), # disable ZeRO so our optimizers are not wrapped default_root_dir=tmpdir, gpus=1, fast_dev_run=True, precision=16, callbacks=[TestCB()]) trainer.fit(model) _assert_save_model_is_equal(model, tmpdir, trainer)
def select_training_type_plugin(self) -> TrainingTypePlugin: if self.use_ddp2: plugin = DDP2Plugin(parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment) elif self.use_ddp and self.use_deepspeed: plugin = DeepSpeedPlugin( num_nodes=self.num_nodes, cluster_environment=self.select_cluster_environment(), parallel_devices=self.parallel_devices) elif self.use_ddp: use_slurm_ddp = self.use_ddp and self.is_slurm_managing_tasks use_torchelastic_ddp = self.use_ddp and self.is_using_torchelastic use_ddp_spawn = self._distrib_type == DistributedType.DDP_SPAWN use_ddp_cpu_spawn = self.use_ddp and self.on_cpu use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and self.is_using_torchelastic use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.is_slurm_managing_tasks use_ddp_sharded = self._distrib_type == DistributedType.DDP_SHARDED use_ddp_sharded_spawn = self._distrib_type == DistributedType.DDP_SHARDED_SPAWN # TODO: decouple from TE # ddp script mode uses the same flags as TE if os.environ.get("PL_IN_DDP_SUBPROCESS", False): use_torchelastic_ddp = False if self.on_tpu: ddp_plugin_cls = TPUSpawnPlugin elif use_ddp_sharded: ddp_plugin_cls = DDPShardedPlugin elif use_ddp_sharded_spawn: ddp_plugin_cls = DDPSpawnShardedPlugin elif use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp: ddp_plugin_cls = DDPPlugin elif use_ddp_spawn or use_ddp_cpu_spawn: ddp_plugin_cls = DDPSpawnPlugin else: ddp_plugin_cls = DDPPlugin plugin = ddp_plugin_cls( parallel_devices=self.parallel_devices, num_nodes=self.num_nodes, cluster_environment=self.cluster_environment, sync_batchnorm=self.sync_batchnorm, ) elif self.use_dp: plugin = DataParallelPlugin(parallel_devices=self.parallel_devices) elif self.use_horovod: plugin = HorovodPlugin(parallel_devices=self.parallel_devices) elif self.on_tpu: if isinstance(self.tpu_cores, list): plugin = SingleTPUPlugin(self.tpu_id) else: plugin = TPUSpawnPlugin( parallel_devices=list(range(self.tpu_cores))) else: single_gpu_ordinal = device_parser.determine_root_gpu_device( self.parallel_device_ids) plugin = SingleDevicePlugin(device=torch.device( f"cuda:{single_gpu_ordinal}" if self.on_gpu else "cpu")) return plugin
def test_deepspeed_with_env_path(tmpdir, monkeypatch, deepspeed_config): """Test to ensure if we pass an env variable, we load the config from the path.""" config_path = os.path.join(tmpdir, "temp.json") with open(config_path, "w") as f: f.write(json.dumps(deepspeed_config)) monkeypatch.setenv("PL_DEEPSPEED_CONFIG_PATH", config_path) plugin = DeepSpeedPlugin() assert plugin.config == deepspeed_config
def test_deepspeed_multigpu_no_schedulers(tmpdir): """Test to ensure ZeRO Stage 3 works with a parallel model and no schedulers.""" model = ModelParallelBoringModelNoSchedulers() trainer = Trainer( default_root_dir=tmpdir, plugins=[DeepSpeedPlugin(stage=3)], gpus=2, fast_dev_run=True, precision=16 ) trainer.fit(model) _assert_save_model_is_equal(model, tmpdir, trainer)
def test_deepspeed_skip_backward_raises(tmpdir): class TestModel(BoringModel): def training_step(self, batch, batch_idx): return None model = TestModel() trainer = Trainer(default_root_dir=tmpdir, plugins=[DeepSpeedPlugin()], gpus=1, fast_dev_run=True, precision=16) with pytest.raises(MisconfigurationException, match="returning `None` .* is not supported"): trainer.fit(model)
def test_deepspeed_with_meta_device(tmpdir): with init_meta_context(): model = BoringModel() assert model.layer.weight.device.type == "meta" trainer = Trainer( default_root_dir=tmpdir, plugins=[DeepSpeedPlugin(stage=3)], gpus=2, fast_dev_run=True, precision=16 ) trainer.fit(model) assert model.layer.weight.device.type == "cpu"
def test_lightning_model(): """Test that DeepSpeed works with a simple LightningModule and LightningDataModule.""" model = BoringModel() trainer = Trainer(strategy=DeepSpeedPlugin(), max_epochs=1, precision=16, gpus=1) trainer.fit(model)