def test_sync_dist(_): sync = _Sync(TPUSpawnPlugin().reduce, should=True, op=torch.distributed.ReduceOp.SUM) value = torch.tensor([1.0]) value = (sync(value), ) assert value.item() == 8
def test_sync_dist(_): value = LightningModule._LightningModule__sync( torch.tensor([1.0]), sync_fn=TPUSpawnPlugin().reduce, sync_dist=True, sync_dist_op=torch.distributed.ReduceOp.SUM) assert value.item() == 8
def select_training_type_plugin(self) -> TrainingTypePlugin: if self.use_ddp2: plugin = DDP2Plugin(parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment) elif self.use_ddp and self.use_deepspeed: plugin = DeepSpeedPlugin( num_nodes=self.num_nodes, cluster_environment=self.select_cluster_environment(), parallel_devices=self.parallel_devices) elif self.use_ddp: use_slurm_ddp = self.use_ddp and self.is_slurm_managing_tasks use_torchelastic_ddp = self.use_ddp and self.is_using_torchelastic use_ddp_spawn = self._distrib_type == DistributedType.DDP_SPAWN use_ddp_cpu_spawn = self.use_ddp and self.on_cpu use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and self.is_using_torchelastic use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.is_slurm_managing_tasks use_ddp_sharded = self._distrib_type == DistributedType.DDP_SHARDED use_ddp_sharded_spawn = self._distrib_type == DistributedType.DDP_SHARDED_SPAWN # TODO: decouple from TE # ddp script mode uses the same flags as TE if os.environ.get("PL_IN_DDP_SUBPROCESS", False): use_torchelastic_ddp = False if self.on_tpu: ddp_plugin_cls = TPUSpawnPlugin elif use_ddp_sharded: ddp_plugin_cls = DDPShardedPlugin elif use_ddp_sharded_spawn: ddp_plugin_cls = DDPSpawnShardedPlugin elif use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp: ddp_plugin_cls = DDPPlugin elif use_ddp_spawn or use_ddp_cpu_spawn: ddp_plugin_cls = DDPSpawnPlugin else: ddp_plugin_cls = DDPPlugin plugin = ddp_plugin_cls( parallel_devices=self.parallel_devices, num_nodes=self.num_nodes, cluster_environment=self.cluster_environment, sync_batchnorm=self.sync_batchnorm, ) elif self.use_dp: plugin = DataParallelPlugin(parallel_devices=self.parallel_devices) elif self.use_horovod: plugin = HorovodPlugin(parallel_devices=self.parallel_devices) elif self.on_tpu: if isinstance(self.tpu_cores, list): plugin = SingleTPUPlugin(self.tpu_id) else: plugin = TPUSpawnPlugin( parallel_devices=list(range(self.tpu_cores))) else: single_gpu_ordinal = device_parser.determine_root_gpu_device( self.parallel_device_ids) plugin = SingleDevicePlugin(device=torch.device( f"cuda:{single_gpu_ordinal}" if self.on_gpu else "cpu")) return plugin
def test_tpu_invalid_raises(): accelerator = TPUAccelerator(object(), TPUSpawnPlugin()) with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `TPUPrecisionPlugin"): accelerator.setup(object()) accelerator = TPUAccelerator(TPUPrecisionPlugin(), object()) with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `SingleTPUPlugin` or `TPUSpawnPlugi"): accelerator.setup(object())
def test_sync_dist(rank): tensor = torch.tensor([1.0]) training_type_plugin = TPUSpawnPlugin() res = Result() res.log("test_tensor", tensor, sync_fn=training_type_plugin.reduce, sync_dist=True, sync_dist_op=torch.distributed.ReduceOp.SUM) assert res["test_tensor"].item( ) == 8, "Result-Log does not work properly with TPU Spawn and Tensors"
def test_tpu_debug_mode(tmpdir): """Test if debug mode works on TPU.""" class DebugModel(BoringModel): def on_train_start(self): assert os.environ.get("PT_XLA_DEBUG") == str( 1), "PT_XLA_DEBUG was not set in environment variables" def teardown(self, stage): assert "PT_XLA_DEBUG" not in os.environ tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=4, tpu_cores=8, limit_train_batches=0.4, limit_val_batches=0.4, plugins=TPUSpawnPlugin(debug=True), ) model = DebugModel() tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
def test_mp_device_dataloader_attribute(_): dataset = RandomDataset(32, 64) dataloader = TPUSpawnPlugin().process_dataloader(DataLoader(dataset)) assert dataloader.dataset == dataset
def test_strategy_choice_tpu_plugin(tmpdir): trainer = Trainer(strategy=TPUSpawnPlugin(), accelerator="tpu", devices=8) assert isinstance(trainer.training_type_plugin, TPUSpawnPlugin)
def test_device_type_when_training_plugin_tpu_passed(tmpdir): trainer = Trainer(strategy=TPUSpawnPlugin(), tpu_cores=8) assert isinstance(trainer.training_type_plugin, TPUSpawnPlugin) assert trainer._device_type == DeviceType.TPU assert isinstance(trainer.accelerator, TPUAccelerator)