def test_sync_dist(_):
     sync = _Sync(TPUSpawnPlugin().reduce,
                  should=True,
                  op=torch.distributed.ReduceOp.SUM)
     value = torch.tensor([1.0])
     value = (sync(value), )
     assert value.item() == 8
Exemple #2
0
 def test_sync_dist(_):
     value = LightningModule._LightningModule__sync(
         torch.tensor([1.0]),
         sync_fn=TPUSpawnPlugin().reduce,
         sync_dist=True,
         sync_dist_op=torch.distributed.ReduceOp.SUM)
     assert value.item() == 8
Exemple #3
0
    def select_training_type_plugin(self) -> TrainingTypePlugin:
        if self.use_ddp2:
            plugin = DDP2Plugin(parallel_devices=self.parallel_devices,
                                cluster_environment=self.cluster_environment)
        elif self.use_ddp and self.use_deepspeed:
            plugin = DeepSpeedPlugin(
                num_nodes=self.num_nodes,
                cluster_environment=self.select_cluster_environment(),
                parallel_devices=self.parallel_devices)
        elif self.use_ddp:
            use_slurm_ddp = self.use_ddp and self.is_slurm_managing_tasks
            use_torchelastic_ddp = self.use_ddp and self.is_using_torchelastic
            use_ddp_spawn = self._distrib_type == DistributedType.DDP_SPAWN
            use_ddp_cpu_spawn = self.use_ddp and self.on_cpu
            use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and self.is_using_torchelastic
            use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.is_slurm_managing_tasks
            use_ddp_sharded = self._distrib_type == DistributedType.DDP_SHARDED
            use_ddp_sharded_spawn = self._distrib_type == DistributedType.DDP_SHARDED_SPAWN

            # TODO: decouple from TE
            # ddp script mode uses the same flags as TE
            if os.environ.get("PL_IN_DDP_SUBPROCESS", False):
                use_torchelastic_ddp = False

            if self.on_tpu:
                ddp_plugin_cls = TPUSpawnPlugin
            elif use_ddp_sharded:
                ddp_plugin_cls = DDPShardedPlugin
            elif use_ddp_sharded_spawn:
                ddp_plugin_cls = DDPSpawnShardedPlugin
            elif use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp:
                ddp_plugin_cls = DDPPlugin
            elif use_ddp_spawn or use_ddp_cpu_spawn:
                ddp_plugin_cls = DDPSpawnPlugin
            else:
                ddp_plugin_cls = DDPPlugin

            plugin = ddp_plugin_cls(
                parallel_devices=self.parallel_devices,
                num_nodes=self.num_nodes,
                cluster_environment=self.cluster_environment,
                sync_batchnorm=self.sync_batchnorm,
            )
        elif self.use_dp:
            plugin = DataParallelPlugin(parallel_devices=self.parallel_devices)
        elif self.use_horovod:
            plugin = HorovodPlugin(parallel_devices=self.parallel_devices)
        elif self.on_tpu:
            if isinstance(self.tpu_cores, list):
                plugin = SingleTPUPlugin(self.tpu_id)
            else:
                plugin = TPUSpawnPlugin(
                    parallel_devices=list(range(self.tpu_cores)))
        else:
            single_gpu_ordinal = device_parser.determine_root_gpu_device(
                self.parallel_device_ids)
            plugin = SingleDevicePlugin(device=torch.device(
                f"cuda:{single_gpu_ordinal}" if self.on_gpu else "cpu"))
        return plugin
Exemple #4
0
def test_tpu_invalid_raises():
    accelerator = TPUAccelerator(object(), TPUSpawnPlugin())
    with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `TPUPrecisionPlugin"):
        accelerator.setup(object())

    accelerator = TPUAccelerator(TPUPrecisionPlugin(), object())
    with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `SingleTPUPlugin` or `TPUSpawnPlugi"):
        accelerator.setup(object())
Exemple #5
0
    def test_sync_dist(rank):
        tensor = torch.tensor([1.0])
        training_type_plugin = TPUSpawnPlugin()

        res = Result()
        res.log("test_tensor",
                tensor,
                sync_fn=training_type_plugin.reduce,
                sync_dist=True,
                sync_dist_op=torch.distributed.ReduceOp.SUM)

        assert res["test_tensor"].item(
        ) == 8, "Result-Log does not work properly with TPU Spawn and Tensors"
Exemple #6
0
def test_tpu_debug_mode(tmpdir):
    """Test if debug mode works on TPU."""
    class DebugModel(BoringModel):
        def on_train_start(self):
            assert os.environ.get("PT_XLA_DEBUG") == str(
                1), "PT_XLA_DEBUG was not set in environment variables"

        def teardown(self, stage):
            assert "PT_XLA_DEBUG" not in os.environ

    tutils.reset_seed()
    trainer_options = dict(
        default_root_dir=tmpdir,
        progress_bar_refresh_rate=0,
        max_epochs=4,
        tpu_cores=8,
        limit_train_batches=0.4,
        limit_val_batches=0.4,
        plugins=TPUSpawnPlugin(debug=True),
    )

    model = DebugModel()
    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
Exemple #7
0
def test_mp_device_dataloader_attribute(_):
    dataset = RandomDataset(32, 64)
    dataloader = TPUSpawnPlugin().process_dataloader(DataLoader(dataset))
    assert dataloader.dataset == dataset
Exemple #8
0
def test_strategy_choice_tpu_plugin(tmpdir):
    trainer = Trainer(strategy=TPUSpawnPlugin(), accelerator="tpu", devices=8)
    assert isinstance(trainer.training_type_plugin, TPUSpawnPlugin)
Exemple #9
0
def test_device_type_when_training_plugin_tpu_passed(tmpdir):

    trainer = Trainer(strategy=TPUSpawnPlugin(), tpu_cores=8)
    assert isinstance(trainer.training_type_plugin, TPUSpawnPlugin)
    assert trainer._device_type == DeviceType.TPU
    assert isinstance(trainer.accelerator, TPUAccelerator)