コード例 #1
0
    def select_accelerator(self):
        if self.trainer.accelerator_backend is not None:
            return self.trainer.accelerator_backend

        # SLURM ddp
        use_slurm_ddp = self.trainer.use_ddp and self.trainer.is_slurm_managing_tasks

        # torchelastic or general non_slurm ddp
        te_flags_passed = 'WORLD_SIZE' in os.environ and (
            'GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ)
        use_torchelastic_ddp = self.trainer.use_ddp and te_flags_passed

        use_ddp_spawn = self.trainer.use_ddp and self.trainer.distributed_backend == "ddp_spawn"
        use_ddp_cpu_spawn = self.trainer.use_ddp and self.trainer.distributed_backend == "ddp_cpu"

        # choose the appropriate accelerator backend
        if self.trainer.use_ddp2:
            accelerator_backend = accelerators.DDP2Backend(self.trainer)

        elif use_slurm_ddp:
            accelerator_backend = accelerators.DDPBackend(self.trainer,
                                                          mode='slurm_ddp')

        elif use_torchelastic_ddp:
            accelerator_backend = accelerators.DDPBackend(
                self.trainer, mode='torchelastic_ddp')

        elif use_ddp_spawn:
            accelerator_backend = accelerators.DDPSpawnBackend(
                self.trainer, nprocs=self.trainer.num_processes)

        elif use_ddp_cpu_spawn:
            accelerator_backend = accelerators.DDPCPUSpawnBackend(
                self.trainer, nprocs=self.trainer.num_processes)

        elif self.trainer.distributed_backend == "ddp":
            accelerator_backend = accelerators.DDPBackend(self.trainer,
                                                          mode='ddp')

        elif self.trainer.use_dp:
            accelerator_backend = accelerators.DataParallelBackend(
                self.trainer)

        elif self.trainer.use_horovod:
            accelerator_backend = accelerators.HorovodBackend(self.trainer)

        elif self.trainer.use_single_gpu:
            accelerator_backend = accelerators.GPUBackend(self.trainer)

        elif self.trainer.use_tpu:
            accelerator_backend = accelerators.TPUBackend(self.trainer)

        elif self.trainer.distributed_backend is None:
            accelerator_backend = accelerators.CPUBackend(self.trainer)
        else:
            raise MisconfigurationException(
                f'Trainer(distributed_backend={self.trainer.distributed_backend} is not a supported backend'
            )

        return accelerator_backend
コード例 #2
0
    def select_accelerator(self):
        # SLURM ddp
        use_slurm_ddp = self.trainer.use_ddp and self.trainer.is_slurm_managing_tasks

        # torchelastic or general non_slurm ddp
        te_flags_passed = 'WORLD_SIZE' in os.environ and (
            'GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ)
        use_torchelastic_ddp = self.trainer.use_ddp and te_flags_passed

        use_ddp_spawn = self.trainer.use_ddp and self.trainer.distributed_backend == 'ddp_spawn'
        use_ddp_cpu_spawn = self.trainer.use_ddp and self.trainer.distributed_backend == 'ddp_cpu'

        # choose the appropriate accelerator backend
        if self.trainer.use_ddp2:
            accelerator_backend = accelerators.DDP2Backend(self.trainer)

        elif use_slurm_ddp:
            accelerator_backend = accelerators.DDPBackend(self.trainer,
                                                          mode='slurm_ddp')

        elif use_torchelastic_ddp:
            accelerator_backend = accelerators.DDPBackend(
                self.trainer, mode='torchelastic_ddp')

        elif use_ddp_spawn:
            accelerator_backend = accelerators.DDPSpawnBackend(
                self.trainer, nprocs=self.trainer.num_processes)

        elif use_ddp_cpu_spawn:
            accelerator_backend = accelerators.DDPCPUSpawnBackend(
                self.trainer, nprocs=self.trainer.num_processes)

        elif self.trainer.distributed_backend == 'ddp':
            accelerator_backend = accelerators.DDPBackend(self.trainer,
                                                          mode='ddp')

        elif self.trainer.use_dp:
            accelerator_backend = accelerators.DataParallelBackend(
                self.trainer)

        elif self.trainer.use_horovod:
            accelerator_backend = accelerators.HorovodBackend(self.trainer)

        elif self.trainer.use_single_gpu:
            accelerator_backend = accelerators.GPUBackend(self.trainer)

        elif self.trainer.use_tpu:
            accelerator_backend = accelerators.TPUBackend(self.trainer)

        else:
            accelerator_backend = accelerators.CPUBackend(self.trainer)

        return accelerator_backend
コード例 #3
0
    def select_accelerator(self):
        if self.trainer.accelerator_backend is not None:
            return self.trainer.accelerator_backend

        # SLURM ddp
        use_slurm_ddp = self.trainer.use_ddp and self.trainer.is_slurm_managing_tasks

        # torchelastic or general non_slurm ddp
        te_flags_passed = 'WORLD_SIZE' in os.environ and (
            'GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ)
        use_torchelastic_ddp = self.trainer.use_ddp and te_flags_passed

        use_ddp_spawn = self.trainer.use_ddp and self.trainer.distributed_backend == "ddp_spawn"
        use_ddp_cpu_spawn = self.trainer.use_ddp and self.trainer.distributed_backend == "ddp_cpu"

        use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and self._is_using_torchelastic(
        )
        use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.trainer.is_slurm_managing_tasks

        # ddp script mode uses the same flags as TE
        # TODO: decouple from TE
        if os.environ.get('PL_DDP_PID', False):
            use_torchelastic_ddp = False

        # choose the appropriate accelerator backend
        if self.trainer.use_ddp2:
            accelerator_backend = accelerators.DDP2Backend(self.trainer)

        elif use_ddp_cpu_slurm:
            accelerator_backend = accelerators.DDPCPUSLURMBackend(self.trainer)

        elif use_slurm_ddp:
            accelerator_backend = accelerators.DDPSLURMBackend(self.trainer)

        elif use_ddp_cpu_torch_elastic:
            accelerator_backend = accelerators.DDPCPUTorchElasticBackend(
                self.trainer)

        elif use_torchelastic_ddp:
            accelerator_backend = accelerators.DDPTorchElasticBackend(
                self.trainer)

        elif use_ddp_spawn:
            accelerator_backend = accelerators.DDPSpawnBackend(
                self.trainer, nprocs=self.trainer.num_processes)

        elif use_ddp_cpu_spawn:
            accelerator_backend = accelerators.DDPCPUSpawnBackend(
                self.trainer, nprocs=self.trainer.num_processes)

        elif self.trainer.distributed_backend == "ddp":
            accelerator_backend = accelerators.DDPBackend(self.trainer)

        elif self.trainer.use_dp:
            accelerator_backend = accelerators.DataParallelBackend(
                self.trainer)

        elif self.trainer.use_horovod:
            accelerator_backend = accelerators.HorovodBackend(self.trainer)

        elif self.trainer.use_single_gpu:
            accelerator_backend = accelerators.GPUBackend(self.trainer)

        elif self.trainer.use_tpu:
            accelerator_backend = accelerators.TPUBackend(self.trainer)

        elif self.trainer.distributed_backend is None:
            accelerator_backend = accelerators.CPUBackend(self.trainer)
        else:
            raise MisconfigurationException(
                f'Trainer(distributed_backend={self.trainer.distributed_backend} is not a supported backend'
            )

        return accelerator_backend