def select_accelerator(self): if self.trainer.accelerator_backend is not None: return self.trainer.accelerator_backend # SLURM ddp use_slurm_ddp = self.trainer.use_ddp and self.trainer.is_slurm_managing_tasks # torchelastic or general non_slurm ddp te_flags_passed = 'WORLD_SIZE' in os.environ and ( 'GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ) use_torchelastic_ddp = self.trainer.use_ddp and te_flags_passed use_ddp_spawn = self.trainer.use_ddp and self.trainer.distributed_backend == "ddp_spawn" use_ddp_cpu_spawn = self.trainer.use_ddp and self.trainer.distributed_backend == "ddp_cpu" # choose the appropriate accelerator backend if self.trainer.use_ddp2: accelerator_backend = accelerators.DDP2Backend(self.trainer) elif use_slurm_ddp: accelerator_backend = accelerators.DDPBackend(self.trainer, mode='slurm_ddp') elif use_torchelastic_ddp: accelerator_backend = accelerators.DDPBackend( self.trainer, mode='torchelastic_ddp') elif use_ddp_spawn: accelerator_backend = accelerators.DDPSpawnBackend( self.trainer, nprocs=self.trainer.num_processes) elif use_ddp_cpu_spawn: accelerator_backend = accelerators.DDPCPUSpawnBackend( self.trainer, nprocs=self.trainer.num_processes) elif self.trainer.distributed_backend == "ddp": accelerator_backend = accelerators.DDPBackend(self.trainer, mode='ddp') elif self.trainer.use_dp: accelerator_backend = accelerators.DataParallelBackend( self.trainer) elif self.trainer.use_horovod: accelerator_backend = accelerators.HorovodBackend(self.trainer) elif self.trainer.use_single_gpu: accelerator_backend = accelerators.GPUBackend(self.trainer) elif self.trainer.use_tpu: accelerator_backend = accelerators.TPUBackend(self.trainer) elif self.trainer.distributed_backend is None: accelerator_backend = accelerators.CPUBackend(self.trainer) else: raise MisconfigurationException( f'Trainer(distributed_backend={self.trainer.distributed_backend} is not a supported backend' ) return accelerator_backend
def select_accelerator(self): # SLURM ddp use_slurm_ddp = self.trainer.use_ddp and self.trainer.is_slurm_managing_tasks # torchelastic or general non_slurm ddp te_flags_passed = 'WORLD_SIZE' in os.environ and ( 'GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ) use_torchelastic_ddp = self.trainer.use_ddp and te_flags_passed use_ddp_spawn = self.trainer.use_ddp and self.trainer.distributed_backend == 'ddp_spawn' use_ddp_cpu_spawn = self.trainer.use_ddp and self.trainer.distributed_backend == 'ddp_cpu' # choose the appropriate accelerator backend if self.trainer.use_ddp2: accelerator_backend = accelerators.DDP2Backend(self.trainer) elif use_slurm_ddp: accelerator_backend = accelerators.DDPBackend(self.trainer, mode='slurm_ddp') elif use_torchelastic_ddp: accelerator_backend = accelerators.DDPBackend( self.trainer, mode='torchelastic_ddp') elif use_ddp_spawn: accelerator_backend = accelerators.DDPSpawnBackend( self.trainer, nprocs=self.trainer.num_processes) elif use_ddp_cpu_spawn: accelerator_backend = accelerators.DDPCPUSpawnBackend( self.trainer, nprocs=self.trainer.num_processes) elif self.trainer.distributed_backend == 'ddp': accelerator_backend = accelerators.DDPBackend(self.trainer, mode='ddp') elif self.trainer.use_dp: accelerator_backend = accelerators.DataParallelBackend( self.trainer) elif self.trainer.use_horovod: accelerator_backend = accelerators.HorovodBackend(self.trainer) elif self.trainer.use_single_gpu: accelerator_backend = accelerators.GPUBackend(self.trainer) elif self.trainer.use_tpu: accelerator_backend = accelerators.TPUBackend(self.trainer) else: accelerator_backend = accelerators.CPUBackend(self.trainer) return accelerator_backend
def select_accelerator(self): if self.trainer.accelerator_backend is not None: return self.trainer.accelerator_backend # SLURM ddp use_slurm_ddp = self.trainer.use_ddp and self.trainer.is_slurm_managing_tasks # torchelastic or general non_slurm ddp te_flags_passed = 'WORLD_SIZE' in os.environ and ( 'GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ) use_torchelastic_ddp = self.trainer.use_ddp and te_flags_passed use_ddp_spawn = self.trainer.use_ddp and self.trainer.distributed_backend == "ddp_spawn" use_ddp_cpu_spawn = self.trainer.use_ddp and self.trainer.distributed_backend == "ddp_cpu" use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and self._is_using_torchelastic( ) use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.trainer.is_slurm_managing_tasks # ddp script mode uses the same flags as TE # TODO: decouple from TE if os.environ.get('PL_DDP_PID', False): use_torchelastic_ddp = False # choose the appropriate accelerator backend if self.trainer.use_ddp2: accelerator_backend = accelerators.DDP2Backend(self.trainer) elif use_ddp_cpu_slurm: accelerator_backend = accelerators.DDPCPUSLURMBackend(self.trainer) elif use_slurm_ddp: accelerator_backend = accelerators.DDPSLURMBackend(self.trainer) elif use_ddp_cpu_torch_elastic: accelerator_backend = accelerators.DDPCPUTorchElasticBackend( self.trainer) elif use_torchelastic_ddp: accelerator_backend = accelerators.DDPTorchElasticBackend( self.trainer) elif use_ddp_spawn: accelerator_backend = accelerators.DDPSpawnBackend( self.trainer, nprocs=self.trainer.num_processes) elif use_ddp_cpu_spawn: accelerator_backend = accelerators.DDPCPUSpawnBackend( self.trainer, nprocs=self.trainer.num_processes) elif self.trainer.distributed_backend == "ddp": accelerator_backend = accelerators.DDPBackend(self.trainer) elif self.trainer.use_dp: accelerator_backend = accelerators.DataParallelBackend( self.trainer) elif self.trainer.use_horovod: accelerator_backend = accelerators.HorovodBackend(self.trainer) elif self.trainer.use_single_gpu: accelerator_backend = accelerators.GPUBackend(self.trainer) elif self.trainer.use_tpu: accelerator_backend = accelerators.TPUBackend(self.trainer) elif self.trainer.distributed_backend is None: accelerator_backend = accelerators.CPUBackend(self.trainer) else: raise MisconfigurationException( f'Trainer(distributed_backend={self.trainer.distributed_backend} is not a supported backend' ) return accelerator_backend