def select_training_type_plugin(self) -> TrainingTypePlugin: if self.use_ddp2: plugin = DDP2Plugin( parallel_devices=self.parallel_devices, num_nodes=self.num_nodes, cluster_environment=self.cluster_environment, sync_batchnorm=self.sync_batchnorm, ) elif self.use_ddp and self.use_deepspeed: plugin = DeepSpeedPlugin( num_nodes=self.num_nodes, cluster_environment=self.select_cluster_environment(), parallel_devices=self.parallel_devices ) elif self.use_ddp: use_slurm_ddp = self.use_ddp and self.is_slurm_managing_tasks use_torchelastic_ddp = self.use_ddp and TorchElasticEnvironment.is_using_torchelastic() use_ddp_spawn = self._distrib_type == DistributedType.DDP_SPAWN use_ddp_cpu_spawn = self.use_ddp and self.on_cpu use_tpu_spawn = self.on_tpu and self._distrib_type == DistributedType.TPU_SPAWN use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and TorchElasticEnvironment.is_using_torchelastic() use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.is_slurm_managing_tasks use_ddp_sharded = self._distrib_type == DistributedType.DDP_SHARDED use_ddp_sharded_spawn = self._distrib_type == DistributedType.DDP_SHARDED_SPAWN # TODO: decouple from TE # ddp script mode uses the same flags as TE if os.environ.get("PL_IN_DDP_SUBPROCESS", False): use_torchelastic_ddp = False if use_tpu_spawn: ddp_plugin_cls = TPUSpawnPlugin elif use_ddp_sharded: ddp_plugin_cls = DDPShardedPlugin elif use_ddp_sharded_spawn: ddp_plugin_cls = DDPSpawnShardedPlugin elif use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp: ddp_plugin_cls = DDPPlugin elif use_ddp_spawn or use_ddp_cpu_spawn: ddp_plugin_cls = DDPSpawnPlugin else: ddp_plugin_cls = DDPPlugin plugin = ddp_plugin_cls( parallel_devices=self.parallel_devices, num_nodes=self.num_nodes, cluster_environment=self.cluster_environment, sync_batchnorm=self.sync_batchnorm, ) elif self.use_dp: plugin = DataParallelPlugin(parallel_devices=self.parallel_devices) elif self.use_horovod: plugin = HorovodPlugin(parallel_devices=self.parallel_devices) elif self.on_tpu and isinstance(self.tpu_cores, list): plugin = SingleTPUPlugin(self.tpu_id) else: single_gpu_ordinal = device_parser.determine_root_gpu_device(self.parallel_device_ids) plugin = SingleDevicePlugin(device=torch.device(f"cuda:{single_gpu_ordinal}" if self.on_gpu else "cpu")) return plugin
def select_cluster_environment(self) -> ClusterEnvironment: if self._cluster_environment is not None: return self._cluster_environment if self.is_slurm_managing_tasks: env = SLURMEnvironment() elif TorchElasticEnvironment.is_using_torchelastic(): env = TorchElasticEnvironment() else: env = LightningEnvironment() return env
def is_using_torchelastic(self) -> bool: """ .. deprecated:: v1.3 Will be removed in v1.5.0. Returns: ``True`` if the current process was launched using the torchelastic command. """ rank_zero_deprecation( "The property `AcceleratorConnector.is_using_torchelastic` was deprecated in v1.3" " and will be removed in 1.5. Use `TorchElasticEnvironment.is_using_torchelastic()` instead.", ) return TorchElasticEnvironment.is_using_torchelastic()
def select_cluster_environment(self) -> ClusterEnvironment: if self._cluster_environment is not None: return self._cluster_environment if self._is_slurm_managing_tasks(): env = SLURMEnvironment() rank_zero_info("Multiprocessing is handled by SLURM.") elif TorchElasticEnvironment.is_using_torchelastic(): env = TorchElasticEnvironment() elif KubeflowEnvironment.is_using_kubeflow(): env = KubeflowEnvironment() elif LSFEnvironment.is_using_lsf(): env = LSFEnvironment() else: env = LightningEnvironment() return env
def parse_gpu_ids( gpus: Optional[Union[int, str, List[int]]]) -> Optional[List[int]]: """ Parses the GPU ids given in the format as accepted by the :class:`~pytorch_lightning.trainer.Trainer`. Args: gpus: An int -1 or string '-1' indicate that all available GPUs should be used. A list of ints or a string containing list of comma separated integers indicates specific GPUs to use. An int 0 means that no GPUs should be used. Any int N > 0 indicates that GPUs [0..N) should be used. Returns: a list of gpus to be used or ``None`` if no GPUs were requested If no GPUs are available but the value of gpus variable indicates request for GPUs then a MisconfigurationException is raised. """ # Check that gpus param is None, Int, String or List _check_data_type(gpus) # Handle the case when no gpus are requested if gpus is None or isinstance(gpus, int) and gpus == 0: return None if _compare_version("pytorch_lightning", operator.ge, "1.5") and isinstance(gpus, str) and gpus.strip() == "0": # TODO: in v1.5 combine this with the above if statement return None # We know user requested GPUs therefore if some of the # requested GPUs are not available an exception is thrown. gpus = _normalize_parse_gpu_string_input(gpus) gpus = _normalize_parse_gpu_input_to_list(gpus) if not gpus: raise MisconfigurationException( "GPUs requested but none are available.") if TorchElasticEnvironment.is_using_torchelastic( ) and len(gpus) != 1 and len(_get_all_available_gpus()) == 1: # omit sanity check on torchelastic as by default shows one visible GPU per process return gpus return _sanitize_gpu_ids(gpus)
def select_training_type_plugin(self) -> TrainingTypePlugin: if (isinstance(self.distributed_backend, Accelerator) and self.distributed_backend.training_type_plugin is not None): plugin = self.distributed_backend.training_type_plugin elif self.use_ddp2: plugin = DDP2Plugin(parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment) elif self.use_ddp and self.use_deepspeed: plugin = DeepSpeedPlugin( cluster_environment=self.select_cluster_environment(), parallel_devices=self.parallel_devices) elif self.use_ddp: use_slurm_ddp = self.use_ddp and self.is_slurm_managing_tasks use_torchelastic_ddp = self.use_ddp and TorchElasticEnvironment.is_using_torchelastic( ) use_kubeflow_ddp = self.use_ddp and KubeflowEnvironment.is_using_kubeflow( ) use_ddp_spawn = self._distrib_type == DistributedType.DDP_SPAWN use_ddp_cpu_spawn = use_ddp_spawn and self.use_cpu use_tpu_spawn = self.use_tpu and self._distrib_type == DistributedType.TPU_SPAWN use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and TorchElasticEnvironment.is_using_torchelastic( ) use_ddp_cpu_kubeflow = use_ddp_cpu_spawn and KubeflowEnvironment.is_using_kubeflow( ) use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.is_slurm_managing_tasks use_ddp_sharded = self._distrib_type == DistributedType.DDP_SHARDED use_ddp_sharded_spawn = self._distrib_type == DistributedType.DDP_SHARDED_SPAWN use_ddp_fully_sharded = self._distrib_type == DistributedType.DDP_FULLY_SHARDED if use_tpu_spawn: ddp_plugin_cls = TPUSpawnPlugin elif use_ddp_sharded: ddp_plugin_cls = DDPShardedPlugin elif use_ddp_sharded_spawn: ddp_plugin_cls = DDPSpawnShardedPlugin elif (use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp or use_kubeflow_ddp or use_ddp_cpu_kubeflow): ddp_plugin_cls = DDPPlugin elif use_ddp_spawn or use_ddp_cpu_spawn: ddp_plugin_cls = DDPSpawnPlugin elif use_ddp_fully_sharded: ddp_plugin_cls = DDPFullyShardedPlugin else: ddp_plugin_cls = DDPPlugin plugin = ddp_plugin_cls( parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment) elif self.use_dp: plugin = DataParallelPlugin(parallel_devices=self.parallel_devices) elif self.use_horovod: plugin = HorovodPlugin(parallel_devices=self.parallel_devices) elif self.use_tpu and isinstance(self.tpu_cores, list): plugin = SingleTPUPlugin(self.tpu_id) elif self.use_ipu: plugin = IPUPlugin(parallel_devices=self.parallel_devices) else: single_gpu_ordinal = device_parser.determine_root_gpu_device( self.parallel_device_ids) plugin = SingleDevicePlugin(device=torch.device( f"cuda:{single_gpu_ordinal}" if self.use_gpu else "cpu")) return plugin