def select_strategy(self) -> Strategy: if isinstance(self.distributed_backend, Accelerator) and self.distributed_backend.strategy is not None: plugin = self.distributed_backend.strategy elif self.use_ddp2: plugin = DDP2Strategy(parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment) elif self.use_ddp and self.use_deepspeed: plugin = DeepSpeedStrategy( cluster_environment=self.select_cluster_environment(), parallel_devices=self.parallel_devices ) elif self.use_ddp: use_slurm_ddp = self.use_ddp and self._is_slurm_managing_tasks() use_torchelastic_ddp = self.use_ddp and TorchElasticEnvironment.detect() use_kubeflow_ddp = self.use_ddp and KubeflowEnvironment.detect() use_ddp_spawn = self._strategy_type == _StrategyType.DDP_SPAWN use_ddp_cpu_spawn = use_ddp_spawn and self.use_cpu use_tpu_spawn = self.use_tpu and self._strategy_type == _StrategyType.TPU_SPAWN use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and TorchElasticEnvironment.detect() use_ddp_cpu_kubeflow = use_ddp_cpu_spawn and KubeflowEnvironment.detect() use_ddp_cpu_slurm = use_ddp_cpu_spawn and self._is_slurm_managing_tasks() use_ddp_sharded = self._strategy_type == _StrategyType.DDP_SHARDED use_ddp_sharded_spawn = self._strategy_type == _StrategyType.DDP_SHARDED_SPAWN use_ddp_fully_sharded = self._strategy_type == _StrategyType.DDP_FULLY_SHARDED if use_tpu_spawn: ddp_strategy_cls = TPUSpawnStrategy elif use_ddp_sharded: ddp_strategy_cls = DDPShardedStrategy elif use_ddp_sharded_spawn: ddp_strategy_cls = DDPSpawnShardedStrategy elif ( use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp or use_kubeflow_ddp or use_ddp_cpu_kubeflow ): ddp_strategy_cls = DDPStrategy elif use_ddp_spawn or use_ddp_cpu_spawn: ddp_strategy_cls = DDPSpawnStrategy elif use_ddp_fully_sharded: ddp_strategy_cls = DDPFullyShardedStrategy else: ddp_strategy_cls = DDPStrategy plugin = ddp_strategy_cls( parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment ) elif self.use_dp: plugin = DataParallelStrategy(parallel_devices=self.parallel_devices) elif self.use_horovod: plugin = HorovodStrategy(parallel_devices=self.parallel_devices) elif self.use_tpu and isinstance(self.tpu_cores, list): plugin = SingleTPUStrategy(self.tpu_id) elif self.use_ipu: plugin = IPUStrategy(parallel_devices=self.parallel_devices) else: single_gpu_ordinal = device_parser.determine_root_gpu_device(self.parallel_device_ids) plugin = SingleDeviceStrategy(device=single_gpu_ordinal if self.use_gpu else "cpu") return plugin
def select_cluster_environment(self) -> ClusterEnvironment: if self._cluster_environment is not None: return self._cluster_environment if self.is_slurm_managing_tasks: env = SLURMEnvironment() elif TorchElasticEnvironment.is_using_torchelastic(): env = TorchElasticEnvironment() elif KubeflowEnvironment.is_using_kubeflow(): env = KubeflowEnvironment() else: env = LightningEnvironment() return env
def select_cluster_environment(self) -> ClusterEnvironment: if self._cluster_environment is not None: return self._cluster_environment if self._is_slurm_managing_tasks(): env = SLURMEnvironment() rank_zero_info("Multiprocessing is handled by SLURM.") elif TorchElasticEnvironment.is_using_torchelastic(): env = TorchElasticEnvironment() elif KubeflowEnvironment.is_using_kubeflow(): env = KubeflowEnvironment() elif LSFEnvironment.is_using_lsf(): env = LSFEnvironment() else: env = LightningEnvironment() return env
def _check_strategy_and_fallback(self) -> None: """Checks edge cases when the strategy selection was a string input, and we need to fall back to a different choice depending on other parameters or the environment.""" # current fallback and check logic only apply to user pass in str config and object config # TODO this logic should apply to both str and object config strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and ( TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks() ): strategy_flag = "ddp" if strategy_flag in ("dp", "ddp2") and self._accelerator_flag == "cpu": rank_zero_warn(f"{strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`.") strategy_flag = "ddp" if ( strategy_flag in DDPFullyShardedNativeStrategy.get_registered_strategies() or isinstance(self._strategy_flag, DDPFullyShardedNativeStrategy) ) and self._accelerator_flag != "gpu": raise MisconfigurationException( f"You selected strategy to be `{DDPFullyShardedNativeStrategy.strategy_name}`, " "but GPU accelerator is not used." ) if strategy_flag: self._strategy_flag = strategy_flag
def _check_strategy_and_fallback(self) -> None: """Checks edge cases when the strategy selection was a string input, and we need to fall back to a different choice depending on other parameters or the environment.""" # current fallback and check logic only apply to user pass in str config and object config # TODO this logic should apply to both str and object config strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and ( TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks()): strategy_flag = "ddp" if strategy_flag == "dp" and self._accelerator_flag == "cpu": rank_zero_warn( f"{strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`." ) strategy_flag = "ddp" if (strategy_flag in DDPFullyShardedNativeStrategy.get_registered_strategies() or isinstance(self._strategy_flag, DDPFullyShardedNativeStrategy) ) and self._accelerator_flag not in ("cuda", "gpu"): raise MisconfigurationException( f"You selected strategy to be `{DDPFullyShardedNativeStrategy.strategy_name}`, " "but GPU accelerator is not used.") if strategy_flag in _DDP_FORK_ALIASES and "fork" not in torch.multiprocessing.get_all_start_methods( ): raise ValueError( f"You selected `Trainer(strategy='{strategy_flag}')` but process forking is not supported on this" f" platform. We recommed `Trainer(strategy='ddp_spawn')` instead." ) if strategy_flag: self._strategy_flag = strategy_flag
def test_default_attributes(): """Test the default attributes when no environment variables are set.""" env = KubeflowEnvironment() assert env.creates_processes_externally with pytest.raises(KeyError): # MASTER_ADDR is required env.main_address with pytest.raises(KeyError): # MASTER_PORT is required env.main_port with pytest.raises(KeyError): # WORLD_SIZE is required env.world_size() with pytest.raises(KeyError): # RANK is required env.global_rank() assert env.local_rank() == 0
def _check_strategy_and_fallback(self) -> None: """Checks edge cases when the strategy selection was a string input, and we need to fall back to a different choice depending on other parameters or the environment.""" # current fallback and check logic only apply to user pass in str config and object config # TODO this logic should apply to both str and object config strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag if strategy_flag == "ddp_cpu": if _TPU_AVAILABLE: raise MisconfigurationException( "`accelerator='ddp_cpu'` is not supported on TPU machines. " "Learn more: https://github.com/PyTorchLightning/pytorch-lightning/issues/7810" ) if self._devices_flag == 1 and self._num_nodes_flag > 1: strategy_flag = DDPStrategy.strategy_name else: strategy_flag = "ddp_spawn" if self._accelerator_flag == "gpu": rank_zero_warn( "You requested one or more GPUs, but set `accelerator='ddp_cpu'`. Training will not use GPUs." ) self._accelerator_flag = "cpu" self.accelerator = CPUAccelerator() if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and ( TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks()): strategy_flag = "ddp" if strategy_flag in ("dp", "ddp2") and self._accelerator_flag == "cpu": rank_zero_warn( f"{strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`." ) strategy_flag = "ddp" if strategy_flag: self._strategy_flag = strategy_flag
def select_training_type_plugin(self) -> TrainingTypePlugin: if isinstance( self.distributed_backend, Accelerator ) and self.distributed_backend.training_type_plugin is not None: plugin = self.distributed_backend.training_type_plugin elif self.use_ddp2: plugin = DDP2Plugin( parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment, ) elif self.use_ddp and self.use_deepspeed: plugin = DeepSpeedPlugin( cluster_environment=self.select_cluster_environment(), parallel_devices=self.parallel_devices ) elif self.use_ddp: use_slurm_ddp = self.use_ddp and self.is_slurm_managing_tasks use_torchelastic_ddp = self.use_ddp and TorchElasticEnvironment.is_using_torchelastic() use_kubeflow_ddp = self.use_ddp and KubeflowEnvironment.is_using_kubeflow() use_ddp_spawn = self._distrib_type == DistributedType.DDP_SPAWN use_ddp_cpu_spawn = self.use_ddp and self.on_cpu use_tpu_spawn = self.on_tpu and self._distrib_type == DistributedType.TPU_SPAWN use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and TorchElasticEnvironment.is_using_torchelastic() use_ddp_cpu_kubeflow = use_ddp_cpu_spawn and KubeflowEnvironment.is_using_kubeflow() use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.is_slurm_managing_tasks use_ddp_sharded = self._distrib_type == DistributedType.DDP_SHARDED use_ddp_sharded_spawn = self._distrib_type == DistributedType.DDP_SHARDED_SPAWN use_ddp_fully_sharded = self._distrib_type == DistributedType.DDP_FULLY_SHARDED # TODO: decouple from TE # ddp script mode uses the same flags as TE if os.environ.get("PL_IN_DDP_SUBPROCESS", False): use_torchelastic_ddp = False if use_tpu_spawn: ddp_plugin_cls = TPUSpawnPlugin elif use_ddp_sharded: ddp_plugin_cls = DDPShardedPlugin elif use_ddp_sharded_spawn: ddp_plugin_cls = DDPSpawnShardedPlugin elif ( use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp or use_kubeflow_ddp or use_ddp_cpu_kubeflow ): ddp_plugin_cls = DDPPlugin elif use_ddp_spawn or use_ddp_cpu_spawn: ddp_plugin_cls = DDPSpawnPlugin elif use_ddp_fully_sharded: ddp_plugin_cls = DDPFullyShardedPlugin else: ddp_plugin_cls = DDPPlugin plugin = ddp_plugin_cls( parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment, ) elif self.use_dp: plugin = DataParallelPlugin(parallel_devices=self.parallel_devices) elif self.use_horovod: plugin = HorovodPlugin(parallel_devices=self.parallel_devices) elif self.on_tpu and isinstance(self.tpu_cores, list): plugin = SingleTPUPlugin(self.tpu_id) elif self.on_ipu: plugin = IPUPlugin(parallel_devices=self.parallel_devices) else: single_gpu_ordinal = device_parser.determine_root_gpu_device(self.parallel_device_ids) plugin = SingleDevicePlugin(device=torch.device(f"cuda:{single_gpu_ordinal}" if self.on_gpu else "cpu")) return plugin
def test_detect_kubeflow(): assert KubeflowEnvironment.detect()
def test_attributes_from_environment_variables(caplog): """Test that the torchelastic cluster environment takes the attributes from the environment variables.""" env = KubeflowEnvironment() assert env.main_address == "1.2.3.4" assert env.main_port == 500 assert env.world_size() == 20 assert env.global_rank() == 1 assert env.local_rank() == 0 assert env.node_rank() == 1 # setter should be no-op with caplog.at_level(logging.DEBUG, logger="pytorch_lightning.plugins.environments"): env.set_global_rank(100) assert env.global_rank() == 1 assert "setting global rank is not allowed" in caplog.text caplog.clear() with caplog.at_level(logging.DEBUG, logger="pytorch_lightning.plugins.environments"): env.set_world_size(100) assert env.world_size() == 20 assert "setting world size is not allowed" in caplog.text
def test_detect_torchelastic_over_kubeflow(): assert not KubeflowEnvironment.detect()
def test_is_using_kubeflow(): assert KubeflowEnvironment.is_using_kubeflow()
def test_is_using_kubeflow_torchelastic(): assert not KubeflowEnvironment.is_using_kubeflow()