コード例 #1
0
    def select_strategy(self) -> Strategy:
        if isinstance(self.distributed_backend, Accelerator) and self.distributed_backend.strategy is not None:
            plugin = self.distributed_backend.strategy
        elif self.use_ddp2:
            plugin = DDP2Strategy(parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment)
        elif self.use_ddp and self.use_deepspeed:
            plugin = DeepSpeedStrategy(
                cluster_environment=self.select_cluster_environment(), parallel_devices=self.parallel_devices
            )
        elif self.use_ddp:
            use_slurm_ddp = self.use_ddp and self._is_slurm_managing_tasks()
            use_torchelastic_ddp = self.use_ddp and TorchElasticEnvironment.detect()
            use_kubeflow_ddp = self.use_ddp and KubeflowEnvironment.detect()
            use_ddp_spawn = self._strategy_type == _StrategyType.DDP_SPAWN
            use_ddp_cpu_spawn = use_ddp_spawn and self.use_cpu
            use_tpu_spawn = self.use_tpu and self._strategy_type == _StrategyType.TPU_SPAWN
            use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and TorchElasticEnvironment.detect()
            use_ddp_cpu_kubeflow = use_ddp_cpu_spawn and KubeflowEnvironment.detect()
            use_ddp_cpu_slurm = use_ddp_cpu_spawn and self._is_slurm_managing_tasks()
            use_ddp_sharded = self._strategy_type == _StrategyType.DDP_SHARDED
            use_ddp_sharded_spawn = self._strategy_type == _StrategyType.DDP_SHARDED_SPAWN
            use_ddp_fully_sharded = self._strategy_type == _StrategyType.DDP_FULLY_SHARDED

            if use_tpu_spawn:
                ddp_strategy_cls = TPUSpawnStrategy
            elif use_ddp_sharded:
                ddp_strategy_cls = DDPShardedStrategy
            elif use_ddp_sharded_spawn:
                ddp_strategy_cls = DDPSpawnShardedStrategy
            elif (
                use_ddp_cpu_slurm
                or use_slurm_ddp
                or use_ddp_cpu_torch_elastic
                or use_torchelastic_ddp
                or use_kubeflow_ddp
                or use_ddp_cpu_kubeflow
            ):
                ddp_strategy_cls = DDPStrategy
            elif use_ddp_spawn or use_ddp_cpu_spawn:
                ddp_strategy_cls = DDPSpawnStrategy
            elif use_ddp_fully_sharded:
                ddp_strategy_cls = DDPFullyShardedStrategy
            else:
                ddp_strategy_cls = DDPStrategy

            plugin = ddp_strategy_cls(
                parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment
            )
        elif self.use_dp:
            plugin = DataParallelStrategy(parallel_devices=self.parallel_devices)
        elif self.use_horovod:
            plugin = HorovodStrategy(parallel_devices=self.parallel_devices)
        elif self.use_tpu and isinstance(self.tpu_cores, list):
            plugin = SingleTPUStrategy(self.tpu_id)
        elif self.use_ipu:
            plugin = IPUStrategy(parallel_devices=self.parallel_devices)
        else:
            single_gpu_ordinal = device_parser.determine_root_gpu_device(self.parallel_device_ids)
            plugin = SingleDeviceStrategy(device=single_gpu_ordinal if self.use_gpu else "cpu")
        return plugin
コード例 #2
0
def test_detect_after_1_9_1():
    """Test the detection of a torchelastic environment configuration after 1.9.1."""
    with mock.patch.dict(os.environ, {}):
        assert not TorchElasticEnvironment.detect()

    with mock.patch.dict(
            os.environ,
        {
            "TORCHELASTIC_RUN_ID": "",
        },
    ):
        assert TorchElasticEnvironment.detect()
コード例 #3
0
def test_detect_before_1_9_1():
    """Test the detection of a torchelastic environment configuration before 1.9.1."""
    with mock.patch.dict(os.environ, {}):
        assert not TorchElasticEnvironment.detect()

    with mock.patch.dict(
            os.environ,
        {
            "RANK": "",
            "GROUP_RANK": "",
            "LOCAL_RANK": "",
            "LOCAL_WORLD_SIZE": "",
        },
    ):
        assert TorchElasticEnvironment.detect()
コード例 #4
0
    def _check_strategy_and_fallback(self) -> None:
        """Checks edge cases when the strategy selection was a string input, and we need to fall back to a
        different choice depending on other parameters or the environment."""
        # current fallback and check logic only apply to user pass in str config and object config
        # TODO this logic should apply to both str and object config
        strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag

        if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and (
            TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks()
        ):
            strategy_flag = "ddp"
        if strategy_flag in ("dp", "ddp2") and self._accelerator_flag == "cpu":
            rank_zero_warn(f"{strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`.")
            strategy_flag = "ddp"
        if (
            strategy_flag in DDPFullyShardedNativeStrategy.get_registered_strategies()
            or isinstance(self._strategy_flag, DDPFullyShardedNativeStrategy)
        ) and self._accelerator_flag != "gpu":
            raise MisconfigurationException(
                f"You selected strategy to be `{DDPFullyShardedNativeStrategy.strategy_name}`, "
                "but GPU accelerator is not used."
            )

        if strategy_flag:
            self._strategy_flag = strategy_flag
コード例 #5
0
    def _check_strategy_and_fallback(self) -> None:
        """Checks edge cases when the strategy selection was a string input, and we need to fall back to a
        different choice depending on other parameters or the environment."""
        # current fallback and check logic only apply to user pass in str config and object config
        # TODO this logic should apply to both str and object config
        strategy_flag = "" if isinstance(self._strategy_flag,
                                         Strategy) else self._strategy_flag

        if strategy_flag in ("ddp_spawn",
                             "ddp_spawn_find_unused_parameters_false") and (
                                 TorchElasticEnvironment.detect()
                                 or KubeflowEnvironment.detect()
                                 or self._is_slurm_managing_tasks()):
            strategy_flag = "ddp"
        if strategy_flag == "dp" and self._accelerator_flag == "cpu":
            rank_zero_warn(
                f"{strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`."
            )
            strategy_flag = "ddp"
        if (strategy_flag
                in DDPFullyShardedNativeStrategy.get_registered_strategies() or
                isinstance(self._strategy_flag, DDPFullyShardedNativeStrategy)
            ) and self._accelerator_flag not in ("cuda", "gpu"):
            raise MisconfigurationException(
                f"You selected strategy to be `{DDPFullyShardedNativeStrategy.strategy_name}`, "
                "but GPU accelerator is not used.")
        if strategy_flag in _DDP_FORK_ALIASES and "fork" not in torch.multiprocessing.get_all_start_methods(
        ):
            raise ValueError(
                f"You selected `Trainer(strategy='{strategy_flag}')` but process forking is not supported on this"
                f" platform. We recommed `Trainer(strategy='ddp_spawn')` instead."
            )
        if strategy_flag:
            self._strategy_flag = strategy_flag
コード例 #6
0
def parse_gpu_ids(
        gpus: Optional[Union[int, str, List[int]]]) -> Optional[List[int]]:
    """
    Parses the GPU ids given in the format as accepted by the
    :class:`~pytorch_lightning.trainer.Trainer`.

    Args:
        gpus: An int -1 or string '-1' indicate that all available GPUs should be used.
            A list of unique ints or a string containing list of comma separated unique integers
            indicates specific GPUs to use.
            An int 0 means that no GPUs should be used.
            Any int N > 0 indicates that GPUs [0..N) should be used.

    Returns:
        a list of gpus to be used or ``None`` if no GPUs were requested

    If no GPUs are available but the value of gpus variable indicates request for GPUs
    then a MisconfigurationException is raised.
    """
    # Check that gpus param is None, Int, String or List
    _check_data_type(gpus)

    # Handle the case when no gpus are requested
    if gpus is None or (isinstance(gpus, int)
                        and gpus == 0) or str(gpus).strip() in ("0", "[]"):
        return None

    # We know user requested GPUs therefore if some of the
    # requested GPUs are not available an exception is thrown.
    gpus = _normalize_parse_gpu_string_input(gpus)
    gpus = _normalize_parse_gpu_input_to_list(gpus)
    if not gpus:
        raise MisconfigurationException(
            "GPUs requested but none are available.")
    if TorchElasticEnvironment.detect() and len(gpus) != 1 and len(
            _get_all_available_gpus()) == 1:
        # omit sanity check on torchelastic as by default shows one visible GPU per process
        return gpus

    # Check that gpus are unique. Duplicate gpus are not supported by the backend.
    _check_unique(gpus)

    return _sanitize_gpu_ids(gpus)
コード例 #7
0
    def _check_strategy_and_fallback(self) -> None:
        """Checks edge cases when the strategy selection was a string input, and we need to fall back to a
        different choice depending on other parameters or the environment."""
        # current fallback and check logic only apply to user pass in str config and object config
        # TODO this logic should apply to both str and object config
        strategy_flag = "" if isinstance(self._strategy_flag,
                                         Strategy) else self._strategy_flag

        if strategy_flag == "ddp_cpu":
            if _TPU_AVAILABLE:
                raise MisconfigurationException(
                    "`accelerator='ddp_cpu'` is not supported on TPU machines. "
                    "Learn more: https://github.com/PyTorchLightning/pytorch-lightning/issues/7810"
                )
            if self._devices_flag == 1 and self._num_nodes_flag > 1:
                strategy_flag = DDPStrategy.strategy_name
            else:
                strategy_flag = "ddp_spawn"
            if self._accelerator_flag == "gpu":
                rank_zero_warn(
                    "You requested one or more GPUs, but set `accelerator='ddp_cpu'`. Training will not use GPUs."
                )
                self._accelerator_flag = "cpu"
                self.accelerator = CPUAccelerator()
        if strategy_flag in ("ddp_spawn",
                             "ddp_spawn_find_unused_parameters_false") and (
                                 TorchElasticEnvironment.detect()
                                 or KubeflowEnvironment.detect()
                                 or self._is_slurm_managing_tasks()):
            strategy_flag = "ddp"
        if strategy_flag in ("dp", "ddp2") and self._accelerator_flag == "cpu":
            rank_zero_warn(
                f"{strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`."
            )
            strategy_flag = "ddp"

        if strategy_flag:
            self._strategy_flag = strategy_flag