Example #1
0
def test_pick_multiple_gpus(nb, expected_gpu_idxs, expected_error):
    if expected_error:
        with pytest.raises(
            expected_error,
            match=re.escape(
                "auto_select_gpus=True, gpus=0 is not a valid configuration."
                " Please select a valid number of GPU resources when using auto_select_gpus."
            ),
        ):
            pick_multiple_gpus(nb)
    else:
        assert expected_gpu_idxs == pick_multiple_gpus(nb)
Example #2
0
def _parse_devices(
    gpus: Optional[Union[List[int], str, int]],
    auto_select_gpus: bool,
    tpu_cores: Optional[Union[List[int], str, int]],
) -> Tuple[Optional[List[int]], Optional[Union[List[int], int]]]:
    if auto_select_gpus and isinstance(gpus, int):
        gpus = pick_multiple_gpus(gpus)

    # TODO (@seannaren, @kaushikb11): Include IPU parsing logic here
    gpu_ids = parse_gpu_ids(gpus)
    tpu_cores = parse_tpu_cores(tpu_cores)
    return gpu_ids, tpu_cores
Example #3
0
    def __init__(
        self,
        num_processes,
        tpu_cores,
        ipus,
        distributed_backend,
        auto_select_gpus,
        gpus,
        num_nodes,
        sync_batchnorm,
        benchmark,
        replace_sampler_ddp,
        deterministic,
        precision,
        amp_type,
        amp_level,
        plugins,
    ):
        # initialization
        self._device_type = DeviceType.CPU
        self._distrib_type = None

        self.num_processes = num_processes
        self.tpu_cores = device_parser.parse_tpu_cores(tpu_cores)
        self.ipus = ipus
        self.distributed_backend = distributed_backend
        self.auto_select_gpus = auto_select_gpus
        self.gpus = gpus
        self.num_nodes = num_nodes
        self.sync_batchnorm = sync_batchnorm
        self.benchmark = benchmark
        self.replace_sampler_ddp = replace_sampler_ddp
        self.deterministic = deterministic
        self.precision = precision
        self.amp_type = amp_type.lower() if isinstance(amp_type, str) else None
        self.amp_level = amp_level
        self.is_slurm_managing_tasks = False

        self._precision_plugin: Optional[PrecisionPlugin] = None
        self._training_type_plugin: Optional[TrainingTypePlugin] = None
        self._cluster_environment: Optional[ClusterEnvironment] = None

        plugins = plugins if plugins is not None else []

        if isinstance(plugins, str):
            plugins = [plugins]

        if not isinstance(plugins, Sequence):
            plugins = [plugins]

        self.plugins = plugins

        # for gpus allow int, string and gpu list
        if auto_select_gpus and isinstance(gpus, int):
            self.gpus = pick_multiple_gpus(gpus)

        self.parallel_device_ids = device_parser.parse_gpu_ids(self.gpus)

        self.set_distributed_mode()
        self.configure_slurm_ddp()

        self.handle_given_plugins()

        self._training_type_plugin_resolved = False
        self.accelerator = self.select_accelerator()

        # override dist backend when using tpus
        if self.on_tpu:
            self.distributed_backend = "tpu"

        # init flags for SLURM+DDP to work
        self.world_size = 1
        self.interactive_ddp_procs = []
        self.global_rank = 0

        # benchmarking
        # TODO: should this be moved to GPU accelerator?
        torch.backends.cudnn.benchmark = self.benchmark

        # determinism for cudnn
        # TODO: should this be moved to GPU accelerator?
        torch.backends.cudnn.deterministic = deterministic
        if deterministic:
            # fixing non-deterministic part of horovod
            # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383
            os.environ["HOROVOD_FUSION_THRESHOLD"] = str(0)

        self.replace_sampler_ddp = replace_sampler_ddp
Example #4
0
    def __init__(
        self,
        num_processes,
        tpu_cores,
        distributed_backend,
        auto_select_gpus,
        gpus,
        num_nodes,
        sync_batchnorm,
        benchmark,
        replace_sampler_ddp,
        deterministic,
        precision,
        amp_type,
        amp_level,
        cluster_environment,
    ):
        # initialization
        self._device_type = DeviceType.CPU
        self._distrib_type = None

        self.num_processes = num_processes
        self.tpu_cores = device_parser.parse_tpu_cores(tpu_cores)
        self.distributed_backend = distributed_backend
        self.auto_select_gpus = auto_select_gpus
        self.gpus = gpus
        self.num_nodes = num_nodes
        self.sync_batchnorm = sync_batchnorm
        self.benchmark = benchmark
        self.replace_sampler_ddp = replace_sampler_ddp
        self.deterministic = deterministic
        self.precision = precision
        self.amp_type = amp_type.lower() if isinstance(amp_type, str) else None
        self.amp_level = amp_level
        self.cluster_environment = cluster_environment
        self.is_slurm_managing_tasks = False

        # init the default rank if exists
        # we need to call this here or NVIDIA flags and other messaging in init will show on all ranks
        # this way we only show it on rank 0
        if "LOCAL_RANK" in os.environ:
            rank_zero_only.rank = int(os.environ["LOCAL_RANK"])

        # for gpus allow int, string and gpu list
        if auto_select_gpus and isinstance(gpus, int):
            self.gpus = pick_multiple_gpus(gpus)

        self.parallel_device_ids = device_parser.parse_gpu_ids(self.gpus)
        self.root_gpu = device_parser.determine_root_gpu_device(
            self.parallel_device_ids)

        self.set_distributed_mode()
        self.configure_slurm_ddp()

        self.accelerator = self.select_accelerator()

        # override dist backend when using tpus
        if self.on_tpu:
            self.distributed_backend = "tpu"
            self.use_tpu = True

        # init flags for SLURM+DDP to work
        self.world_size = 1
        self.interactive_ddp_procs = []
        self.global_rank = 0

        # NVIDIA setup
        # self.set_nvidia_flags(self.trainer.is_slurm_managing_tasks, self.trainer.data_parallel_device_ids)

        # benchmarking
        # TODO: should this be moved to GPU accelerator?
        torch.backends.cudnn.benchmark = self.benchmark

        # determinism for cudnn
        # TODO: should this be moved to GPU accelerator?
        torch.backends.cudnn.deterministic = deterministic
        if deterministic:
            # fixing non-deterministic part of horovod
            # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383
            os.environ["HOROVOD_FUSION_THRESHOLD"] = str(0)

        # TODO: move this to TPU accelerator/plugin
        self.on_colab_kaggle = os.getenv("COLAB_GPU") or os.getenv(
            "KAGGLE_URL_BASE")

        self.replace_sampler_ddp = replace_sampler_ddp
Example #5
0
 def pick_multiple_gpus(self, num_gpus: int):
     return pick_multiple_gpus(num_gpus)
Example #6
0
def test_pick_multiple_gpus_more_than_available(*_):
    with pytest.raises(
            MisconfigurationException,
            match="You requested 3 GPUs but your machine only has 1 GPUs"):
        pick_multiple_gpus(3)
Example #7
0
 def _set_devices_flag_if_auto_select_gpus_passed(self) -> None:
     if self._auto_select_gpus and isinstance(self._gpus, int) and isinstance(self.accelerator, GPUAccelerator):
         self._devices_flag = pick_multiple_gpus(self._gpus)
         log.info(f"Auto select gpus: {self._devices_flag}")