コード例 #1
0
def _parse_and_validate_remote_device(pg, remote_device):

    worker_name = remote_device.worker_name()
    rank = remote_device.rank()
    device = remote_device.device()

    # Validate rank, skip validation if rank is not part of process group.
    if not distributed_c10d._rank_not_in_group(pg):
        if rank is not None and (rank < 0 or
                                 rank >= distributed_c10d.get_world_size(pg)):
            raise ValueError(f'Invalid rank: {rank}')

    if worker_name is not None:
        if not rpc._is_current_rpc_agent_set():
            raise RuntimeError(
                f'RPC framework needs to be initialized for using worker names: {worker_name}'
            )

        workers = rpc._get_current_rpc_agent().get_worker_infos()
        for worker in workers:
            if worker.name == worker_name:
                return worker.id, device

        raise ValueError(f'Invalid worker name: {worker_name}')

    return rank, device
コード例 #2
0
ファイル: remote_module.py プロジェクト: xkszltl/pytorch
    def _prepare_init(self, remote_device_str: str) -> bool:
        """
        Prepares the initializaiton and returns whether to enable automatically moving CPU tensors to CUDA devices.
        """
        # Sanity check.
        assert rpc._is_current_rpc_agent_set(
        ), "RemoteModule only works in RPC."

        remote_device = _remote_device(remote_device_str)
        self.on = remote_device.worker_name() if remote_device.worker_name(
        ) is not None else remote_device.rank()
        self.device = str(remote_device.device())
        agent = rpc._get_current_rpc_agent()
        # If the device map of the remote worker is set,
        # then enable moving any input CPU tensors to the same cuda device.
        self.is_device_map_set = bool(
            agent._get_device_map(agent.get_worker_info(
                self.on))  # type: ignore[arg-type]
        )
        # ``enable_moving_cpu_tensors_to_cuda`` is less strict than ``is_device_map_set``:
        # If ``enable_moving_cpu_tensors_to_cuda`` is true, but the device map is not set,
        # then any CPU tensors can still be moved to a cuda device to run forward,
        # but the output must be moved back to CPU before being sent over the wire.
        enable_moving_cpu_tensors_to_cuda = torch.device(
            self.device).type == "cuda"
        return enable_moving_cpu_tensors_to_cuda
コード例 #3
0
ファイル: api.py プロジェクト: ydcjeff/pytorch
    def _init_rpc(self):
        self._rpc_initialized = True
        self._remote_shards = {}

        # Gather all the sharded tensor ids.
        world_size = dist.get_world_size(self._process_group)
        worker_infos = rpc._get_current_rpc_agent().get_worker_infos()
        rank_to_name = {}
        name_to_rank = {}

        for worker_info in worker_infos:
            rank_to_name[worker_info.id] = worker_info.name
            name_to_rank[worker_info.name] = worker_info.id

        rpc_workers = set()
        for rank in range(world_size):
            if self._process_group == distributed_c10d._get_default_group():
                global_rank = rank
            else:
                global_rank = distributed_c10d._get_global_rank(
                    self._process_group, rank)
            rpc_workers.add(rank_to_name[global_rank])

        all_tensor_ids = rpc.api._all_gather(self._sharded_tensor_id,
                                             rpc_workers)

        # Share the local shards to the entire world.
        futs = []
        rpc_rank = rpc.get_worker_info().id
        for rank in range(world_size):
            # Skip self.
            if rank == dist.get_rank(self._process_group):
                continue

            if self._process_group == distributed_c10d._get_default_group():
                global_rank = rank
            else:
                global_rank = distributed_c10d._get_global_rank(
                    self._process_group, rank)

            if len(self.local_shards()) != 0:
                rrefs: List[rpc.RRef[Shard]] = [
                    rpc.RRef(shard) for shard in self.local_shards()
                ]
                fut = rpc.rpc_async(
                    global_rank,
                    _register_remote_shards,
                    args=(all_tensor_ids[rank_to_name[global_rank]], rrefs,
                          rpc_rank))
                futs.append(fut)

        torch.futures.wait_all(futs)

        # Barrier for all RPCs to finish on all ranks.
        rpc.api._barrier(rpc_workers)
コード例 #4
0
    def __init__(self,
                 name: str,
                 rank: int = -1,
                 world_size: int = None,
                 init_method: str = "tcp://localhost:9100",
                 rpc_timeout: float = 60,
                 rpc_threads: int = 8):
        """
        Args:
            name: A unique name to identify current process.
            rank: A unique rank of the current process. You do not need to specify
                it if you are using `torch.distributed.launch` or `torchelastic`
            world_size:   Size of the distributed world. You do not need to specify
                it if you are using `torch.distributed.launch` or `torchelastic`
            init_method:  Backend initialization method.
            rpc_timeout:  Global rpc call timeout in seconds.
            rpc_threads:  Rpc recv/send thread num.
        """
        self.world_size = world_size
        self.rank = rank
        self.name = name
        self.groups = {}
        self.group_create_signals = {}

        # "<rank-number>" is used as the unique name.
        rpc.init_rpc(self.name,
                     rank=rank,
                     world_size=world_size,
                     rpc_backend_options=rpc.ProcessGroupRpcBackendOptions(
                         init_method=init_method,
                         num_send_recv_threads=rpc_threads,
                         rpc_timeout=timedelta(seconds=rpc_timeout)
                     ))

        # get rank-name mapping
        self.rank_name_map = {}
        for wi in rpc._get_current_rpc_agent().get_worker_infos():
            self.rank_name_map[wi.id] = wi.name

        # Start role dispatching.
        self.started = True
        self.rpc_timeout = rpc_timeout

        # map for paired values and registered services
        self.value_lut = {}
        self.service_lut = {}
        self.lut_lock = Lock()
        self.lut_manager = self.rank_name_map[0]
コード例 #5
0
    def _init_rpc(self):
        # Validate PG and RPC ranks match.
        pg_rank = dist.get_rank()
        rpc_rank = rpc.get_worker_info().id
        if pg_rank != rpc_rank:
            raise ValueError(
                f'Default ProcessGroup and RPC ranks must be '
                f'the same for ShardedTensor, found process group rank: '
                f'{pg_rank} and RPC rank: {rpc_rank}')

        self._remote_shards = {}

        # Gather all the sharded tensor ids.
        world_size = dist.get_world_size(self._process_group)
        worker_infos = rpc._get_current_rpc_agent().get_worker_infos()
        rank_to_name = {}
        name_to_rank = {}

        for worker_info in worker_infos:
            rank_to_name[worker_info.id] = worker_info.name
            name_to_rank[worker_info.name] = worker_info.id

        all_tensor_ids = rpc.api._all_gather(self._sharded_tensor_id)

        # Share the local shards to the entire world.
        futs = []
        rpc_rank = rpc.get_worker_info().id
        for rank in range(dist.get_world_size()):
            # Skip self.
            if rank == dist.get_rank():
                continue

            if len(self.local_shards()) != 0:
                rrefs: List[rpc.RRef[Shard]] = [
                    rpc.RRef(shard) for shard in self.local_shards()
                ]
                fut = rpc.rpc_async(rank,
                                    _register_remote_shards,
                                    args=(all_tensor_ids[rank_to_name[rank]],
                                          rrefs, rpc_rank))
                futs.append(fut)

        torch.futures.wait_all(futs)

        # Barrier for all RPCs to finish on all ranks.
        rpc.api._all_gather(None)
コード例 #6
0
    def _parse_and_validate_remote_device(self, device):

        on, local_device = _parse_remote_device(device)  # type: ignore[arg-type]

        # Validate rank.
        if isinstance(on, int) and (on < 0 or on >= dist.get_world_size(self._process_group)):
            raise ValueError(f'Invalid rank: {on}')

        if isinstance(on, str):
            if not rpc._is_current_rpc_agent_set():
                raise RuntimeError(f'RPC framework needs to be initialized for using worker names: {on}')

            workers = rpc._get_current_rpc_agent().get_worker_infos()
            for worker in workers:
                if worker.name == on:
                    return worker.id, local_device

            raise ValueError(f'Invalid worker name: {on}')

        return on, local_device
コード例 #7
0
    def __init__(
        self,
        remote_device: str,
        module_cls: nn.Module,
        args: Tuple = None,
        kwargs: Dict[str, Any] = None,
        _module_interface_cls: Any = None,
    ):
        """
        A RemoteModule instance can only be created after RPC initialization.
        It creates a user-specified module on a specified remote node.
        It behaves like a regular ``nn.Module`` except that the ``forward`` method is
        executed on the remote node.
        It takes care of autograd recording to ensure the backward pass propogates
        gradients back to the corresponding remote module.
        It can be shared across processors using `RPC framework <https://pytorch.org/docs/stable/rpc.html>`__,
        without incurring any overheads of copying the actual module,
        which is equivalent to an :class:`~torch.distributed.rpc.RRef`
        pointing to the remote module.

        The arguments of ``forward_async`` and ``forward`` are the same as
        the ``forward`` method of the module returned by the ``module_cls``.

        Apart from ``forward_async`` and ``forward``, no other methods are supported from nn.Module for now.

        Particularly, to create a hybrid model, typically the local modules should be
        created outside of remote modules, rather than as submodules of any remote module (by calling ``add_module``).
        Hybrid Example:
                >>> class HybridModel(nn.Module):
                >>>     def __init__(self):
                >>>         nn.Module.__init__(self)
                >>>         self.remote_embedding = RemoteModule(...)
                >>>         self.local_linear = nn.Linear(...)

        For example, if ``module_cls`` returns an instance of ``nn.Linear``,
        that has ``forward`` method signature, ``def forward(input: Tensor) -> Tensor:``,
        the generated ``RemoteModule`` will have 2 methods in signature of
        ``def forward(input: Tensor) -> Tensor:`` and
        ``def forward_async(input: Tensor) -> Future[Tensor]:``.

        .. note::
            If the remote module is placed on a cuda device,
            any input CPU tensors will be automatically moved to the same cuda device,
            and GPU tensors are returned over the wire according to the device map of the remote worker on TensorPipe RPC backend.

        Args:
            remote_device (str): Device on the destination worker where we'd like to place this module.
                The format should be "<workername>/<device>", where the device field can be parsed as torch.device type.
                E.g., "trainer0/cpu", "trainer0", "ps0/cuda:0".
                In addition, the device field can be optional and the default value is "cpu".
            module_cls (nn.Module): For example,
                >>> class MyModule(nn.Module):
                >>>     def forward(input):
                >>>         return input + 1
                >>>
                >>> module_cls = MyModule
            args (Sequence, optional): args to be passed to ``module_cls``.
            kwargs (Dict, optional): kwargs to be passed to ``module_cls``.
            _module_interface_cls (type, optional): The TorchScript interface type for the module
                to be created. The type object should be decorated by @torch.jit.interface.
                If not provided, the generated RemoteModule is not torchscript-able.
                Warning, this is an experimental API and susceptible to frequent changes.

        Returns:
            A remote module instance which wraps the :class:`~nn.Module` created by the
            user-provided ``module_cls``, it has a blocking ``forward`` method and an
            asynchronous ``forward_async`` method that returns a future of the ``forward`` call
            on the user-provided module on the remote side.

        Example::
            Run the following code in two different processes:

            >>> # On worker 0:
            >>> import torch
            >>> import torch.distributed.rpc as rpc
            >>> from torch import nn, Tensor
            >>> from torch.distributed.nn.api.remote_module import RemoteModule
            >>>
            >>> rpc.init_rpc("worker0", rank=0, world_size=2)
            >>> remote_linear_module = RemoteModule(
            >>>     "worker1/cpu", nn.Linear, args=(20, 30),
            >>> )
            >>> input = torch.randn(128, 20)
            >>> ret_fut = remote_linear_module.forward_async(input)
            >>> ret = ret_fut.wait()
            >>> rpc.shutdown()

            >>> # On worker 1:
            >>> import torch
            >>> import torch.distributed.rpc as rpc
            >>>
            >>> rpc.init_rpc("worker1", rank=1, world_size=2)
            >>> rpc.shutdown()
        """
        super().__init__()

        # NOTE: if a new attribute is added to this class, also need to add it
        # to ``_REMOTE_MODULE_PICKLED_ATTRIBUTES`` for pickling/unpickling.

        # Sanity checks.
        assert rpc._is_current_rpc_agent_set(
        ), "RemoteModule only works in RPC."

        # Default arguments preperation.
        args = args if args is not None else ()
        kwargs = kwargs if kwargs is not None else {}

        self.on, self.device = _parse_remote_device(remote_device)
        agent = rpc._get_current_rpc_agent()
        # If the device map of the remote worker is set,
        # then enable moving any input CPU tensors to the same cuda device.
        self.is_device_map_set = bool(
            agent._get_device_map(agent.get_worker_info(self.on)))
        # ``enable_moving_cpu_tensors_to_cuda`` is less strict than ``is_device_map_set``:
        # If ``enable_moving_cpu_tensors_to_cuda`` is true, but the device map is not set,
        # then any CPU tensors can still be moved to a cuda device to run forward,
        # but the output must be moved back to CPU before being sent over the wire.
        enable_moving_cpu_tensors_to_cuda = torch.device(
            self.device).type == "cuda"

        if _module_interface_cls is not None:
            # Users reply on this field to know if this generated RemoteModule is TorchScript-able.
            self.is_scriptable = True

            # Instantiate template on remote side.
            fut = rpc.rpc_async(
                self.on,
                _instantiate_template,
                (_module_interface_cls, enable_moving_cpu_tensors_to_cuda),
            )

            # Instantiate template on local side.
            generated_module = (
                instantiator.instantiate_scriptable_remote_module_template(
                    _module_interface_cls, enable_moving_cpu_tensors_to_cuda))
            self.generated_methods = generated_module._generated_methods

            # Create the module on the remote side.
            fut.wait()  # Ensure remote_module_cls is available on remote side.

            # TODO: We need to change this to rpc.remote, and make it async (see the else branch below).
            # For that we need to be able to apply _module_interface_cls to the RRef returned by rpc.remote
            # See https://github.com/pytorch/pytorch/issues/58098 for more context.
            self.module_rref = rpc.rpc_sync(
                self.on,
                _create_module_with_interface,
                (module_cls, args, kwargs, self.device, _module_interface_cls),
            )
        else:
            self.is_scriptable = False
            self.generated_methods = (
                _NON_SCRIPTABLE_REMOTE_MODULE_MODULE._generated_methods)
            # Create the module on the remote side.
            self.module_rref = rpc.remote(
                self.on,
                _create_module,
                (module_cls, args, kwargs, self.device),
            )

        # Install generated methods.
        for method in self.generated_methods:
            method_name = method.__name__
            method = torch.jit.export(method)
            setattr(self, method_name, types.MethodType(method, self))

        # Sanity check: whether to be pickled must be explicitly defined for every attribute.
        for k in self.__dict__.keys():
            if (k not in _REMOTE_MODULE_PICKLED_ATTRIBUTES and k
                    not in _REMOTE_MODULE_ATTRIBUTES_IGNORE_FOR_PICKLING):
                raise AttributeError(
                    "Attribute {} must be either in ``_REMOTE_MODULE_PICKLED_ATTRIBUTES`` or "
                    "``_REMOTE_MODULE_ATTRIBUTES_IGNORE_FOR_PICKLING``.".
                    format(k))
コード例 #8
0
    def __init__(
        self,
        name: str,
        rank: int = -1,
        world_size: int = -1,
        init_dist: bool = True,
        init_rpc: bool = True,
        dist_backend: str = "gloo",
        dist_init_method: str = "tcp://localhost:9100",
        rpc_init_method: str = "tcp://localhost:9101",
        dist_timeout: float = 60,
        rpc_timeout: float = 60,
    ):
        """
        Args:
            name: A unique name to identify current process.
            rank: A unique rank of the current process. You do not need to specify
                it if you are using `torch.distributed.launch` or `torchelastic`
            world_size:   Size of the distributed world. You do not need to specify
                it if you are using `torch.distributed.launch` or `torchelastic`
            dist_timeout: Distributed package timeout in seconds.
            rpc_timeout:  Global rpc call timeout in seconds.
        """
        self.world_size = world_size
        self.rank = rank
        self.name = name
        self.groups = {}
        self.group_create_signals = {}

        if init_dist:
            dist.init_process_group(
                backend=dist_backend,
                init_method=dist_init_method,
                timeout=timedelta(seconds=dist_timeout),
                rank=rank,
                world_size=world_size,
            )
        if init_rpc:
            rpc.init_rpc(
                self.name,
                rank=rank,
                world_size=world_size,
                backend=rpc.BackendType.TENSORPIPE,
                rpc_backend_options=rpc.TensorPipeRpcBackendOptions(
                    init_method=rpc_init_method, rpc_timeout=rpc_timeout
                ),
            )

        # get rank-name mapping
        self.rank_name_map = {}
        for wi in rpc._get_current_rpc_agent().get_worker_infos():
            self.rank_name_map[wi.id] = wi.name

        # Start role dispatching.
        self.started = True
        self.rpc_timeout = rpc_timeout

        # map for paired values and registered services
        self.value_lut = {}
        self.service_lut = {}
        self.lut_lock = Lock()
        self.lut_manager = self.rank_name_map[0]