Ejemplo n.º 1
0
    def _parse_and_validate_remote_device(self, device):

        rank, local_device = _parse_remote_device(device)  # type: ignore[arg-type]

        # Validate rank.
        if not isinstance(rank, int) or (rank < 0 or rank >= dist.get_world_size(self._process_group)):
            raise ValueError(f'Invalid rank: {rank}')

        return rank, local_device
Ejemplo n.º 2
0
def is_valid_device(device):
    """
    Checks if this is a valid local/remote device.
    """
    # Check for torch.device
    try:
        torch.device(device)
        return True
    except Exception:
        pass

    # Check for remote device.
    try:
        _parse_remote_device(device)
        return True
    except Exception:
        pass

    return False
Ejemplo n.º 3
0
    def _parse_and_validate_remote_device(self, device):

        on, local_device = _parse_remote_device(device)  # type: ignore[arg-type]

        # Validate rank.
        if isinstance(on, int) and (on < 0 or on >= dist.get_world_size(self._process_group)):
            raise ValueError(f'Invalid rank: {on}')

        if isinstance(on, str):
            if not rpc._is_current_rpc_agent_set():
                raise RuntimeError(f'RPC framework needs to be initialized for using worker names: {on}')

            workers = rpc._get_current_rpc_agent().get_worker_infos()
            for worker in workers:
                if worker.name == on:
                    return worker.id, local_device

            raise ValueError(f'Invalid worker name: {on}')

        return on, local_device
Ejemplo n.º 4
0
    def _prepare_init(self, remote_device: str) -> bool:  # type: ignore[return]
        """
        Prepares the initializaiton and returns whether to enable automatically moving CPU tensors to CUDA devices.
        """
        # Sanity check.
        assert rpc._is_current_rpc_agent_set(), "RemoteModule only works in RPC."

        self.on, self.device = _parse_remote_device(remote_device)
        agent = rpc._get_current_rpc_agent()
        # If the device map of the remote worker is set,
        # then enable moving any input CPU tensors to the same cuda device.
        self.is_device_map_set = bool(
            agent._get_device_map(agent.get_worker_info(self.on))
        )
        # ``enable_moving_cpu_tensors_to_cuda`` is less strict than ``is_device_map_set``:
        # If ``enable_moving_cpu_tensors_to_cuda`` is true, but the device map is not set,
        # then any CPU tensors can still be moved to a cuda device to run forward,
        # but the output must be moved back to CPU before being sent over the wire.
        enable_moving_cpu_tensors_to_cuda = torch.device(self.device).type == "cuda"
        return enable_moving_cpu_tensors_to_cuda
Ejemplo n.º 5
0
    def __init__(
        self,
        remote_device: str,
        module_cls: Type[nn.Module] = None,
        args: Tuple = None,
        kwargs: Dict[str, Any] = None,
        module_rref: rpc.RRef[nn.Module] = None,
        _module_interface_cls: Any = None,
    ):
        """
        A RemoteModule instance can only be created after RPC initialization.
        It creates a user-specified module on a specified remote node.
        It behaves like a regular ``nn.Module`` except that the ``forward`` method is
        executed on the remote node.
        It takes care of autograd recording to ensure the backward pass propogates
        gradients back to the corresponding remote module.
        It can be shared across processors using `RPC framework <https://pytorch.org/docs/stable/rpc.html>`__,
        without incurring any overheads of copying the actual module,
        which is equivalent to an :class:`~torch.distributed.rpc.RRef`
        pointing to the remote module.

        The arguments of ``forward_async`` and ``forward`` are the same as
        the ``forward`` method of the module returned by the ``module_cls``.

        Apart from ``forward_async`` and ``forward``, no other methods are supported from nn.Module for now.

        Particularly, to create a hybrid model, typically the local modules should be
        created outside of remote modules, rather than as submodules of any remote module (by calling ``add_module``).
        Hybrid Example:
                >>> class HybridModel(nn.Module):
                >>>     def __init__(self):
                >>>         nn.Module.__init__(self)
                >>>         self.remote_embedding = RemoteModule(...)
                >>>         self.local_linear = nn.Linear(...)

        For example, if ``module_cls`` returns an instance of ``nn.Linear``,
        that has ``forward`` method signature, ``def forward(input: Tensor) -> Tensor:``,
        the generated ``RemoteModule`` will have 2 methods in signature of
        ``def forward(input: Tensor) -> Tensor:`` and
        ``def forward_async(input: Tensor) -> Future[Tensor]:``.

        .. note::
            If the remote module is placed on a cuda device,
            any input CPU tensors will be automatically moved to the same cuda device,
            and GPU tensors are returned over the wire according to the device map of the remote worker on TensorPipe RPC backend.

        Args:
            remote_device (str): Device on the destination worker where we'd like to place this module.
                The device can be a local device or a remote device specified by one of the following remote
                formats:

                    1. "rank:<rank>/<device>" (ex: "rank:0/cuda:0").
                    2. "<worker_name>/<device>" (ex: "trainer0/cuda:0").

                In addition, the device field can be optional and the default value is "cpu".
            module_cls (nn.Module): For example,
                >>> class MyModule(nn.Module):
                >>>     def forward(input):
                >>>         return input + 1
                >>>
                >>> module_cls = MyModule
            args (Sequence, optional): args to be passed to ``module_cls``.
            kwargs (Dict, optional): kwargs to be passed to ``module_cls``.
            module_rref (RRef[nn.Module], optional): If provided, no new module will be actually created,
                and only reuses a module reference possibly shared by other another remote module.
                This alternate construction can help save memory footprint by not creating an underlying module.
                For this case, the other 3 args ``module_cls``, ``args``, and ``kwargs`` will be disregarded.
            _module_interface_cls (type, optional): The TorchScript interface type for the module
                to be created. The type object should be decorated by @torch.jit.interface.
                If not provided, the generated RemoteModule is not torchscript-able.
                Warning, this is an experimental API and susceptible to frequent changes.

        Returns:
            A remote module instance which wraps the :class:`~nn.Module` created by the
            user-provided ``module_cls`` or ``module_rref``, it has a blocking ``forward`` method and an
            asynchronous ``forward_async`` method that returns a future of the ``forward`` call
            on the user-provided module on the remote side.

        Example::
            Run the following code in two different processes:

            >>> # On worker 0:
            >>> import torch
            >>> import torch.distributed.rpc as rpc
            >>> from torch import nn, Tensor
            >>> from torch.distributed.nn.api.remote_module import RemoteModule
            >>>
            >>> rpc.init_rpc("worker0", rank=0, world_size=2)
            >>> remote_linear_module = RemoteModule(
            >>>     "worker1/cpu", nn.Linear, args=(20, 30),
            >>> )
            >>> input = torch.randn(128, 20)
            >>> ret_fut = remote_linear_module.forward_async(input)
            >>> ret = ret_fut.wait()
            >>> rpc.shutdown()

            >>> # On worker 1:
            >>> import torch
            >>> import torch.distributed.rpc as rpc
            >>>
            >>> rpc.init_rpc("worker1", rank=1, world_size=2)
            >>> rpc.shutdown()
        """
        super().__init__()

        assert (module_cls is not None or module_rref
                is not None), "module_cls and module_rref cannot be both None."

        # NOTE: if a new attribute is added to this class, also need to add it
        # to ``_REMOTE_MODULE_PICKLED_ATTRIBUTES`` for pickling/unpickling.

        # Sanity checks.
        assert rpc._is_current_rpc_agent_set(
        ), "RemoteModule only works in RPC."

        # Default arguments preperation.
        args = args if args is not None else ()
        kwargs = kwargs if kwargs is not None else {}

        self.on, self.device = _parse_remote_device(remote_device)
        agent = rpc._get_current_rpc_agent()
        # If the device map of the remote worker is set,
        # then enable moving any input CPU tensors to the same cuda device.
        self.is_device_map_set = bool(
            agent._get_device_map(agent.get_worker_info(self.on)))
        # ``enable_moving_cpu_tensors_to_cuda`` is less strict than ``is_device_map_set``:
        # If ``enable_moving_cpu_tensors_to_cuda`` is true, but the device map is not set,
        # then any CPU tensors can still be moved to a cuda device to run forward,
        # but the output must be moved back to CPU before being sent over the wire.
        enable_moving_cpu_tensors_to_cuda = torch.device(
            self.device).type == "cuda"

        if _module_interface_cls is not None:
            # Users reply on this field to know if this generated RemoteModule is TorchScript-able.
            self.is_scriptable = True

            if module_rref is None:
                # Instantiate template on remote side.
                fut = rpc.rpc_async(
                    self.on,
                    _instantiate_template,
                    (_module_interface_cls, enable_moving_cpu_tensors_to_cuda),
                )

            # Instantiate template on local side.
            generated_module = (
                instantiator.instantiate_scriptable_remote_module_template(
                    _module_interface_cls, enable_moving_cpu_tensors_to_cuda))
            self.generated_methods = generated_module._generated_methods

            if module_rref is None:
                # Instantiate template on remote side.
                fut = rpc.rpc_async(
                    self.on,
                    _instantiate_template,
                    (_module_interface_cls, enable_moving_cpu_tensors_to_cuda),
                )

                # Create the module on the remote side.
                fut.wait(
                )  # Ensure remote_module_cls is available on remote side.

                # TODO: We need to change this to rpc.remote, and make it async (see the else branch below).
                # For that we need to be able to apply _module_interface_cls to the RRef returned by rpc.remote
                # See https://github.com/pytorch/pytorch/issues/58098 for more context.
                self.module_rref = rpc.rpc_sync(
                    self.on,
                    _create_module_with_interface,
                    (module_cls, args, kwargs, self.device,
                     _module_interface_cls),
                )
            else:
                self.module_rref = module_rref
        else:
            self.is_scriptable = False
            self.generated_methods = (
                _NON_SCRIPTABLE_REMOTE_MODULE_MODULE._generated_methods)
            if module_rref is None:
                # Create the module on the remote side.
                self.module_rref = rpc.remote(
                    self.on,
                    _create_module,
                    (module_cls, args, kwargs, self.device),
                )
            else:
                self.module_rref = module_rref

        # Install generated methods.
        for method in self.generated_methods:
            method_name = method.__name__
            method = torch.jit.export(method)
            setattr(self, method_name, types.MethodType(method, self))

        # Sanity check: whether to be pickled must be explicitly defined for every attribute.
        for k in self.__dict__.keys():
            if (k not in _REMOTE_MODULE_PICKLED_ATTRIBUTES and k
                    not in _REMOTE_MODULE_ATTRIBUTES_IGNORE_FOR_PICKLING):
                raise AttributeError(
                    "Attribute {} must be either in ``_REMOTE_MODULE_PICKLED_ATTRIBUTES`` or "
                    "``_REMOTE_MODULE_ATTRIBUTES_IGNORE_FOR_PICKLING``.".
                    format(k))
Ejemplo n.º 6
0
    def _init_chunked(
        self,
        sharding_spec: ChunkShardingSpec,
        dims,
        dtype,
        layout,
        requires_grad,
        pin_memory,
        memory_format,
        process_group,
    ):
        current_rank = dist.get_rank(process_group)
        sharding_dim = sharding_spec.dim

        # Validate the sharding spec.
        if not isinstance(sharding_dim, int):
            raise ValueError(
                f"Sharding dim needs to be an integer, found: {sharding_dim}"
            )
        if sharding_dim >= len(dims) or sharding_dim < -len(dims):
            raise ValueError(f"Invalid sharding dim: {sharding_dim}")

        dim_size = dims[sharding_dim]
        devices = sharding_spec.placements
        chunks = len(devices)
        # split_size computed similar to 'torch.chunk'
        split_size = (dim_size + chunks - 1) // chunks

        for idx, device in enumerate(devices):
            if not is_valid_device(device):
                raise ValueError(f"{device} is not a valid device")

            rank, local_device = _parse_remote_device(device)  # type: ignore[arg-type]

            # Validate rank.
            if not isinstance(rank, int) or (rank < 0 or rank >= dist.get_world_size(process_group)):
                raise ValueError(f'Invalid rank: {rank}')

            # Adjust the sharding dim for this rank.
            sharded_dim_size = min(dim_size, split_size * (idx + 1)) - split_size * idx

            if sharded_dim_size > 0:
                # Build sharding_metadata.

                # deepcopy for modification.
                rank_dims = dims.copy()

                rank_offsets = [0] * len(dims)
                rank_offsets[sharding_dim] = split_size * idx
                rank_dims[sharding_dim] = sharded_dim_size

                shard_metadata = ShardMetadata(rank_offsets, rank_dims, device)
                self._sharding_metadata.append(shard_metadata)

                # Build the local shard for the current rank if it is involved in the sharding spec.
                if current_rank == rank:
                    # Initialize the local shard.
                    local_shard = torch.empty(
                        *rank_dims,
                        dtype=dtype,
                        layout=layout,
                        device=local_device,
                        requires_grad=requires_grad,
                        memory_format=memory_format,
                        pin_memory=pin_memory,
                    )

                    self._local_shards.append(Shard(local_shard, shard_metadata))