Ejemplo n.º 1
0
def init_rpc(
    name: str,
    # pyre-fixme[11]: Annotation `BackendType` is not defined as a type.
    backend: torch_rpc.backend_registry.BackendType,
    # pyre-fixme[11]: Annotation `RpcBackendOptions` is not defined as a type.
    backend_options: torch_rpc.RpcBackendOptions,
    store,
):
    if not backend_options:
        # default construct a set of RPC backend options.
        backend_options = backend_registry.construct_rpc_backend_options(
            backend)
    rank = int(utils.get_env_variable_or_raise("RANK"))
    world_size = int(utils.get_env_variable_or_raise("WORLD_SIZE"))

    # Initialize autograd before RPC since _init_rpc_backend guarantees all
    # processes sync via the store. If we initialize autograd after RPC,
    # there could be a race where some nodes might have initialized autograd
    # and others might not have. As a result, a node calling
    # torch.distributed.autograd.backward() would run into errors since
    # other nodes might not have been initialized.
    # pyre-fixme[16]: Module `dist_autograd` has no attribute `_init`.
    dist_autograd._init(rank)

    # Initialize RPC.
    api._init_rpc_backend(backend, store, name, rank, world_size,
                          backend_options)
Ejemplo n.º 2
0
    def init_rpc(
        name,
        backend=backend_registry.BackendType.PROCESS_GROUP,
        rank=-1,
        world_size=None,
        rpc_backend_options=None,
    ):
        r"""
        Initializes RPC primitives such as the local RPC agent
        and distributed autograd.

        Initializes the local RPC agent which immediately makes the current
        process ready to send and receive RPCs. This method also properly
        initializes a default process group backend that uses gloo for
        collective communication.

        Arguments:
            backend (Enum): type of RPC backend implementation. Currently,
                process group backend is the only available backend
                implementation. (default: ``RpcBackend.PROCESS_GROUP``).
            name (str): a globally unique name of this node. (e.g.,
                ``Trainer3``, ``ParameterServer2``, ``Master``, ``Worker1``)
                Name can only contain number, alphabet, underscore, and/or dash,
                and must be shorter than 128 characters.
            rank (int): a globally unique id/rank of this node.
            world_size (int): The number of workers in the group.
            rpc_backend_options (RpcBackendOptions): The options passed to
                RpcAgent constructor. It contains RpcAgent specific
                initialization configurations. By default, it contains
                ``rpc_timeout = timedelta(seconds=60)``,
                ``init_method = "env://"``, ``num_send_recv_threads = 4`` for
                process group agent. If using the default
                ``rpc_backend_options``, RPC would initialize the underlying
                process group backend using ``init_method = "env://"``,
                meaning that environment variables ``MASTER_ADDRESS`` and
                ``MASTER_PORT`` needs to be set properly.
        """

        if not rpc_backend_options:
            # default construct a set of RPC backend options.
            rpc_backend_options = backend_registry.construct_rpc_backend_options(
                backend)

        # Rendezvous.
        rendezvous_iterator = torch.distributed.rendezvous(
            rpc_backend_options.init_method, rank=rank, world_size=world_size)
        store, _, _ = next(rendezvous_iterator)

        # Initialize autograd before RPC since _init_rpc_backend guarantees all
        # processes sync via the store. If we initialize autograd after RPC,
        # there could be a race where some nodes might have initialized autograd
        # and others might not have. As a result, a node calling
        # torch.distributed.autograd.backward() would run into errors since
        # other nodes might not have been initialized.
        dist_autograd._init(rank)

        # Initialize RPC.
        api._init_rpc_backend(backend, store, name, rank, world_size,
                              rpc_backend_options)
Ejemplo n.º 3
0
    def init_rpc(
        name,
        backend=BackendType.PROCESS_GROUP,
        rank=-1,
        world_size=None,
        rpc_backend_options=None,
    ):
        r"""
        Initializes RPC primitives such as the local RPC agent
        and distributed autograd, which immediately makes the current
        process ready to send and receive RPCs.

        Arguments:
            backend (BackendType, optional): The type of RPC backend
                implementation. Supported values include
                ``BackendType.PROCESS_GROUP`` (the default) and
                ``BackendType.TENSORPIPE``. See :ref:`rpc-backends` for more
                information.
            name (str): a globally unique name of this node. (e.g.,
                ``Trainer3``, ``ParameterServer2``, ``Master``, ``Worker1``)
                Name can only contain number, alphabet, underscore, colon,
                and/or dash, and must be shorter than 128 characters.
            rank (int): a globally unique id/rank of this node.
            world_size (int): The number of workers in the group.
            rpc_backend_options (RpcBackendOptions, optional): The options
                passed to the RpcAgent constructor. It must be an agent-specific
                subclass of :class:`~torch.distributed.rpc.RpcBackendOptions`
                and contains agent-specific initialization configurations. By
                default, for all agents, it sets the default timeout to 60
                seconds and performs the rendezvous with an underlying process
                group initialized using ``init_method = "env://"``,
                meaning that environment variables ``MASTER_ADDR`` and
                ``MASTER_PORT`` need to be set properly. See
                :ref:`rpc-backends` for more information and find which options
                are available.
        """

        if not rpc_backend_options:
            # default construct a set of RPC backend options.
            rpc_backend_options = backend_registry.construct_rpc_backend_options(
                backend)

        # Rendezvous.
        # This rendezvous state sometimes is destroyed before all processes
        # finishing handshaking. To avoid that issue, we make it global to
        # keep it alive.
        global rendezvous_iterator
        rendezvous_iterator = torch.distributed.rendezvous(
            rpc_backend_options.init_method, rank=rank, world_size=world_size)
        store, _, _ = next(rendezvous_iterator)

        # Use a PrefixStore to distinguish multiple invocations.
        with _init_counter_lock:
            global _init_counter
            store = dist.PrefixStore(
                str('rpc_prefix_{}'.format(_init_counter)), store)
            _init_counter += 1

        # Initialize autograd before RPC since _init_rpc_backend guarantees all
        # processes sync via the store. If we initialize autograd after RPC,
        # there could be a race where some nodes might have initialized autograd
        # and others might not have. As a result, a node calling
        # torch.distributed.autograd.backward() would run into errors since
        # other nodes might not have been initialized.
        dist_autograd._init(rank)

        _set_profiler_node_id(rank)
        # Initialize RPC.
        api._init_rpc_backend(backend, store, name, rank, world_size,
                              rpc_backend_options)
Ejemplo n.º 4
0
    def init_rpc(
        name,
        backend=None,
        rank=-1,
        world_size=None,
        rpc_backend_options=None,
    ):
        r"""
        Initializes RPC primitives such as the local RPC agent
        and distributed autograd, which immediately makes the current
        process ready to send and receive RPCs.

        Args:
            backend (BackendType, optional): The type of RPC backend
                implementation. Supported values include
                ``BackendType.TENSORPIPE`` (the default) and
                ``BackendType.PROCESS_GROUP``. See :ref:`rpc-backends` for more
                information.
            name (str): a globally unique name of this node. (e.g.,
                ``Trainer3``, ``ParameterServer2``, ``Master``, ``Worker1``)
                Name can only contain number, alphabet, underscore, colon,
                and/or dash, and must be shorter than 128 characters.
            rank (int): a globally unique id/rank of this node.
            world_size (int): The number of workers in the group.
            rpc_backend_options (RpcBackendOptions, optional): The options
                passed to the RpcAgent constructor. It must be an agent-specific
                subclass of :class:`~torch.distributed.rpc.RpcBackendOptions`
                and contains agent-specific initialization configurations. By
                default, for all agents, it sets the default timeout to 60
                seconds and performs the rendezvous with an underlying process
                group initialized using ``init_method = "env://"``,
                meaning that environment variables ``MASTER_ADDR`` and
                ``MASTER_PORT`` need to be set properly. See
                :ref:`rpc-backends` for more information and find which options
                are available.
        """

        if backend is not None and not isinstance(
                backend, backend_registry.BackendType):
            raise TypeError("Argument backend must be a member of BackendType")

        if rpc_backend_options is not None and not isinstance(
                rpc_backend_options, RpcBackendOptions):
            raise TypeError(
                "Argument rpc_backend_options must be an instance of RpcBackendOptions"
            )

        # To avoid breaking users that passed a ProcessGroupRpcBackendOptions
        # without specifying the backend as PROCESS_GROUP when that was the
        # default, we try to detect the backend from the options when only the
        # latter is passed.
        if backend is None and rpc_backend_options is not None:
            for candidate_backend in BackendType:
                if isinstance(
                        rpc_backend_options,
                        type(
                            backend_registry.construct_rpc_backend_options(
                                candidate_backend)),
                ):
                    backend = candidate_backend
                    break
            else:
                raise TypeError(
                    f"Could not infer backend for options {rpc_backend_options}"
                )
            # Ignore type error because mypy doesn't handle dynamically generated type objects (#4865)
            if backend != BackendType.TENSORPIPE:  # type: ignore[attr-defined]
                logger.warning(
                    f"RPC was initialized with no explicit backend but with options "  # type: ignore[attr-defined]
                    f"corresponding to {backend}, hence that backend will be used "
                    f"instead of the default {BackendType.TENSORPIPE}. To silence this "
                    f"warning pass `backend={backend}` explicitly.")

        if backend is None:
            backend = BackendType.TENSORPIPE  # type: ignore[attr-defined]

        if backend == BackendType.PROCESS_GROUP:  # type: ignore[attr-defined]
            logger.warning(
                "RPC was initialized with the PROCESS_GROUP backend which is "
                "deprecated and slated to be removed and superseded by the TENSORPIPE "
                "backend. It is recommended to migrate to the TENSORPIPE backend."
            )

        if rpc_backend_options is None:
            # default construct a set of RPC backend options.
            rpc_backend_options = backend_registry.construct_rpc_backend_options(
                backend)

        # Rendezvous.
        # This rendezvous state sometimes is destroyed before all processes
        # finishing handshaking. To avoid that issue, we make it global to
        # keep it alive.
        global rendezvous_iterator
        rendezvous_iterator = torch.distributed.rendezvous(
            rpc_backend_options.init_method, rank=rank, world_size=world_size)
        store, _, _ = next(rendezvous_iterator)

        # Use a PrefixStore to distinguish multiple invocations.
        with _init_counter_lock:
            global _init_counter
            store = dist.PrefixStore(
                str('rpc_prefix_{}'.format(_init_counter)), store)
            _init_counter += 1

        # Initialize autograd before RPC since _init_rpc_backend guarantees all
        # processes sync via the store. If we initialize autograd after RPC,
        # there could be a race where some nodes might have initialized autograd
        # and others might not have. As a result, a node calling
        # torch.distributed.autograd.backward() would run into errors since
        # other nodes might not have been initialized.
        dist_autograd._init(rank)

        _set_profiler_node_id(rank)
        # Initialize RPC.
        _init_rpc_backend(backend, store, name, rank, world_size,
                          rpc_backend_options)