Example #1
0
def init_collective_group(world_size: int,
                          rank: int,
                          backend=types.Backend.NCCL,
                          group_name: str = "default"):
    """Initialize a collective group inside an actor process.

    Args:
        world_size (int): the total number of processes in the group.
        rank (int): the rank of the current process.
        backend: the CCL backend to use, NCCL or GLOO.
        group_name (str): the name of the collective group.

    Returns:
        None
    """
    _check_inside_actor()
    backend = types.Backend(backend)
    _check_backend_availability(backend)
    global _group_mgr
    # TODO(Hao): implement a group auto-counter.
    if not group_name:
        raise ValueError("group_name '{}' needs to be a string."
                         .format(group_name))

    if _group_mgr.is_group_exist(group_name):
        raise RuntimeError("Trying to initialize a group twice.")

    assert (world_size > 0)
    assert (rank >= 0)
    assert (rank < world_size)
    _group_mgr.create_collective_group(backend, world_size, rank, group_name)
Example #2
0
    def create_collective_group(self, backend, world_size, rank, group_name):
        """The entry to create new collective groups in the manager.

        Put the registration and the group information into the manager
        metadata as well.
        """
        backend = types.Backend(backend)
        if backend == types.Backend.MPI:
            raise RuntimeError("Ray does not support MPI.")
        elif backend == types.Backend.GLOO:
            logger.debug("Creating GLOO group: '{}'...".format(group_name))
            g = GLOOGroup(
                world_size,
                rank,
                group_name,
                store_type="ray_internal_kv",
                device_type="tcp",
            )
            self._name_group_map[group_name] = g
            self._group_name_map[g] = group_name
        elif backend == types.Backend.NCCL:
            logger.debug("Creating NCCL group: '{}'...".format(group_name))
            g = NCCLGroup(world_size, rank, group_name)
            self._name_group_map[group_name] = g
            self._group_name_map[g] = group_name
        return self._name_group_map[group_name]
Example #3
0
    def create_collective_group(self, backend, world_size, rank, group_name):
        """The entry to create new collective groups in the manager.

        Put the registration and the group information into the manager
        metadata as well.
        """
        backend = types.Backend(backend)
        if backend == types.Backend.MPI:
            raise NotImplementedError()
        elif backend == types.Backend.NCCL:
            # create the ncclUniqueID
            if rank == 0:
                # availability has been checked before entering here.
                group_uid = nccl_util.get_nccl_unique_id()
                store_name = get_nccl_store_name(group_name)
                # Avoid a potential circular dependency in ray/actor.py
                from ray.util.collective.util import NCCLUniqueIDStore
                store = NCCLUniqueIDStore.options(
                    name=store_name, lifetime="detached").remote(store_name)
                ray.wait([store.set_id.remote(group_uid)])

            logger.debug("creating NCCL group: '{}'".format(group_name))
            g = NCCLGroup(world_size, rank, group_name)
            self._name_group_map[group_name] = g
            self._group_name_map[g] = group_name
        return self._name_group_map[group_name]
Example #4
0
def create_collective_group(
    actors,
    world_size: int,
    ranks: List[int],
    backend=types.Backend.NCCL,
    group_name: str = "default",
):
    """Declare a list of actors as a collective group.

    Note: This function should be called in a driver process.

    Args:
        actors: a list of actors to be set in a collective group.
        world_size: the total number of processes in the group.
        ranks (List[int]): the rank of each actor.
        backend: the CCL backend to use, NCCL or GLOO.
        group_name: the name of the collective group.

    Returns:
        None
    """
    backend = types.Backend(backend)
    _check_backend_availability(backend)

    name = "info_" + group_name
    try:
        ray.get_actor(name)
        raise RuntimeError("Trying to initialize a group twice.")
    except ValueError:
        pass

    if len(ranks) != len(actors):
        raise RuntimeError(
            "Each actor should correspond to one rank. Got '{}' "
            "ranks but '{}' actors".format(len(ranks), len(actors)))

    if set(ranks) != set(range(len(ranks))):
        raise RuntimeError(
            "Ranks must be a permutation from 0 to '{}'. Got '{}'.".format(
                len(ranks), "".join([str(r) for r in ranks])))

    if world_size <= 0:
        raise RuntimeError(
            "World size must be greater than zero. Got '{}'.".format(
                world_size))
    if not all(ranks) >= 0:
        raise RuntimeError("Ranks must be non-negative.")
    if not all(ranks) < world_size:
        raise RuntimeError("Ranks cannot be greater than world_size.")

    # avoid a circular dependency
    from ray.util.collective.util import Info

    # store the information into a NamedActor that can be accessed later.
    name = "info_" + group_name
    actors_id = [a._ray_actor_id for a in actors]
    # TODO (Dacheng): how do we recycle this name actor?
    info = Info.options(name=name, lifetime="detached").remote()
    ray.get([info.set_info.remote(actors_id, world_size, ranks, backend)])
Example #5
0
    def create_collective_group(self, backend, world_size, rank, group_name):
        """The entry to create new collective groups in the manager.

        Put the registration and the group information into the manager
        metadata as well.
        """
        backend = types.Backend(backend)
        if backend == types.Backend.MPI:
            raise NotImplementedError()
        elif backend == types.Backend.NCCL:
            logger.debug("creating NCCL group: '{}'".format(group_name))
            g = NCCLGroup(world_size, rank, group_name)
            self._name_group_map[group_name] = g
            self._group_name_map[g] = group_name
        return self._name_group_map[group_name]
Example #6
0
def declare_collective_group(actors,
                             world_size: int,
                             ranks: List[int],
                             backend=types.Backend.NCCL,
                             group_name: str = "default"):
    """Declare a list of actors as a collective group.

    Note: This function should be called in a driver process.

    Args:
        actors (list): a list of actors to be set in a collective group.
        group_options (dict): a dictionary that contains group_name(str),
                              world_size(int), rank(list of int, e.g. [0,1]
                              means the first actor is rank 0, and the second
                              actor is rank 1), backend(str).
    """
    backend = types.Backend(backend)
    _check_backend_availability(backend)

    name = "info_" + group_name
    try:
        ray.get_actor(name)
        raise RuntimeError("Trying to initialize a group twice.")
    except ValueError:
        pass

    if len(ranks) != len(actors):
        raise RuntimeError(
            "Each actor should correspond to one rank. Got '{}' "
            "ranks but '{}' actors".format(len(ranks), len(actors)))

    if set(ranks) != set(range(len(ranks))):
        raise RuntimeError(
            "Ranks must be a permutation from 0 to '{}'. Got '{}'.".format(
                len(ranks), "".join([str(r) for r in ranks])))

    assert world_size > 0
    assert all(ranks) >= 0 and all(ranks) < world_size

    # avoid a circular dependency
    from ray.util.collective.util import Info
    # store the information into a NamedActor that can be accessed later/
    name = "info_" + group_name
    actors_id = [a._ray_actor_id for a in actors]
    info = Info.options(name=name, lifetime="detached").remote()
    ray.get([info.set_info.remote(actors_id, world_size, ranks, backend)])