def init_collective_group(world_size: int, rank: int, backend=types.Backend.NCCL, group_name: str = "default"): """Initialize a collective group inside an actor process. Args: world_size (int): the total number of processes in the group. rank (int): the rank of the current process. backend: the CCL backend to use, NCCL or GLOO. group_name (str): the name of the collective group. Returns: None """ _check_inside_actor() backend = types.Backend(backend) _check_backend_availability(backend) global _group_mgr # TODO(Hao): implement a group auto-counter. if not group_name: raise ValueError("group_name '{}' needs to be a string." .format(group_name)) if _group_mgr.is_group_exist(group_name): raise RuntimeError("Trying to initialize a group twice.") assert (world_size > 0) assert (rank >= 0) assert (rank < world_size) _group_mgr.create_collective_group(backend, world_size, rank, group_name)
def create_collective_group(self, backend, world_size, rank, group_name): """The entry to create new collective groups in the manager. Put the registration and the group information into the manager metadata as well. """ backend = types.Backend(backend) if backend == types.Backend.MPI: raise RuntimeError("Ray does not support MPI.") elif backend == types.Backend.GLOO: logger.debug("Creating GLOO group: '{}'...".format(group_name)) g = GLOOGroup( world_size, rank, group_name, store_type="ray_internal_kv", device_type="tcp", ) self._name_group_map[group_name] = g self._group_name_map[g] = group_name elif backend == types.Backend.NCCL: logger.debug("Creating NCCL group: '{}'...".format(group_name)) g = NCCLGroup(world_size, rank, group_name) self._name_group_map[group_name] = g self._group_name_map[g] = group_name return self._name_group_map[group_name]
def create_collective_group(self, backend, world_size, rank, group_name): """The entry to create new collective groups in the manager. Put the registration and the group information into the manager metadata as well. """ backend = types.Backend(backend) if backend == types.Backend.MPI: raise NotImplementedError() elif backend == types.Backend.NCCL: # create the ncclUniqueID if rank == 0: # availability has been checked before entering here. group_uid = nccl_util.get_nccl_unique_id() store_name = get_nccl_store_name(group_name) # Avoid a potential circular dependency in ray/actor.py from ray.util.collective.util import NCCLUniqueIDStore store = NCCLUniqueIDStore.options( name=store_name, lifetime="detached").remote(store_name) ray.wait([store.set_id.remote(group_uid)]) logger.debug("creating NCCL group: '{}'".format(group_name)) g = NCCLGroup(world_size, rank, group_name) self._name_group_map[group_name] = g self._group_name_map[g] = group_name return self._name_group_map[group_name]
def create_collective_group( actors, world_size: int, ranks: List[int], backend=types.Backend.NCCL, group_name: str = "default", ): """Declare a list of actors as a collective group. Note: This function should be called in a driver process. Args: actors: a list of actors to be set in a collective group. world_size: the total number of processes in the group. ranks (List[int]): the rank of each actor. backend: the CCL backend to use, NCCL or GLOO. group_name: the name of the collective group. Returns: None """ backend = types.Backend(backend) _check_backend_availability(backend) name = "info_" + group_name try: ray.get_actor(name) raise RuntimeError("Trying to initialize a group twice.") except ValueError: pass if len(ranks) != len(actors): raise RuntimeError( "Each actor should correspond to one rank. Got '{}' " "ranks but '{}' actors".format(len(ranks), len(actors))) if set(ranks) != set(range(len(ranks))): raise RuntimeError( "Ranks must be a permutation from 0 to '{}'. Got '{}'.".format( len(ranks), "".join([str(r) for r in ranks]))) if world_size <= 0: raise RuntimeError( "World size must be greater than zero. Got '{}'.".format( world_size)) if not all(ranks) >= 0: raise RuntimeError("Ranks must be non-negative.") if not all(ranks) < world_size: raise RuntimeError("Ranks cannot be greater than world_size.") # avoid a circular dependency from ray.util.collective.util import Info # store the information into a NamedActor that can be accessed later. name = "info_" + group_name actors_id = [a._ray_actor_id for a in actors] # TODO (Dacheng): how do we recycle this name actor? info = Info.options(name=name, lifetime="detached").remote() ray.get([info.set_info.remote(actors_id, world_size, ranks, backend)])
def create_collective_group(self, backend, world_size, rank, group_name): """The entry to create new collective groups in the manager. Put the registration and the group information into the manager metadata as well. """ backend = types.Backend(backend) if backend == types.Backend.MPI: raise NotImplementedError() elif backend == types.Backend.NCCL: logger.debug("creating NCCL group: '{}'".format(group_name)) g = NCCLGroup(world_size, rank, group_name) self._name_group_map[group_name] = g self._group_name_map[g] = group_name return self._name_group_map[group_name]
def declare_collective_group(actors, world_size: int, ranks: List[int], backend=types.Backend.NCCL, group_name: str = "default"): """Declare a list of actors as a collective group. Note: This function should be called in a driver process. Args: actors (list): a list of actors to be set in a collective group. group_options (dict): a dictionary that contains group_name(str), world_size(int), rank(list of int, e.g. [0,1] means the first actor is rank 0, and the second actor is rank 1), backend(str). """ backend = types.Backend(backend) _check_backend_availability(backend) name = "info_" + group_name try: ray.get_actor(name) raise RuntimeError("Trying to initialize a group twice.") except ValueError: pass if len(ranks) != len(actors): raise RuntimeError( "Each actor should correspond to one rank. Got '{}' " "ranks but '{}' actors".format(len(ranks), len(actors))) if set(ranks) != set(range(len(ranks))): raise RuntimeError( "Ranks must be a permutation from 0 to '{}'. Got '{}'.".format( len(ranks), "".join([str(r) for r in ranks]))) assert world_size > 0 assert all(ranks) >= 0 and all(ranks) < world_size # avoid a circular dependency from ray.util.collective.util import Info # store the information into a NamedActor that can be accessed later/ name = "info_" + group_name actors_id = [a._ray_actor_id for a in actors] info = Info.options(name=name, lifetime="detached").remote() ray.get([info.set_info.remote(actors_id, world_size, ranks, backend)])