def create_collective_group(self, backend, world_size, rank, group_name): """The entry to create new collective groups in the manager. Put the registration and the group information into the manager metadata as well. """ backend = types.Backend(backend) if backend == types.Backend.MPI: raise NotImplementedError() elif backend == types.Backend.NCCL: # create the ncclUniqueID if rank == 0: # availability has been checked before entering here. group_uid = nccl_util.get_nccl_unique_id() store_name = get_nccl_store_name(group_name) # Avoid a potential circular dependency in ray/actor.py from ray.util.collective.util import NCCLUniqueIDStore store = NCCLUniqueIDStore.options( name=store_name, lifetime="detached").remote(store_name) ray.wait([store.set_id.remote(group_uid)]) logger.debug("creating NCCL group: '{}'".format(group_name)) g = NCCLGroup(world_size, rank, group_name) self._name_group_map[group_name] = g self._group_name_map[g] = group_name return self._name_group_map[group_name]
def _generate_nccl_uid(self, key): """Generate an NCCL unique ID for initializing communicators. The method will also create a KV store using Ray named actor and store the NCCLUniqueID in the store. The store needs to be garbage collected when destroying the collective group. Args: key (str): the key of the . Returns: NCCLUniqueID (str): NCCL unique ID. """ group_uid = nccl_util.get_nccl_unique_id() store_name = get_store_name(key) # Avoid a potential circular dependency in ray/actor.py from ray.util.collective.util import NCCLUniqueIDStore store = NCCLUniqueIDStore.options( name=store_name, lifetime="detached").remote(store_name) ray.get([store.set_id.remote(group_uid)]) return group_uid