def create_store(self, store_type): if store_type == "redis": redisStore = pygloo.rendezvous.RedisStore(self._ip_address, int(self._redis_port)) redis_password = ray_constants.REDIS_DEFAULT_PASSWORD redisStore.authorize(redis_password) self._store = redisStore elif store_type == "file": store_name = get_store_name(self._group_name) store_path = gloo_util.get_gloo_store_path(store_name) if self._context.rank == 0: if not os.path.exists(store_path): os.makedirs(store_path) elif os.listdir(store_path) and os.listdir(store_path): shutil.rmtree(store_path) os.makedirs(store_path) else: while not os.path.exists(store_path): time.sleep(0.1) # Note: multi-machines needs a shared NFS. fileStore = pygloo.rendezvous.FileStore(store_path) self._store = pygloo.rendezvous.PrefixStore( self._group_name, fileStore) elif store_type == "hash": raise NotImplementedError("No implementation for hash store.") else: raise RuntimeError( "Unrecognized store type: {}.".format(store_type))
def meet(self, timeout_s=180): """Meet at the named actor store. Args: timeout_s (int): timeout in seconds. Return: None """ if timeout_s <= 0: raise ValueError("The 'timeout' argument must be positive. " "Got '{}'.".format(timeout_s)) self._store_name = get_store_name(self._store_key) timeout_delta = datetime.timedelta(seconds=timeout_s) elapsed = datetime.timedelta(seconds=0) start_time = datetime.datetime.now() while elapsed < timeout_delta: try: logger.debug("Trying to meet at the store '{}'".format( self._store_name)) self._store = ray.get_actor(self._store_name) except ValueError: logger.debug("Failed to meet at the store '{}'." "Trying again...".format(self._store_name)) time.sleep(1) elapsed = datetime.datetime.now() - start_time continue logger.debug("Successful rendezvous!") break if not self._store: raise RuntimeError("Unable to meet other processes " "at the rendezvous store. If you are using " "P2P communication, please check if tensors " "are put in the correct GPU. ")
def clean_up(): group_names = ["default", "test", "123?34!", "default2", "random"] group_names.extend([str(i) for i in range(10)]) max_world_size = 4 all_keys = [] for name in group_names: devices = [[0], [0, 1], [1, 0]] for d in devices: collective_communicator_key = _get_comm_key_from_devices(d) all_keys.append(collective_communicator_key + "@" + name) for i in range(max_world_size): for j in range(max_world_size): if i < j: p2p_communicator_key = _get_comm_key_send_recv(i, 0, j, 0) all_keys.append(p2p_communicator_key + "@" + name) for group_key in all_keys: store_name = get_store_name(group_key) try: actor = ray.get_actor(store_name) except ValueError: actor = None if actor: logger.debug( "Killing actor with group_key: '{}' and store: '{}'.".format( group_key, store_name)) ray.kill(actor)
def destroy_group(self): """Destroy the group and release GLOO communicators.""" if self._gloo_context is not None: pygloo.barrier(self._gloo_context) # destroy the communicator self._gloo_context = None if self.rank == 0 and self._rendezvous.store_type == "file": store_name = get_store_name(self._group_name) store_path = gloo_util.get_gloo_store_path(store_name) if os.path.exists(store_path): shutil.rmtree(store_path) super(GLOOGroup, self).destroy_group()
def _destroy_store(group_key): """Destroy the KV store (Ray named actor). Args: group_key (str): the unique key to retrieve the KV store. Returns: None """ store_name = get_store_name(group_key) store = ray.get_actor(store_name) # ray.get([store.__ray_terminate__.remote()]) ray.kill(store)
def _generate_nccl_uid(self, key): """Generate an NCCL unique ID for initializing communicators. The method will also create a KV store using Ray named actor and store the NCCLUniqueID in the store. The store needs to be garbage collected when destroying the collective group. Args: key (str): the key of the . Returns: NCCLUniqueID (str): NCCL unique ID. """ group_uid = nccl_util.get_nccl_unique_id() store_name = get_store_name(key) # Avoid a potential circular dependency in ray/actor.py from ray.util.collective.util import NCCLUniqueIDStore store = NCCLUniqueIDStore.options( name=store_name, lifetime="detached").remote(store_name) ray.get([store.set_id.remote(group_uid)]) return group_uid