Ejemplo n.º 1
0
 def create_store(self, store_type):
     if store_type == "redis":
         redisStore = pygloo.rendezvous.RedisStore(self._ip_address,
                                                   int(self._redis_port))
         redis_password = ray_constants.REDIS_DEFAULT_PASSWORD
         redisStore.authorize(redis_password)
         self._store = redisStore
     elif store_type == "file":
         store_name = get_store_name(self._group_name)
         store_path = gloo_util.get_gloo_store_path(store_name)
         if self._context.rank == 0:
             if not os.path.exists(store_path):
                 os.makedirs(store_path)
             elif os.listdir(store_path) and os.listdir(store_path):
                 shutil.rmtree(store_path)
                 os.makedirs(store_path)
         else:
             while not os.path.exists(store_path):
                 time.sleep(0.1)
         # Note: multi-machines needs a shared NFS.
         fileStore = pygloo.rendezvous.FileStore(store_path)
         self._store = pygloo.rendezvous.PrefixStore(
             self._group_name, fileStore)
     elif store_type == "hash":
         raise NotImplementedError("No implementation for hash store.")
     else:
         raise RuntimeError(
             "Unrecognized store type: {}.".format(store_type))
Ejemplo n.º 2
0
    def meet(self, timeout_s=180):
        """Meet at the named actor store.

        Args:
            timeout_s (int): timeout in seconds.

        Return:
            None
        """
        if timeout_s <= 0:
            raise ValueError("The 'timeout' argument must be positive. "
                             "Got '{}'.".format(timeout_s))
        self._store_name = get_store_name(self._store_key)
        timeout_delta = datetime.timedelta(seconds=timeout_s)
        elapsed = datetime.timedelta(seconds=0)
        start_time = datetime.datetime.now()
        while elapsed < timeout_delta:
            try:
                logger.debug("Trying to meet at the store '{}'".format(
                    self._store_name))
                self._store = ray.get_actor(self._store_name)
            except ValueError:
                logger.debug("Failed to meet at the store '{}'."
                             "Trying again...".format(self._store_name))
                time.sleep(1)
                elapsed = datetime.datetime.now() - start_time
                continue
            logger.debug("Successful rendezvous!")
            break
        if not self._store:
            raise RuntimeError("Unable to meet other processes "
                               "at the rendezvous store. If you are using "
                               "P2P communication, please check if tensors "
                               "are put in the correct GPU. ")
Ejemplo n.º 3
0
def clean_up():
    group_names = ["default", "test", "123?34!", "default2", "random"]
    group_names.extend([str(i) for i in range(10)])
    max_world_size = 4
    all_keys = []
    for name in group_names:
        devices = [[0], [0, 1], [1, 0]]
        for d in devices:
            collective_communicator_key = _get_comm_key_from_devices(d)
            all_keys.append(collective_communicator_key + "@" + name)
        for i in range(max_world_size):
            for j in range(max_world_size):
                if i < j:
                    p2p_communicator_key = _get_comm_key_send_recv(i, 0, j, 0)
                    all_keys.append(p2p_communicator_key + "@" + name)
    for group_key in all_keys:
        store_name = get_store_name(group_key)
        try:
            actor = ray.get_actor(store_name)
        except ValueError:
            actor = None
        if actor:
            logger.debug(
                "Killing actor with group_key: '{}' and store: '{}'.".format(
                    group_key, store_name))
            ray.kill(actor)
Ejemplo n.º 4
0
    def destroy_group(self):
        """Destroy the group and release GLOO communicators."""
        if self._gloo_context is not None:
            pygloo.barrier(self._gloo_context)
            # destroy the communicator
            self._gloo_context = None

        if self.rank == 0 and self._rendezvous.store_type == "file":
            store_name = get_store_name(self._group_name)
            store_path = gloo_util.get_gloo_store_path(store_name)
            if os.path.exists(store_path):
                shutil.rmtree(store_path)
        super(GLOOGroup, self).destroy_group()
Ejemplo n.º 5
0
    def _destroy_store(group_key):
        """Destroy the KV store (Ray named actor).

        Args:
            group_key (str): the unique key to retrieve the KV store.

        Returns:
            None
        """
        store_name = get_store_name(group_key)
        store = ray.get_actor(store_name)
        # ray.get([store.__ray_terminate__.remote()])
        ray.kill(store)
Ejemplo n.º 6
0
    def _generate_nccl_uid(self, key):
        """Generate an NCCL unique ID for initializing communicators.

        The method will also create a KV store using Ray named actor and store
        the NCCLUniqueID in the store. The store needs to be garbage collected
        when destroying the collective group.

        Args:
            key (str): the key of the .

        Returns:
            NCCLUniqueID (str): NCCL unique ID.
        """
        group_uid = nccl_util.get_nccl_unique_id()
        store_name = get_store_name(key)
        # Avoid a potential circular dependency in ray/actor.py
        from ray.util.collective.util import NCCLUniqueIDStore
        store = NCCLUniqueIDStore.options(
            name=store_name, lifetime="detached").remote(store_name)
        ray.get([store.set_id.remote(group_uid)])
        return group_uid