Exemple #1
0
def clean_up():
    group_names = ["default", "test", "123?34!", "default2", "random"]
    group_names.extend([str(i) for i in range(10)])
    max_world_size = 4
    all_keys = []
    for name in group_names:
        devices = [[0], [0, 1], [1, 0]]
        for d in devices:
            collective_communicator_key = _get_comm_key_from_devices(d)
            all_keys.append(collective_communicator_key + "@" + name)
        for i in range(max_world_size):
            for j in range(max_world_size):
                if i < j:
                    p2p_communicator_key = _get_comm_key_send_recv(i, 0, j, 0)
                    all_keys.append(p2p_communicator_key + "@" + name)
    for group_key in all_keys:
        store_name = get_nccl_store_name(group_key)
        try:
            actor = ray.get_actor(store_name)
        except ValueError:
            actor = None
        if actor:
            logger.debug("Killing actor with group_key: '{}' and store: '{}'."
                         .format(group_key, store_name))
            ray.kill(actor)
Exemple #2
0
    def meet(self, timeout_s=180):
        """Meet at the named actor store.

        Args:
            timeout_s: timeout in seconds.

        Return:
            None
        """
        if timeout_s <= 0:
            raise ValueError("The 'timeout' argument must be positive. "
                             "Got '{}'.".format(timeout_s))
        self._store_name = get_nccl_store_name(self._group_name)
        timeout_delta = datetime.timedelta(seconds=timeout_s)
        elapsed = datetime.timedelta(seconds=0)
        start_time = datetime.datetime.now()
        while elapsed < timeout_delta:
            try:
                logger.debug("Trying to meet at the store '{}'".format(
                    self._store_name))
                self._store = ray.get_actor(self._store_name)
            except ValueError:
                logger.debug("Failed to meet at the store '{}'."
                             "Trying again...".format(self._store_name))
                time.sleep(1)
                elapsed = datetime.datetime.now() - start_time
                continue
            logger.debug("Successful rendezvous!")
            break
        if not self._store:
            raise RuntimeError("Unable to meet other processes "
                               "at the rendezvous store.")
Exemple #3
0
    def create_collective_group(self, backend, world_size, rank, group_name):
        """The entry to create new collective groups in the manager.

        Put the registration and the group information into the manager
        metadata as well.
        """
        backend = types.Backend(backend)
        if backend == types.Backend.MPI:
            raise NotImplementedError()
        elif backend == types.Backend.NCCL:
            # create the ncclUniqueID
            if rank == 0:
                # availability has been checked before entering here.
                group_uid = nccl_util.get_nccl_unique_id()
                store_name = get_nccl_store_name(group_name)
                # Avoid a potential circular dependency in ray/actor.py
                from ray.util.collective.util import NCCLUniqueIDStore
                store = NCCLUniqueIDStore.options(
                    name=store_name, lifetime="detached").remote(store_name)
                ray.wait([store.set_id.remote(group_uid)])

            logger.debug("creating NCCL group: '{}'".format(group_name))
            g = NCCLGroup(world_size, rank, group_name)
            self._name_group_map[group_name] = g
            self._group_name_map[g] = group_name
        return self._name_group_map[group_name]
Exemple #4
0
def clean_up():
    group_names = ["default", "test", "123?34!", "default2", "random"]
    group_names.extend([str(i) for i in range(10)])
    for group_name in group_names:
        try:
            store_name = get_nccl_store_name(group_name)
            actor = ray.get_actor(store_name)
        except ValueError:
            actor = None
        if actor:
            ray.kill(actor)
Exemple #5
0
    def _destroy_store(group_key):
        """Destroy the KV store (Ray named actor).

        Args:
            group_key (str): the unique key to retrieve the KV store.

        Returns:
            None
        """
        store_name = get_nccl_store_name(group_key)
        store = ray.get_actor(store_name)
        # ray.get([store.__ray_terminate__.remote()])
        ray.kill(store)
Exemple #6
0
def clean_up():
    group_names = ["default", "test", "123?34!", "default2", "random"]
    group_names.extend([str(i) for i in range(10)])
    max_world_size = 4
    p2p_group_names = []
    for name in group_names:
        for i in range(max_world_size):
            for j in range(max_world_size):
                if i <= j:
                    p2p_group_name = name + "_" + str(i) + "_" + str(j)
                    p2p_group_names.append(p2p_group_name)
    all_names = group_names + p2p_group_names
    for group_name in all_names:
        store_name = get_nccl_store_name(group_name)
        try:
            actor = ray.get_actor(store_name)
        except ValueError:
            actor = None
        if actor:
            ray.kill(actor)
Exemple #7
0
    def _generate_nccl_uid(self, key):
        """Generate an NCCL unique ID for initializing communicators.

        The method will also create a KV store using Ray named actor and store
        the NCCLUniqueID in the store. The store needs to be garbage collected
        when destroying the collective group.

        Args:
            key (str): the key of the .

        Returns:
            NCCLUniqueID (str): NCCL unique ID.
        """
        group_uid = nccl_util.get_nccl_unique_id()
        store_name = get_nccl_store_name(key)
        # Avoid a potential circular dependency in ray/actor.py
        from ray.util.collective.util import NCCLUniqueIDStore
        store = NCCLUniqueIDStore.options(
            name=store_name, lifetime="detached").remote(store_name)
        ray.get([store.set_id.remote(group_uid)])
        return group_uid
Exemple #8
0
    def destroy_collective_group(self, group_name):
        """Group destructor."""
        if not self.is_group_exist(group_name):
            logger.warning("The group '{}' does not exist.".format(group_name))
            return

        # release the collective group resource
        g = self._name_group_map[group_name]
        rank = g.rank
        backend = g.backend()

        # clean up the dicts
        del self._group_name_map[g]
        del self._name_group_map[group_name]
        if backend == types.Backend.NCCL:
            # release the named actor
            if rank == 0:
                store_name = get_nccl_store_name(group_name)
                store = ray.get_actor(store_name)
                ray.wait([store.__ray_terminate__.remote()])
                ray.kill(store)
        # Release the communicator resources
        g.destroy_group()