def _get_nccl_p2p_communicator(self, comm_key, my_gpu_idx, peer_rank, peer_gpu_idx): """Create or retrieve an NCCL communicator for p2p tasks. Note(Hao): this function is not thread-safe now. Args: comm_key (str): communicator key. my_gpu_idx (int): the gpu index on the current process. peer_rank (int): the rank of the destination process. peer_gpu_idx (int): the gpu index on the peer process. Returns: communicator """ if not comm_key: raise RuntimeError("Got empty communicator key.") # TODO(Hao): lock the _dev_comm_map here. if comm_key in self._dev_comm_map: return self._dev_comm_map[comm_key] # Note (Hao): This is a bit complex so I decide to take a note here. # Here we need to consider three cases: # Case 1: src_rank != dst_rank, hence the send and recv happen on # different process (actors/tasks); each process makes independent # collective calls and manages corresponding communicators. # Case 2: src_rank == dst_rank, src_gpu_idx == dst_gpu_idx; for # this case, we simply throw a RuntimeError; # Case 3: src_rank == dst_rank, src_gpu_idx != dst_gpu_idx, which # means the send and recv will be called on the same process. We # DO NOT support this case for now. We need to properly scope: # (1) communicators creation, and # (2) send/recv calls # using groupStart(( and groupEnd() calls to avoid deadlocks. if self.rank < peer_rank: my_p2p_rank = 0 elif self.rank > peer_rank: my_p2p_rank = 1 else: raise RuntimeError( "Send and recv happens on the same process! " "ray.util.collective does not support this case as of now. " "Alternatively, consider doing GPU to GPU memcpy?") group_key = self._generate_group_key(comm_key) if my_p2p_rank == 0: nccl_uid = self._generate_nccl_uid(group_key) else: rendezvous = Rendezvous(group_key) rendezvous.meet() nccl_uid = rendezvous.get_nccl_id() # create the p2p communicators with nccl_util.Device(my_gpu_idx): comm = nccl_util.create_nccl_communicator(2, nccl_uid, my_p2p_rank) stream = cupy.cuda.Stream.null # Stream(non_blocking=True) self._dev_comm_map[comm_key] = [comm] self._dev_streams_map[comm_key] = [stream] return [comm]
def _get_nccl_communicator(self): """Create or use a cached NCCL communicator for the collective task. """ # TODO(Hao): later change this to use device keys and query from cache. # TODO(Hao): implement a thin wrapper if not self._nccl_comm: self._nccl_comm = nccl_util.create_nccl_communicator( self.world_size, self.nccl_uid, self.rank) return self._nccl_comm
def _get_nccl_collective_communicator(self, comm_key, device_list): """Create or retrieve an NCCL communicator from cache. If the communicator is found in cache, return the communicator. If not, a communicator and a stream will be created and put in cache. TODO(Hao): this function is not thread-safe now. Args: comm_key (str): the key to query the communicator cache. device_list (List): a list of GPU devices of the current process that participates into the collective. Returns: communicator: the NCCL communicator corresponded to the devices. """ if not comm_key: raise RuntimeError("Got empty communicator key.") for d in device_list: self._used_gpu_indices.add(d) # TODO(Hao): lock the _dev_comm_map here. if comm_key in self._dev_comm_map: return self._dev_comm_map[comm_key] group_key = self._generate_group_key(comm_key) if self.rank == 0: nccl_uid = self._generate_nccl_uid(group_key) else: rendezvous = Rendezvous(group_key) rendezvous.meet() nccl_uid = rendezvous.get_nccl_id() # Now create the communicators actual_world_size = len(device_list) * self.world_size comms = [None] * len(device_list) streams = [None] * len(device_list) events = [None] * len(device_list) nccl_util.groupStart() for i, device in enumerate(device_list): actual_rank = self.rank * len(device_list) + i with nccl_util.Device(device): comms[i] = nccl_util.create_nccl_communicator( actual_world_size, nccl_uid, actual_rank ) # request a stream from the pool # note the device_idx is absolute index. streams[i] = get_stream_pool(device).get_stream() # TODO(Fu): double check the parameters events[i] = cupy.cuda.Event() nccl_util.groupEnd() # TODO(Fu): lock self._dev_comm_map[comm_key] = comms self._dev_streams_map[comm_key] = streams self._dev_event_map[comm_key] = events return comms