def __init__(self, world_size, rank, group_name): """Init an NCCL collective group.""" super(NCCLGroup, self).__init__(world_size, rank, group_name) # communicator and stream cache. # TODO (Hao): we need a lock here... self._dev_comm_map = {} self._dev_streams_map = {} # record the used GPU IDs. self._used_gpu_indices = set() if nccl_util.get_nccl_build_version() < 2000: raise RuntimeError("NCCL in Ray requires NCCL >= 2.0.") if nccl_util.get_nccl_runtime_version() < 2704: logger.warning("NCCL send/recv calls requires NCCL>=2.7.4")
def __init__(self, world_size, rank, group_name): """Init an NCCL collective group.""" super(NCCLGroup, self).__init__(world_size, rank, group_name) self._nccl_uid = None # TODO(Hao): change this to a be a cache self._nccl_comm = None if nccl_util.get_nccl_build_version() < 2000: raise RuntimeError("NCCL in Ray requires NCCL >= 2.0.") # TODO(Hao): check version here if nccl_util.get_nccl_runtime_version() < 2704: logger.warning("NCCL send/recv calls requires NCCL>=2.7.4") self._rendezvous = Rendezvous(self.group_name) self._rendezvous.meet() # Setup the nccl uid using the store self._init_nccl_unique_id() # Setup a tensor for barrier calls self._barrier_tensor = cupy.array([1])