def call(self, env, group): if _runtime.runtimeGetVersion() < 11000: raise RuntimeError("not supported in CUDA < 11.0") if not isinstance(group.ctype, _ThreadGroup): raise ValueError("group must be a valid cooperative group") _check_include(env, 'cg') return _Data(f'cg::sync({group.code})', _cuda_types.void)
def call(self, env, group, step): if _runtime.runtimeGetVersion() < 11000: raise RuntimeError("not supported in CUDA < 11.0") _check_include(env, 'cg') if not isinstance(step, _Constant): raise ValueError('step must be a compile-time constant') return _Data(f'cg::wait_prior<{step.obj}>({group.code})', _cuda_types.void)
def thread_index(self, env): """ thread_index() 3-Dimensional index of the thread within the launched block. """ _check_include(env, 'cg') return _Data('thread_index()', _cuda_types.dim3)
def group_index(self, env): """ group_index() 3-Dimensional index of the block within the launched grid. """ _check_include(env, 'cg') return _Data('group_index()', _cuda_types.dim3)
def is_valid(self, env): """ is_valid() Returns whether the grid_group can synchronize. """ _check_include(env, 'cg') return _Data('is_valid()', _cuda_types.bool_)
def thread_rank(self, env): """ thread_rank() Rank of the calling thread within ``[0, num_threads)``. """ _check_include(env, 'cg') return _Data('thread_rank()', _cuda_types.uint64)
def group_dim(self, env): """ group_dim() Dimensions of the launched block in units of threads. """ # despite it is an alias of dim_threads, we need it for earlier 11.x _check_include(env, 'cg') return _Data('group_dim()', _cuda_types.dim3)
def size(self, env): """ size() Total number of threads in the group. """ # despite it is an alias of num_threads, we need it for earlier 11.x _check_include(env, 'cg') return _Data('size()', _cuda_types.uint32)
def group_index(self, env): """ group_index() 3-Dimensional index of the block within the launched grid. """ from cupyx.jit._interface import _Dim3 # avoid circular import self._check_cg_include(env) return _Data('group_index()', _Dim3())
def thread_index(self, env): """ thread_index() 3-Dimensional index of the thread within the launched block. """ from cupyx.jit._interface import _Dim3 # avoid circular import self._check_cg_include(env) return _Data('thread_index()', _Dim3())
def dim_threads(self, env): """ dim_threads() Dimensions of the launched block in units of threads. """ if _runtime.runtimeGetVersion() < 11060: raise RuntimeError("dim_threads() is supported on CUDA 11.6+") _check_include(env, 'cg') return _Data('dim_threads()', _cuda_types.dim3)
def block_index(self, env): """ block_index() 3-Dimensional index of the block within the launched grid. """ if _runtime.runtimeGetVersion() < 11060: raise RuntimeError("block_index() is supported on CUDA 11.6+") _check_include(env, 'cg') return _Data('block_index()', _cuda_types.dim3)
def num_blocks(self, env): """ num_blocks() Total number of blocks in the group. """ if _runtime.runtimeGetVersion() < 11060: raise RuntimeError("num_blocks() is supported on CUDA 11.6+") self._check_cg_include(env) return _Data('num_blocks()', _cuda_types.uint64)
def group_dim(self, env): """ group_dim() Dimensions of the launched block in units of threads. """ # despite it is an alias of dim_threads, we need it for earlier 11.x from cupyx.jit._interface import _Dim3 # avoid circular import self._check_cg_include(env) return _Data('group_dim()', _Dim3())
def num_threads(self, env): """ num_threads() Total number of threads in the group. """ if _runtime.runtimeGetVersion() < 11060: raise RuntimeError("num_threads() is supported on CUDA 11.6+") _check_include(env, 'cg') return _Data('num_threads()', _cuda_types.uint32)
def block_rank(self, env): """ block_rank() Rank of the calling block within ``[0, num_blocks)``. """ if _runtime.runtimeGetVersion() < 11060: raise RuntimeError("block_rank() is supported on CUDA 11.6+") _check_include(env, 'cg') return _Data('block_rank()', _cuda_types.uint64)
def dim_threads(self, env): """ dim_threads() Dimensions of the launched block in units of threads. """ if _runtime.runtimeGetVersion() < 11060: raise RuntimeError("dim_threads() is supported on CUDA 11.6+") from cupyx.jit._interface import _Dim3 # avoid circular import self._check_cg_include(env) return _Data('dim_threads()', _Dim3())
def block_index(self, env): """ block_index() 3-Dimensional index of the block within the launched grid. """ if _runtime.runtimeGetVersion() < 11060: raise RuntimeError("block_index() is supported on CUDA 11.6+") from cupyx.jit._interface import _Dim3 # avoid circular import self._check_cg_include(env) return _Data('block_index()', _Dim3())
def call_const(self, env): if _runtime.is_hip: raise RuntimeError('cooperative group is not supported on HIP') if self.group_type == 'grid': if _runtime.runtimeGetVersion() < 11000: raise RuntimeError( "For pre-CUDA 11, the grid group has very limited " "functionality (only group.sync() works), and so we " "disable the grid group support to prepare the transition " "to support CUDA 11+ only.") cg_type = _GridGroup() elif self.group_type == 'thread_block': cg_type = _ThreadBlockGroup() return _Data(f'cg::this_{self.group_type}()', cg_type)
def call(self, env, group, dst, dst_idx, src, src_idx, size, *, aligned_size=None): if _runtime.runtimeGetVersion() < 11010: # the overloaded version of memcpy_async that we use does not yet # exist in CUDA 11.0 raise RuntimeError("not supported in CUDA < 11.1") _check_include(env, 'cg') _check_include(env, 'cg_memcpy_async') dst = _Data.init(dst, env) src = _Data.init(src, env) for arr in (dst, src): if not isinstance(arr.ctype, (_cuda_types.CArray, _cuda_types.Ptr)): raise TypeError('dst/src must be of array type.') dst = _compile._indexing(dst, dst_idx, env) src = _compile._indexing(src, src_idx, env) size = _compile._astype_scalar( # it's very unlikely that the size would exceed 2^32, so we just # pick uint32 for simplicity size, _cuda_types.uint32, 'same_kind', env) size = _Data.init(size, env) size_code = f'{size.code}' if aligned_size: if not isinstance(aligned_size, _Constant): raise ValueError( 'aligned_size must be a compile-time constant') _check_include(env, 'cuda_barrier') size_code = (f'cuda::aligned_size_t<{aligned_size.obj}>' f'({size_code})') return _Data( f'cg::memcpy_async({group.code}, &({dst.code}), ' f'&({src.code}), {size_code})', _cuda_types.void)
def sync(self, env): self._check_cg_include(env) return _Data('sync()', _cuda_types.void)
def sync(self, env): _check_include(env, 'cg') return _Data('sync()', _cuda_types.void)
def call(self, env, group): if _runtime.runtimeGetVersion() < 11000: raise RuntimeError("not supported in CUDA < 11.0") _check_include(env, 'cg') return _Data(f'cg::wait({group.code})', _cuda_types.void)