Ejemplo n.º 1
0
class TestMemPool:
    @pytest.mark.skipif(runtime.is_hip,
                        reason='HIP does not support async allocator')
    @pytest.mark.skipif(driver._is_cuda_python()
                        and runtime.runtimeGetVersion() < 11020,
                        reason='cudaMemPool_t is supported since CUDA 11.2')
    @pytest.mark.skipif(not driver._is_cuda_python()
                        and driver.get_build_version() < 11020,
                        reason='cudaMemPool_t is supported since CUDA 11.2')
    @pytest.mark.skipif(runtime.deviceGetAttribute(
        runtime.cudaDevAttrMemoryPoolsSupported, 0) == 0,
                        reason='cudaMemPool_t is not supported on device 0')
    def test_mallocFromPoolAsync(self):
        # also test create/destroy a pool
        props = runtime.MemPoolProps(runtime.cudaMemAllocationTypePinned,
                                     runtime.cudaMemHandleTypeNone,
                                     runtime.cudaMemLocationTypeDevice,
                                     0)  # on device 0
        pool = runtime.memPoolCreate(props)
        assert pool > 0
        s = cupy.cuda.Stream()
        ptr = runtime.mallocFromPoolAsync(128, pool, s.ptr)
        assert ptr > 0
        runtime.freeAsync(ptr, s.ptr)
        runtime.memPoolDestroy(pool)
Ejemplo n.º 2
0
Archivo: cg.py Proyecto: takagi/cupy
 def call(self, env, group):
     if _runtime.runtimeGetVersion() < 11000:
         raise RuntimeError("not supported in CUDA < 11.0")
     if not isinstance(group.ctype, _ThreadGroup):
         raise ValueError("group must be a valid cooperative group")
     _check_include(env, 'cg')
     return _Data(f'cg::sync({group.code})', _cuda_types.void)
Ejemplo n.º 3
0
def test_assumed_runtime_version():
    # When CUDA Python is enabled, CuPy calculates the CUDA runtime version
    # from NVRTC version. This test ensures that the assumption is correct
    # by running the same logic in non-CUDA Python environment.
    # When this fails, `runtime.runtimeGetVersion()` logic needs to be fixed.
    (major, minor) = nvrtc.getVersion()
    assert runtime.runtimeGetVersion() == major * 1000 + minor * 10
Ejemplo n.º 4
0
Archivo: cg.py Proyecto: takagi/cupy
 def call(self, env, group, step):
     if _runtime.runtimeGetVersion() < 11000:
         raise RuntimeError("not supported in CUDA < 11.0")
     _check_include(env, 'cg')
     if not isinstance(step, _Constant):
         raise ValueError('step must be a compile-time constant')
     return _Data(f'cg::wait_prior<{step.obj}>({group.code})',
                  _cuda_types.void)
Ejemplo n.º 5
0
Archivo: cg.py Proyecto: takagi/cupy
    def block_rank(self, env):
        """
        block_rank()

        Rank of the calling block within ``[0, num_blocks)``.
        """
        if _runtime.runtimeGetVersion() < 11060:
            raise RuntimeError("block_rank() is supported on CUDA 11.6+")
        _check_include(env, 'cg')
        return _Data('block_rank()', _cuda_types.uint64)
Ejemplo n.º 6
0
Archivo: cg.py Proyecto: takagi/cupy
    def num_threads(self, env):
        """
        num_threads()

        Total number of threads in the group.
        """
        if _runtime.runtimeGetVersion() < 11060:
            raise RuntimeError("num_threads() is supported on CUDA 11.6+")
        _check_include(env, 'cg')
        return _Data('num_threads()', _cuda_types.uint32)
Ejemplo n.º 7
0
Archivo: cg.py Proyecto: takagi/cupy
    def dim_threads(self, env):
        """
        dim_threads()

        Dimensions of the launched block in units of threads.
        """
        if _runtime.runtimeGetVersion() < 11060:
            raise RuntimeError("dim_threads() is supported on CUDA 11.6+")
        _check_include(env, 'cg')
        return _Data('dim_threads()', _cuda_types.dim3)
Ejemplo n.º 8
0
Archivo: cg.py Proyecto: takagi/cupy
    def block_index(self, env):
        """
        block_index()

        3-Dimensional index of the block within the launched grid.
        """
        if _runtime.runtimeGetVersion() < 11060:
            raise RuntimeError("block_index() is supported on CUDA 11.6+")
        _check_include(env, 'cg')
        return _Data('block_index()', _cuda_types.dim3)
Ejemplo n.º 9
0
    def num_blocks(self, env):
        """
        num_blocks()

        Total number of blocks in the group.
        """
        if _runtime.runtimeGetVersion() < 11060:
            raise RuntimeError("num_blocks() is supported on CUDA 11.6+")
        self._check_cg_include(env)
        return _Data('num_blocks()', _cuda_types.uint64)
Ejemplo n.º 10
0
    def dim_threads(self, env):
        """
        dim_threads()

        Dimensions of the launched block in units of threads.
        """
        if _runtime.runtimeGetVersion() < 11060:
            raise RuntimeError("dim_threads() is supported on CUDA 11.6+")
        from cupyx.jit._interface import _Dim3  # avoid circular import
        self._check_cg_include(env)
        return _Data('dim_threads()', _Dim3())
Ejemplo n.º 11
0
    def block_index(self, env):
        """
        block_index()

        3-Dimensional index of the block within the launched grid.
        """
        if _runtime.runtimeGetVersion() < 11060:
            raise RuntimeError("block_index() is supported on CUDA 11.6+")
        from cupyx.jit._interface import _Dim3  # avoid circular import
        self._check_cg_include(env)
        return _Data('block_index()', _Dim3())
Ejemplo n.º 12
0
def check_availability(name):
    if name not in _available_cuda_version:
        msg = 'No available version information specified for {}'.name
        raise ValueError(msg)
    version_added, version_removed = _available_cuda_version[name]
    cuda_version = runtime.runtimeGetVersion()
    if version_added is not None and cuda_version < version_added:
        return False
    if version_removed is not None and cuda_version >= version_removed:
        return False
    return True
Ejemplo n.º 13
0
Archivo: cg.py Proyecto: takagi/cupy
 def call_const(self, env):
     if _runtime.is_hip:
         raise RuntimeError('cooperative group is not supported on HIP')
     if self.group_type == 'grid':
         if _runtime.runtimeGetVersion() < 11000:
             raise RuntimeError(
                 "For pre-CUDA 11, the grid group has very limited "
                 "functionality (only group.sync() works), and so we "
                 "disable the grid group support to prepare the transition "
                 "to support CUDA 11+ only.")
         cg_type = _GridGroup()
     elif self.group_type == 'thread_block':
         cg_type = _ThreadBlockGroup()
     return _Data(f'cg::this_{self.group_type}()', cg_type)
Ejemplo n.º 14
0
Archivo: cg.py Proyecto: takagi/cupy
    def call(self,
             env,
             group,
             dst,
             dst_idx,
             src,
             src_idx,
             size,
             *,
             aligned_size=None):
        if _runtime.runtimeGetVersion() < 11010:
            # the overloaded version of memcpy_async that we use does not yet
            # exist in CUDA 11.0
            raise RuntimeError("not supported in CUDA < 11.1")
        _check_include(env, 'cg')
        _check_include(env, 'cg_memcpy_async')

        dst = _Data.init(dst, env)
        src = _Data.init(src, env)
        for arr in (dst, src):
            if not isinstance(arr.ctype,
                              (_cuda_types.CArray, _cuda_types.Ptr)):
                raise TypeError('dst/src must be of array type.')
        dst = _compile._indexing(dst, dst_idx, env)
        src = _compile._indexing(src, src_idx, env)

        size = _compile._astype_scalar(
            # it's very unlikely that the size would exceed 2^32, so we just
            # pick uint32 for simplicity
            size,
            _cuda_types.uint32,
            'same_kind',
            env)
        size = _Data.init(size, env)
        size_code = f'{size.code}'

        if aligned_size:
            if not isinstance(aligned_size, _Constant):
                raise ValueError(
                    'aligned_size must be a compile-time constant')
            _check_include(env, 'cuda_barrier')
            size_code = (f'cuda::aligned_size_t<{aligned_size.obj}>'
                         f'({size_code})')
        return _Data(
            f'cg::memcpy_async({group.code}, &({dst.code}), '
            f'&({src.code}), {size_code})', _cuda_types.void)
Ejemplo n.º 15
0
    def test_atomic_cas(self, dtype):
        if dtype == cupy.uint16:
            if (runtime.is_hip or runtime.runtimeGetVersion() < 10010
                    or int(device.get_compute_capability()) < 70):
                self.skipTest('not supported')

        @jit.rawkernel()
        def f(x, y, out):
            tid = jit.blockDim.x * jit.blockIdx.x + jit.threadIdx.x
            if tid < x.size:
                # = y[tid] if out[tid] == x[tid] else out[tid]
                jit.atomic_cas(out, tid, x[tid], y[tid])

        x = cupy.arange(1024, dtype=dtype)
        y = x.copy()
        y[512:] = 0
        out = x.copy()
        out[:512] = 0
        f((32, ), (32, ), (x, y, out))
        expected = cupy.zeros_like(out)
        self._check(out, expected)
Ejemplo n.º 16
0
def _syevd(a, UPLO, with_eigen_vector, overwrite_a=False):
    if UPLO not in ('L', 'U'):
        raise ValueError('UPLO argument must be \'L\' or \'U\'')

    # reject_float16=False for backward compatibility
    dtype, v_dtype = _util.linalg_common_type(a, reject_float16=False)
    real_dtype = dtype.char.lower()
    w_dtype = v_dtype.char.lower()

    # Note that cuSolver assumes fortran array
    v = a.astype(dtype, order='F', copy=not overwrite_a)

    m, lda = a.shape
    w = cupy.empty(m, real_dtype)
    dev_info = cupy.empty((), numpy.int32)
    handle = device.Device().cusolver_handle

    if with_eigen_vector:
        jobz = cusolver.CUSOLVER_EIG_MODE_VECTOR
    else:
        jobz = cusolver.CUSOLVER_EIG_MODE_NOVECTOR

    if UPLO == 'L':
        uplo = cublas.CUBLAS_FILL_MODE_LOWER
    else:  # UPLO == 'U'
        uplo = cublas.CUBLAS_FILL_MODE_UPPER

    global _cuda_runtime_version
    if _cuda_runtime_version < 0:
        _cuda_runtime_version = runtime.runtimeGetVersion()

    if not runtime.is_hip and _cuda_runtime_version >= 11010:
        if dtype.char not in 'fdFD':
            raise RuntimeError('Only float32, float64, complex64, and '
                               'complex128 are supported')
        type_v = _dtype.to_cuda_dtype(dtype)
        type_w = _dtype.to_cuda_dtype(real_dtype)
        params = cusolver.createParams()
        try:
            work_device_size, work_host_sizse = cusolver.xsyevd_bufferSize(
                handle, params, jobz, uplo, m, type_v, v.data.ptr, lda, type_w,
                w.data.ptr, type_v)
            work_device = cupy.empty(work_device_size, 'b')
            work_host = numpy.empty(work_host_sizse, 'b')
            cusolver.xsyevd(handle, params, jobz, uplo, m, type_v, v.data.ptr,
                            lda, type_w, w.data.ptr, type_v,
                            work_device.data.ptr, work_device_size,
                            work_host.ctypes.data, work_host_sizse,
                            dev_info.data.ptr)
        finally:
            cusolver.destroyParams(params)
        cupy.linalg._util._check_cusolver_dev_info_if_synchronization_allowed(
            cusolver.xsyevd, dev_info)
    else:
        if dtype == 'f':
            buffer_size = cupy.cuda.cusolver.ssyevd_bufferSize
            syevd = cupy.cuda.cusolver.ssyevd
        elif dtype == 'd':
            buffer_size = cupy.cuda.cusolver.dsyevd_bufferSize
            syevd = cupy.cuda.cusolver.dsyevd
        elif dtype == 'F':
            buffer_size = cupy.cuda.cusolver.cheevd_bufferSize
            syevd = cupy.cuda.cusolver.cheevd
        elif dtype == 'D':
            buffer_size = cupy.cuda.cusolver.zheevd_bufferSize
            syevd = cupy.cuda.cusolver.zheevd
        else:
            raise RuntimeError('Only float32, float64, complex64, and '
                               'complex128 are supported')

        work_size = buffer_size(handle, jobz, uplo, m, v.data.ptr, lda,
                                w.data.ptr)
        work = cupy.empty(work_size, dtype)
        syevd(handle, jobz, uplo, m, v.data.ptr, lda, w.data.ptr,
              work.data.ptr, work_size, dev_info.data.ptr)
        cupy.linalg._util._check_cusolver_dev_info_if_synchronization_allowed(
            syevd, dev_info)

    return w.astype(w_dtype, copy=False), v.astype(v_dtype, copy=False)
Ejemplo n.º 17
0
class TestCooperativeGroups:
    def test_thread_block_group(self):
        @jit.rawkernel()
        def test_thread_block(x):
            g = jit.cg.this_thread_block()
            if g.thread_rank() == 0:
                x[0] += 101
            if g.thread_rank() == 1:
                x[1] = g.size()
            # test dim3
            if g.thread_rank() == 2:
                g_idx = g.group_index()
                x[2], x[3], x[4] = g_idx.x, g_idx.y, g_idx.z
            if g.thread_rank() == 3:
                t_idx = g.thread_index()
                x[5], x[6], x[7] = t_idx.x, t_idx.y, t_idx.z
            if g.thread_rank() == 4:
                g_dim = g.group_dim()
                x[8], x[9], x[10] = g_dim.x, g_dim.y, g_dim.z
            g.sync()

        x = cupy.empty((16, ), dtype=cupy.int64)
        x[:] = -1
        test_thread_block[1, 32](x)
        assert x[0] == 100
        assert x[1] == 32
        assert (x[2], x[3], x[4]) == (0, 0, 0)
        assert (x[5], x[6], x[7]) == (3, 0, 0)
        assert (x[8], x[9], x[10]) == (32, 1, 1)
        assert (x[11:] == -1).all()

    @pytest.mark.skipif(
        runtime.runtimeGetVersion() < 11060 or
        (cupy.cuda.driver._is_cuda_python() and cupy.cuda.nvrtc.getVersion() <
         (11, 6)),
        reason='not supported until CUDA 11.6')
    def test_thread_block_group_cu116_new_APIs(self):
        @jit.rawkernel()
        def test_thread_block(x):
            g = jit.cg.this_thread_block()
            if g.thread_rank() == 0:
                x[0] = g.num_threads()
            if g.thread_rank() == 1:
                d_th = g.dim_threads()
                x[1], x[2], x[3] = d_th.x, d_th.y, d_th.z
            g.sync()

        x = cupy.empty((16, ), dtype=cupy.int64)
        x[:] = -1
        test_thread_block[1, 32](x)
        assert x[0] == 32
        assert (x[1], x[2], x[3]) == (32, 1, 1)
        assert (x[4:] == -1).all()

    @pytest.mark.skipif(runtime.runtimeGetVersion() < 11000,
                        reason='we do not support it')
    @pytest.mark.skipif(
        runtime.deviceGetAttribute(runtime.cudaDevAttrCooperativeLaunch,
                                   0) == 0,
        reason='cooperative launch is not supported on device 0')
    def test_grid_group(self):
        @jit.rawkernel()
        def test_grid(x):
            g = jit.cg.this_grid()
            if g.thread_rank() == 0:
                x[0] = g.is_valid()
            if g.thread_rank() == 1:
                x[1] = g.size()
            if g.thread_rank() == 32:  # on the 2nd group
                # Note: this is not yet possible...
                # x[2], x[3], x[4] == g.group_dim()
                g_dim = g.group_dim()
                x[2], x[3], x[4] = g_dim.x, g_dim.y, g_dim.z
            g.sync()  # this should just work!

        x = cupy.empty((16, ), dtype=cupy.uint64)
        x[:] = -1  # = 2**64-1
        test_grid[2, 32](x)
        assert x[0] == 1
        assert x[1] == 64
        assert (x[2], x[3], x[4]) == (2, 1, 1)
        assert (x[5:] == 2**64 - 1).all()

    @pytest.mark.skipif(
        runtime.runtimeGetVersion() < 11060 or
        (cupy.cuda.driver._is_cuda_python() and cupy.cuda.nvrtc.getVersion() <
         (11, 6)),
        reason='not supported until CUDA 11.6')
    @pytest.mark.skipif(
        runtime.deviceGetAttribute(runtime.cudaDevAttrCooperativeLaunch,
                                   0) == 0,
        reason='cooperative launch is not supported on device 0')
    def test_grid_group_cu116_new_APIs(self):
        @jit.rawkernel()
        def test_grid(x):
            g = jit.cg.this_grid()
            if g.thread_rank() == 1:
                x[1] = g.num_threads()
            if g.thread_rank() == 32:
                g_dim = g.dim_blocks()
                x[2], x[3], x[4] = g_dim.x, g_dim.y, g_dim.z
            if g.thread_rank() == 33:  # on the 2nd block
                x[5] = g.block_rank()
            if g.thread_rank() == 2:
                x[6] = g.num_blocks()
            if g.thread_rank() == 34:  # on the 2nd block
                b_idx = g.block_index()
                x[7], x[8], x[9] = b_idx.x, b_idx.y, b_idx.z
            g.sync()  # this should just work!

        x = cupy.empty((16, ), dtype=cupy.uint64)
        x[:] = -1  # = 2**64-1
        test_grid[2, 32](x)
        assert x[1] == 64
        assert (x[2], x[3], x[4]) == (2, 1, 1)
        assert x[5] == 1
        assert x[6] == 2
        assert (x[7], x[8], x[9]) == (1, 0, 0)
        assert (x[10:] == 2**64 - 1).all()
Ejemplo n.º 18
0
class TestCooperativeGroups:

    def test_thread_block_group(self):
        @jit.rawkernel()
        def test_thread_block(x):
            g = jit.cg.this_thread_block()
            if g.thread_rank() == 0:
                x[0] += 101
            if g.thread_rank() == 1:
                x[1] = g.size()
            # test dim3
            if g.thread_rank() == 2:
                g_idx = g.group_index()
                x[2], x[3], x[4] = g_idx.x, g_idx.y, g_idx.z
            if g.thread_rank() == 3:
                t_idx = g.thread_index()
                x[5], x[6], x[7] = t_idx.x, t_idx.y, t_idx.z
            if g.thread_rank() == 4:
                g_dim = g.group_dim()
                x[8], x[9], x[10] = g_dim.x, g_dim.y, g_dim.z
            g.sync()

        x = cupy.empty((16,), dtype=cupy.int64)
        x[:] = -1
        test_thread_block[1, 32](x)
        assert x[0] == 100
        assert x[1] == 32
        assert (x[2], x[3], x[4]) == (0, 0, 0)
        assert (x[5], x[6], x[7]) == (3, 0, 0)
        assert (x[8], x[9], x[10]) == (32, 1, 1)
        assert (x[11:] == -1).all()

    @pytest.mark.skipif(
        runtime.runtimeGetVersion() < 11060
        or (cupy.cuda.driver._is_cuda_python()
            and cupy.cuda.nvrtc.getVersion() < (11, 6)),
        reason='not supported until CUDA 11.6')
    def test_thread_block_group_cu116_new_APIs(self):
        @jit.rawkernel()
        def test_thread_block(x):
            g = jit.cg.this_thread_block()
            if g.thread_rank() == 0:
                x[0] = g.num_threads()
            if g.thread_rank() == 1:
                d_th = g.dim_threads()
                x[1], x[2], x[3] = d_th.x, d_th.y, d_th.z
            g.sync()

        x = cupy.empty((16,), dtype=cupy.int64)
        x[:] = -1
        test_thread_block[1, 32](x)
        assert x[0] == 32
        assert (x[1], x[2], x[3]) == (32, 1, 1)
        assert (x[4:] == -1).all()

    @pytest.mark.skipif(
        runtime.runtimeGetVersion() < 11000,
        reason='we do not support it')
    @pytest.mark.skipif(runtime.deviceGetAttribute(
        runtime.cudaDevAttrCooperativeLaunch, 0) == 0,
        reason='cooperative launch is not supported on device 0')
    def test_grid_group(self):
        @jit.rawkernel()
        def test_grid(x):
            g = jit.cg.this_grid()
            if g.thread_rank() == 0:
                x[0] = g.is_valid()
            if g.thread_rank() == 1:
                x[1] = g.size()
            if g.thread_rank() == 32:  # on the 2nd group
                # Note: this is not yet possible...
                # x[2], x[3], x[4] == g.group_dim()
                g_dim = g.group_dim()
                x[2], x[3], x[4] = g_dim.x, g_dim.y, g_dim.z
            g.sync()  # this should just work!

        x = cupy.empty((16,), dtype=cupy.uint64)
        x[:] = -1  # = 2**64-1
        test_grid[2, 32](x)
        assert x[0] == 1
        assert x[1] == 64
        assert (x[2], x[3], x[4]) == (2, 1, 1)
        assert (x[5:] == 2**64-1).all()

    @pytest.mark.skipif(
        runtime.runtimeGetVersion() < 11060
        or (cupy.cuda.driver._is_cuda_python()
            and cupy.cuda.nvrtc.getVersion() < (11, 6)),
        reason='not supported until CUDA 11.6')
    @pytest.mark.skipif(runtime.deviceGetAttribute(
        runtime.cudaDevAttrCooperativeLaunch, 0) == 0,
        reason='cooperative launch is not supported on device 0')
    def test_grid_group_cu116_new_APIs(self):
        @jit.rawkernel()
        def test_grid(x):
            g = jit.cg.this_grid()
            if g.thread_rank() == 1:
                x[1] = g.num_threads()
            if g.thread_rank() == 32:
                g_dim = g.dim_blocks()
                x[2], x[3], x[4] = g_dim.x, g_dim.y, g_dim.z
            if g.thread_rank() == 33:  # on the 2nd block
                x[5] = g.block_rank()
            if g.thread_rank() == 2:
                x[6] = g.num_blocks()
            if g.thread_rank() == 34:  # on the 2nd block
                b_idx = g.block_index()
                x[7], x[8], x[9] = b_idx.x, b_idx.y, b_idx.z
            g.sync()  # this should just work!

        x = cupy.empty((16,), dtype=cupy.uint64)
        x[:] = -1  # = 2**64-1
        test_grid[2, 32](x)
        assert x[1] == 64
        assert (x[2], x[3], x[4]) == (2, 1, 1)
        assert x[5] == 1
        assert x[6] == 2
        assert (x[7], x[8], x[9]) == (1, 0, 0)
        assert (x[10:] == 2**64-1).all()

    @pytest.mark.skipif(
        runtime.runtimeGetVersion() < 11000,
        reason='we do not support it')
    @pytest.mark.skipif(runtime.deviceGetAttribute(
        runtime.cudaDevAttrCooperativeLaunch, 0) == 0,
        reason='cooperative launch is not supported on device 0')
    def test_cg_sync(self):
        @jit.rawkernel()
        def test_sync():
            b = jit.cg.this_thread_block()
            g = jit.cg.this_grid()
            jit.cg.sync(b)
            jit.cg.sync(g)

        test_sync[2, 64]()

    @pytest.mark.skipif(
        runtime.runtimeGetVersion() < 11000,
        reason='not supported until CUDA 11.0')
    @pytest.mark.parametrize(
        'test_aligned', (True, False),
    )
    def test_cg_memcpy_async_wait_for_wait(self, test_aligned):
        @jit.rawkernel()
        def test_copy(x, y):
            # do two batches of copies to test relevant APIs
            if test_aligned:
                smem = jit.shared_memory(cupy.int32, 32*2, alignment=16)
            else:
                smem = jit.shared_memory(cupy.int32, 32*2)
            g = jit.cg.this_thread_block()
            tid = g.thread_rank()
            # int32 is 4 bytes
            if test_aligned:
                # CuPy ensures x is 256B-aligned
                jit.cg.memcpy_async(
                    g, smem, 0, x, 0, 4*32, aligned_size=16)
                jit.cg.memcpy_async(
                    g, smem, 32, x, 32, 4*32, aligned_size=16)
            else:
                jit.cg.memcpy_async(
                    g, smem, 0, x, 0, 4*32)
                jit.cg.memcpy_async(
                    g, smem, 32, x, 32, 4*32)
            jit.cg.wait_prior(g, 1)
            if tid < 32:
                y[tid] = smem[tid]
            jit.cg.wait(g)
            if 32 <= tid and tid < 64:  # can't do "32 <= tid < 64" yet...
                y[tid] = smem[tid]

        x = cupy.arange(64, dtype=cupy.int32)
        y = cupy.zeros(64, dtype=cupy.int32)
        test_copy[2, 64](x, y)
        assert (x == y).all()
Ejemplo n.º 19
0
Archivo: cg.py Proyecto: takagi/cupy
 def call(self, env, group):
     if _runtime.runtimeGetVersion() < 11000:
         raise RuntimeError("not supported in CUDA < 11.0")
     _check_include(env, 'cg')
     return _Data(f'cg::wait({group.code})', _cuda_types.void)