class TestMemPool: @pytest.mark.skipif(runtime.is_hip, reason='HIP does not support async allocator') @pytest.mark.skipif(driver._is_cuda_python() and runtime.runtimeGetVersion() < 11020, reason='cudaMemPool_t is supported since CUDA 11.2') @pytest.mark.skipif(not driver._is_cuda_python() and driver.get_build_version() < 11020, reason='cudaMemPool_t is supported since CUDA 11.2') @pytest.mark.skipif(runtime.deviceGetAttribute( runtime.cudaDevAttrMemoryPoolsSupported, 0) == 0, reason='cudaMemPool_t is not supported on device 0') def test_mallocFromPoolAsync(self): # also test create/destroy a pool props = runtime.MemPoolProps(runtime.cudaMemAllocationTypePinned, runtime.cudaMemHandleTypeNone, runtime.cudaMemLocationTypeDevice, 0) # on device 0 pool = runtime.memPoolCreate(props) assert pool > 0 s = cupy.cuda.Stream() ptr = runtime.mallocFromPoolAsync(128, pool, s.ptr) assert ptr > 0 runtime.freeAsync(ptr, s.ptr) runtime.memPoolDestroy(pool)
def call(self, env, group): if _runtime.runtimeGetVersion() < 11000: raise RuntimeError("not supported in CUDA < 11.0") if not isinstance(group.ctype, _ThreadGroup): raise ValueError("group must be a valid cooperative group") _check_include(env, 'cg') return _Data(f'cg::sync({group.code})', _cuda_types.void)
def test_assumed_runtime_version(): # When CUDA Python is enabled, CuPy calculates the CUDA runtime version # from NVRTC version. This test ensures that the assumption is correct # by running the same logic in non-CUDA Python environment. # When this fails, `runtime.runtimeGetVersion()` logic needs to be fixed. (major, minor) = nvrtc.getVersion() assert runtime.runtimeGetVersion() == major * 1000 + minor * 10
def call(self, env, group, step): if _runtime.runtimeGetVersion() < 11000: raise RuntimeError("not supported in CUDA < 11.0") _check_include(env, 'cg') if not isinstance(step, _Constant): raise ValueError('step must be a compile-time constant') return _Data(f'cg::wait_prior<{step.obj}>({group.code})', _cuda_types.void)
def block_rank(self, env): """ block_rank() Rank of the calling block within ``[0, num_blocks)``. """ if _runtime.runtimeGetVersion() < 11060: raise RuntimeError("block_rank() is supported on CUDA 11.6+") _check_include(env, 'cg') return _Data('block_rank()', _cuda_types.uint64)
def num_threads(self, env): """ num_threads() Total number of threads in the group. """ if _runtime.runtimeGetVersion() < 11060: raise RuntimeError("num_threads() is supported on CUDA 11.6+") _check_include(env, 'cg') return _Data('num_threads()', _cuda_types.uint32)
def dim_threads(self, env): """ dim_threads() Dimensions of the launched block in units of threads. """ if _runtime.runtimeGetVersion() < 11060: raise RuntimeError("dim_threads() is supported on CUDA 11.6+") _check_include(env, 'cg') return _Data('dim_threads()', _cuda_types.dim3)
def block_index(self, env): """ block_index() 3-Dimensional index of the block within the launched grid. """ if _runtime.runtimeGetVersion() < 11060: raise RuntimeError("block_index() is supported on CUDA 11.6+") _check_include(env, 'cg') return _Data('block_index()', _cuda_types.dim3)
def num_blocks(self, env): """ num_blocks() Total number of blocks in the group. """ if _runtime.runtimeGetVersion() < 11060: raise RuntimeError("num_blocks() is supported on CUDA 11.6+") self._check_cg_include(env) return _Data('num_blocks()', _cuda_types.uint64)
def dim_threads(self, env): """ dim_threads() Dimensions of the launched block in units of threads. """ if _runtime.runtimeGetVersion() < 11060: raise RuntimeError("dim_threads() is supported on CUDA 11.6+") from cupyx.jit._interface import _Dim3 # avoid circular import self._check_cg_include(env) return _Data('dim_threads()', _Dim3())
def block_index(self, env): """ block_index() 3-Dimensional index of the block within the launched grid. """ if _runtime.runtimeGetVersion() < 11060: raise RuntimeError("block_index() is supported on CUDA 11.6+") from cupyx.jit._interface import _Dim3 # avoid circular import self._check_cg_include(env) return _Data('block_index()', _Dim3())
def check_availability(name): if name not in _available_cuda_version: msg = 'No available version information specified for {}'.name raise ValueError(msg) version_added, version_removed = _available_cuda_version[name] cuda_version = runtime.runtimeGetVersion() if version_added is not None and cuda_version < version_added: return False if version_removed is not None and cuda_version >= version_removed: return False return True
def call_const(self, env): if _runtime.is_hip: raise RuntimeError('cooperative group is not supported on HIP') if self.group_type == 'grid': if _runtime.runtimeGetVersion() < 11000: raise RuntimeError( "For pre-CUDA 11, the grid group has very limited " "functionality (only group.sync() works), and so we " "disable the grid group support to prepare the transition " "to support CUDA 11+ only.") cg_type = _GridGroup() elif self.group_type == 'thread_block': cg_type = _ThreadBlockGroup() return _Data(f'cg::this_{self.group_type}()', cg_type)
def call(self, env, group, dst, dst_idx, src, src_idx, size, *, aligned_size=None): if _runtime.runtimeGetVersion() < 11010: # the overloaded version of memcpy_async that we use does not yet # exist in CUDA 11.0 raise RuntimeError("not supported in CUDA < 11.1") _check_include(env, 'cg') _check_include(env, 'cg_memcpy_async') dst = _Data.init(dst, env) src = _Data.init(src, env) for arr in (dst, src): if not isinstance(arr.ctype, (_cuda_types.CArray, _cuda_types.Ptr)): raise TypeError('dst/src must be of array type.') dst = _compile._indexing(dst, dst_idx, env) src = _compile._indexing(src, src_idx, env) size = _compile._astype_scalar( # it's very unlikely that the size would exceed 2^32, so we just # pick uint32 for simplicity size, _cuda_types.uint32, 'same_kind', env) size = _Data.init(size, env) size_code = f'{size.code}' if aligned_size: if not isinstance(aligned_size, _Constant): raise ValueError( 'aligned_size must be a compile-time constant') _check_include(env, 'cuda_barrier') size_code = (f'cuda::aligned_size_t<{aligned_size.obj}>' f'({size_code})') return _Data( f'cg::memcpy_async({group.code}, &({dst.code}), ' f'&({src.code}), {size_code})', _cuda_types.void)
def test_atomic_cas(self, dtype): if dtype == cupy.uint16: if (runtime.is_hip or runtime.runtimeGetVersion() < 10010 or int(device.get_compute_capability()) < 70): self.skipTest('not supported') @jit.rawkernel() def f(x, y, out): tid = jit.blockDim.x * jit.blockIdx.x + jit.threadIdx.x if tid < x.size: # = y[tid] if out[tid] == x[tid] else out[tid] jit.atomic_cas(out, tid, x[tid], y[tid]) x = cupy.arange(1024, dtype=dtype) y = x.copy() y[512:] = 0 out = x.copy() out[:512] = 0 f((32, ), (32, ), (x, y, out)) expected = cupy.zeros_like(out) self._check(out, expected)
def _syevd(a, UPLO, with_eigen_vector, overwrite_a=False): if UPLO not in ('L', 'U'): raise ValueError('UPLO argument must be \'L\' or \'U\'') # reject_float16=False for backward compatibility dtype, v_dtype = _util.linalg_common_type(a, reject_float16=False) real_dtype = dtype.char.lower() w_dtype = v_dtype.char.lower() # Note that cuSolver assumes fortran array v = a.astype(dtype, order='F', copy=not overwrite_a) m, lda = a.shape w = cupy.empty(m, real_dtype) dev_info = cupy.empty((), numpy.int32) handle = device.Device().cusolver_handle if with_eigen_vector: jobz = cusolver.CUSOLVER_EIG_MODE_VECTOR else: jobz = cusolver.CUSOLVER_EIG_MODE_NOVECTOR if UPLO == 'L': uplo = cublas.CUBLAS_FILL_MODE_LOWER else: # UPLO == 'U' uplo = cublas.CUBLAS_FILL_MODE_UPPER global _cuda_runtime_version if _cuda_runtime_version < 0: _cuda_runtime_version = runtime.runtimeGetVersion() if not runtime.is_hip and _cuda_runtime_version >= 11010: if dtype.char not in 'fdFD': raise RuntimeError('Only float32, float64, complex64, and ' 'complex128 are supported') type_v = _dtype.to_cuda_dtype(dtype) type_w = _dtype.to_cuda_dtype(real_dtype) params = cusolver.createParams() try: work_device_size, work_host_sizse = cusolver.xsyevd_bufferSize( handle, params, jobz, uplo, m, type_v, v.data.ptr, lda, type_w, w.data.ptr, type_v) work_device = cupy.empty(work_device_size, 'b') work_host = numpy.empty(work_host_sizse, 'b') cusolver.xsyevd(handle, params, jobz, uplo, m, type_v, v.data.ptr, lda, type_w, w.data.ptr, type_v, work_device.data.ptr, work_device_size, work_host.ctypes.data, work_host_sizse, dev_info.data.ptr) finally: cusolver.destroyParams(params) cupy.linalg._util._check_cusolver_dev_info_if_synchronization_allowed( cusolver.xsyevd, dev_info) else: if dtype == 'f': buffer_size = cupy.cuda.cusolver.ssyevd_bufferSize syevd = cupy.cuda.cusolver.ssyevd elif dtype == 'd': buffer_size = cupy.cuda.cusolver.dsyevd_bufferSize syevd = cupy.cuda.cusolver.dsyevd elif dtype == 'F': buffer_size = cupy.cuda.cusolver.cheevd_bufferSize syevd = cupy.cuda.cusolver.cheevd elif dtype == 'D': buffer_size = cupy.cuda.cusolver.zheevd_bufferSize syevd = cupy.cuda.cusolver.zheevd else: raise RuntimeError('Only float32, float64, complex64, and ' 'complex128 are supported') work_size = buffer_size(handle, jobz, uplo, m, v.data.ptr, lda, w.data.ptr) work = cupy.empty(work_size, dtype) syevd(handle, jobz, uplo, m, v.data.ptr, lda, w.data.ptr, work.data.ptr, work_size, dev_info.data.ptr) cupy.linalg._util._check_cusolver_dev_info_if_synchronization_allowed( syevd, dev_info) return w.astype(w_dtype, copy=False), v.astype(v_dtype, copy=False)
class TestCooperativeGroups: def test_thread_block_group(self): @jit.rawkernel() def test_thread_block(x): g = jit.cg.this_thread_block() if g.thread_rank() == 0: x[0] += 101 if g.thread_rank() == 1: x[1] = g.size() # test dim3 if g.thread_rank() == 2: g_idx = g.group_index() x[2], x[3], x[4] = g_idx.x, g_idx.y, g_idx.z if g.thread_rank() == 3: t_idx = g.thread_index() x[5], x[6], x[7] = t_idx.x, t_idx.y, t_idx.z if g.thread_rank() == 4: g_dim = g.group_dim() x[8], x[9], x[10] = g_dim.x, g_dim.y, g_dim.z g.sync() x = cupy.empty((16, ), dtype=cupy.int64) x[:] = -1 test_thread_block[1, 32](x) assert x[0] == 100 assert x[1] == 32 assert (x[2], x[3], x[4]) == (0, 0, 0) assert (x[5], x[6], x[7]) == (3, 0, 0) assert (x[8], x[9], x[10]) == (32, 1, 1) assert (x[11:] == -1).all() @pytest.mark.skipif( runtime.runtimeGetVersion() < 11060 or (cupy.cuda.driver._is_cuda_python() and cupy.cuda.nvrtc.getVersion() < (11, 6)), reason='not supported until CUDA 11.6') def test_thread_block_group_cu116_new_APIs(self): @jit.rawkernel() def test_thread_block(x): g = jit.cg.this_thread_block() if g.thread_rank() == 0: x[0] = g.num_threads() if g.thread_rank() == 1: d_th = g.dim_threads() x[1], x[2], x[3] = d_th.x, d_th.y, d_th.z g.sync() x = cupy.empty((16, ), dtype=cupy.int64) x[:] = -1 test_thread_block[1, 32](x) assert x[0] == 32 assert (x[1], x[2], x[3]) == (32, 1, 1) assert (x[4:] == -1).all() @pytest.mark.skipif(runtime.runtimeGetVersion() < 11000, reason='we do not support it') @pytest.mark.skipif( runtime.deviceGetAttribute(runtime.cudaDevAttrCooperativeLaunch, 0) == 0, reason='cooperative launch is not supported on device 0') def test_grid_group(self): @jit.rawkernel() def test_grid(x): g = jit.cg.this_grid() if g.thread_rank() == 0: x[0] = g.is_valid() if g.thread_rank() == 1: x[1] = g.size() if g.thread_rank() == 32: # on the 2nd group # Note: this is not yet possible... # x[2], x[3], x[4] == g.group_dim() g_dim = g.group_dim() x[2], x[3], x[4] = g_dim.x, g_dim.y, g_dim.z g.sync() # this should just work! x = cupy.empty((16, ), dtype=cupy.uint64) x[:] = -1 # = 2**64-1 test_grid[2, 32](x) assert x[0] == 1 assert x[1] == 64 assert (x[2], x[3], x[4]) == (2, 1, 1) assert (x[5:] == 2**64 - 1).all() @pytest.mark.skipif( runtime.runtimeGetVersion() < 11060 or (cupy.cuda.driver._is_cuda_python() and cupy.cuda.nvrtc.getVersion() < (11, 6)), reason='not supported until CUDA 11.6') @pytest.mark.skipif( runtime.deviceGetAttribute(runtime.cudaDevAttrCooperativeLaunch, 0) == 0, reason='cooperative launch is not supported on device 0') def test_grid_group_cu116_new_APIs(self): @jit.rawkernel() def test_grid(x): g = jit.cg.this_grid() if g.thread_rank() == 1: x[1] = g.num_threads() if g.thread_rank() == 32: g_dim = g.dim_blocks() x[2], x[3], x[4] = g_dim.x, g_dim.y, g_dim.z if g.thread_rank() == 33: # on the 2nd block x[5] = g.block_rank() if g.thread_rank() == 2: x[6] = g.num_blocks() if g.thread_rank() == 34: # on the 2nd block b_idx = g.block_index() x[7], x[8], x[9] = b_idx.x, b_idx.y, b_idx.z g.sync() # this should just work! x = cupy.empty((16, ), dtype=cupy.uint64) x[:] = -1 # = 2**64-1 test_grid[2, 32](x) assert x[1] == 64 assert (x[2], x[3], x[4]) == (2, 1, 1) assert x[5] == 1 assert x[6] == 2 assert (x[7], x[8], x[9]) == (1, 0, 0) assert (x[10:] == 2**64 - 1).all()
class TestCooperativeGroups: def test_thread_block_group(self): @jit.rawkernel() def test_thread_block(x): g = jit.cg.this_thread_block() if g.thread_rank() == 0: x[0] += 101 if g.thread_rank() == 1: x[1] = g.size() # test dim3 if g.thread_rank() == 2: g_idx = g.group_index() x[2], x[3], x[4] = g_idx.x, g_idx.y, g_idx.z if g.thread_rank() == 3: t_idx = g.thread_index() x[5], x[6], x[7] = t_idx.x, t_idx.y, t_idx.z if g.thread_rank() == 4: g_dim = g.group_dim() x[8], x[9], x[10] = g_dim.x, g_dim.y, g_dim.z g.sync() x = cupy.empty((16,), dtype=cupy.int64) x[:] = -1 test_thread_block[1, 32](x) assert x[0] == 100 assert x[1] == 32 assert (x[2], x[3], x[4]) == (0, 0, 0) assert (x[5], x[6], x[7]) == (3, 0, 0) assert (x[8], x[9], x[10]) == (32, 1, 1) assert (x[11:] == -1).all() @pytest.mark.skipif( runtime.runtimeGetVersion() < 11060 or (cupy.cuda.driver._is_cuda_python() and cupy.cuda.nvrtc.getVersion() < (11, 6)), reason='not supported until CUDA 11.6') def test_thread_block_group_cu116_new_APIs(self): @jit.rawkernel() def test_thread_block(x): g = jit.cg.this_thread_block() if g.thread_rank() == 0: x[0] = g.num_threads() if g.thread_rank() == 1: d_th = g.dim_threads() x[1], x[2], x[3] = d_th.x, d_th.y, d_th.z g.sync() x = cupy.empty((16,), dtype=cupy.int64) x[:] = -1 test_thread_block[1, 32](x) assert x[0] == 32 assert (x[1], x[2], x[3]) == (32, 1, 1) assert (x[4:] == -1).all() @pytest.mark.skipif( runtime.runtimeGetVersion() < 11000, reason='we do not support it') @pytest.mark.skipif(runtime.deviceGetAttribute( runtime.cudaDevAttrCooperativeLaunch, 0) == 0, reason='cooperative launch is not supported on device 0') def test_grid_group(self): @jit.rawkernel() def test_grid(x): g = jit.cg.this_grid() if g.thread_rank() == 0: x[0] = g.is_valid() if g.thread_rank() == 1: x[1] = g.size() if g.thread_rank() == 32: # on the 2nd group # Note: this is not yet possible... # x[2], x[3], x[4] == g.group_dim() g_dim = g.group_dim() x[2], x[3], x[4] = g_dim.x, g_dim.y, g_dim.z g.sync() # this should just work! x = cupy.empty((16,), dtype=cupy.uint64) x[:] = -1 # = 2**64-1 test_grid[2, 32](x) assert x[0] == 1 assert x[1] == 64 assert (x[2], x[3], x[4]) == (2, 1, 1) assert (x[5:] == 2**64-1).all() @pytest.mark.skipif( runtime.runtimeGetVersion() < 11060 or (cupy.cuda.driver._is_cuda_python() and cupy.cuda.nvrtc.getVersion() < (11, 6)), reason='not supported until CUDA 11.6') @pytest.mark.skipif(runtime.deviceGetAttribute( runtime.cudaDevAttrCooperativeLaunch, 0) == 0, reason='cooperative launch is not supported on device 0') def test_grid_group_cu116_new_APIs(self): @jit.rawkernel() def test_grid(x): g = jit.cg.this_grid() if g.thread_rank() == 1: x[1] = g.num_threads() if g.thread_rank() == 32: g_dim = g.dim_blocks() x[2], x[3], x[4] = g_dim.x, g_dim.y, g_dim.z if g.thread_rank() == 33: # on the 2nd block x[5] = g.block_rank() if g.thread_rank() == 2: x[6] = g.num_blocks() if g.thread_rank() == 34: # on the 2nd block b_idx = g.block_index() x[7], x[8], x[9] = b_idx.x, b_idx.y, b_idx.z g.sync() # this should just work! x = cupy.empty((16,), dtype=cupy.uint64) x[:] = -1 # = 2**64-1 test_grid[2, 32](x) assert x[1] == 64 assert (x[2], x[3], x[4]) == (2, 1, 1) assert x[5] == 1 assert x[6] == 2 assert (x[7], x[8], x[9]) == (1, 0, 0) assert (x[10:] == 2**64-1).all() @pytest.mark.skipif( runtime.runtimeGetVersion() < 11000, reason='we do not support it') @pytest.mark.skipif(runtime.deviceGetAttribute( runtime.cudaDevAttrCooperativeLaunch, 0) == 0, reason='cooperative launch is not supported on device 0') def test_cg_sync(self): @jit.rawkernel() def test_sync(): b = jit.cg.this_thread_block() g = jit.cg.this_grid() jit.cg.sync(b) jit.cg.sync(g) test_sync[2, 64]() @pytest.mark.skipif( runtime.runtimeGetVersion() < 11000, reason='not supported until CUDA 11.0') @pytest.mark.parametrize( 'test_aligned', (True, False), ) def test_cg_memcpy_async_wait_for_wait(self, test_aligned): @jit.rawkernel() def test_copy(x, y): # do two batches of copies to test relevant APIs if test_aligned: smem = jit.shared_memory(cupy.int32, 32*2, alignment=16) else: smem = jit.shared_memory(cupy.int32, 32*2) g = jit.cg.this_thread_block() tid = g.thread_rank() # int32 is 4 bytes if test_aligned: # CuPy ensures x is 256B-aligned jit.cg.memcpy_async( g, smem, 0, x, 0, 4*32, aligned_size=16) jit.cg.memcpy_async( g, smem, 32, x, 32, 4*32, aligned_size=16) else: jit.cg.memcpy_async( g, smem, 0, x, 0, 4*32) jit.cg.memcpy_async( g, smem, 32, x, 32, 4*32) jit.cg.wait_prior(g, 1) if tid < 32: y[tid] = smem[tid] jit.cg.wait(g) if 32 <= tid and tid < 64: # can't do "32 <= tid < 64" yet... y[tid] = smem[tid] x = cupy.arange(64, dtype=cupy.int32) y = cupy.zeros(64, dtype=cupy.int32) test_copy[2, 64](x, y) assert (x == y).all()
def call(self, env, group): if _runtime.runtimeGetVersion() < 11000: raise RuntimeError("not supported in CUDA < 11.0") _check_include(env, 'cg') return _Data(f'cg::wait({group.code})', _cuda_types.void)