def test_copy(x, y): # do two batches of copies to test relevant APIs if test_aligned: smem = jit.shared_memory(cupy.int32, 32*2, alignment=16) else: smem = jit.shared_memory(cupy.int32, 32*2) g = jit.cg.this_thread_block() tid = g.thread_rank() # int32 is 4 bytes if test_aligned: # CuPy ensures x is 256B-aligned jit.cg.memcpy_async( g, smem, 0, x, 0, 4*32, aligned_size=16) jit.cg.memcpy_async( g, smem, 32, x, 32, 4*32, aligned_size=16) else: jit.cg.memcpy_async( g, smem, 0, x, 0, 4*32) jit.cg.memcpy_async( g, smem, 32, x, 32, 4*32) jit.cg.wait_prior(g, 1) if tid < 32: y[tid] = smem[tid] jit.cg.wait(g) if 32 <= tid and tid < 64: # can't do "32 <= tid < 64" yet... y[tid] = smem[tid]
def f(x, y): tid = jit.threadIdx.x ntid = jit.blockDim.x bid = jit.blockIdx.x i = tid + bid * ntid smem = jit.shared_memory(numpy.int32, 32) smem[tid] = x[i] jit.syncthreads() y[i] = smem[ntid - tid - 1]
def reduction(x, y, size): tid = jit.threadIdx.x ntid = jit.blockDim.x value = cupy.float32(0) for i in range(tid, size, ntid): value += x[i] smem = jit.shared_memory(cupy.float32, 1024) smem[tid] = value jit.syncthreads() if tid == cupy.uint32(0): value = cupy.float32(0) for i in range(ntid): value += smem[i] y[0] = value
def reduction(x, y, size): tid = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x ntid = jit.blockDim.x * jit.gridDim.x value = cupy.float32(0) for i in range(tid, size, ntid): value += x[i] smem = jit.shared_memory(cupy.float32, 1024) smem[jit.threadIdx.x] = value jit.syncthreads() if jit.threadIdx.x == cupy.uint32(0): value = cupy.float32(0) for i in range(jit.blockDim.x): value += smem[i] jit.atomic_add(y, 0, value)