def coop_syncwarp(res): sm = cuda.shared.array(32, int32) i = cuda.grid(1) sm[i] = i cuda.syncwarp() if i < 16: sm[i] = sm[i] + sm[i + 16] cuda.syncwarp(0xFFFF) if i < 8: sm[i] = sm[i] + sm[i + 8] cuda.syncwarp(0xFF) if i < 4: sm[i] = sm[i] + sm[i + 4] cuda.syncwarp(0xF) if i < 2: sm[i] = sm[i] + sm[i + 2] cuda.syncwarp(0x3) if i == 0: res[0] = sm[0] + sm[1]
def device_reduce_full_block(arr, partials, sm_partials): """ Partially reduce `arr` into `partials` using `sm_partials` as working space. The algorithm goes like: array chunks of 128: | 0 | 128 | 256 | 384 | 512 | block-0: | x | | | x | | block-1: | | x | | | x | block-2: | | | x | | | The array is divided into chunks of 128 (size of a threadblock). The threadblocks consumes the chunks in roundrobin scheduling. First, a threadblock loads a chunk into temp memory. Then, all subsequent chunks are combined into the temp memory. Once all chunks are processed. Inner-block reduction is performed on the temp memory. So that, there will just be one scalar result per block. The result from each block is stored to `partials` at the dedicated slot. """ tid = cuda.threadIdx.x blkid = cuda.blockIdx.x blksz = cuda.blockDim.x gridsz = cuda.gridDim.x # block strided loop to compute the reduction start = tid + blksz * blkid stop = arr.size step = blksz * gridsz # load first value tmp = arr[start] # loop over all values in block-stride for i in range(start + step, stop, step): tmp = reduce_op(tmp, arr[i]) cuda.syncthreads() # inner-warp reduction inner_warp_reduction(sm_partials, tmp) cuda.syncthreads() # at this point, only the first slot for each warp in tsm_partials # is valid. # finish up block reduction # warning: this is assuming 4 warps. # assert numwarps == 4 if tid < 2: sm_partials[tid, 0] = reduce_op(sm_partials[tid, 0], sm_partials[tid + 2, 0]) cuda.syncwarp() if tid == 0: partials[blkid] = reduce_op(sm_partials[0, 0], sm_partials[1, 0])
def inner_warp_reduction(sm_partials, init): """ Compute reduction within a single warp """ tid = cuda.threadIdx.x warpid = tid // _WARPSIZE laneid = tid % _WARPSIZE sm_this = sm_partials[warpid, :] sm_this[laneid] = init cuda.syncwarp() width = _WARPSIZE // 2 while width: if laneid < width: old = sm_this[laneid] sm_this[laneid] = reduce_op(old, sm_this[laneid + width]) cuda.syncwarp() width //= 2
def useful_syncwarp(ary): i = cuda.grid(1) if i == 0: ary[0] = 42 cuda.syncwarp(0xffffffff) ary[i] = ary[0]
def useless_syncwarp_with_mask(ary): i = cuda.grid(1) cuda.syncwarp(0xFFFF) ary[i] = i
def useless_syncwarp(ary): i = cuda.grid(1) cuda.syncwarp() ary[i] = i