def mdist_kernel(Q, S, out): """mean-adjusted Euclidean Distance naive kernel: nothing cached""" warpDim = cuda.blockDim.x // 32 warpIdx = cuda.threadIdx.x // 32 laneIdx = cuda.threadIdx.x % 32 lower = cuda.blockIdx.x * warpDim + warpIdx stride = cuda.gridDim.x * warpDim for position in range(lower, S.shape[0] - Q.shape[0] + 1, stride): accum = float64(0) for index in range(laneIdx, Q.shape[0], 32): accum += S[position + index] for delta in [16, 8, 4, 2, 1]: accum += cuda.shfl_xor_sync(0xFFFFFFFF, accum, delta) mean = accum / Q.shape[0] accum = float64(0) for index in range(laneIdx, Q.shape[0], 32): value = Q[index] - S[position + index] + mean accum += value * value for delta in [16, 8, 4, 2, 1]: value = cuda.shfl_down_sync(0xFFFFFFFF, accum, delta) accum += value if laneIdx == 0: out[position] = accum
def CTAReduce(tid: int, x, storage, count: int, R_opid: int): """ CUDA Warp reduction kernel. It is a device kernel to be called by other kernels. The data will be read from the right segement recursively, and reduced (ROP) onto the left half. Operation continues while warp size is larger than a given offset. Beyond this offset, warp reduction is performed via `shfl_down_sync`, which halves the reduction space and sums the two halves at each call. Note: Efficient warp occurs at input shapes of 2 ^ K. References: - Warp Primitives [https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/] Args: tid: CUDA thread index x: activation. Single float. storage: shared memory of size CTA_REDUCE_SIZE used for reduction in parallel threads. count: equivalent to num_rows, which is equivalent to alphabet_size (V+1) R_opid: Operator ID for reduction. See R_Op for more information. """ storage[tid] = x cuda.syncthreads() # Fold the data in half with each pass offset = CTA_REDUCE_SIZE // 2 while offset >= warp_size: if (tid + offset) < count and tid < offset: # Read from the right half and store to the left half. if R_opid == 0: x = rnnt_helper.add(x, storage[offset + tid]) else: x = rnnt_helper.maximum(x, storage[offset + tid]) storage[tid] = x cuda.syncthreads() offset = offset // 2 offset = warp_size // 2 while offset > 0: # warp reduction and sync shuff = cuda.shfl_down_sync(0xFFFFFFFF, x, offset) if (tid + offset < count) and (tid < offset): if R_opid == 0: x = rnnt_helper.add(x, shuff) else: x = rnnt_helper.maximum(x, shuff) offset = offset // 2 return x
def run_max(d_in, d_out): N = d_in.size # assuming that nthreads is >= 64 smax = cuda.shared.array(shape=0, dtype=float64) gid = cuda.blockIdx.x * cuda.blockDim.x * 2 + cuda.threadIdx.x tid = cuda.threadIdx.x if gid < N: mm = d_in[gid] if gid + cuda.blockDim.x < N: mm = max(mm, d_in[gid + cuda.blockDim.x]) else: mm = -1.0 smax[tid] = mm cuda.syncthreads() if cuda.blockDim.x >= 512 and tid < 256: smax[tid] = mm = max(mm, smax[tid + 256]) cuda.syncthreads() if cuda.blockDim.x >= 256 and tid < 128: smax[tid] = mm = max(mm, smax[tid + 128]) cuda.syncthreads() if cuda.blockDim.x >= 128 and tid < 64: smax[tid] = mm = max(mm, smax[tid + 64]) cuda.syncthreads() if tid < 32: if cuda.blockDim.x >= 64: smax[tid] = mm = max(mm, smax[tid + 32]) for offset in [16, 8, 4, 2, 1]: mm = max(mm, cuda.shfl_down_sync((1 << 2 * offset) - 1, mm, offset)) if tid == 0: d_out[cuda.blockIdx.x] = mm
def zdist_kernel(Q, S, out, epsilon): """z-normalized Euclidean Distance naive kernel: nothing cached""" warpDim = cuda.blockDim.x // 32 warpIdx = cuda.threadIdx.x // 32 laneIdx = cuda.threadIdx.x % 32 lower = cuda.blockIdx.x * warpDim + warpIdx stride = cuda.gridDim.x * warpDim for position in range(lower, S.shape[0] - Q.shape[0] + 1, stride): accum1 = float64(0) accum2 = float64(0) for index in range(laneIdx, Q.shape[0], 32): value = S[position + index] accum1 += value accum2 += value * value for delta in [16, 8, 4, 2, 1]: accum1 += cuda.shfl_xor_sync(0xFFFFFFFF, accum1, delta) accum2 += cuda.shfl_xor_sync(0xFFFFFFFF, accum2, delta) mean = accum1 / Q.shape[0] sigma = accum2 / Q.shape[0] - mean * mean sigma = sqrt(sigma) if sigma > 0.0 else epsilon accum = float64(0) for index in range(laneIdx, Q.shape[0], 32): value = Q[index] - (S[position + index] - mean) / sigma accum += value * value for delta in [16, 8, 4, 2, 1]: accum += cuda.shfl_down_sync(0xFFFFFFFF, accum, delta) if laneIdx == 0: out[position] = accum
def warp_reduction(val): offset = cuda.warpsize / 2 while offset > 0: val[0] += cuda.shfl_down_sync(0xffffffff, val[0], offset) offset /= 2
def use_shfl_sync_down(ary, delta): i = cuda.grid(1) val = cuda.shfl_down_sync(0xffffffff, i, delta) ary[i] = val
def reduce_warp(value, mask): offset = 16 # i.e. WARPSIZE // 2 while offset: value += cuda.shfl_down_sync(mask, value, offset) offset //= 2 return value