Esempio n. 1
0
def mdist_kernel(Q, S, out):
    """mean-adjusted Euclidean Distance naive kernel: nothing cached"""

    warpDim = cuda.blockDim.x // 32
    warpIdx = cuda.threadIdx.x // 32
    laneIdx = cuda.threadIdx.x % 32

    lower = cuda.blockIdx.x * warpDim + warpIdx
    stride = cuda.gridDim.x * warpDim

    for position in range(lower, S.shape[0] - Q.shape[0] + 1, stride):

        accum = float64(0)
        for index in range(laneIdx, Q.shape[0], 32):
            accum += S[position + index]

        for delta in [16, 8, 4, 2, 1]:
            accum += cuda.shfl_xor_sync(0xFFFFFFFF, accum, delta)

        mean = accum / Q.shape[0]
        accum = float64(0)
        for index in range(laneIdx, Q.shape[0], 32):
            value = Q[index] - S[position + index] + mean
            accum += value * value

        for delta in [16, 8, 4, 2, 1]:
            value = cuda.shfl_down_sync(0xFFFFFFFF, accum, delta)
            accum += value

        if laneIdx == 0:
            out[position] = accum
Esempio n. 2
0
File: reduce.py Progetto: blisc/NeMo
def CTAReduce(tid: int, x, storage, count: int, R_opid: int):
    """
    CUDA Warp reduction kernel.

    It is a device kernel to be called by other kernels.

    The data will be read from the right segement recursively, and reduced (ROP) onto the left half.
    Operation continues while warp size is larger than a given offset.
    Beyond this offset, warp reduction is performed via `shfl_down_sync`, which halves the reduction
    space and sums the two halves at each call.

    Note:
        Efficient warp occurs at input shapes of 2 ^ K.

    References:
        - Warp Primitives [https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/]

    Args:
        tid: CUDA thread index
        x: activation. Single float.
        storage: shared memory of size CTA_REDUCE_SIZE used for reduction in parallel threads.
        count: equivalent to num_rows, which is equivalent to alphabet_size (V+1)
        R_opid: Operator ID for reduction. See R_Op for more information.
    """
    storage[tid] = x

    cuda.syncthreads()

    # Fold the data in half with each pass
    offset = CTA_REDUCE_SIZE // 2
    while offset >= warp_size:
        if (tid + offset) < count and tid < offset:
            # Read from the right half and store to the left half.
            if R_opid == 0:
                x = rnnt_helper.add(x, storage[offset + tid])
            else:
                x = rnnt_helper.maximum(x, storage[offset + tid])

            storage[tid] = x

        cuda.syncthreads()
        offset = offset // 2

    offset = warp_size // 2
    while offset > 0:
        # warp reduction and sync
        shuff = cuda.shfl_down_sync(0xFFFFFFFF, x, offset)

        if (tid + offset < count) and (tid < offset):
            if R_opid == 0:
                x = rnnt_helper.add(x, shuff)
            else:
                x = rnnt_helper.maximum(x, shuff)

        offset = offset // 2

    return x
Esempio n. 3
0
def run_max(d_in, d_out):
    N = d_in.size

    # assuming that nthreads is >= 64
    smax = cuda.shared.array(shape=0, dtype=float64)

    gid = cuda.blockIdx.x * cuda.blockDim.x * 2 + cuda.threadIdx.x
    tid = cuda.threadIdx.x

    if gid < N:
        mm = d_in[gid]
        if gid + cuda.blockDim.x < N:
            mm = max(mm, d_in[gid + cuda.blockDim.x])
    else:
        mm = -1.0

    smax[tid] = mm
    cuda.syncthreads()

    if cuda.blockDim.x >= 512 and tid < 256:
        smax[tid] = mm = max(mm, smax[tid + 256])
    cuda.syncthreads()

    if cuda.blockDim.x >= 256 and tid < 128:
        smax[tid] = mm = max(mm, smax[tid + 128])
    cuda.syncthreads()

    if cuda.blockDim.x >= 128 and tid < 64:
        smax[tid] = mm = max(mm, smax[tid + 64])
    cuda.syncthreads()

    if tid < 32:
        if cuda.blockDim.x >= 64:
            smax[tid] = mm = max(mm, smax[tid + 32])

        for offset in [16, 8, 4, 2, 1]:
            mm = max(mm, cuda.shfl_down_sync((1 << 2 * offset) - 1, mm,
                                             offset))

    if tid == 0:
        d_out[cuda.blockIdx.x] = mm
Esempio n. 4
0
def zdist_kernel(Q, S, out, epsilon):
    """z-normalized Euclidean Distance naive kernel: nothing cached"""

    warpDim = cuda.blockDim.x // 32
    warpIdx = cuda.threadIdx.x // 32
    laneIdx = cuda.threadIdx.x % 32

    lower = cuda.blockIdx.x * warpDim + warpIdx
    stride = cuda.gridDim.x * warpDim

    for position in range(lower, S.shape[0] - Q.shape[0] + 1, stride):

        accum1 = float64(0)
        accum2 = float64(0)
        for index in range(laneIdx, Q.shape[0], 32):
            value = S[position + index]
            accum1 += value
            accum2 += value * value

        for delta in [16, 8, 4, 2, 1]:
            accum1 += cuda.shfl_xor_sync(0xFFFFFFFF, accum1, delta)
            accum2 += cuda.shfl_xor_sync(0xFFFFFFFF, accum2, delta)

        mean = accum1 / Q.shape[0]
        sigma = accum2 / Q.shape[0] - mean * mean
        sigma = sqrt(sigma) if sigma > 0.0 else epsilon

        accum = float64(0)
        for index in range(laneIdx, Q.shape[0], 32):
            value = Q[index] - (S[position + index] - mean) / sigma
            accum += value * value

        for delta in [16, 8, 4, 2, 1]:
            accum += cuda.shfl_down_sync(0xFFFFFFFF, accum, delta)

        if laneIdx == 0:
            out[position] = accum
Esempio n. 5
0
def warp_reduction(val):
    offset = cuda.warpsize / 2
    while offset > 0:
        val[0] += cuda.shfl_down_sync(0xffffffff, val[0], offset)
        offset /= 2
Esempio n. 6
0
def use_shfl_sync_down(ary, delta):
    i = cuda.grid(1)
    val = cuda.shfl_down_sync(0xffffffff, i, delta)
    ary[i] = val
def reduce_warp(value, mask):
    offset = 16  # i.e. WARPSIZE // 2
    while offset:
        value += cuda.shfl_down_sync(mask, value, offset)
        offset //= 2
    return value