Esempi in Python per shfl_down_sync, esempi in Python per numba.cuda.shfl_down_sync

Esempio n. 1

0

Mostra file

def mdist_kernel(Q, S, out):
    """mean-adjusted Euclidean Distance naive kernel: nothing cached"""

    warpDim = cuda.blockDim.x // 32
    warpIdx = cuda.threadIdx.x // 32
    laneIdx = cuda.threadIdx.x % 32

    lower = cuda.blockIdx.x * warpDim + warpIdx
    stride = cuda.gridDim.x * warpDim

    for position in range(lower, S.shape[0] - Q.shape[0] + 1, stride):

        accum = float64(0)
        for index in range(laneIdx, Q.shape[0], 32):
            accum += S[position + index]

        for delta in [16, 8, 4, 2, 1]:
            accum += cuda.shfl_xor_sync(0xFFFFFFFF, accum, delta)

        mean = accum / Q.shape[0]
        accum = float64(0)
        for index in range(laneIdx, Q.shape[0], 32):
            value = Q[index] - S[position + index] + mean
            accum += value * value

        for delta in [16, 8, 4, 2, 1]:
            value = cuda.shfl_down_sync(0xFFFFFFFF, accum, delta)
            accum += value

        if laneIdx == 0:
            out[position] = accum

Esempio n. 2

0

Mostra file

File: reduce.py Progetto: blisc/NeMo

def CTAReduce(tid: int, x, storage, count: int, R_opid: int):
    """
    CUDA Warp reduction kernel.

    It is a device kernel to be called by other kernels.

    The data will be read from the right segement recursively, and reduced (ROP) onto the left half.
    Operation continues while warp size is larger than a given offset.
    Beyond this offset, warp reduction is performed via `shfl_down_sync`, which halves the reduction
    space and sums the two halves at each call.

    Note:
        Efficient warp occurs at input shapes of 2 ^ K.

    References:
        - Warp Primitives [https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/]

    Args:
        tid: CUDA thread index
        x: activation. Single float.
        storage: shared memory of size CTA_REDUCE_SIZE used for reduction in parallel threads.
        count: equivalent to num_rows, which is equivalent to alphabet_size (V+1)
        R_opid: Operator ID for reduction. See R_Op for more information.
    """
    storage[tid] = x

    cuda.syncthreads()

    # Fold the data in half with each pass
    offset = CTA_REDUCE_SIZE // 2
    while offset >= warp_size:
        if (tid + offset) < count and tid < offset:
            # Read from the right half and store to the left half.
            if R_opid == 0:
                x = rnnt_helper.add(x, storage[offset + tid])
            else:
                x = rnnt_helper.maximum(x, storage[offset + tid])

            storage[tid] = x

        cuda.syncthreads()
        offset = offset // 2

    offset = warp_size // 2
    while offset > 0:
        # warp reduction and sync
        shuff = cuda.shfl_down_sync(0xFFFFFFFF, x, offset)

        if (tid + offset < count) and (tid < offset):
            if R_opid == 0:
                x = rnnt_helper.add(x, shuff)
            else:
                x = rnnt_helper.maximum(x, shuff)

        offset = offset // 2

    return x

Esempio n. 3

0

Mostra file

def run_max(d_in, d_out):
    N = d_in.size

    # assuming that nthreads is >= 64
    smax = cuda.shared.array(shape=0, dtype=float64)

    gid = cuda.blockIdx.x * cuda.blockDim.x * 2 + cuda.threadIdx.x
    tid = cuda.threadIdx.x

    if gid < N:
        mm = d_in[gid]
        if gid + cuda.blockDim.x < N:
            mm = max(mm, d_in[gid + cuda.blockDim.x])
    else:
        mm = -1.0

    smax[tid] = mm
    cuda.syncthreads()

    if cuda.blockDim.x >= 512 and tid < 256:
        smax[tid] = mm = max(mm, smax[tid + 256])
    cuda.syncthreads()

    if cuda.blockDim.x >= 256 and tid < 128:
        smax[tid] = mm = max(mm, smax[tid + 128])
    cuda.syncthreads()

    if cuda.blockDim.x >= 128 and tid < 64:
        smax[tid] = mm = max(mm, smax[tid + 64])
    cuda.syncthreads()

    if tid < 32:
        if cuda.blockDim.x >= 64:
            smax[tid] = mm = max(mm, smax[tid + 32])

        for offset in [16, 8, 4, 2, 1]:
            mm = max(mm, cuda.shfl_down_sync((1 << 2 * offset) - 1, mm,
                                             offset))

    if tid == 0:
        d_out[cuda.blockIdx.x] = mm

Esempio n. 4

0

Mostra file

def zdist_kernel(Q, S, out, epsilon):
    """z-normalized Euclidean Distance naive kernel: nothing cached"""

    warpDim = cuda.blockDim.x // 32
    warpIdx = cuda.threadIdx.x // 32
    laneIdx = cuda.threadIdx.x % 32

    lower = cuda.blockIdx.x * warpDim + warpIdx
    stride = cuda.gridDim.x * warpDim

    for position in range(lower, S.shape[0] - Q.shape[0] + 1, stride):

        accum1 = float64(0)
        accum2 = float64(0)
        for index in range(laneIdx, Q.shape[0], 32):
            value = S[position + index]
            accum1 += value
            accum2 += value * value

        for delta in [16, 8, 4, 2, 1]:
            accum1 += cuda.shfl_xor_sync(0xFFFFFFFF, accum1, delta)
            accum2 += cuda.shfl_xor_sync(0xFFFFFFFF, accum2, delta)

        mean = accum1 / Q.shape[0]
        sigma = accum2 / Q.shape[0] - mean * mean
        sigma = sqrt(sigma) if sigma > 0.0 else epsilon

        accum = float64(0)
        for index in range(laneIdx, Q.shape[0], 32):
            value = Q[index] - (S[position + index] - mean) / sigma
            accum += value * value

        for delta in [16, 8, 4, 2, 1]:
            accum += cuda.shfl_down_sync(0xFFFFFFFF, accum, delta)

        if laneIdx == 0:
            out[position] = accum

Esempio n. 5

0

Mostra file

def warp_reduction(val):
    offset = cuda.warpsize / 2
    while offset > 0:
        val[0] += cuda.shfl_down_sync(0xffffffff, val[0], offset)
        offset /= 2

Esempio n. 6

0

Mostra file

File: test_warp_ops.py Progetto: zhaijf1992/numba

def use_shfl_sync_down(ary, delta):
    i = cuda.grid(1)
    val = cuda.shfl_down_sync(0xffffffff, i, delta)
    ary[i] = val

Esempio n. 7

0

Mostra file

File: debug.py Progetto: qdev-dk/time-domain-feature-request

def reduce_warp(value, mask):
    offset = 16  # i.e. WARPSIZE // 2
    while offset:
        value += cuda.shfl_down_sync(mask, value, offset)
        offset //= 2
    return value