Python syncthreads Examples, numba.cuda.syncthreads Python Examples

Example #1

0

Show file

File: test_matmul.py Project: stuartarchibald/numba

        def cu_square_matrix_mul(A, B, C):
            sA = cuda.shared.array(shape=SM_SIZE, dtype=float32)
            sB = cuda.shared.array(shape=(tpb, tpb), dtype=float32)

            tx = cuda.threadIdx.x
            ty = cuda.threadIdx.y
            bx = cuda.blockIdx.x
            by = cuda.blockIdx.y
            bw = cuda.blockDim.x
            bh = cuda.blockDim.y

            x = tx + bx * bw
            y = ty + by * bh

            acc = float32(0)  # forces all the math to be f32
            for i in range(bpg):
                if x < n and y < n:
                    sA[ty, tx] = A[y, tx + i * tpb]
                    sB[ty, tx] = B[ty + i * tpb, x]

                cuda.syncthreads()

                if x < n and y < n:
                    for j in range(tpb):
                        acc += sA[ty, j] * sB[j, tx]

                cuda.syncthreads()

            if x < n and y < n:
                C[y, x] = acc

Example #2

0

Show file

File: laplace2d-numba-cuda-smem.py Project: FedericoStra/numba

def jacobi_relax_core(A, Anew, error):
    smem = cuda.shared.array(shape=(32 + 2, 32 + 2), dtype=f8)
    n = A.shape[0]
    m = A.shape[1]

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y

    j = ty + cuda.blockIdx.y * cuda.blockDim.y
    i = tx + cuda.blockIdx.x * cuda.blockDim.x

    sy = ty + 1
    sx = tx + 1

    smem[sy, sx] = A[j, i]
    if tx == 0 and i >= 1:
        smem[sy, 0] = A[j, i - 1]

    if ty == 0 and j < m - 1:
        smem[0, sx] = A[j - 1, i]

    if tx == 31 and j >= 1:
        smem[sy, 33] = A[j, i + 1]

    if ty == 31 and j < n - 1:
        smem[33, sx] = A[j + 1, i]

    cuda.syncthreads() # ensure smem is visible by all threads in the block

    if j >= 1 and j < n - 1 and i >= 1 and i < m - 1:
        Anew[j, i] = 0.25 * ( smem[sy, sx + 1] + smem[sy, sx - 1] \
                            + smem[sy - 1, sx] + smem[sy + 1, sx])
        error[j, i] = Anew[j, i] - A[j, i]

Example #3

0

Show file

File: voxeldescriptors.py Project: alejandrovr/htmd

def _getOccupancyCUDAkernel(occus, coords, centers, channelsigmas, trunc):
    centeridx = cuda.blockIdx.x
    blockidx = cuda.blockIdx.y
    atomidx = (cuda.threadIdx.x + (cuda.blockDim.x * blockidx))

    if atomidx >= coords.shape[0] or centeridx >= centers.shape[0]:
        return

    # TODO: Can remove this. Barely any speedup
    centcoor = cuda.shared.array(shape=(3), dtype=numba.float32)
    centcoor[0] = centers[centeridx, 0]
    centcoor[1] = centers[centeridx, 1]
    centcoor[2] = centers[centeridx, 2]
    cuda.syncthreads()

    dx = coords[atomidx, 0] - centcoor[0]
    dy = coords[atomidx, 1] - centcoor[1]
    dz = coords[atomidx, 2] - centcoor[2]
    d2 = dx * dx + dy * dy + dz * dz
    if d2 >= trunc:
        return

    d1 = 1 / sqrt(d2)
    for h in range(channelsigmas.shape[1]):
        if channelsigmas[atomidx, h] == 0:
            continue
        x = channelsigmas[atomidx, h] * d1
        value = 1 - exp(-(x ** 12))
        cuda.atomic.max(occus, (centeridx, h), value)

Example #4

0

Show file

def fast_matmul(A, B, C):
    # Define an array in the shared memory
    # The size and type of the arrays must be known at compile time
    sA = cuda.shared.array(shape=(TPB, TPB), dtype=float32)
    sB = cuda.shared.array(shape=(TPB, TPB), dtype=float32)

    x, y = cuda.grid(2)

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bpg = cuda.gridDim.x    # blocks per grid

    if x >= C.shape[0] and y >= C.shape[1]:
        # Quit if (x, y) is outside of valid C boundary
        return

    # Each thread computes one element in the result matrix.
    # The dot product is chunked into dot products of TPB-long vectors.
    tmp = 0.
    for i in range(bpg):
        # Preload data into shared memory
        sA[tx, ty] = A[x, ty + i * TPB]
        sB[tx, ty] = B[tx + i * TPB, y]

        # Wait until all threads finish preloading
        cuda.syncthreads()

        # Computes partial product on the shared memory
        for j in range(TPB):
            tmp += sA[tx, j] * sB[j, ty]

        # Wait until all threads finish computing
        cuda.syncthreads()

    C[x, y] = tmp

Example #5

0

Show file

File: test_globals.py Project: ASPP/numba

def simple_smem(ary):
    sm = cuda.shared.array(N, int32)
    i = cuda.grid(1)
    if i == 0:
        for j in range(N):
            sm[j] = j
    cuda.syncthreads()
    ary[i] = sm[i]

Example #6

0

Show file

File: test_atomics.py Project: MJJoyce/numba

def atomic_add(ary):
    tid = cuda.threadIdx.x
    sm = cuda.shared.array(32, uint32)
    sm[tid] = 0
    cuda.syncthreads()
    bin = ary[tid] % 32
    cuda.atomic.add(sm, bin, 1)
    cuda.syncthreads()
    ary[tid] = sm[tid]

Example #7

0

Show file

File: test_atomics.py Project: cpcloud/numba

def atomic_add_float(ary):
    tid = cuda.threadIdx.x
    sm = cuda.shared.array(32, float32)
    sm[tid] = 0
    cuda.syncthreads()
    bin = int(ary[tid] % 32)
    cuda.atomic.add(sm, bin, 1.0)
    cuda.syncthreads()
    ary[tid] = sm[tid]

Example #8

0

Show file

File: test_atomics.py Project: cpcloud/numba

def atomic_add_double(idx, ary):
    tid = cuda.threadIdx.x
    sm = cuda.shared.array(32, float64)
    sm[tid] = 0.0
    cuda.syncthreads()
    bin = idx[tid] % 32
    cuda.atomic.add(sm, bin, 1.0)
    cuda.syncthreads()
    ary[tid] = sm[tid]

Example #9

0

Show file

File: test_atomics.py Project: MJJoyce/numba

def atomic_add3(ary):
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    sm = cuda.shared.array((4, 8), uint32)
    sm[tx, ty] = ary[tx, ty]
    cuda.syncthreads()
    cuda.atomic.add(sm, (tx, uint64(ty)), 1)
    cuda.syncthreads()
    ary[tx, ty] = sm[tx, ty]

Example #10

0

Show file

File: test_atomics.py Project: cpcloud/numba

def atomic_add_float_2(ary):
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    sm = cuda.shared.array((4, 8), float32)
    sm[tx, ty] = ary[tx, ty]
    cuda.syncthreads()
    cuda.atomic.add(sm, (tx, ty), 1)
    cuda.syncthreads()
    ary[tx, ty] = sm[tx, ty]

Example #11

0

Show file

File: test_sync_cuda.py Project: CyberIntelMafia/KaggleMalware

def idx_kernel(arr):
    s = cuda.shared.array(shape=maxThread, dtype=int32)

    idx = cuda.grid(1)
    if idx < arr.shape[0]:
        s[cuda.threadIdx.x] = 1

    cuda.syncthreads()

    if idx < arr.shape[0]:
        cuda.atomic.add(arr, s[cuda.threadIdx.x], 1)

Example #12

0

Show file

File: test_exception.py Project: cpcloud/numba

        def problematic(x, y):
            tid = cuda.threadIdx.x
            ntid = cuda.blockDim.x

            if tid > 12:
                for i in range(ntid):
                    y[i] += x[i] // y[i]

            cuda.syncthreads()
            if tid < 17:
                for i in range(ntid):
                    x[i] += x[i] // y[i]

Example #13

0

Show file

File: test_atomics.py Project: cpcloud/numba

def atomic_max_double_shared(res, ary):
    tid = cuda.threadIdx.x
    smary = cuda.shared.array(32, float64)
    smary[tid] = ary[tid]
    smres = cuda.shared.array(1, float64)
    if tid == 0:
        smres[0] = res[0]
    cuda.syncthreads()
    cuda.atomic.max(smres, 0, smary[tid])
    cuda.syncthreads()
    if tid == 0:
        res[0] = smres[0]

Example #14

0

Show file

File: reduction.py Project: cpcloud/numba

    def device_reduce_full_block(arr, partials, sm_partials):
        """
        Partially reduce `arr` into `partials` using `sm_partials` as working
        space.  The algorithm goes like:

            array chunks of 128:  |   0 | 128 | 256 | 384 | 512 |
                        block-0:  |   x |     |     |   x |     |
                        block-1:  |     |   x |     |     |   x |
                        block-2:  |     |     |   x |     |     |

        The array is divided into chunks of 128 (size of a threadblock).
        The threadblocks consumes the chunks in roundrobin scheduling.
        First, a threadblock loads a chunk into temp memory.  Then, all
        subsequent chunks are combined into the temp memory.

        Once all chunks are processed.  Inner-block reduction is performed
        on the temp memory.  So that, there will just be one scalar result
        per block.  The result from each block is stored to `partials` at
        the dedicated slot.
        """
        tid = cuda.threadIdx.x
        blkid = cuda.blockIdx.x
        blksz = cuda.blockDim.x
        gridsz = cuda.gridDim.x

        # block strided loop to compute the reduction
        start = tid + blksz * blkid
        stop = arr.size
        step = blksz * gridsz

        # load first value
        tmp = arr[start]
        # loop over all values in block-stride
        for i in range(start + step, stop, step):
            tmp = reduce_op(tmp, arr[i])

        cuda.syncthreads()
        # inner-warp reduction
        inner_warp_reduction(sm_partials, tmp)

        cuda.syncthreads()
        # at this point, only the first slot for each warp in tsm_partials
        # is valid.

        # finish up block reduction
        # warning: this is assuming 4 warps.
        # assert numwarps == 4
        if tid < 2:
            sm_partials[tid, 0] = reduce_op(sm_partials[tid, 0],
                                            sm_partials[tid + 2, 0])
        if tid == 0:
            partials[blkid] = reduce_op(sm_partials[0, 0], sm_partials[1, 0])

Example #15

0

Show file

File: test_exception.py Project: cpcloud/numba

        def oracle(x, y):
            tid = cuda.threadIdx.x
            ntid = cuda.blockDim.x

            if tid > 12:
                for i in range(ntid):
                    if y[i] != 0:
                        y[i] += x[i] // y[i]

            cuda.syncthreads()
            if tid < 17:
                for i in range(ntid):
                    if y[i] != 0:
                        x[i] += x[i] // y[i]

Example #16

0

Show file

File: cudautils.py Project: xennygrimmato/pygdf

        def gpu_unique_k(arr, k, out, outsz_ptr):
            """
            Note: run with small blocks.
            """
            tid = cuda.threadIdx.x
            blksz = cuda.blockDim.x
            base = 0

            # shared memory
            vset_size = 0
            sm_mem_size = MAX_FAST_UNIQUE_K
            vset = cuda.shared.array(sm_mem_size, dtype=nbtype)
            share_vset_size = cuda.shared.array(1, dtype=int32)
            share_loaded = cuda.shared.array(sm_mem_size, dtype=nbtype)
            sm_mem_size = min(k, sm_mem_size)

            while vset_size < sm_mem_size and base < arr.size:
                pos = base + tid
                valid_load = min(blksz, arr.size - base)
                # load
                if tid < valid_load:
                    share_loaded[tid] = arr[pos]
                # wait for load to complete
                cuda.syncthreads()
                # thread-0 inserts
                if tid == 0:
                    for i in range(valid_load):
                        val = share_loaded[i]
                        new_size = gpu_unique_set_insert(vset, vset_size, val)
                        if new_size >= 0:
                            vset_size = new_size
                        else:
                            vset_size = sm_mem_size + 1
                    share_vset_size[0] = vset_size
                # wait until the insert is done
                cuda.syncthreads()
                vset_size = share_vset_size[0]
                # increment
                base += blksz

            # output
            if vset_size <= sm_mem_size:
                for i in range(tid, vset_size, blksz):
                    out[i] = vset[i]
                if tid == 0:
                    outsz_ptr[0] = vset_size
            else:
                outsz_ptr[0] = -1

Example #17

0

Show file

File: transpose.py Project: Alexhuszagh/numba

    def kernel(input, output):

        tile = cuda.shared.array(shape=tile_shape, dtype=dt)

        tx = cuda.threadIdx.x
        ty = cuda.threadIdx.y
        bx = cuda.blockIdx.x * cuda.blockDim.x
        by = cuda.blockIdx.y * cuda.blockDim.y
        x = by + tx
        y = bx + ty

        if by+ty < input.shape[0] and bx+tx < input.shape[1]:
            tile[ty, tx] = input[by+ty, bx+tx]
        cuda.syncthreads()
        if y < output.shape[0] and x < output.shape[1]:
            output[y, x] = tile[tx, ty]

Example #18

0

Show file

File: singlelink.py Project: Chiroptera/masters_code

def argmax_lvl0(ary, reduce_max, reduce_arg):
    """
    This only works for positive values arrays.
    Shared memory must be initialized with double the size of 
    the block size.
    """
    sm_ary = cuda.shared.array(shape = 0, dtype = ary.dtype)

    # each thread will process two elements
    tgid = cuda.grid(1)
    thid = cuda.threadIdx.x

    # pointer to value and argument side of shared memory
    val_pointer = 0
    arg_pointer = sm_ary.size / 2    

    # when global thread id is bigger or equal than the ary size
    # it means that the block is incomplete; in this case we just
    # fill the rest of the block with -1 so it is smaller than all
    # other elements; this only works for positive arrays
    if tgid < ary.size:
        sm_ary[val_pointer + thid] = ary[tgid]
        sm_ary[arg_pointer + thid] = tgid
    else:
        sm_ary[val_pointer + thid] = 0
        sm_ary[arg_pointer + thid] = -1        


    cuda.syncthreads()

    s = cuda.blockDim.x / 2
    while s >0:
        index = 2 * s * thid

        if thid < s:
            # only change if the left element is smaller than the right one
            if sm_ary[val_pointer + thid] < sm_ary[val_pointer + thid + s]:
                sm_ary[val_pointer + thid] = sm_ary[val_pointer + thid + s]
                sm_ary[arg_pointer + index] = sm_ary[arg_pointer + index + s]

        cuda.syncthreads()

    if thid == 0:
        reduce_ary[cuda.blockIdx.x] = sm_ary[val_pointer]
        reduce_arg[cuda.blockIdx.x] = sm_ary[arg_pointer]

Example #19

0

Show file

File: cuda_dask.py Project: Alexhuszagh/numba

def gpu_single_block_sum(arr, out):
    """
    A naive single threadblock sum reduction
    """
    temp = cuda.shared.array(gpu_block_sum_max_blockdim, dtype=float32)
    tid = cuda.threadIdx.x
    blksz = cuda.blockDim.x
    temp[tid] = 0
    # block stride loop to sum-reduce cooperatively
    for i in range(tid, arr.size, blksz):
        temp[tid] += arr[i]
    cuda.syncthreads()
    # naive intra block sum that uses a single thread
    if tid == 0:
        for i in range(1, blksz):
            temp[0] += temp[i]
        # store result
        out[0] = temp[0]

Example #20

0

Show file

File: max.py Project: AngelBerihuete/numbapro-examples

def max_kernel(a, b):
    "Simple implementation of reduction kernel"
    # Allocate static shared memory of 256.
    # This limits the maximum block size to 256.
    sa = cuda.shared.array(shape=(256,), dtype=int32)
    tx = cuda.threadIdx.x
    bx = cuda.blockIdx.x
    bw = cuda.blockDim.x
    i = tx + bx * bw
    if i < a.shape[0]:
        sa[tx] = a[i]
        if tx == 0:
            # Uses the first thread of each block to perform the actual
            # reduction
            m = sa[tx]
            cuda.syncthreads()
            for j in range(1, bw):
                m = mymax(m, sa[j])
            b[bx] = m

Example #21

0

Show file

File: reduction.py Project: cpcloud/numba

    def device_reduce_partial_block(arr, partials, sm_partials):
        """
        This computes reduction on `arr`.
        This device function must be used by 1 threadblock only.
        The blocksize must match `arr.size` and must not be greater than 128.
        """
        tid = cuda.threadIdx.x
        blkid = cuda.blockIdx.x
        blksz = cuda.blockDim.x
        warpid = tid // _WARPSIZE
        laneid = tid % _WARPSIZE

        size = arr.size
        # load first value
        tid = cuda.threadIdx.x
        value = arr[tid]
        sm_partials[warpid, laneid] = value

        cuda.syncthreads()

        if (warpid + 1) * _WARPSIZE < size:
            # fully populated warps
            inner_warp_reduction(sm_partials, value)
        else:
            # partially populated warps
            # NOTE: this uses a very inefficient sequential algorithm
            if laneid == 0:
                sm_this = sm_partials[warpid, :]
                base = warpid * _WARPSIZE
                for i in range(1, size - base):
                    sm_this[0] = reduce_op(sm_this[0], sm_this[i])

        cuda.syncthreads()
        # finish up
        if tid == 0:
            num_active_warps = (blksz + _WARPSIZE - 1) // _WARPSIZE

            result = sm_partials[0, 0]
            for i in range(1, num_active_warps):
                result = reduce_op(result, sm_partials[i, 0])

            partials[blkid] = result

Example #22

0

Show file

File: gpu_flat.py Project: ZhouHUB/pyIID

def experimental_sum_fq(g_odata, g_idata, n):
    _, qx = cuda.grid(2)
    sdata = cuda.shared.array(512, f4)

    tid = cuda.threadIdx.x
    bd = cuda.blockDim.x
    bid = cuda.blockIdx.x
    i = bid * bd * 2 + tid
    gridsize = bd * 2 * cuda.gridDim.x

    sdata[tid] = 0.
    while i < n:
        if i + bd >= len(g_idata):
            sdata[tid] += g_idata[i, qx]
        else:
            sdata[tid] += g_idata[i, qx] + g_idata[i + bd, qx]
        i += gridsize
    cuda.syncthreads()

    if bd >= 512:
        if tid < 256:
            sdata[tid] += sdata[tid + 256]
        cuda.syncthreads()

    if bd >= 256:
        if tid < 128:
            sdata[tid] += sdata[tid + 128]
        cuda.syncthreads()

    if bd >= 128:
        if tid < 64:
            sdata[tid] += sdata[tid + 64]
        cuda.syncthreads()

    if tid < 32:
        if bd >= 64:
            sdata[tid] += sdata[tid + 32]
        if bd >= 32:
            sdata[tid] += sdata[tid + 16]
        if bd >= 16:
            sdata[tid] += sdata[tid + 8]
        if bd >= 8:
            sdata[tid] += sdata[tid + 4]
        if bd >= 4:
            sdata[tid] += sdata[tid + 2]
        if bd >= 2:
            sdata[tid] += sdata[tid + 1]

    if tid == 0:
        g_odata[cuda.blockIdx.x, qx] = sdata[0]

Example #23

0

Show file

File: nms_gpu.py Project: lucky-chz/CIA-SSD

def rotate_iou_kernel_eval(N,
                           K,
                           dev_boxes,
                           dev_query_boxes,
                           dev_iou,
                           criterion=-1):
    threadsPerBlock = 8 * 8
    row_start = cuda.blockIdx.x
    col_start = cuda.blockIdx.y
    tx = cuda.threadIdx.x
    row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
    col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
    block_boxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
    block_qboxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)

    dev_query_box_idx = threadsPerBlock * col_start + tx
    dev_box_idx = threadsPerBlock * row_start + tx
    if tx < col_size:
        block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
        block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
        block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
        block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
        block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
    if tx < row_size:
        block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
        block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
        block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
        block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
        block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
    cuda.syncthreads()
    if tx < row_size:
        for i in range(col_size):
            offset = (row_start * threadsPerBlock * K +
                      col_start * threadsPerBlock + tx * K + i)
            dev_iou[offset] = devRotateIoUEval(
                block_qboxes[i * 5:i * 5 + 5],
                block_boxes[tx * 5:tx * 5 + 5],
                criterion,
            )

Example #24

0

Show file

File: surface_model_cuda.py Project: Potopoles/Climate_Model

def calc_albedo_gpu(SURFALBEDSW, SURFALBEDLW, OCEANMASK, SOILTEMP):

    i, j = cuda.grid(2)

    # ocean
    if OCEANMASK[i,j] == 1:
        SURFALBEDSW[i,j] = 0.05
        #SURFALBEDLW[i,j] = 0.05
        SURFALBEDLW[i,j] = 0.00
    # land
    else:
        SURFALBEDSW[i,j] = 0.2
        #SURFALBEDLW[i,j] = 0.2
        SURFALBEDLW[i,j] = 0.0

    # ice (land and sea)
    if SOILTEMP[i,j,0] <= 273.15:
        SURFALBEDSW[i,j] = 0.5
        #SURFALBEDLW[i,j] = 0.3
        SURFALBEDLW[i,j] = 0.0

    cuda.syncthreads()

Example #25

0

Show file

File: numba-cuda-mm.py Project: izham-sugita/frgs9003-00646

def fast_matmul(A, B, C):
    """
    Perform matrix multiplication of C = A * B
    Each thread computes one element of the result matrix C
    """

    # Define an array in the shared memory
    # The size and type of the arrays must be known at compile time
    sA = cuda.shared.array(shape=(TPB, TPB), dtype=float32)
    sB = cuda.shared.array(shape=(TPB, TPB), dtype=float32)

    x, y = cuda.grid(2)
    
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    
    if x >= C.shape[0] and y >= C.shape[1]:
        # Quit if (x, y) is outside of valid C boundary
        return

    # Each thread computes one element in the result matrix.
    # The dot product is chunked into dot products of TPB-long vectors.
    tmp = 0.
    for i in range(int(A.shape[1] / TPB)):
        # Preload data into shared memory
        sA[tx, ty] = A[x, ty + i * TPB]
        sB[tx, ty] = B[tx + i * TPB, y]

        # Wait until all threads finish preloading
        cuda.syncthreads()

        # Computes partial product on the shared memory
        for j in range(TPB):
            tmp += sA[tx, j] * sB[j, ty]

        # Wait until all threads finish computing
        cuda.syncthreads()

    C[x, y] = tmp

Example #26

0

Show file

def matmul_shared_memory(A, B, C):
    """
    使用Shared Memory的矩阵乘法 C = A * B
    """
    # 在Shared Memory中定义向量
    # 向量可被整个Block的所有Thread共享
    # 必须声明向量大小和数据类型
    sA = cuda.shared.array(shape=(BLOCK_SIZE, BLOCK_SIZE), dtype=float32)
    sB = cuda.shared.array(shape=(BLOCK_SIZE, BLOCK_SIZE), dtype=float32)

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    row = cuda.threadIdx.x + cuda.blockDim.x * cuda.blockIdx.x
    col = cuda.threadIdx.y + cuda.blockDim.y * cuda.blockIdx.y

    if row >= C.shape[0] and col >= C.shape[1]:
        # 当(x, y)越界时退出
        return

    tmp = 0.
    # 以一个 BLOCK_SIZE x BLOCK_SIZE 为单位
    for m in range(math.ceil(A.shape[1] / BLOCK_SIZE)):
        sA[tx, ty] = A[row, ty + m * BLOCK_SIZE]
        sB[tx, ty] = B[tx + m * BLOCK_SIZE, col]
        # 线程同步，等待Block中所有Thread预加载结束
        # 该函数会等待所有Thread执行完之后才执行下一步
        cuda.syncthreads()
        # 此时已经将A和B的子矩阵拷贝到了sA和sB

        # 计算Shared Memory中的向量点积
        # 直接从Shard Memory中读取数据的延迟很低
        for n in range(BLOCK_SIZE):
            tmp += sA[tx, n] * sB[n, ty]

        # 线程同步，等待Block中所有Thread计算结束
        cuda.syncthreads()

    # 循环后得到每个BLOCK的点积之和
    C[row, col] = tmp

Example #27

0

Show file

File: reduction.py Project: Alexhuszagh/numba

        def copy_strides(arr, n, stride, tpb):
            sm = cuda.shared.array(1, dtype=uint32)
            i = cuda.threadIdx.x
            base = 0
            if i == 0:
                sm[0] = 0

            val = arr[0]
            while base < n:
                idx = base + i
                if idx < n:
                    val = arr[idx * stride]

                cuda.syncthreads()

                if base + i < n:
                    arr[sm[0] + i] = val

                if i == 0:
                    sm[0] += tpb

                base += tpb

Example #28

0

Show file

def k__histogram_shmem(x, xmin, xmax, histogram):
    """
    Recall the common usage of shared memory, i.e., caching and buffering, this
    exmample demonstrates usage #2. Here we are benefited by putting the write-
    intensive operations into the shared memory and then collect the tally once
    a block has finished its calculation.
    """
    start = cuda.grid(1)
    stride = cuda.gridsize(1)

    # though unnecessary in the present case, below is always a good habit
    if start >= x.shape[0]:
        return

    # allocate space in the shared memory whose size must be a constant
    tid = cuda.threadIdx.x
    hist_buffer = cuda.shared.array(nbins, types.int32)

    for i in range(nbins):
        hist_buffer[i] = 0

    cuda.syncthreads()  # this is important

    # real calculations
    bin_width = (xmax - xmin) / nbins

    for idx in range(start, x.shape[0], stride):
        bin_number = np.int32((x[idx] - xmin) / bin_width)

        if 0 <= bin_number < nbins:
            # writing in the shared memory
            cuda.atomic.add(hist_buffer, bin_number, 1)

    cuda.syncthreads()  # this is important

    # move the tallied result back to the output array
    if tid < nbins:  # assuming griddim >= nbins
        cuda.atomic.add(histogram, tid, hist_buffer[tid])

Example #29

0

Show file

def fast_matmul(A, B, C):
    # Define an array in the shared memory
    # The size and type of the arrays must be known at compile time
    sA = cuda.shared.array(shape=(TPB, TPB), dtype=float32)
    sB = cuda.shared.array(shape=(TPB, TPB), dtype=float32)

    x, y = cuda.grid(2)

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bpg = cuda.gridDim.x    # blocks per grid


    if x >= C.shape[0] and y >= C.shape[1]:
        # Quit if (x, y) is outside of valid C boundary
        return

    # Each thread computes one element in the result matrix.
    # The dot product is chunked into dot products of TPB-long vectors.
    tmp = 0.
    for i in range(bpg):
        print(x, ty, i, TPB, tx, i, TPB, y)
        # Preload data into shared memory
        sA[tx, ty] = A[x, ty + i * TPB] # row
        sB[tx, ty] = B[tx + i * TPB, y] # col

        # Wait until all threads finish preloading
        cuda.syncthreads()

        # Computes partial product on the shared memory
        for j in range(TPB):
            tmp += sA[tx, j] * sB[j, ty]

        # Wait until all threads finish computing
        cuda.syncthreads()

#    print(x,y, tmp)
    C[x,y] = tmp

Example #30

0

Show file

File: 06_finite_diff_04_ijk_separate.py Project: Potopoles/Climate_Model

def kernel_shared(dVARdt, VAR):
    sVAR = cuda.shared.array(shape=(shared_memory_size), dtype=float32)
    i, j, k = cuda.grid(3)
    si = cuda.threadIdx.x + 1
    sj = cuda.threadIdx.y + 1
    sk = cuda.threadIdx.z

    sVAR[si, sj, sk] = VAR[i, j, k]
    #cuda.syncthreads()

    if i_run_x:
        if si == 1:
            sVAR[si - 1, sj, sk] = VAR[i - 1, j, k]
        if si == cuda.blockDim.x:
            sVAR[si + 1, sj, sk] = VAR[i + 1, j, k]
    #cuda.syncthreads()

    if i_run_y:
        if sj == 1:
            sVAR[si, sj - 1, sk] = VAR[i, j - 1, k]
        if sj == cuda.blockDim.y:
            sVAR[si, sj + 1, sk] = VAR[i, j + 1, k]

    cuda.syncthreads()

    if i >= nb and i < nx + nb and j >= nb and j < ny + nb:
        tmp = wp(0.)
        if i_run_x:
            tmp += (sVAR[si + 1, sj, sk] - sVAR[si - 1, sj, sk] -
                    sVAR[si, sj, sk])
        if i_run_y:
            tmp += (sVAR[si, sj + 1, sk] - sVAR[si, sj - 1, sk] -
                    sVAR[si, sj, sk])
        if i_run_z:
            if k >= 1 and k < nz - 1:
                tmp += (sVAR[si, sj, sk + 1] - sVAR[si, sj, sk - 1] +
                        sVAR[si, sj, sk])
        dVARdt[i, j, k] = tmp

Example #31

0

Show file

File: utils.py Project: xKuZz/tfg

def cuda_sum(my_array, my_sums):
    # 1. Declaramos la memoria compartida
    shared_mem = cuda.shared.array(shape=128, dtype=numba.float32)

    # 2. Obtenemos los índices
    tidx = cuda.threadIdx.x
    idx = cuda.blockDim.x * cuda.blockIdx.x + tidx

    # 3. Inicialiamos a cero
    shared_mem[tidx] = 0

    # 4. Cada thread comprueba un stride doble del grid
    while idx < my_array.size:
        shared_mem[tidx] += my_array[idx]

        idx += cuda.blockDim.x * cuda.gridDim.x
    cuda.syncthreads()

    # 5. Unroll de bloque
    # Consideramos que estamos usando 128 hebras por bloque.
    if tidx < 64:
        shared_mem[tidx] += shared_mem[tidx + 64]

    cuda.syncthreads()

    # 6. Hacemos unroll para un warp (nos ahorramos syncthreads)
    if tidx < 32:
        shared_mem[tidx] += shared_mem[tidx + 32]
        shared_mem[tidx] += shared_mem[tidx + 16]
        shared_mem[tidx] += shared_mem[tidx + 8]
        shared_mem[tidx] += shared_mem[tidx + 4]
        shared_mem[tidx] += shared_mem[tidx + 2]
        shared_mem[tidx] += shared_mem[tidx + 1]

    # El primer thread de cada bloque indica su suma
    # Si da para más de un bloque, luego hay que reaplicar el kernel
    if tidx == 0:
        my_sums[cuda.blockIdx.x] = shared_mem[tidx]

Example #32

0

Show file

File: su_convolutionSeparable_GPUv2.py Project: vcruzer/Sucuri-GPUNode

def convolutionColumnsGPU(d_Dst, d_Src, c_Kernel, imageW, imageH, pitch):
	COLUMNS_BLOCKDIM_X = 16
	COLUMNS_BLOCKDIM_Y = 8
	COLUMNS_RESULT_STEPS = 8
	COLUMNS_HALO_STEPS = 1
	KERNEL_RADIUS = 8

	#cuda.const.array_like(c_Kernel)
	#s_Data = cuda.shared.array(shape=(COLUMNS_BLOCKDIM_X,(COLUMNS_RESULT_STEPS + 2 * COLUMNS_HALO_STEPS) * COLUMNS_BLOCKDIM_Y + 1), dtype=float32)
	s_Data = cuda.shared.array(shape=(16,81), dtype=float32)
			
	#Offset to the upper halo edge
	baseX = cuda.blockIdx.x * COLUMNS_BLOCKDIM_X + cuda.threadIdx.x 
	baseY = (cuda.blockIdx.y * COLUMNS_RESULT_STEPS - COLUMNS_HALO_STEPS) * COLUMNS_BLOCKDIM_Y + cuda.threadIdx.y 
	#d_Src += baseY * pitch + baseX 
	#d_Dst += baseY * pitch + baseX 
	desvio = baseY * pitch + baseX 

	#Main data
	for i in xrange(COLUMNS_HALO_STEPS,COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS):
		s_Data[cuda.threadIdx.x][cuda.threadIdx.y + i * COLUMNS_BLOCKDIM_Y] = d_Src[desvio + (i * COLUMNS_BLOCKDIM_Y * pitch)] 

	#Upper halo
	for i in xrange(COLUMNS_HALO_STEPS):
		s_Data[cuda.threadIdx.x][cuda.threadIdx.y + i * COLUMNS_BLOCKDIM_Y] =  d_Src[desvio + (i * COLUMNS_BLOCKDIM_Y * pitch)] if (baseY >= -i * COLUMNS_BLOCKDIM_Y) else 0 

	#Lower halo
	for i in xrange(COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS,COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS + COLUMNS_HALO_STEPS):
		s_Data[cuda.threadIdx.x][cuda.threadIdx.y + i * COLUMNS_BLOCKDIM_Y]= d_Src[desvio + (i * COLUMNS_BLOCKDIM_Y * pitch)] if (imageH - baseY > i * COLUMNS_BLOCKDIM_Y) else 0 

	#Compute and store results
	cuda.syncthreads() 
	for i in xrange(COLUMNS_HALO_STEPS,COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS):
		sum = 0.0
		for j in xrange(-KERNEL_RADIUS,KERNEL_RADIUS+1):
			sum += c_Kernel[KERNEL_RADIUS - j] * s_Data[cuda.threadIdx.x][cuda.threadIdx.y + i * COLUMNS_BLOCKDIM_Y + j] 

		d_Dst[desvio+(i * COLUMNS_BLOCKDIM_Y * pitch)] = sum

Example #33

0

Show file

File: CudaKernels.py Project: ChristianSeipp/MOLTWaveSolver

def uLocal1D(uvalsMatrix, Plist, Qlist, Rlist, vjInvList, djlist,
             quadCoeffsMatrix, JLvalMatrix, JRvalMatrix, wMatrix, leftComm,
             rightComm, JSizeList):

    i = cuda.blockIdx.x
    j = cuda.threadIdx.x
    JSize = JSizeList[i]
    P = Plist[i]
    Q = Qlist[i]
    R = Rlist[i]
    vjsqrinv = vjInvList[i]
    dj = djlist[i]

    calcLocalJs(uvalsMatrix[i, :], P, Q, R, vjsqrinv, dj,
                quadCoeffsMatrix[i, :], JLvalMatrix[i, :], JRvalMatrix[i, :],
                JSize)

    cuda.syncthreads()
    #wait for all threads to finish

    #this seems like a bad way to do this,
    #if we are on the first thread
    if j == 1:
        index = 0

        while index < JSize:
            if index != 0:
                JLvalMatrix[i][index] = dj * JLvalMatrix[i][
                    index - 1] + JLvalMatrix[i][index]
            if index != JSize - 1:
                reverseIndex = JSize - index - 1
                JRvalMatrix[i][
                    reverseIndex -
                    1] = dj * JRvalMatrix[i][reverseIndex] + JRvalMatrix[i][
                        reverseIndex - 1]
            index += 1

    cuda.syncthreads()

    wMatrix[i, j] = JLvalMatrix[i, j] + JRvalMatrix[i, j]

    cuda.syncthreads()

    if j == 1:
        leftComm[len(uvalsMatrix) - i - 1] = wMatrix[i][0]
        rightComm[i] = wMatrix[i][JSize - 1]

    cuda.syncthreads()

Example #34

0

Show file

File: get_data.py Project: Simon-174/parallel_algo_itmo

def kernel_similarity_shared(objs, sim_matrix):
    """
    Calculates l2 distance between row and column in numba with shared memory.
    """
    shared_objs_row = cuda.shared.array((NTHREADS, NTHREADS), dtype=float32)
    shared_objs_col = cuda.shared.array((NTHREADS, NTHREADS), dtype=float32)
    row, col = cuda.grid(2)
    thread_row = cuda.threadIdx.x  # from 0 to NTHREADS - 1
    thread_col = cuda.threadIdx.y  # from 0 to NTHREADS - 1

    if row < sim_matrix.shape[0] and col < sim_matrix.shape[1]:
        current_sum = 0.0
        for i in range(BLOCKS_PER_GRID):
            shared_objs_row[thread_row,
                            thread_col] = objs[row, thread_col + i * NTHREADS]
            shared_objs_col[thread_row,
                            thread_col] = objs[col, thread_row + i * NTHREADS]
            cuda.syncthreads()
            for j in range(NTHREADS):
                current_sum += (shared_objs_row[thread_row, j] -
                                shared_objs_col[thread_col, j])**2
            cuda.syncthreads()
        sim_matrix[row, col] = current_sum

Example #35

0

Show file

File: numba_cuda_aux.py Project: robtovey/2018_dictionary_tomography

def __GPU_reduce_flex_C(x, out, sz):
    tid = cuda.threadIdx.x
    i = cuda.blockIdx.x * (2 * THREADS) + tid
    step = (THREADS * 2) * cuda.gridDim.x
    end = sz - THREADS

    buf = cuda.shared.array((THREADS, 2), dtype=numba.float32)

    buf[tid, 0] = 0
    buf[tid, 1] = 0
    while i < end:
        buf[tid, 0] += x[i].real + x[i + THREADS].real
        buf[tid, 1] += x[i].imag + x[i + THREADS].imag
        i += step
    if i < sz:
        buf[tid, 0] += x[i].real
        buf[tid, 1] += x[i].imag
    cuda.syncthreads()

    __GPU_reduce_2(buf)
    if tid == 0:
        out[0, cuda.blockIdx.x] = buf[0, 0]
        out[1, cuda.blockIdx.x] = buf[0, 1]

Example #36

0

Show file

File: scan.py Project: Chiroptera/masters_code

def scan_sum(g_data, aux):
    temp = cuda.shared.array(shape = 1, dtype = numba.i4)

    thid = cuda.threadIdx.x # thread id in block
    bid = cuda.blockIdx.x # block id  

    if thid == 0:
        temp[0] = aux[bid]

    tgid = cuda.grid(1) # thread id in grid
    elid = tgid * 2 # each thread processes 2 elements

    n = g_data.size

    if elid >= n:
        return
    
    cuda.syncthreads() # synchronize to make sure value to sum is loaded in memory

    g_data[elid] += aux[bid] # do the sum

    if elid + 1 < n:
        g_data[elid + 1] += aux[bid]

Example #37

0

Show file

def vec_sum_row(vecs, sums):
    sm = cuda.shared.array(threadsperblock, float64)
    bid = cuda.blockIdx.x
    tid = cuda.threadIdx.x
    bdim = cuda.blockDim.x

    # load shared memory with vector using block-stride loop
    lid = tid
    sm[lid] = 0
    while lid < nCols:
        sm[tid] += vecs[bid, lid]
        lid += bdim
    cuda.syncthreads()

    # perform shared memory sweep reduction
    sweep = bdim // 2
    while sweep > 0:
        if tid < sweep:
            sm[tid] += sm[tid + sweep]
        sweep = sweep // 2
        cuda.syncthreads()
    if tid == 0:
        sums[bid] = sm[0]

Example #38

0

Show file

File: gpuoperations.py Project: xianghuix/msdnet

def fastsumall_impl(a, out):
    tx = int32(cuda.threadIdx.x)
    gtx = tx + cuda.blockIdx.x * 1024
    gsize = 1024 * cuda.gridDim.x
    sz2 = a[0].size
    nc = a[0].shape[1]
    fshared = cuda.shared.array(shape=1024, dtype=float32)
    fidx = 0
    for ai in range(a.shape[0]):
        sumv = float32(0)
        for i in range(gtx,sz2,gsize):
            sumv += a[ai,i//nc,i%nc]
        fshared[tx] = sumv
        cuda.syncthreads()
        sz = int32(512)
        while sz>0:
            if tx<sz:
                fshared[tx] += fshared[tx+sz]
            cuda.syncthreads()
            sz//=2
        if tx==0:
            out[cuda.blockIdx.x + fidx] = fshared[0]
        fidx += cuda.gridDim.x

Example #39

0

Show file

def cu_sums1(nme, member, vel, virial_potential, coll, nblocks):
    sm = cuda.shared.array(256, nb.float32)
    i = cuda.grid(1)
    tx = cuda.threadIdx.x
    temp = nb.float32(0.0)
    if i < nme:
        idx = member[i]
        vi = vel[idx]
        mi = vi[3]
        temp = mi * (vi[0] * vi[0] + vi[1] * vi[1] + vi[2] * vi[2])

    sm[tx] = temp
    cuda.syncthreads()

    offs = cuda.blockDim.x >> nb.int32(1)
    while offs > nb.int32(0):
        if tx < offs:
            sm[tx] += sm[tx + offs]
        offs >>= nb.int32(1)
        cuda.syncthreads()

    if tx == nb.int32(0):
        coll[cuda.blockIdx.x] = sm[0]

Example #40

0

Show file

File: nbody.py Project: LLinville/graphicsMisc

def calculate_forces(positions, weights, accelerations):
    """
    Calculate accelerations produced on all bodies by mutual gravitational
    forces.
    """
    sh_positions = cuda.shared.array((tile_size, 2), float32)
    sh_weights = cuda.shared.array(tile_size, float32)
    i = cuda.grid(1)
    axi = 0.0
    ayi = 0.0
    xi = positions[i, 0]
    yi = positions[i, 1]
    for j in range(0, len(weights), tile_size):
        index = (j // tile_size) * cuda.blockDim.x + cuda.threadIdx.x
        sh_index = cuda.threadIdx.x
        sh_positions[sh_index, 0] = positions[index, 0]
        sh_positions[sh_index, 1] = positions[index, 1]
        sh_weights[sh_index] = weights[index]
        cuda.syncthreads()
        axi, ayi = tile_calculation(xi, yi, axi, ayi, sh_positions, sh_weights)
        cuda.syncthreads()
    accelerations[i, 0] = axi
    accelerations[i, 1] = ayi

Example #41

0

Show file

File: surface_model_cuda.py Project: Potopoles/Climate_Model

def soil_temperature_euler_forward_gpu(dSOILTEMPdt, SOILTEMP, LWFLXNET, SWFLXNET,
                                    SOILCP, SOILRHO, SOILDEPTH, dt):

    nx = SOILTEMP.shape[0]
    ny = SOILTEMP.shape[1]
    nzs = LWFLXNET.shape[2]

    i, j = cuda.grid(2)

    dSOILTEMPdt[i,j,0] = 0.

    if i_radiation > 0:
        dSOILTEMPdt[i,j,0] = (LWFLXNET[i,j,nzs-1] + SWFLXNET[i,j,nzs-1])/ \
                        (SOILCP[i,j] * SOILRHO[i,j] * SOILDEPTH[i,j])

    #if i_microphysics > 0:
    #    dSOILTEMPdt = dSOILTEMPdt - ( MIC.surf_evap_flx * MIC.lh_cond_water ) / \
    #                                (CF.SOILCP * CF.SOILRHO * CF.SOILDEPTH)

    SOILTEMP[i,j,0] = SOILTEMP[i,j,0] + dt * dSOILTEMPdt[i,j,0]


    cuda.syncthreads()

Example #42

0

Show file

File: 2d_stencil.py Project: ebadali/numba

def lbp_texture(arry, hist):

    # We have 32*32 threads per block
    A = cuda.shared.array(shape=(32, 32), dtype=int32)

    # H = cuda.shared.array(BIN_COUNT, dtype=int32)
    x, y = cuda.grid(2)

    ty = cuda.threadIdx.x
    tx = cuda.threadIdx.y

    A[ty, tx] = arry[x, y]

    cuda.syncthreads()

    threadCountX = A.shape[0] - 1
    threadCountY = A.shape[1] - 1
    # If within x range and y range then calculate the LBP discriptor along
    # with histogram value to specific bin

    # Other wise Ignore the Value
    if (ty > 0 and (threadCountX - ty) > 0) and (tx > 0 and
                                                 (threadCountY - tx) > 0):
        #     # You can do the Processing here. ^_^
        code = 0
        #  We need to make sure that each value is accessable to each thread
        center = A[ty, tx]

        # Compiler optimization: By loop unrolling
        # turns out twice faster than rolled version for over
        # 16*16 window
        code |= (1 if A[ty - 1][tx - 1] > center else 0) << 7
        code |= (1 if A[ty][tx - 1] > center else 0) << 6
        code |= (1 if A[ty + 1][tx - 1] > center else 0) << 5
        code |= (1 if A[ty + 1][tx] > center else 0) << 4
        code |= (1 if A[ty + 1][tx + 1] > center else 0) << 3
        code |= (1 if A[ty][tx + 1] > center else 0) << 2
        code |= (1 if A[ty - 1][tx + 1] > center else 0) << 1
        code |= (1 if A[ty - 1][tx - 1] > center else 0) << 0

        # Since atomic add; adds value to the existing value
        # Need to figure out the fraction to be added in the previous value
        code = (code - center)

        A[ty, tx] = code

        cuda.syncthreads()

        # Fun It's Fun to have a visible LBP Texture
        # So, overriding that with the origional vale.
        val = A[ty, tx]
        cuda.atomic.add(arry, (x, y), val)
        cuda.syncthreads()

        # This Atomic Operation is equivalent to  hist[code % 256] += 1
        ind = code % BIN_COUNT
        cuda.atomic.add(hist, ind, 1)

Example #43

0

Show file

def kernel_shared(dVARdt, VAR):
    sVAR = cuda.shared.array(shape=(shared_memory_size), dtype=float32)

    k, j, i = cuda.grid(3)
    si = cuda.threadIdx.z + 1
    sj = cuda.threadIdx.y + 1
    sk = cuda.threadIdx.x + 1

    sVAR[sk, sj, si] = VAR[k, j, i]
    #cuda.syncthreads()

    if si == 1:
        sVAR[sk, sj, si - 1] = VAR[k, j, i - 1]
    if si == cuda.blockDim.z:
        sVAR[sk, sj, si + 1] = VAR[k, j, i + 1]
    #cuda.syncthreads()

    if sj == 1:
        sVAR[sk, sj - 1, si] = VAR[k, j - 1, i]
    if sj == cuda.blockDim.y:
        sVAR[sk, sj + 1, si] = VAR[k, j + 1, i]
    #cuda.syncthreads()

    if sk == 1:
        sVAR[sk - 1, sj, si] = VAR[k - 1, j, i]
    if sk == cuda.blockDim.x:
        sVAR[sk + 1, sj, si] = VAR[k + 1, j, i]

    cuda.syncthreads()

    if i >= nb and i < nx + nb and j >= nb and j < ny + nb:
        tmp = wp(0.)
        tmp += (sVAR[sk, sj + 1, si] - sVAR[sk, sj - 1, si])
        tmp += (sVAR[sk, sj, si + 1] - sVAR[sk, sj, si - 1])
        if k >= 1 and k < nz - 1:
            tmp += (sVAR[sk + 1, sj, si] - sVAR[sk - 1, sj, si])
        dVARdt[k, j, i] = tmp

Example #44

0

Show file

File: gpu_defs.py Project: Tarleton-Math/M5364_17fl_Data_Mining1

def get_pp_col_time_kernel(pp_dt, pos, vel,  mask, N, radius):#, pp_dt_full, a_gpu, b_gpu, c_gpu):
    pp_dt_shr = cuda.shared.array(shape=(pp_brows, pp_bcols), dtype=nb_dtype)
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    p = ty + cuda.blockIdx.y * cuda.blockDim.y
    q = tx + cuda.blockIdx.x * cuda.blockDim.x
    if ((p >= N) or (q >= N)):        
        pp_dt_shr[ty,tx] = np.inf
        a = p
        b = q
        c = N
    else:
        a = 0.0
        b = 0.0
        c = 0.0
        for d in range(dim):
            dx = pos[p,d] - pos[q,d]
            dv = vel[p,d] - vel[q,d]
            a += (dv * dv)
            b += (dx * dv * 2)
            c += (dx * dx)
        c -= (radius[p] + radius[q])**2

        if ((mask[0]==p) & (mask[1]==q)):
            masked = True
        elif ((mask[0]==q) & (mask[1]==p)):
            masked = True
        else:
            masked = False

        pp_dt_shr[ty,tx] = solve_quadratic_gpu(a, b, c, masked)

    cuda.syncthreads()

    row_min_gpu(pp_dt_shr)
    pp_dt[p, cuda.blockIdx.x] = pp_dt_shr[ty, 0]
    cuda.syncthreads()

Example #45

0

Show file

File: su_convolutionSeparable_GPUv2.py Project: vcruzer/Sucuri-GPUNode

def convolutionRowsGPU(d_Dst, d_Src, c_Kernel, imageW, imageH, pitch):
	ROWS_BLOCKDIM_X = 16
	ROWS_BLOCKDIM_Y = 4
	ROWS_RESULT_STEPS = 8
	ROWS_HALO_STEPS = 1
	KERNEL_RADIUS = 8

	#s_Data = cuda.shared.array(shape=(ROWS_BLOCKDIM_Y,(ROWS_RESULT_STEPS + 2 * ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X),dtype=float32)
	s_Data = cuda.shared.array(shape=(4,160),dtype=float32)
	#Offset to the left halo edge
	baseX = (cuda.blockIdx.x * ROWS_RESULT_STEPS - ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X + cuda.threadIdx.x 
	baseY = cuda.blockIdx.y * ROWS_BLOCKDIM_Y + cuda.threadIdx.y 

	#d_Src += baseY * pitch + baseX 
	#d_Dst += baseY * pitch + baseX
	desvio = baseY * pitch + baseX

	#Load main data
	for i in xrange(ROWS_HALO_STEPS,ROWS_HALO_STEPS + ROWS_RESULT_STEPS):
		s_Data[cuda.threadIdx.y][cuda.threadIdx.x + i * ROWS_BLOCKDIM_X] = d_Src[desvio + (i * ROWS_BLOCKDIM_X)] 

	#Load left halo
	for i in xrange(ROWS_HALO_STEPS):
		s_Data[cuda.threadIdx.y][cuda.threadIdx.x + i * ROWS_BLOCKDIM_X] = d_Src[desvio + (i * ROWS_BLOCKDIM_X)] if (baseX >= -i * ROWS_BLOCKDIM_X ) else 0 

	#Load right halo
	for i in xrange(ROWS_HALO_STEPS + ROWS_RESULT_STEPS,ROWS_HALO_STEPS + ROWS_RESULT_STEPS + ROWS_HALO_STEPS):
		s_Data[cuda.threadIdx.y][cuda.threadIdx.x + i * ROWS_BLOCKDIM_X] = d_Src[desvio  + (i * ROWS_BLOCKDIM_X)] if (imageW - baseX > i * ROWS_BLOCKDIM_X) else 0 

	#Compute and store results
	cuda.syncthreads() 
	for i in xrange(ROWS_HALO_STEPS,ROWS_HALO_STEPS + ROWS_RESULT_STEPS):
		sum = 0.0
		for j in xrange(-KERNEL_RADIUS,KERNEL_RADIUS+1):
			sum += c_Kernel[KERNEL_RADIUS - j] * s_Data[cuda.threadIdx.y][cuda.threadIdx.x + i * ROWS_BLOCKDIM_X + j] 

		d_Dst[desvio + (i * ROWS_BLOCKDIM_X)] = sum

Example #46

0

Show file

File: gpu_pathfinder_array.py Project: LunaDilangalen/gpu-pathfinding

def GridDecompPath(grid, start, goal, parents, h, block):
    x, y = cuda.grid(2)
    glb_x, glb_y = dim
    goal_x, goal_y = goal

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bpg = cuda.gridDim.x  # blocks per grid

    if x < grid.shape[0] and y < grid.shape[1]:
        # do the search for as many times as number of tiles in the grid
        if passable(grid, (x, y)) and (x != goal_x or y != goal_y):
            # print(x, y)
            # initialize local arrays
            local_open = cuda.local.array(dim, cp.int32)
            local_closed = cuda.local.array(dim, cp.int32)
            local_cost = cuda.local.array(dim, cp.int32)
            local_g = cuda.local.array(dim, cp.int32)
            local_neighbors = cuda.local.array((8, 2), cp.int32)

            for i in range(glb_x):
                for j in range(glb_y):
                    local_open[i, j] = UNEXPLORED
                    local_closed[i, j] = UNEXPLORED
                    local_cost[i, j] = 0
                    local_g[i, j] = 0
            cuda.syncthreads()

            for i in range(8):
                local_neighbors[i, 0] = 0
                local_neighbors[i, 1] = 0
            cuda.syncthreads()

            # search(x, y, shared_planning_block, (block_x,block_y), goal, local_open, local_closed, parents, local_cost, local_g, shared_h, local_neighbors)
            search(x, y, grid, (x, y), goal, local_open, local_closed,
                   parents[x,
                           y], local_cost, local_g, h, local_neighbors, block)

Example #47

0

Show file

def fast_matmul(a, b, c):
    sa = cuda.shared.array(shape=(TPB, TPB), dtype=float64)
    sb = cuda.shared.array(shape=(TPB, TPB), dtype=float64)

    x, y = cuda.grid(2)

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    # bpg = cuda.gridDim.x

    if x >= c.shape[0] and y >= c.shape[1]:
        # Quit if (x, y) is outside of valid C boundary
        return

    tmp = 0
    for i in range(hid):
        sa[tx, ty] = a[x, ty + i * TPB]
        sb[tx, ty] = b[tx + i * TPB, y]
        cuda.syncthreads()

        for j in range(TPB):
            tmp += sa[tx, j] * sb[j, ty]
        cuda.syncthreads()
    c[x, y] = tmp

Example #48

0

Show file

File: geopotential_cuda.py Project: Potopoles/Climate_Model

def get_geopotential(PHI, PHIVB, PVTF, PVTFVB, \
                     POTT, HSURF):
    nx = PHIVB.shape[0] - 2
    ny = PHIVB.shape[1] - 2
    nzs = PHIVB.shape[2]
    i, j, ks = cuda.grid(3)
    if i > 0 and i < nx + 1 and j > 0 and j < ny + 1:
        kiter = nzs - 1
        if ks == kiter:
            PHIVB[i, j, ks] = HSURF[i, j] * con_g
        kiter = kiter - 1
        cuda.syncthreads()

        while kiter >= 0:
            if ks == kiter:
                PHI  [i,j,ks] = PHIVB[i,j,ks+1] - con_cp*  \
                                        ( POTT[i,j,ks] * (   PVTF  [i,j,ks  ] \
                                                           - PVTFVB[i,j,ks+1] ) )
                PHIVB[i,j,ks] = PHI  [i,j,ks  ] - con_cp * \
                                        ( POTT[i,j,ks] * (   PVTFVB[i,j,ks  ] \
                                                           - PVTF  [i,j,ks  ] ) )

            kiter = kiter - 1
            cuda.syncthreads()

Example #49

0

Show file

File: lagevin_coulomb_lj_cuda.py Project: Zarasthustra/EwaldSummation

 def kernel_calc_force_rec(q, out, m, S_m_cos_parts, S_m_sin_parts,
                           S_m_cos_sum, S_m_sin_sum, dS_modul_sq, charges,
                           f_rec_prefactor, coeff_S):
     i = cuda.grid(1)
     if i < n_particles:
         for j in range(m.shape[1]):
             for k in range(n_dim):
                 S_m_cos_parts[i, j] += q[i, k] * m[k, j]
                 S_m_sin_parts[i, j] += q[i, k] * m[k, j]
             S_m_cos_parts[i, j] = math.cos(
                 2 * math.pi * S_m_cos_parts[i, j]) * charges[i]
             S_m_sin_parts[i, j] = math.sin(
                 2 * math.pi * S_m_sin_parts[i, j]) * charges[i]
             S_m_cos_sum[j] += S_m_cos_parts[i, j]
             S_m_sin_sum[j] += S_m_sin_parts[i, j]
         cuda.syncthreads()
         for j in range(m.shape[1]):
             dS_modul_sq[i, j] = coeff_S[j] * (
                 2. * S_m_cos_sum[j] * S_m_sin_parts[i, j] -
                 2. * S_m_sin_sum[j] * S_m_cos_parts[i, j])
         for j in range(m.shape[1]):
             for k in range(n_dim):
                 out[i,
                     k] += f_rec_prefactor[i] * dS_modul_sq[i, j] * m[k, j]

Example #50

0

Show file

File: definition.py Project: chingisooinar/Accelerated-Computing-with-CUDA-Python

def mm_shared(a, b, c):
    sum = 0

    # `a_cache` and `b_cache` are already correctly defined
    a_cache = cuda.shared.array(block_size, types.int32)
    b_cache = cuda.shared.array(block_size, types.int32)

    # TODO: use each thread to populate one element each a_cache and b_cache
    x, y = cuda.grid(2)
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bpg = cuda.gridDim.x
    TPB = int(N)

    for i in range(a.shape[1] / TPB):
        a_cache[tx, ty] = a[x, ty + i * TPB]
        b_cache[tx, ty] = b[tx + i * TPB, y]

    cuda.syncthreads()
    for j in range(TPB):  #a.shape[1]):
        # TODO: calculate the `sum` value correctly using values from the cache
        sum += a_cache[tx][j] * b_cache[j][ty]
    cuda.syncthreads()
    c[x][y] = sum

Example #51

0

Show file

File: nms_gpu.py Project: zzdxlee/Det3D

def rotate_nms_kernel(n_boxes, nms_overlap_thresh, dev_boxes, dev_mask):
    threadsPerBlock = 8 * 8
    row_start = cuda.blockIdx.y
    col_start = cuda.blockIdx.x
    tx = cuda.threadIdx.x
    row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock)
    col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock)
    block_boxes = cuda.shared.array(shape=(64 * 6,), dtype=numba.float32)
    dev_box_idx = threadsPerBlock * col_start + tx
    if tx < col_size:
        block_boxes[tx * 6 + 0] = dev_boxes[dev_box_idx * 6 + 0]
        block_boxes[tx * 6 + 1] = dev_boxes[dev_box_idx * 6 + 1]
        block_boxes[tx * 6 + 2] = dev_boxes[dev_box_idx * 6 + 2]
        block_boxes[tx * 6 + 3] = dev_boxes[dev_box_idx * 6 + 3]
        block_boxes[tx * 6 + 4] = dev_boxes[dev_box_idx * 6 + 4]
        block_boxes[tx * 6 + 5] = dev_boxes[dev_box_idx * 6 + 5]
    cuda.syncthreads()
    if tx < row_size:
        cur_box_idx = threadsPerBlock * row_start + tx
        # cur_box = dev_boxes + cur_box_idx * 5;
        t = 0
        start = 0
        if row_start == col_start:
            start = tx + 1
        for i in range(start, col_size):
            iou = devRotateIoU(
                dev_boxes[cur_box_idx * 6 : cur_box_idx * 6 + 5],
                block_boxes[i * 6 : i * 6 + 5],
            )
            # print('iou', iou, cur_box_idx, i)
            if iou > nms_overlap_thresh:
                t |= 1 << i
        col_blocks = (n_boxes) // (threadsPerBlock) + (
            (n_boxes) % (threadsPerBlock) > 0
        )
        dev_mask[cur_box_idx * col_blocks + col_start] = t

Example #52

0

Show file

File: bench_cuda.py Project: gmarkall/numba-benchmark

def calculate_forces(positions, weights, accelerations):
    """
    Calculate accelerations produced on all bodies by mutual gravitational
    forces.
    """
    sh_positions = cuda.shared.array((tile_size, 2), float32)
    sh_weights = cuda.shared.array(tile_size, float32)
    i = cuda.grid(1)
    axi = 0.0
    ayi = 0.0
    xi = positions[i,0]
    yi = positions[i,1]
    for j in range(0, len(weights), tile_size):
        index = (j // tile_size) * cuda.blockDim.x + cuda.threadIdx.x
        sh_index = cuda.threadIdx.x
        sh_positions[sh_index,0] = positions[index,0]
        sh_positions[sh_index,1] = positions[index,1]
        sh_weights[sh_index] = weights[index]
        cuda.syncthreads()
        axi, ayi = tile_calculation(xi, yi, axi, ayi,
                                    sh_positions, sh_weights)
        cuda.syncthreads()
    accelerations[i,0] = axi
    accelerations[i,1] = ayi

Example #53

0

Show file

def BusquedaLocal_CUDA(nuevo_individuos, probabilidades, Aristmono,
                       MatrizAdjacencia, H, numNodos, tam_pobla, rng_states):
    bx = cuda.blockIdx.x
    thx = cuda.threadIdx.x
    id = bx * H + thx
    '''individuo = cuda.local.array(shape= (4,52), dtype= int32)
    probabilidad = cuda.local.array(shape = 4, dtype = float32)
    MatrizAdjCuda = cuda.local.array(shape=(52,52), dtype=int32)

    if id < tam_pobla:
        for j in range(tam_pobla):
            individuo = nuevo_individuos[id]
            probabilidad = probabilidades[id]
            Aristas = Aristmono[id]
        MatrizAdjCuda= MatrizAdjacencia
        
        metropolis_gpu(MatrizAdjCuda, individuo, probabilidad,Aristas,numColores,numNodos,rng_states,id)  # regresa al individuo después de realzar la búsqueda local
        cuda.syncthreads()
    '''
    metropolis_gpu(
        MatrizAdjacencia, nuevo_individuos[id], probabilidades[id],
        Aristmono[id], numColores, numNodos, rng_states,
        id)  # regresa al individuo después de realzar la búsqueda local
    cuda.syncthreads()

Example #54

0

Show file

File: scan.py Project: diogo-aos/masters_final

def scan_sum(g_data, aux):
    temp = cuda.shared.array(shape=1, dtype=numba.i4)

    thid = cuda.threadIdx.x  # thread id in block
    bid = cuda.blockIdx.x  # block id

    if thid == 0:
        temp[0] = aux[bid]

    tgid = cuda.grid(1)  # thread id in grid
    elid = tgid * 2  # each thread processes 2 elements

    n = g_data.size

    if elid >= n:
        return

    # synchronize to make sure value to sum is loaded in memory
    cuda.syncthreads()

    g_data[elid] += aux[bid]  # do the sum

    if elid + 1 < n:
        g_data[elid + 1] += aux[bid]

Example #55

0

Show file

File: gaussian_lu_decomposition.py Project: tllano11/Numerical-Methods

    def gaussian_lu_decomposition(A, L, size, i):
        """ Performs Gaussian LU elimination.

        @param A Coefficient matrix A.
        @param L Matrix in which to store the multipliers.
        @param size Size of coefficiente matrix.
        @param i Integer representing the current column in which all threads
        are performing row operations.
        @return None
        """
        idx = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
        idy = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y
        index = idx * size + idy

        if idx < size and idy < size:
            if idx > i:
                mul = A[idx * size + i] / A[i * size + i]
                if idy >= i:
                    A[index] -= A[i * size + idy] * mul
                    if idy == i:
                        L[index] = mul
            elif idx == idy:
                L[index] = 1
            cuda.syncthreads()

Example #56

0

Show file

File: reduction.py Project: gdementen/numba

    def reducer(inp, out, nelem, ostride):
        tid = cuda.threadIdx.x
        i = cuda.blockIdx.x * (blocksize * 2) + tid
        gridSize = blocksize * 2 * cuda.gridDim.x
        sdata = cuda.shared.array(blocksize, dtype=typ)

        while i < nelem:
            sdata[tid] = binop(inp[i], inp[i + blocksize])
            i += gridSize

        cuda.syncthreads()

        if blocksize >= 512:
            if tid < 256:
                sdata[tid] = binop(sdata[tid], sdata[tid + 256])
                cuda.syncthreads()

        if blocksize >= 256:
            if tid < 128:
                sdata[tid] = binop(sdata[tid], sdata[tid + 128])
                cuda.syncthreads()

        if blocksize >= 128:
            if tid < 64:
                sdata[tid] = binop(sdata[tid], sdata[tid + 64])
                cuda.syncthreads()

        if tid < 32:
            if blocksize >= 64:
                sdata[tid] = binop(sdata[tid], sdata[tid + 32])
            if blocksize >= 32:
                sdata[tid] = binop(sdata[tid], sdata[tid + 16])
            if blocksize >= 16:
                sdata[tid] = binop(sdata[tid], sdata[tid + 8])
            if blocksize >= 8:
                sdata[tid] = binop(sdata[tid], sdata[tid + 4])
            if blocksize >= 4:
                sdata[tid] = binop(sdata[tid], sdata[tid + 2])
            if blocksize >= 2:
                sdata[tid] = binop(sdata[tid], sdata[tid + 1])

        if tid == 0:
            out[cuda.blockIdx.x * ostride] = sdata[0]

Example #57

0

Show file

File: laplace2d-numba-gpu-improve.py Project: ContinuumIO/numbapro-examples

def jocabi_relax_core(A, Anew, error):
    err_sm = cuda.shared.array((tpb, tpb), dtype=f8)

    ty = cuda.threadIdx.x
    tx = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y

    n = A.shape[0]
    m = A.shape[1]

    i, j = cuda.grid(2)

    err_sm[ty, tx] = 0
    if j >= 1 and j < n - 1 and i >= 1 and i < m - 1:
        Anew[j, i] = 0.25 * ( A[j, i + 1] + A[j, i - 1] \
                            + A[j - 1, i] + A[j + 1, i])
        err_sm[ty, tx] = Anew[j, i] - A[j, i]

    cuda.syncthreads()

    # max-reduce err_sm vertically
    t = tpb // 2
    while t > 0:
        if ty < t:
            err_sm[ty, tx] = get_max(err_sm[ty, tx], err_sm[ty + t, tx])
        t //= 2
        cuda.syncthreads()

    # max-reduce err_sm horizontally
    t = tpb // 2
    while t > 0:
        if tx < t and ty == 0:
            err_sm[ty, tx] = get_max(err_sm[ty, tx], err_sm[ty, tx + t])
        t //= 2
        cuda.syncthreads()


    if tx == 0 and ty == 0:
        error[by, bx] = err_sm[0, 0]

Example #58

0

Show file

File: scan.py Project: Chiroptera/masters_code

def last_scan(g_data, aux, auxidx, elb, start_idx):
    """
    Performs the Bleloch scan on last block, where size might be variable.
    g_data : array to perform scan on
    aux : where to store sum
    auxidx : where to store sum in aux array; if auxid == -1 it means that this is not part of
             a large array scan and sums should not be stored
    elb : number of elements of last block
    """
    temp = cuda.shared.array(shape = 0, dtype = int32)

    thid = cuda.threadIdx.x # thread id in block
    tgid = cuda.grid(1) # thread id in grid
    bid = cuda.blockIdx.x # block id

    bsize =  cuda.blockDim.x

    # load input into shared memory
    # if index is above number of elements in last block,
    # shared memory should be 0
    idx1 = 2 * thid
    idx2 = 2 * thid +1

    if idx1 < elb:
        temp[idx1] = g_data[start_idx + idx1]
    else:
        temp[idx1] = 0

    if idx2 < elb:
        temp[idx2] = g_data[start_idx + idx2]
    else:
        temp[idx2] = 0

    offset = 1

    # build sum in place up the tree
    d = bsize # bsize is half the number of elements to process
    while d > 0:
        # if thid == 0:
        #     from pdb import set_trace; set_trace()
        cuda.syncthreads()
        
        if thid < d:
            ai = offset * (2 * thid + 1) - 1
            bi = offset * (2 * thid + 2) - 1

            temp[bi] += temp[ai]
        offset <<= 1 # multipy by 2
        d >>= 1 # divide by 2

    # clear the last element
    if thid == 0:
        
        # the last element processed by this block is the size
        # of the block multiplied by 2
        last_elem_id = bsize * 2 - 1

        if auxidx != -1:
            #aux[auxidx] = temp[last_elem_id]
            aux[auxidx] = temp[last_elem_id]

        temp[last_elem_id] = 0
        
    # traverse down tree and build scan
    d = 1
    while d < bsize << 1: # same thing as before
        offset >>= 1
        cuda.syncthreads()
        
        if thid < d:
            ai = offset * (2 * thid + 1) - 1
            bi = offset * (2 * thid + 2) - 1
            
            t = temp[ai]
            temp[ai] = temp[bi]
            temp[bi] += t
            
        d <<= 1
        
    cuda.syncthreads()
    
    # write results to device memory, in global IDs
    if idx1 < elb:
        g_data[start_idx + idx1] = temp[idx1]
    if idx2 < elb:
        g_data[start_idx + idx2] = temp[idx2]

Example #59

0

Show file

File: scan.py Project: Chiroptera/masters_code

def advanced_scan(g_odata, g_idata, n, aux):
    """
    Bleloch algorithm.
    receives auxiliary array to store the whole sum
    only works for array of max size 1024
    adapted to Numba CUDA from 
        [1] M. Harris, S. Sengupta, and J. D. Owens,
        \“Parallel Prefix Sum (Scan) with CUDA Mark,\” Gpu gems 3, no. April, pp. 1–24, 2007.
    
    """
    temp = cuda.shared.array(shape = 0, dtype = numba.i4)

    thid = cuda.threadIdx.x # thread id in block
    tgid = cuda.grid(1) # thread id in grid
    bid = cuda.blockIdx.x # block id
    
    
    
    # load input into shared memory
    temp[2 * thid] = g_idata[2 * thid]
    temp[2 * thid + 1] = g_idata[2 * thid + 1]
    
    offset = 1

    # build sum in place up the tree
    d = n / 2
    while d > 0:
        cuda.syncthreads()
        
        if thid < d:
            ai = offset * (2 * thid + 1) - 1
            bi = offset * (2 * thid + 2) - 1

            temp[bi] += temp[ai]
        offset <<= 1 # multipy by 2
        d >>= 1 # divide by 2
    
    # clear the last element
    if thid == 0:
        temp[n - 1] = 0
        
    # traverse down tree and build scan
    d = 1
    while d < n:
        offset >>= 1
        cuda.syncthreads()
        
        if thid < d:
            ai = offset * (2 * thid + 1) - 1
            bi = offset * (2 * thid + 2) - 1
            
            t = temp[ai]
            temp[ai] = temp[bi]
            temp[bi] += t
            
        d *= 2
        
    cuda.syncthreads()
    
    # write results to device memory
    g_odata[2 * thid] = temp[2 * thid]
    g_odata[2 * thid + 1] = temp[2 * thid + 1]

Example #60

0

Show file

File: reduction.py Project: Alexhuszagh/numba

    def reducer(inp, out, nelem, ostride):
        tid = cuda.threadIdx.x
        i = cuda.blockIdx.x * (blocksize * 2) + tid
        gridSize = blocksize * 2 * cuda.gridDim.x

        # Blocks perform most of the reduction within shared memory, in the
        # sdata array
        sdata = cuda.shared.array(sdatasize, dtype=typ)

        # The first reduction operation is performed during the process of
        # loading the data from global memory, in order to reduce the number of
        # idle threads (See "Reduction #4: First Add During Load")
        while i < nelem:
            sdata[tid] = binop(inp[i], inp[i + blocksize])
            i += gridSize

        # The following reduction steps rely on all values being loaded into
        # sdata; we need to synchronize in order to meet this condition
        cuda.syncthreads()

        # The following lines implement an unrolled loop that repeatedly reduces
        # the number of values by two (by performing the reduction operation)
        # until only a single value is left. This is done to reduce instruction
        # overhead (See the section "Instruction Bottleneck")
        if blocksize >= 512:
            if tid < 256:
                sdata[tid] = binop(sdata[tid], sdata[tid + 256])
                cuda.syncthreads()

        if blocksize >= 256:
            if tid < 128:
                sdata[tid] = binop(sdata[tid], sdata[tid + 128])
                cuda.syncthreads()

        if blocksize >= 128:
            if tid < 64:
                sdata[tid] = binop(sdata[tid], sdata[tid + 64])
                cuda.syncthreads()

        # At this point only the first warp has any work to do - we perform a
        # check on the thread ID here so that we can avoid calling syncthreads
        # (operations are synchronous within a warp) and also to avoid checking
        # the thread ID at each iteration (See the section "Unrolling the Last
        # Warp)
        if tid < 32:
            if blocksize >= 64:
                sdata[tid] = binop(sdata[tid], sdata[tid + 32])
            if blocksize >= 32:
                sdata[tid] = binop(sdata[tid], sdata[tid + 16])
            if blocksize >= 16:
                sdata[tid] = binop(sdata[tid], sdata[tid + 8])
            if blocksize >= 8:
                sdata[tid] = binop(sdata[tid], sdata[tid + 4])
            if blocksize >= 4:
                sdata[tid] = binop(sdata[tid], sdata[tid + 2])
            if blocksize >= 2:
                sdata[tid] = binop(sdata[tid], sdata[tid + 1])

        # Write this block's partially reduced value into the vector of all
        # partially-reduced values.
        if tid == 0:
            out[cuda.blockIdx.x * ostride] = sdata[0]