Python syncwarp Exemples, numba.cuda.syncwarp Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : test_sync.py Projet : srchilukoori/numba

def coop_syncwarp(res):
    sm = cuda.shared.array(32, int32)
    i = cuda.grid(1)

    sm[i] = i
    cuda.syncwarp()

    if i < 16:
        sm[i] = sm[i] + sm[i + 16]
        cuda.syncwarp(0xFFFF)

    if i < 8:
        sm[i] = sm[i] + sm[i + 8]
        cuda.syncwarp(0xFF)

    if i < 4:
        sm[i] = sm[i] + sm[i + 4]
        cuda.syncwarp(0xF)

    if i < 2:
        sm[i] = sm[i] + sm[i + 2]
        cuda.syncwarp(0x3)

    if i == 0:
        res[0] = sm[0] + sm[1]

Exemple #2

0

Afficher le fichier

    def device_reduce_full_block(arr, partials, sm_partials):
        """
        Partially reduce `arr` into `partials` using `sm_partials` as working
        space.  The algorithm goes like:

            array chunks of 128:  |   0 | 128 | 256 | 384 | 512 |
                        block-0:  |   x |     |     |   x |     |
                        block-1:  |     |   x |     |     |   x |
                        block-2:  |     |     |   x |     |     |

        The array is divided into chunks of 128 (size of a threadblock).
        The threadblocks consumes the chunks in roundrobin scheduling.
        First, a threadblock loads a chunk into temp memory.  Then, all
        subsequent chunks are combined into the temp memory.

        Once all chunks are processed.  Inner-block reduction is performed
        on the temp memory.  So that, there will just be one scalar result
        per block.  The result from each block is stored to `partials` at
        the dedicated slot.
        """
        tid = cuda.threadIdx.x
        blkid = cuda.blockIdx.x
        blksz = cuda.blockDim.x
        gridsz = cuda.gridDim.x

        # block strided loop to compute the reduction
        start = tid + blksz * blkid
        stop = arr.size
        step = blksz * gridsz

        # load first value
        tmp = arr[start]
        # loop over all values in block-stride
        for i in range(start + step, stop, step):
            tmp = reduce_op(tmp, arr[i])

        cuda.syncthreads()
        # inner-warp reduction
        inner_warp_reduction(sm_partials, tmp)

        cuda.syncthreads()
        # at this point, only the first slot for each warp in tsm_partials
        # is valid.

        # finish up block reduction
        # warning: this is assuming 4 warps.
        # assert numwarps == 4
        if tid < 2:
            sm_partials[tid, 0] = reduce_op(sm_partials[tid, 0],
                                            sm_partials[tid + 2, 0])
            cuda.syncwarp()
        if tid == 0:
            partials[blkid] = reduce_op(sm_partials[0, 0], sm_partials[1, 0])

Exemple #3

0

Afficher le fichier

    def inner_warp_reduction(sm_partials, init):
        """
        Compute reduction within a single warp
        """
        tid = cuda.threadIdx.x
        warpid = tid // _WARPSIZE
        laneid = tid % _WARPSIZE

        sm_this = sm_partials[warpid, :]
        sm_this[laneid] = init
        cuda.syncwarp()

        width = _WARPSIZE // 2
        while width:
            if laneid < width:
                old = sm_this[laneid]
                sm_this[laneid] = reduce_op(old, sm_this[laneid + width])
            cuda.syncwarp()
            width //= 2

Exemple #4

0

Afficher le fichier

Fichier : test_warp_ops.py Projet : zhaijf1992/numba

def useful_syncwarp(ary):
    i = cuda.grid(1)
    if i == 0:
        ary[0] = 42
    cuda.syncwarp(0xffffffff)
    ary[i] = ary[0]

Exemple #5

0

Afficher le fichier

Fichier : test_sync.py Projet : srchilukoori/numba

def useless_syncwarp_with_mask(ary):
    i = cuda.grid(1)
    cuda.syncwarp(0xFFFF)
    ary[i] = i

Exemple #6

0

Afficher le fichier

Fichier : test_sync.py Projet : srchilukoori/numba

def useless_syncwarp(ary):
    i = cuda.grid(1)
    cuda.syncwarp()
    ary[i] = i