Esempio n. 1
0
        def f(x, y):
            tid = jit.threadIdx.x
            ntid = jit.blockDim.x
            bid = jit.blockIdx.x
            i = tid + bid * ntid

            smem = jit.shared_memory(numpy.int32, 32)
            smem[tid] = x[i]
            jit.syncthreads()
            y[i] = smem[ntid - tid - 1]
Esempio n. 2
0
def reduction(x, y, size):
    tid = jit.threadIdx.x
    ntid = jit.blockDim.x

    value = cupy.float32(0)
    for i in range(tid, size, ntid):
        value += x[i]

    smem = jit.shared_memory(cupy.float32, 1024)
    smem[tid] = value

    jit.syncthreads()

    if tid == cupy.uint32(0):
        value = cupy.float32(0)
        for i in range(ntid):
            value += smem[i]
        y[0] = value
Esempio n. 3
0
def reduction(x, y, size):
    tid = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x
    ntid = jit.blockDim.x * jit.gridDim.x

    value = cupy.float32(0)
    for i in range(tid, size, ntid):
        value += x[i]

    smem = jit.shared_memory(cupy.float32, 1024)
    smem[jit.threadIdx.x] = value

    jit.syncthreads()

    if jit.threadIdx.x == cupy.uint32(0):
        value = cupy.float32(0)
        for i in range(jit.blockDim.x):
            value += smem[i]
        jit.atomic_add(y, 0, value)
Esempio n. 4
0
 def f(x, y, buf):
     tid = jit.threadIdx.x + jit.threadIdx.y * jit.blockDim.x
     ntid = jit.blockDim.x * jit.blockDim.y
     buf[tid] = x[ntid - tid - 1]
     jit.syncthreads()
     y[tid] = buf[ntid - tid - 1]
Esempio n. 5
0
 def f(x, y):
     tid = jit.threadIdx.x
     y[tid] = x[tid]
     jit.syncthreads()
     g(1)(y)