Beispiel #1
0
    def group_reduce_min(val):
        """
        First thread of first wave get the result
        """
        tid = roc.get_local_id(0)
        blksz = roc.get_local_size(0)
        wid = tid >> WAVEBITS
        lane = tid & (WAVESIZE - 1)

        sm_partials = roc.shared.array(WAVESIZE, dtype=dtype)

        val = wave_reduce_min(val)

        if lane == 0:
            sm_partials[wid] = val

        roc.barrier()

        val = sm_partials[lane] if tid < (blksz //
                                          WAVESIZE) else dtype(POS_INF)

        if wid == 0:
            val = wave_reduce_min(val)

        return val
Beispiel #2
0
 def twice(A):
     i = roc.get_global_id(0)
     d = A[i]
     # no argument defaults to global mem fence
     # which is the same for local in hsail
     roc.barrier()
     A[i] = d * 2
Beispiel #3
0
        def matmulfast(A, B, C):
            x = roc.get_global_id(0)
            y = roc.get_global_id(1)

            tx = roc.get_local_id(0)
            ty = roc.get_local_id(1)

            sA = roc.shared.array(shape=(blocksize, blocksize), dtype=float32)
            sB = roc.shared.array(shape=(blocksize, blocksize), dtype=float32)

            if x >= C.shape[0] or y >= C.shape[1]:
                return

            tmp = 0

            for i in range(gridsize):
                # preload
                sA[tx, ty] = A[x, ty + i * blocksize]
                sB[tx, ty] = B[tx + i * blocksize, y]
                # wait for preload to end
                roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)
                # compute loop
                for j in range(blocksize):
                    tmp += sA[tx, j] * sB[j, ty]
                # wait for compute to end
                roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)

            C[x, y] = tmp
 def twice(A):
     i = roc.get_global_id(0)
     d = A[i]
     # no argument defaults to global mem fence
     # which is the same for local in hsail
     roc.barrier()
     A[i] = d * 2
Beispiel #5
0
        def matmulfast(A, B, C):
            x = roc.get_global_id(0)
            y = roc.get_global_id(1)

            tx = roc.get_local_id(0)
            ty = roc.get_local_id(1)

            sA = roc.shared.array(shape=(blocksize, blocksize), dtype=float32)
            sB = roc.shared.array(shape=(blocksize, blocksize), dtype=float32)

            if x >= C.shape[0] or y >= C.shape[1]:
                return

            tmp = 0

            for i in range(gridsize):
                # preload
                sA[tx, ty] = A[x, ty + i * blocksize]
                sB[tx, ty] = B[tx + i * blocksize, y]
                # wait for preload to end
                roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)
                # compute loop
                for j in range(blocksize):
                    tmp += sA[tx, j] * sB[j, ty]
                # wait for compute to end
                roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)

            C[x, y] = tmp
Beispiel #6
0
def atomic_add(ary):
    tid = roc.get_local_id(0)
    sm = roc.shared.array(32, numba.uint32)
    sm[tid] = 0
    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)
    bin = ary[tid] % 32
    roc.atomic.add(sm, bin, 1)
    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)
    ary[tid] = sm[tid]
Beispiel #7
0
def atomic_add3(ary):
    tx = roc.get_local_id(0)
    ty = roc.get_local_id(1)
    sm = roc.shared.array((4, 8), numba.uint32)
    sm[tx, ty] = ary[tx, ty]
    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)
    roc.atomic.add(sm, (tx, numba.uint64(ty)), 1)
    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)
    ary[tx, ty] = sm[tx, ty]
Beispiel #8
0
def atomic_add3(ary):
    tx = roc.get_local_id(0)
    ty = roc.get_local_id(1)
    sm = roc.shared.array((4, 8), numba.uint32)
    sm[tx, ty] = ary[tx, ty]
    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)
    roc.atomic.add(sm, (tx, numba.uint64(ty)), 1)
    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)
    ary[tx, ty] = sm[tx, ty]
Beispiel #9
0
def atomic_add(ary):
    tid = roc.get_local_id(0)
    sm = roc.shared.array(32, numba.uint32)
    sm[tid] = 0
    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)
    bin = ary[tid] % 32
    roc.atomic.add(sm, bin, 1)
    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)
    ary[tid] = sm[tid]
        def reverse_array(A):
            sm = roc.shared.array(shape=blocksize, dtype=float32)
            i = roc.get_global_id(0)

            # preload
            sm[i] = A[i]
            # barrier
            roc.barrier(roc.CLK_LOCAL_MEM_FENCE)  # local mem fence
            # write
            A[i] += sm[blocksize - 1 - i]
Beispiel #11
0
        def reverse_array(A):
            sm = roc.shared.array(shape=blocksize, dtype=float32)
            i = roc.get_global_id(0)

            # preload
            sm[i] = A[i]
            # barrier
            roc.barrier(roc.CLK_LOCAL_MEM_FENCE)  # local mem fence
            # write
            A[i] += sm[blocksize - 1 - i]
Beispiel #12
0
def local_inclusive_scan_shuf(tid, value, nelem, temp):
    """
    * temp: shared array
        Size of the array must be at least the number of active wave

    Note: This function must be called by all threads in the block
    """
    roc.barrier()
    roc.wavebarrier()
    res = shuf_device_inclusive_scan(value, temp)
    roc.barrier()
    return res
Beispiel #13
0
def blockwise_prefixsum(value, temp, nelem):
    tid = roc.get_local_id(0)

    # inc_val = local_inclusive_scan_shuf(tid, value, nelem, data)
    inc_val = local_inclusive_scan(tid, value, nelem, temp)

    roc.barrier()
    if tid + 1 == nelem:
        # the last value stores the sum at index 0
        temp[0] = inc_val
    else:
        # the other value stores at the next slot for exclusive scan value
        temp[tid + 1] = inc_val

    roc.barrier()

    # Read the sum
    the_sum = temp[0]

    roc.barrier()

    # Reset first slot to zero
    if tid == 0:
        temp[0] = 0

    roc.barrier()
    return the_sum
Beispiel #14
0
def blockwise_prefixsum_naive(data, nelem):
    last = data[nelem - 1]
    roc.barrier()

    tid = roc.get_local_id(0)

    if tid == 0:
        psum = 0
        for i in range(nelem):
            cur = data[i]
            data[i] = psum
            psum += cur

    roc.barrier()

    return last + data[nelem - 1]
Beispiel #15
0
    def roc_multi_kde(support, samples, bandwidths, pdf):
        """
        Expects 2d arrays for samples and support: (num_observations,
        num_variables)
        """
        nvar = support.shape[1]
        i = roc.get_global_id(0)
        tid = roc.get_local_id(0)
        valid = i < support.shape[0]

        sum = 0

        sm_samples = roc.shared.array(SAMPLES_SIZE, dtype=float64)
        sm_bandwidths = roc.shared.array(MAX_NDIM, dtype=float64)
        sm_support = roc.shared.array(SAMPLES_SIZE, dtype=float64)

        if valid:
            for k in range(nvar):
                sm_support[k, tid] = support[i, k]

        if tid < nvar:
            sm_bandwidths[tid] = bandwidths[tid]

        for base in range(0, samples.shape[0], BLOCKSIZE):
            loadcount = min(samples.shape[0] - base, BLOCKSIZE)

            roc.barrier()

            # Preload samples tile
            if tid < loadcount:
                for k in range(nvar):
                    sm_samples[k, tid] = samples[base + tid, k]

            roc.barrier()

            # Compute on the tile
            if valid:
                for j in range(loadcount):
                    prod = 1
                    for k in range(nvar):
                        bw = sm_bandwidths[k]
                        diff = sm_samples[k, j] - sm_support[k, tid]
                        prod *= kernel(diff / bw) / bw
                    sum += prod

        if valid:
            pdf[i] = sum / samples.shape[0]
Beispiel #16
0
def device_scan_generic(tid, data):
    """Inclusive prefix sum within a single block

    Requires tid should have range [0, data.size) and data.size must be
    power of 2.
    """
    n = data.size

    # Upsweep
    offset = 1
    d = n // 2
    while d > 0:
        roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)
        if tid < d:
            ai = offset * (2 * tid + 1) - 1
            bi = offset * (2 * tid + 2) - 1
            data[bi] += data[ai]

        offset *= 2
        d //= 2

    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)
    prefixsum = data[n - 1]
    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)
    if tid == 0:
        data[n - 1] = 0

    # Downsweep
    d = 1
    offset = n
    while d < n:
        offset //= 2
        roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)
        if tid < d:
            ai = offset * (2 * tid + 1) - 1
            bi = offset * (2 * tid + 2) - 1

            tmp = data[ai]
            data[ai] = data[bi]
            data[bi] += tmp

        d *= 2

    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)
    return prefixsum
Beispiel #17
0
def device_scan_generic(tid, data):
    """Inclusive prefix sum within a single block

    Requires tid should have range [0, data.size) and data.size must be
    power of 2.
    """
    n = data.size

    # Upsweep
    offset = 1
    d = n // 2
    while d > 0:
        roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)
        if tid < d:
            ai = offset * (2 * tid + 1) - 1
            bi = offset * (2 * tid + 2) - 1
            data[bi] += data[ai]

        offset *= 2
        d //= 2

    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)
    prefixsum = data[n - 1]
    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)
    if tid == 0:
        data[n - 1] = 0

    # Downsweep
    d = 1
    offset = n
    while d < n:
        offset //= 2
        roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)
        if tid < d:
            ai = offset * (2 * tid + 1) - 1
            bi = offset * (2 * tid + 2) - 1

            tmp = data[ai]
            data[ai] = data[bi]
            data[bi] += tmp

        d *= 2

    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)
    return prefixsum
Beispiel #18
0
        def kernel_local_shuffle(data, size, shift, blocksum, localscan,
                                 shuffled, indices, store_indices):
            tid = roc.get_local_id(0)
            blkid = roc.get_group_id(0)
            blksz = localscan.shape[1]

            sm_mask = roc.shared.array(shape=mask_shape, dtype=int32)
            sm_blocksum = roc.shared.array(shape=4, dtype=int32)
            sm_shuffled = roc.shared.array(shape=block_size, dtype=uintp)
            sm_indices = roc.shared.array(shape=block_size, dtype=uintp)
            sm_localscan = roc.shared.array(shape=block_size, dtype=int32)
            sm_localscan[tid] = -1

            dataid = blkid * blksz + tid
            valid = dataid < size and tid < blksz
            curdata = uintp(data[dataid] if valid else uintp(0))
            processed_data = uintp((curdata >> uintp(shift))
                                   & uintp(RADIX_MINUS_1))

            chunk_offset, scanval = four_way_scan(processed_data, sm_mask,
                                                  sm_blocksum, blksz, valid)

            if tid < RADIX:
                blocksum[tid, blkid] = sm_blocksum[tid]

            if tid < blksz:
                # Store local scan value
                where = chunk_offset + scanval
                # Store shuffled value and indices
                shuffled[blkid, where] = curdata
                if store_indices and valid:
                    sm_indices[where] = indices[dataid]
                sm_localscan[where] = scanval

            # Cleanup
            roc.barrier()
            if tid < blksz:
                # shuffled[blkid, tid] = sm_shuffled[tid]
                if store_indices and valid:
                    indices[dataid] = sm_indices[tid]
                localscan[blkid, tid] = sm_localscan[tid]
Beispiel #19
0
def local_shuffle(tid, value, mask, temp):
    """
    * temp: shared array
        Size of the array must be at least the number of threads

    Note: This function must be called by all threads in the block
    """
    roc.barrier(0)
    temp[tid] = value
    roc.barrier(0)
    output = temp[mask]
    roc.barrier(0)
    return output
Beispiel #20
0
def shuf_device_inclusive_scan(data, temp):
    """
    Args
    ----
    data: scalar
        input for tid
    temp: shared memory for temporary work, requires at least
    threadcount/wavesize storage
    """
    tid = roc.get_local_id(0)
    lane = tid & (_WARPSIZE - 1)
    warpid = tid >> 6

    roc.barrier()

    # Scan warps in parallel
    warp_scan_res = shuf_wave_inclusive_scan(data)

    roc.barrier()

    # Store partial sum into shared memory
    if lane == (_WARPSIZE - 1):
        temp[warpid] = warp_scan_res

    roc.barrier()

    # Scan the partial sum by first wave
    if warpid == 0:
        temp[lane] = shuf_wave_inclusive_scan(temp[lane])

    roc.barrier()

    # Get block sum for each wave
    blocksum = 0  # first wave is 0
    if warpid > 0:
        blocksum = temp[warpid - 1]

    return warp_scan_res + blocksum
Beispiel #21
0
def shuf_device_inclusive_scan(data, temp):
    """
    Args
    ----
    data: scalar
        input for tid
    temp: shared memory for temporary work, requires at least
    threadcount/wavesize storage
    """
    tid = roc.get_local_id(0)
    lane = tid & (_WARPSIZE - 1)
    warpid = tid >> 6

    # Scan warps in parallel
    warp_scan_res = shuf_wave_inclusive_scan_int32(data)

    roc.barrier()

    # Store partial sum into shared memory
    if lane == (_WARPSIZE - 1):
        temp[warpid] = warp_scan_res

    roc.barrier()

    # Scan the partial sum by first wave
    if warpid == 0:
        shuf_wave_inclusive_scan_int32(temp[lane])

    roc.barrier()

    # Get block sum for each wave
    blocksum = 0    # first wave is 0
    if warpid > 0:
        blocksum = temp[warpid - 1]

    return warp_scan_res + blocksum
Beispiel #22
0
        def four_way_scan(data, sm_masks, sm_blocksum, blksz, valid):
            sm_chunkoffset = roc.shared.array(4, dtype=int32)

            tid = roc.get_local_id(0)

            laneid = tid & (_WARPSIZE - 1)
            warpid = tid >> 6

            my_digit = -1

            for digit in range(RADIX):
                sm_masks[digit, tid] = 0
                if valid and data == digit:
                    sm_masks[digit, tid] = 1
                    my_digit = digit

            roc.barrier()

            offset = 0
            base = 0
            while offset < blksz:
                # Exclusive scan
                if warpid < RADIX:
                    val = intp(sm_masks[warpid, offset + laneid])
                    cur, psum = shuf_wave_exclusive_scan(val)
                    sm_masks[warpid, offset + laneid] = cur + base
                    base += psum

                roc.barrier()
                offset += _WARPSIZE

            roc.barrier()

            # Store blocksum from the exclusive scan
            if warpid < RADIX and laneid == 0:
                sm_blocksum[warpid] = base

            roc.barrier()
            # Calc chunk offset (a short exclusive scan)
            if tid == 0:
                sm_chunkoffset[0] = 0
                sm_chunkoffset[1] = sm_blocksum[0]
                sm_chunkoffset[2] = sm_chunkoffset[1] + sm_blocksum[1]
                sm_chunkoffset[3] = sm_chunkoffset[2] + sm_blocksum[2]

            roc.barrier()
            # Prepare output
            chunk_offset = -1
            scanval = -1

            if my_digit != -1:
                chunk_offset = sm_chunkoffset[my_digit]
                scanval = sm_masks[my_digit, tid]

            roc.wavebarrier()
            roc.barrier()

            return chunk_offset, scanval
Beispiel #23
0
 def twice(A):
     i = roc.get_global_id(0)
     d = A[i]
     roc.barrier(roc.CLK_LOCAL_MEM_FENCE)  # local mem fence
     A[i] = d * 2
Beispiel #24
0
def device_scan(tid, data, temp, inclusive):
    """
    Args
    ----
    tid:
        thread id
    data: scalar
        input for tid
    temp: shared memory for temporary work
    """
    lane = tid & (_WARPSIZE - 1)
    warpid = tid >> 6

    # Preload
    temp[tid] = data
    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)

    # Scan warps in parallel
    warp_scan_res = warp_scan(tid, temp, inclusive)
    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)

    # Get parital result
    if lane == (_WARPSIZE - 1):
        temp[warpid] = temp[tid]
    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)

    # Scan the partial results
    if warpid == 0:
        warp_scan(tid, temp, True)
    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)

    # Accumlate scanned partial results
    if warpid > 0:
        warp_scan_res += temp[warpid - 1]
    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)

    # Output
    if tid == temp.size - 1:
        # Last thread computes prefix sum
        if inclusive:
            temp[0] = warp_scan_res
        else:
            temp[0] = warp_scan_res + data

    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)

    # Load prefixsum
    prefixsum = temp[0]
    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)

    return warp_scan_res, prefixsum
Beispiel #25
0
def device_scan(tid, data, temp, inclusive):
    """
    Args
    ----
    tid:
        thread id
    data: scalar
        input for tid
    temp: shared memory for temporary work
    """
    lane = tid & (_WARPSIZE - 1)
    warpid = tid >> 6

    # Preload
    temp[tid] = data
    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)

    # Scan warps in parallel
    warp_scan_res = warp_scan(tid, temp, inclusive)
    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)

    # Get parital result
    if lane == (_WARPSIZE - 1):
        temp[warpid] = temp[tid]
    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)

    # Scan the partial results
    if warpid == 0:
        warp_scan(tid, temp, True)
    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)

    # Accumlate scanned partial results
    if warpid > 0:
        warp_scan_res += temp[warpid - 1]
    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)

    # Output
    if tid == temp.size - 1:
        # Last thread computes prefix sum
        if inclusive:
            temp[0] = warp_scan_res
        else:
            temp[0] = warp_scan_res + data

    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)

    # Load prefixsum
    prefixsum = temp[0]
    roc.barrier(roc.CLK_GLOBAL_MEM_FENCE)

    return warp_scan_res, prefixsum
 def twice(A):
     i = roc.get_global_id(0)
     d = A[i]
     roc.barrier(roc.CLK_LOCAL_MEM_FENCE)  # local mem fence
     A[i] = d * 2