Exemple #1
0
        def matmulfast(A, B, C):
            x = hsa.get_global_id(0)
            y = hsa.get_global_id(1)

            tx = hsa.get_local_id(0)
            ty = hsa.get_local_id(1)

            sA = hsa.shared.array(shape=(blocksize, blocksize), dtype=float32)
            sB = hsa.shared.array(shape=(blocksize, blocksize), dtype=float32)

            if x >= C.shape[0] or y >= C.shape[1]:
                return

            tmp = 0

            for i in range(gridsize):
                # preload
                sA[tx, ty] = A[x, ty + i * blocksize]
                sB[tx, ty] = B[tx + i * blocksize, y]
                # wait for preload to end
                hsa.barrier(1)
                # compute loop
                for j in range(blocksize):
                    tmp += sA[tx, j] * sB[j, ty]
                # wait for compute to end
                hsa.barrier(1)

            C[x, y] = tmp
Exemple #2
0
        def matmulfast(A, B, C):
            x = hsa.get_global_id(0)
            y = hsa.get_global_id(1)

            tx = hsa.get_local_id(0)
            ty = hsa.get_local_id(1)

            sA = hsa.shared.array(shape=(blocksize, blocksize), dtype=float32)
            sB = hsa.shared.array(shape=(blocksize, blocksize), dtype=float32)

            if x >= C.shape[0] or y >= C.shape[1]:
                return

            tmp = 0

            for i in range(gridsize):
                # preload
                sA[tx, ty] = A[x, ty + i * blocksize]
                sB[tx, ty] = B[tx + i * blocksize, y]
                # wait for preload to end
                hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE)
                # compute loop
                for j in range(blocksize):
                    tmp += sA[tx, j] * sB[j, ty]
                # wait for compute to end
                hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE)

            C[x, y] = tmp
Exemple #3
0
def atomic_add3(ary):
    tx = hsa.get_local_id(0)
    ty = hsa.get_local_id(1)
    sm = hsa.shared.array((4, 8), numba.uint32)
    sm[tx, ty] = ary[tx, ty]
    hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE)
    hsa.atomic.add(sm, (tx, numba.uint64(ty)), 1)
    hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE)
    ary[tx, ty] = sm[tx, ty]
Exemple #4
0
def atomic_add2(ary):
    tx = hsa.get_local_id(0)
    ty = hsa.get_local_id(1)
    sm = hsa.shared.array((4, 8), numba.uint32)
    sm[tx, ty] = ary[tx, ty]
    hsa.barrier(1)
    hsa.atomic.add(sm, (tx, ty), 1)
    hsa.barrier(1)
    ary[tx, ty] = sm[tx, ty]
Exemple #5
0
def atomic_add3(ary):
    tx = hsa.get_local_id(0)
    ty = hsa.get_local_id(1)
    sm = hsa.shared.array((4, 8), numba.uint32)
    sm[tx, ty] = ary[tx, ty]
    hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE)
    hsa.atomic.add(sm, (tx, numba.uint64(ty)), 1)
    hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE)
    ary[tx, ty] = sm[tx, ty]
Exemple #6
0
    def group_reduce_min(val):
        """
        First thread of first wave get the result
        """
        tid = hsa.get_local_id(0)
        blksz = hsa.get_local_size(0)
        wid = tid >> WAVEBITS
        lane = tid & (WAVESIZE - 1)

        sm_partials = hsa.shared.array(WAVESIZE, dtype=dtype)

        val = wave_reduce_min(val)

        if lane == 0:
            sm_partials[wid] = val

        hsa.barrier()

        val = sm_partials[lane] if tid < (blksz //
                                          WAVESIZE) else dtype(POS_INF)

        if wid == 0:
            val = wave_reduce_min(val)

        return val
def kernel_shuffle(arr, masks):
    tid = hsa.get_local_id(0)
    temp = hsa.shared.array(256, dtype=intp)
    val = arr[tid]
    mask = masks[tid]
    out = local_shuffle(tid, val, mask, temp)
    arr[tid] = out
Exemple #8
0
def blockwise_prefixsum(value, temp, nelem):
    tid = hsa.get_local_id(0)

    # inc_val = local_inclusive_scan_shuf(tid, value, nelem, data)
    inc_val = local_inclusive_scan(tid, value, nelem, temp)

    hsa.barrier()
    if tid + 1 == nelem:
        # the last value stores the sum at index 0
        temp[0] = inc_val
    else:
        # the other value stores at the next slot for exclusive scan value
        temp[tid + 1] = inc_val

    hsa.barrier()

    # Read the sum
    the_sum = temp[0]

    hsa.barrier()

    # Reset first slot to zero
    if tid == 0:
        temp[0] = 0

    hsa.barrier()
    return the_sum
def kernel_scan(values):
    tid = hsa.get_local_id(0)
    nelem = values.size
    temp = hsa.shared.array(256, dtype=intp)
    value = values[tid]
    out = local_inclusive_scan(tid, value, nelem, temp)
    values[tid] = out
Exemple #10
0
        def four_way_scan(data, sm_masks, sm_blocksum, blksz, valid):
            sm_chunkoffset = hsa.shared.array(4, dtype=int32)

            tid = hsa.get_local_id(0)

            laneid = tid & (_WARPSIZE - 1)
            warpid = tid >> 6

            my_digit = -1

            for digit in range(RADIX):
                sm_masks[digit, tid] = 0
                if valid and data == digit:
                    sm_masks[digit, tid] = 1
                    my_digit = digit

            hsa.barrier()

            offset = 0
            base = 0
            while offset < blksz:
                # Exclusive scan
                if warpid < RADIX:
                    val = intp(sm_masks[warpid, offset + laneid])
                    cur, psum = shuf_wave_exclusive_scan(val)
                    sm_masks[warpid, offset + laneid] = cur + base
                    base += psum

                hsa.barrier()
                offset += _WARPSIZE

            hsa.barrier()

            # Store blocksum from the exclusive scan
            if warpid < RADIX and laneid == 0:
                sm_blocksum[warpid] = base

            hsa.barrier()
            # Calc chunk offset (a short exclusive scan)
            if tid == 0:
                sm_chunkoffset[0] = 0
                sm_chunkoffset[1] = sm_blocksum[0]
                sm_chunkoffset[2] = sm_chunkoffset[1] + sm_blocksum[1]
                sm_chunkoffset[3] = sm_chunkoffset[2] + sm_blocksum[2]

            hsa.barrier()
            # Prepare output
            chunk_offset = -1
            scanval = -1

            if my_digit != -1:
                chunk_offset = sm_chunkoffset[my_digit]
                scanval = sm_masks[my_digit, tid]

            hsa.wavebarrier()
            hsa.barrier()

            return chunk_offset, scanval
Exemple #11
0
 def scan_block(data, sums):
     sm_data = hsa.shared.array(128, dtype=intp)
     tid = hsa.get_local_id(0)
     gid = hsa.get_global_id(0)
     blkid = hsa.get_group_id(0)
     sm_data[tid] = data[gid]
     prefixsum = device_scan_generic(tid, sm_data)
     data[gid] = sm_data[tid]
     sums[blkid, tid] = prefixsum
Exemple #12
0
def atomic_add(ary):
    tid = hsa.get_local_id(0)
    sm = hsa.shared.array(32, numba.uint32)
    sm[tid] = 0
    hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE)
    bin = ary[tid] % 32
    hsa.atomic.add(sm, bin, 1)
    hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE)
    ary[tid] = sm[tid]
Exemple #13
0
 def scan_block(data, sums):
     sm_data = hsa.shared.array(128, dtype=intp)
     tid = hsa.get_local_id(0)
     gid = hsa.get_global_id(0)
     blkid = hsa.get_group_id(0)
     sm_data[tid] = data[gid]
     prefixsum = device_scan_generic(tid, sm_data)
     data[gid] = sm_data[tid]
     sums[blkid, tid] = prefixsum
Exemple #14
0
def atomic_add(ary):
    tid = hsa.get_local_id(0)
    sm = hsa.shared.array(32, numba.uint32)
    sm[tid] = 0
    hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE)
    bin = ary[tid] % 32
    hsa.atomic.add(sm, bin, 1)
    hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE)
    ary[tid] = sm[tid]
def shuf_wave_exclusive_scan(val):
    tid = hsa.get_local_id(0)
    lane = tid & (_WARPSIZE - 1)

    incl = shuf_wave_inclusive_scan(val)
    excl = shuffle_up(incl, 1)
    the_sum = broadcast(excl, 0)
    if lane == 0:
        excl = 0
    return excl, the_sum
Exemple #16
0
        def scan_block(data, sums):
            sm_data = hsa.shared.array(128, dtype=intp)
            tid = hsa.get_local_id(0)
            gid = hsa.get_global_id(0)
            blkid = hsa.get_group_id(0)

            scanval, prefixsum = device_scan(tid, data[gid], sm_data, False)

            data[gid] = scanval
            sums[blkid, tid] = prefixsum
Exemple #17
0
        def scan_block(data, sums):
            sm_data = hsa.shared.array(128, dtype=intp)
            tid = hsa.get_local_id(0)
            gid = hsa.get_global_id(0)
            blkid = hsa.get_group_id(0)

            scanval, prefixsum = device_scan(tid, data[gid], sm_data,
                                             False)

            data[gid] = scanval
            sums[blkid, tid] = prefixsum
    def kernel_reduce_min(inp, out, nelem):
        tid = hsa.get_local_id(0)
        blkid = hsa.get_group_id(0)
        blksz = hsa.get_local_size(0)
        numgroup = hsa.get_num_groups(0)

        i = blkid * blksz + tid

        accum = dtype(POS_INF)
        while i < nelem:
            accum = min(accum, inp[i])
            i += blksz * numgroup

        accum = group_reducer(accum)
        if tid == 0:
            out[blkid] = accum
Exemple #19
0
def blockwise_prefixsum_naive(data, nelem):
    last = data[nelem - 1]
    hsa.barrier()

    tid = hsa.get_local_id(0)

    if tid == 0:
        psum = 0
        for i in range(nelem):
            cur = data[i]
            data[i] = psum
            psum += cur

    hsa.barrier()

    return last + data[nelem - 1]
Exemple #20
0
        def udt(output):
            global_id = hsa.get_global_id(0)
            global_size = hsa.get_global_size(0)
            local_id = hsa.get_local_id(0)
            group_id = hsa.get_group_id(0)
            num_groups = hsa.get_num_groups(0)
            workdim = hsa.get_work_dim()
            local_size = hsa.get_local_size(0)

            output[0, group_id, local_id] = global_id
            output[1, group_id, local_id] = global_size
            output[2, group_id, local_id] = local_id
            output[3, group_id, local_id] = local_size
            output[4, group_id, local_id] = group_id
            output[5, group_id, local_id] = num_groups
            output[6, group_id, local_id] = workdim
Exemple #21
0
        def kernel_scatter(size, shift, shuffled, scanblocksum, localscan,
                           shuffled_sorted, indices, indices_sorted,
                           store_indices):
            tid = hsa.get_local_id(0)
            blkid = hsa.get_group_id(0)
            gid = hsa.get_global_id(0)

            if gid < size:
                curdata = uintp(shuffled[blkid, tid])
                data_radix = uintp((curdata >> uintp(shift)) &
                                   uintp(RADIX_MINUS_1))
                pos = scanblocksum[data_radix, blkid] + localscan[blkid, tid]
                shuffled_sorted[pos] = curdata

                if store_indices:
                    indices_sorted[pos] = indices[gid]
Exemple #22
0
    def kernel_reduce_min(inp, out, nelem):
        tid = hsa.get_local_id(0)
        blkid = hsa.get_group_id(0)
        blksz = hsa.get_local_size(0)
        numgroup = hsa.get_num_groups(0)

        i = blkid * blksz + tid

        accum = dtype(POS_INF)
        while i < nelem:
            accum = min(accum, inp[i])
            i += blksz * numgroup

        accum = group_reducer(accum)
        if tid == 0:
            out[blkid] = accum
Exemple #23
0
        def udt(output):
            global_id = hsa.get_global_id(0)
            global_size = hsa.get_global_size(0)
            local_id = hsa.get_local_id(0)
            group_id = hsa.get_group_id(0)
            num_groups = hsa.get_num_groups(0)
            workdim = hsa.get_work_dim()
            local_size = hsa.get_local_size(0)

            output[0, group_id, local_id] = global_id
            output[1, group_id, local_id] = global_size
            output[2, group_id, local_id] = local_id
            output[3, group_id, local_id] = local_size
            output[4, group_id, local_id] = group_id
            output[5, group_id, local_id] = num_groups
            output[6, group_id, local_id] = workdim
    def hsa_multi_kde(support, samples, bandwidths, pdf):
        """
        Expects 2d arrays for samples and support: (num_observations,
        num_variables)
        """
        nvar = support.shape[1]
        i = hsa.get_global_id(0)
        tid = hsa.get_local_id(0)
        valid = i < support.shape[0]

        sum = 0

        sm_samples = hsa.shared.array(SAMPLES_SIZE, dtype=float64)
        sm_bandwidths = hsa.shared.array(MAX_NDIM, dtype=float64)
        sm_support = hsa.shared.array(SAMPLES_SIZE, dtype=float64)

        if valid:
            for k in range(nvar):
                sm_support[k, tid] = support[i, k]

        if tid < nvar:
            sm_bandwidths[tid] = bandwidths[tid]

        for base in range(0, samples.shape[0], BLOCKSIZE):
            loadcount = min(samples.shape[0] - base, BLOCKSIZE)

            hsa.barrier()

            # Preload samples tile
            if tid < loadcount:
                for k in range(nvar):
                    sm_samples[k, tid] = samples[base + tid, k]

            hsa.barrier()

            # Compute on the tile
            if valid:
                for j in range(loadcount):
                    prod = 1
                    for k in range(nvar):
                        bw = sm_bandwidths[k]
                        diff = sm_samples[k, j] - sm_support[k, tid]
                        prod *= kernel(diff / bw) / bw
                    sum += prod

        if valid:
            pdf[i] = sum / samples.shape[0]
    def hsa_multi_kde(support, samples, bandwidths, pdf):
        """
        Expects 2d arrays for samples and support: (num_observations,
        num_variables)
        """
        nvar = support.shape[1]
        i = hsa.get_global_id(0)
        tid = hsa.get_local_id(0)
        valid = i < support.shape[0]

        sum = 0

        sm_samples = hsa.shared.array(SAMPLES_SIZE, dtype=float64)
        sm_bandwidths = hsa.shared.array(MAX_NDIM, dtype=float64)
        sm_support = hsa.shared.array(SAMPLES_SIZE, dtype=float64)

        if valid:
            for k in range(nvar):
                sm_support[k, tid] = support[i, k]

        if tid < nvar:
            sm_bandwidths[tid] = bandwidths[tid]

        for base in range(0, samples.shape[0], BLOCKSIZE):
            loadcount = min(samples.shape[0] - base, BLOCKSIZE)

            hsa.barrier()

            # Preload samples tile
            if tid < loadcount:
                for k in range(nvar):
                    sm_samples[k, tid] = samples[base + tid, k]

            hsa.barrier()

            # Compute on the tile
            if valid:
                for j in range(loadcount):
                    prod = 1
                    for k in range(nvar):
                        bw = sm_bandwidths[k]
                        diff = sm_samples[k, j] - sm_support[k, tid]
                        prod *= kernel(diff / bw) / bw
                    sum += prod

        if valid:
            pdf[i] = sum / samples.shape[0]
    def hsa_uni_kde(support, samples, bandwidth, pdf):
        gid = hsa.get_group_id(0)
        tid = hsa.get_local_id(0)
        tsz = hsa.get_local_size(0)

        supp = support[gid]

        # all local threads cooperatively computes the energy for a support
        energy = 0
        for base in range(0, samples.size, tsz):
            idx = tid + base
            if idx < samples.size:
                energy += kernel((samples[idx] - supp) / bandwidth) / bandwidth

        # reduce energy
        total = group_reduce_sum_float64(energy)
        if tid == 0:
            pdf[gid] = total / samples.size
Exemple #27
0
def wave_reduce(val):
    tmp = val
    tid = hsa.get_local_id(0)
    laneid = tid & (WAVESIZE - 1)

    width = WAVESIZE // 2

    while width > 0:
        hsa.wavebarrier()
        other = hsa.activelanepermute_wavewidth(tmp, laneid + width, 0, False)
        if laneid < width:
            tmp += other

        width //= 2

    # First thread has the result
    hsa.wavebarrier()
    return hsa.activelanepermute_wavewidth(tmp, 0, 0, False)
    def hsa_uni_kde(support, samples, bandwidth, pdf):
        gid = hsa.get_group_id(0)
        tid = hsa.get_local_id(0)
        tsz = hsa.get_local_size(0)

        supp = support[gid]

        # all local threads cooperatively computes the energy for a support
        energy = 0
        for base in range(0, samples.size, tsz):
            idx = tid + base
            if idx < samples.size:
                energy += kernel((samples[idx] - supp) / bandwidth) / bandwidth

        # reduce energy
        total = group_reduce_sum_float64(energy)
        if tid == 0:
            pdf[gid] = total / samples.size
Exemple #29
0
def wave_reduce(val):
    tmp = val
    tid = hsa.get_local_id(0)
    laneid = tid & (WAVESIZE - 1)

    width = WAVESIZE // 2

    while width > 0:
        hsa.wavebarrier()
        other = hsa.activelanepermute_wavewidth(tmp, laneid + width, 0, False)
        if laneid < width:
            tmp += other

        width //= 2

    # First thread has the result
    hsa.wavebarrier()
    return hsa.activelanepermute_wavewidth(tmp, 0, 0, False)
Exemple #30
0
        def kernel_local_shuffle(data, size, shift, blocksum, localscan,
                                 shuffled, indices, store_indices):
            tid = hsa.get_local_id(0)
            blkid = hsa.get_group_id(0)
            blksz = localscan.shape[1]

            sm_mask = hsa.shared.array(shape=mask_shape, dtype=int32)
            sm_blocksum = hsa.shared.array(shape=4, dtype=int32)
            sm_shuffled = hsa.shared.array(shape=block_size, dtype=uintp)
            sm_indices = hsa.shared.array(shape=block_size, dtype=uintp)
            sm_localscan = hsa.shared.array(shape=block_size, dtype=int32)
            sm_localscan[tid] = -1

            dataid = blkid * blksz + tid
            valid = dataid < size and tid < blksz
            curdata = uintp(data[dataid] if valid else uintp(0))
            processed_data = uintp((curdata >> uintp(shift)) &
                                   uintp(RADIX_MINUS_1))

            chunk_offset, scanval = four_way_scan(processed_data, sm_mask,
                                                  sm_blocksum, blksz, valid)

            if tid < RADIX:
                blocksum[tid, blkid] = sm_blocksum[tid]

            if tid < blksz:
                # Store local scan value
                where = chunk_offset + scanval
                # Store shuffled value and indices
                shuffled[blkid, where] = curdata
                if store_indices and valid:
                    sm_indices[where] = indices[dataid]
                sm_localscan[where] = scanval

            # Cleanup
            hsa.barrier()
            if tid < blksz:
                # shuffled[blkid, tid] = sm_shuffled[tid]
                if store_indices and valid:
                    indices[dataid] = sm_indices[tid]
                localscan[blkid, tid] = sm_localscan[tid]
Exemple #31
0
def foo(out):
    sm = hsa.shared.array(2, dtype=intp)
    tid = hsa.get_local_id(0)
    sm[tid] = 666

    hsa.barrier(0)

    if tid == 0:
        sm[0] = 123
        sm[1] = 321

    hsa.barrier(0)
    # Uncomment the following lines to prevent the optimization bug
    # hsa.barrier(0)

    if tid == 1:
        val = sm[tid]
    else:
        val = 456

    hsa.barrier(0)
    out[tid] = val
def shuf_device_inclusive_scan(data, temp):
    """
    Args
    ----
    data: scalar
        input for tid
    temp: shared memory for temporary work, requires at least
    threadcount/wavesize storage
    """
    tid = hsa.get_local_id(0)
    lane = tid & (_WARPSIZE - 1)
    warpid = tid >> 6

    hsa.barrier()

    # Scan warps in parallel
    warp_scan_res = shuf_wave_inclusive_scan(data)

    hsa.barrier()

    # Store partial sum into shared memory
    if lane == (_WARPSIZE - 1):
        temp[warpid] = warp_scan_res

    hsa.barrier()

    # Scan the partial sum by first wave
    if warpid == 0:
        temp[lane] = shuf_wave_inclusive_scan(temp[lane])

    hsa.barrier()

    # Get block sum for each wave
    blocksum = 0  # first wave is 0
    if warpid > 0:
        blocksum = temp[warpid - 1]

    return warp_scan_res + blocksum
    def group_reduce_min(val):
        """
        First thread of first wave get the result
        """
        tid = hsa.get_local_id(0)
        blksz = hsa.get_local_size(0)
        wid = tid >> WAVEBITS
        lane = tid & (WAVESIZE - 1)

        sm_partials = hsa.shared.array(WAVESIZE, dtype=dtype)

        val = wave_reduce_min(val)

        if lane == 0:
            sm_partials[wid] = val

        hsa.barrier()

        val = sm_partials[lane] if tid < (blksz // WAVESIZE) else dtype(POS_INF)

        if wid == 0:
            val = wave_reduce_min(val)

        return val
def shuf_wave_inclusive_scan(val):
    tid = hsa.get_local_id(0)
    lane = tid & (_WARPSIZE - 1)

    hsa.wavebarrier()
    shuf = shuffle_up(val, 1)
    if lane >= 1:
        val += shuf

    hsa.wavebarrier()
    shuf = shuffle_up(val, 2)
    if lane >= 2:
        val += shuf

    hsa.wavebarrier()
    shuf = shuffle_up(val, 4)
    if lane >= 4:
        val += shuf

    hsa.wavebarrier()
    shuf = shuffle_up(val, 8)
    if lane >= 8:
        val += shuf

    hsa.wavebarrier()
    shuf = shuffle_up(val, 16)
    if lane >= 16:
        val += shuf

    hsa.wavebarrier()
    shuf = shuffle_up(val, 32)
    if lane >= 32:
        val += shuf

    hsa.wavebarrier()
    return val
Exemple #35
0
def shuf_wave_inclusive_scan(val):
    tid = hsa.get_local_id(0)
    lane = tid & (_WARPSIZE - 1)

    hsa.wavebarrier()
    shuf = shuffle_up(val, 1)
    if lane >= 1:
        val += shuf

    hsa.wavebarrier()
    shuf = shuffle_up(val, 2)
    if lane >= 2:
        val += shuf

    hsa.wavebarrier()
    shuf = shuffle_up(val, 4)
    if lane >= 4:
        val += shuf

    hsa.wavebarrier()
    shuf = shuffle_up(val, 8)
    if lane >= 8:
        val += shuf

    hsa.wavebarrier()
    shuf = shuffle_up(val, 16)
    if lane >= 16:
        val += shuf

    hsa.wavebarrier()
    shuf = shuffle_up(val, 32)
    if lane >= 32:
        val += shuf

    hsa.wavebarrier()
    return val
Exemple #36
0
 def foo(inp, mask, out):
     tid = hsa.get_local_id(0)
     out[tid] = hsa.activelanepermute_wavewidth(inp[tid], mask[tid], 0,
                                                False)
Exemple #37
0
 def foo(inp, mask, out):
     tid = hsa.get_local_id(0)
     out[tid] = hsa.activelanepermute_wavewidth(inp[tid], mask[tid], 0,
                                                False)
Exemple #38
0
 def test_wave_reduce(inp, out):
     tid = hsa.get_local_id(0)
     val = inp[tid]
     out[tid] = wave_reduce_sum_all(val)
Exemple #39
0
def shuffle_up(val, width):
    tid = hsa.get_local_id(0)
    hsa.wavebarrier()
    res = hsa.activelanepermute_wavewidth(val, tid - width, 0, False)
    return res
def kernel_wave_excl_scan_shuf(values, out, psum):
    tid = hsa.get_local_id(0)
    val = values[tid]
    out[tid], psum[tid] = shuf_wave_exclusive_scan(val)
 def test_wave_reduce(inp, out):
     tid = hsa.get_local_id(0)
     val = inp[tid]
     out[tid] = wave_reduce_sum_all(val)
def shuffle_up(val, width):
    tid = hsa.get_local_id(0)
    hsa.wavebarrier()
    res = hsa.activelanepermute_wavewidth(val, tid - width, 0, False)
    return res
Exemple #43
0
 def udt(output):
     global_id = hsa.get_global_id(0)
     local_id = hsa.get_local_id(0)
     output[global_id] = local_id