def twice(A): i = hsa.get_global_id(0) d = A[i] # no argument defaults to global mem fence # which is the same for local in hsail hsa.barrier() A[i] = d * 2
def group_reduce_min(val): """ First thread of first wave get the result """ tid = hsa.get_local_id(0) blksz = hsa.get_local_size(0) wid = tid >> WAVEBITS lane = tid & (WAVESIZE - 1) sm_partials = hsa.shared.array(WAVESIZE, dtype=dtype) val = wave_reduce_min(val) if lane == 0: sm_partials[wid] = val hsa.barrier() val = sm_partials[lane] if tid < (blksz // WAVESIZE) else dtype(POS_INF) if wid == 0: val = wave_reduce_min(val) return val
def matmulfast(A, B, C): x = hsa.get_global_id(0) y = hsa.get_global_id(1) tx = hsa.get_local_id(0) ty = hsa.get_local_id(1) sA = hsa.shared.array(shape=(blocksize, blocksize), dtype=float32) sB = hsa.shared.array(shape=(blocksize, blocksize), dtype=float32) if x >= C.shape[0] or y >= C.shape[1]: return tmp = 0 for i in range(gridsize): # preload sA[tx, ty] = A[x, ty + i * blocksize] sB[tx, ty] = B[tx + i * blocksize, y] # wait for preload to end hsa.barrier(1) # compute loop for j in range(blocksize): tmp += sA[tx, j] * sB[j, ty] # wait for compute to end hsa.barrier(1) C[x, y] = tmp
def matmulfast(A, B, C): x = hsa.get_global_id(0) y = hsa.get_global_id(1) tx = hsa.get_local_id(0) ty = hsa.get_local_id(1) sA = hsa.shared.array(shape=(blocksize, blocksize), dtype=float32) sB = hsa.shared.array(shape=(blocksize, blocksize), dtype=float32) if x >= C.shape[0] or y >= C.shape[1]: return tmp = 0 for i in range(gridsize): # preload sA[tx, ty] = A[x, ty + i * blocksize] sB[tx, ty] = B[tx + i * blocksize, y] # wait for preload to end hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE) # compute loop for j in range(blocksize): tmp += sA[tx, j] * sB[j, ty] # wait for compute to end hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE) C[x, y] = tmp
def atomic_add(ary): tid = hsa.get_local_id(0) sm = hsa.shared.array(32, numba.uint32) sm[tid] = 0 hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE) bin = ary[tid] % 32 hsa.atomic.add(sm, bin, 1) hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE) ary[tid] = sm[tid]
def atomic_add3(ary): tx = hsa.get_local_id(0) ty = hsa.get_local_id(1) sm = hsa.shared.array((4, 8), numba.uint32) sm[tx, ty] = ary[tx, ty] hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE) hsa.atomic.add(sm, (tx, numba.uint64(ty)), 1) hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE) ary[tx, ty] = sm[tx, ty]
def atomic_add2(ary): tx = hsa.get_local_id(0) ty = hsa.get_local_id(1) sm = hsa.shared.array((4, 8), numba.uint32) sm[tx, ty] = ary[tx, ty] hsa.barrier(1) hsa.atomic.add(sm, (tx, ty), 1) hsa.barrier(1) ary[tx, ty] = sm[tx, ty]
def reverse_array(A): sm = hsa.shared.array(shape=blocksize, dtype=float32) i = hsa.get_global_id(0) # preload sm[i] = A[i] # barrier hsa.barrier(1) # local mem fence # write A[i] += sm[blocksize - 1 - i]
def reverse_array(A): sm = hsa.shared.array(shape=blocksize, dtype=float32) i = hsa.get_global_id(0) # preload sm[i] = A[i] # barrier hsa.barrier(hsa.CLK_LOCAL_MEM_FENCE) # local mem fence # write A[i] += sm[blocksize - 1 - i]
def local_inclusive_scan_shuf(tid, value, nelem, temp): """ * temp: shared array Size of the array must be at least the number of active wave Note: This function must be called by all threads in the block """ hsa.barrier() hsa.wavebarrier() res = shuf_device_inclusive_scan(value, temp) hsa.barrier() return res
def blockwise_prefixsum(value, temp, nelem): tid = hsa.get_local_id(0) # inc_val = local_inclusive_scan_shuf(tid, value, nelem, data) inc_val = local_inclusive_scan(tid, value, nelem, temp) hsa.barrier() if tid + 1 == nelem: # the last value stores the sum at index 0 temp[0] = inc_val else: # the other value stores at the next slot for exclusive scan value temp[tid + 1] = inc_val hsa.barrier() # Read the sum the_sum = temp[0] hsa.barrier() # Reset first slot to zero if tid == 0: temp[0] = 0 hsa.barrier() return the_sum
def blockwise_prefixsum_naive(data, nelem): last = data[nelem - 1] hsa.barrier() tid = hsa.get_local_id(0) if tid == 0: psum = 0 for i in range(nelem): cur = data[i] data[i] = psum psum += cur hsa.barrier() return last + data[nelem - 1]
def hsa_multi_kde(support, samples, bandwidths, pdf): """ Expects 2d arrays for samples and support: (num_observations, num_variables) """ nvar = support.shape[1] i = hsa.get_global_id(0) tid = hsa.get_local_id(0) valid = i < support.shape[0] sum = 0 sm_samples = hsa.shared.array(SAMPLES_SIZE, dtype=float64) sm_bandwidths = hsa.shared.array(MAX_NDIM, dtype=float64) sm_support = hsa.shared.array(SAMPLES_SIZE, dtype=float64) if valid: for k in range(nvar): sm_support[k, tid] = support[i, k] if tid < nvar: sm_bandwidths[tid] = bandwidths[tid] for base in range(0, samples.shape[0], BLOCKSIZE): loadcount = min(samples.shape[0] - base, BLOCKSIZE) hsa.barrier() # Preload samples tile if tid < loadcount: for k in range(nvar): sm_samples[k, tid] = samples[base + tid, k] hsa.barrier() # Compute on the tile if valid: for j in range(loadcount): prod = 1 for k in range(nvar): bw = sm_bandwidths[k] diff = sm_samples[k, j] - sm_support[k, tid] prod *= kernel(diff / bw) / bw sum += prod if valid: pdf[i] = sum / samples.shape[0]
def device_scan_generic(tid, data): """Inclusive prefix sum within a single block Requires tid should have range [0, data.size) and data.size must be power of 2. """ n = data.size # Upsweep offset = 1 d = n // 2 while d > 0: hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE) if tid < d: ai = offset * (2 * tid + 1) - 1 bi = offset * (2 * tid + 2) - 1 data[bi] += data[ai] offset *= 2 d //= 2 hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE) prefixsum = data[n - 1] hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE) if tid == 0: data[n - 1] = 0 # Downsweep d = 1 offset = n while d < n: offset //= 2 hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE) if tid < d: ai = offset * (2 * tid + 1) - 1 bi = offset * (2 * tid + 2) - 1 tmp = data[ai] data[ai] = data[bi] data[bi] += tmp d *= 2 hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE) return prefixsum
def device_scan_generic(tid, data): """Inclusive prefix sum within a single block Requires tid should have range [0, data.size) and data.size must be power of 2. """ n = data.size # Upsweep offset = 1 d = n // 2 while d > 0: hsa.barrier(1) if tid < d: ai = offset * (2 * tid + 1) - 1 bi = offset * (2 * tid + 2) - 1 data[bi] += data[ai] offset *= 2 d //= 2 hsa.barrier(1) prefixsum = data[n - 1] hsa.barrier(1) if tid == 0: data[n - 1] = 0 # Downsweep d = 1 offset = n while d < n: offset //= 2 hsa.barrier(1) if tid < d: ai = offset * (2 * tid + 1) - 1 bi = offset * (2 * tid + 2) - 1 tmp = data[ai] data[ai] = data[bi] data[bi] += tmp d *= 2 hsa.barrier(1) return prefixsum
def kernel_local_shuffle(data, size, shift, blocksum, localscan, shuffled, indices, store_indices): tid = hsa.get_local_id(0) blkid = hsa.get_group_id(0) blksz = localscan.shape[1] sm_mask = hsa.shared.array(shape=mask_shape, dtype=int32) sm_blocksum = hsa.shared.array(shape=4, dtype=int32) sm_shuffled = hsa.shared.array(shape=block_size, dtype=uintp) sm_indices = hsa.shared.array(shape=block_size, dtype=uintp) sm_localscan = hsa.shared.array(shape=block_size, dtype=int32) sm_localscan[tid] = -1 dataid = blkid * blksz + tid valid = dataid < size and tid < blksz curdata = uintp(data[dataid] if valid else uintp(0)) processed_data = uintp((curdata >> uintp(shift)) & uintp(RADIX_MINUS_1)) chunk_offset, scanval = four_way_scan(processed_data, sm_mask, sm_blocksum, blksz, valid) if tid < RADIX: blocksum[tid, blkid] = sm_blocksum[tid] if tid < blksz: # Store local scan value where = chunk_offset + scanval # Store shuffled value and indices shuffled[blkid, where] = curdata if store_indices and valid: sm_indices[where] = indices[dataid] sm_localscan[where] = scanval # Cleanup hsa.barrier() if tid < blksz: # shuffled[blkid, tid] = sm_shuffled[tid] if store_indices and valid: indices[dataid] = sm_indices[tid] localscan[blkid, tid] = sm_localscan[tid]
def local_shuffle(tid, value, mask, temp): """ * temp: shared array Size of the array must be at least the number of threads Note: This function must be called by all threads in the block """ hsa.barrier(0) temp[tid] = value hsa.barrier(0) output = temp[mask] hsa.barrier(0) return output
def shuf_device_inclusive_scan(data, temp): """ Args ---- data: scalar input for tid temp: shared memory for temporary work, requires at least threadcount/wavesize storage """ tid = hsa.get_local_id(0) lane = tid & (_WARPSIZE - 1) warpid = tid >> 6 hsa.barrier() # Scan warps in parallel warp_scan_res = shuf_wave_inclusive_scan(data) hsa.barrier() # Store partial sum into shared memory if lane == (_WARPSIZE - 1): temp[warpid] = warp_scan_res hsa.barrier() # Scan the partial sum by first wave if warpid == 0: temp[lane] = shuf_wave_inclusive_scan(temp[lane]) hsa.barrier() # Get block sum for each wave blocksum = 0 # first wave is 0 if warpid > 0: blocksum = temp[warpid - 1] return warp_scan_res + blocksum
def foo(out): sm = hsa.shared.array(2, dtype=intp) tid = hsa.get_local_id(0) sm[tid] = 666 hsa.barrier(0) if tid == 0: sm[0] = 123 sm[1] = 321 hsa.barrier(0) # Uncomment the following lines to prevent the optimization bug # hsa.barrier(0) if tid == 1: val = sm[tid] else: val = 456 hsa.barrier(0) out[tid] = val
def twice(A): i = hsa.get_global_id(0) d = A[i] hsa.barrier(hsa.CLK_LOCAL_MEM_FENCE) # local mem fence A[i] = d * 2
def device_scan(tid, data, temp, inclusive): """ Args ---- tid: thread id data: scalar input for tid temp: shared memory for temporary work """ lane = tid & (_WARPSIZE - 1) warpid = tid >> 6 # Preload temp[tid] = data hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE) # Scan warps in parallel warp_scan_res = warp_scan(tid, temp, inclusive) hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE) # Get parital result if lane == (_WARPSIZE - 1): temp[warpid] = temp[tid] hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE) # Scan the partial results if warpid == 0: warp_scan(tid, temp, True) hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE) # Accumlate scanned partial results if warpid > 0: warp_scan_res += temp[warpid - 1] hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE) # Output if tid == temp.size - 1: # Last thread computes prefix sum if inclusive: temp[0] = warp_scan_res else: temp[0] = warp_scan_res + data hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE) # Load prefixsum prefixsum = temp[0] hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE) return warp_scan_res, prefixsum
def device_scan(tid, data, temp, inclusive): """ Args ---- tid: thread id data: scalar input for tid temp: shared memory for temporary work """ lane = tid & (_WARPSIZE - 1) warpid = tid >> 6 # Preload temp[tid] = data hsa.barrier(1) # Scan warps in parallel warp_scan_res = warp_scan(tid, temp, inclusive) hsa.barrier(1) # Get parital result if lane == (_WARPSIZE - 1): temp[warpid] = temp[tid] hsa.barrier(1) # Scan the partial results if warpid == 0: warp_scan(tid, temp, True) hsa.barrier(1) # Accumlate scanned partial results if warpid > 0: warp_scan_res += temp[warpid - 1] hsa.barrier(1) # Output if tid == temp.size - 1: # Last thread computes prefix sum if inclusive: temp[0] = warp_scan_res else: temp[0] = warp_scan_res + data hsa.barrier(1) # Load prefixsum prefixsum = temp[0] hsa.barrier(1) return warp_scan_res, prefixsum
def twice(A): i = hsa.get_global_id(0) d = A[i] hsa.barrier(1) # local mem fence A[i] = d * 2
def four_way_scan(data, sm_masks, sm_blocksum, blksz, valid): sm_chunkoffset = hsa.shared.array(4, dtype=int32) tid = hsa.get_local_id(0) laneid = tid & (_WARPSIZE - 1) warpid = tid >> 6 my_digit = -1 for digit in range(RADIX): sm_masks[digit, tid] = 0 if valid and data == digit: sm_masks[digit, tid] = 1 my_digit = digit hsa.barrier() offset = 0 base = 0 while offset < blksz: # Exclusive scan if warpid < RADIX: val = intp(sm_masks[warpid, offset + laneid]) cur, psum = shuf_wave_exclusive_scan(val) sm_masks[warpid, offset + laneid] = cur + base base += psum hsa.barrier() offset += _WARPSIZE hsa.barrier() # Store blocksum from the exclusive scan if warpid < RADIX and laneid == 0: sm_blocksum[warpid] = base hsa.barrier() # Calc chunk offset (a short exclusive scan) if tid == 0: sm_chunkoffset[0] = 0 sm_chunkoffset[1] = sm_blocksum[0] sm_chunkoffset[2] = sm_chunkoffset[1] + sm_blocksum[1] sm_chunkoffset[3] = sm_chunkoffset[2] + sm_blocksum[2] hsa.barrier() # Prepare output chunk_offset = -1 scanval = -1 if my_digit != -1: chunk_offset = sm_chunkoffset[my_digit] scanval = sm_masks[my_digit, tid] hsa.wavebarrier() hsa.barrier() return chunk_offset, scanval