def scan_block(data, sums): sm_data = roc.shared.array(128, dtype=intp) tid = roc.get_local_id(0) gid = roc.get_global_id(0) blkid = roc.get_group_id(0) sm_data[tid] = data[gid] prefixsum = device_scan_generic(tid, sm_data) data[gid] = sm_data[tid] sums[blkid, tid] = prefixsum
def scan_block(data, sums): sm_data = roc.shared.array(128, dtype=intp) tid = roc.get_local_id(0) gid = roc.get_global_id(0) blkid = roc.get_group_id(0) scanval, prefixsum = device_scan(tid, data[gid], sm_data, False) data[gid] = scanval sums[blkid, tid] = prefixsum
def udt(output): global_id = roc.get_global_id(0) global_size = roc.get_global_size(0) local_id = roc.get_local_id(0) group_id = roc.get_group_id(0) num_groups = roc.get_num_groups(0) workdim = roc.get_work_dim() local_size = roc.get_local_size(0) output[0, group_id, local_id] = global_id output[1, group_id, local_id] = global_size output[2, group_id, local_id] = local_id output[3, group_id, local_id] = local_size output[4, group_id, local_id] = group_id output[5, group_id, local_id] = num_groups output[6, group_id, local_id] = workdim
def kernel_reduce_min(inp, out, nelem): tid = roc.get_local_id(0) blkid = roc.get_group_id(0) blksz = roc.get_local_size(0) numgroup = roc.get_num_groups(0) i = blkid * blksz + tid accum = dtype(POS_INF) while i < nelem: accum = min(accum, inp[i]) i += blksz * numgroup accum = group_reducer(accum) if tid == 0: out[blkid] = accum
def kernel_scatter(size, shift, shuffled, scanblocksum, localscan, shuffled_sorted, indices, indices_sorted, store_indices): tid = roc.get_local_id(0) blkid = roc.get_group_id(0) gid = roc.get_global_id(0) if gid < size: curdata = uintp(shuffled[blkid, tid]) data_radix = uintp((curdata >> uintp(shift)) & uintp(RADIX_MINUS_1)) pos = scanblocksum[data_radix, blkid] + localscan[blkid, tid] shuffled_sorted[pos] = curdata if store_indices: indices_sorted[pos] = indices[gid]
def roc_uni_kde(support, samples, bandwidth, pdf): gid = roc.get_group_id(0) tid = roc.get_local_id(0) tsz = roc.get_local_size(0) supp = support[gid] # all local threads cooperatively computes the energy for a support energy = 0 for base in range(0, samples.size, tsz): idx = tid + base if idx < samples.size: energy += kernel((samples[idx] - supp) / bandwidth) / bandwidth # reduce energy total = group_reduce_sum_float64(energy) if tid == 0: pdf[gid] = total / samples.size
def kernel_local_shuffle(data, size, shift, blocksum, localscan, shuffled, indices, store_indices): tid = roc.get_local_id(0) blkid = roc.get_group_id(0) blksz = localscan.shape[1] sm_mask = roc.shared.array(shape=mask_shape, dtype=int32) sm_blocksum = roc.shared.array(shape=4, dtype=int32) sm_shuffled = roc.shared.array(shape=block_size, dtype=uintp) sm_indices = roc.shared.array(shape=block_size, dtype=uintp) sm_localscan = roc.shared.array(shape=block_size, dtype=int32) sm_localscan[tid] = -1 dataid = blkid * blksz + tid valid = dataid < size and tid < blksz curdata = uintp(data[dataid] if valid else uintp(0)) processed_data = uintp((curdata >> uintp(shift)) & uintp(RADIX_MINUS_1)) chunk_offset, scanval = four_way_scan(processed_data, sm_mask, sm_blocksum, blksz, valid) if tid < RADIX: blocksum[tid, blkid] = sm_blocksum[tid] if tid < blksz: # Store local scan value where = chunk_offset + scanval # Store shuffled value and indices shuffled[blkid, where] = curdata if store_indices and valid: sm_indices[where] = indices[dataid] sm_localscan[where] = scanval # Cleanup roc.barrier() if tid < blksz: # shuffled[blkid, tid] = sm_shuffled[tid] if store_indices and valid: indices[dataid] = sm_indices[tid] localscan[blkid, tid] = sm_localscan[tid]
def udt(output): global_id = roc.get_global_id(0) group_id = roc.get_group_id(0) output[global_id] = group_id + 1
def kernel_warp_reduce(inp, out): idx = roc.get_group_id(0) val = inp[idx] out[idx] = wave_reduce(val)