def group_reduce_min(val): """ First thread of first wave get the result """ tid = hsa.get_local_id(0) blksz = hsa.get_local_size(0) wid = tid >> WAVEBITS lane = tid & (WAVESIZE - 1) sm_partials = hsa.shared.array(WAVESIZE, dtype=dtype) val = wave_reduce_min(val) if lane == 0: sm_partials[wid] = val hsa.barrier() val = sm_partials[lane] if tid < (blksz // WAVESIZE) else dtype(POS_INF) if wid == 0: val = wave_reduce_min(val) return val
def udt(output): global_id = hsa.get_global_id(0) global_size = hsa.get_global_size(0) local_id = hsa.get_local_id(0) group_id = hsa.get_group_id(0) num_groups = hsa.get_num_groups(0) workdim = hsa.get_work_dim() local_size = hsa.get_local_size(0) output[0, group_id, local_id] = global_id output[1, group_id, local_id] = global_size output[2, group_id, local_id] = local_id output[3, group_id, local_id] = local_size output[4, group_id, local_id] = group_id output[5, group_id, local_id] = num_groups output[6, group_id, local_id] = workdim
def kernel_reduce_min(inp, out, nelem): tid = hsa.get_local_id(0) blkid = hsa.get_group_id(0) blksz = hsa.get_local_size(0) numgroup = hsa.get_num_groups(0) i = blkid * blksz + tid accum = dtype(POS_INF) while i < nelem: accum = min(accum, inp[i]) i += blksz * numgroup accum = group_reducer(accum) if tid == 0: out[blkid] = accum
def hsa_uni_kde(support, samples, bandwidth, pdf): gid = hsa.get_group_id(0) tid = hsa.get_local_id(0) tsz = hsa.get_local_size(0) supp = support[gid] # all local threads cooperatively computes the energy for a support energy = 0 for base in range(0, samples.size, tsz): idx = tid + base if idx < samples.size: energy += kernel((samples[idx] - supp) / bandwidth) / bandwidth # reduce energy total = group_reduce_sum_float64(energy) if tid == 0: pdf[gid] = total / samples.size