def udt(output): global_id = roc.get_global_id(0) global_size = roc.get_global_size(0) local_id = roc.get_local_id(0) group_id = roc.get_group_id(0) num_groups = roc.get_num_groups(0) workdim = roc.get_work_dim() local_size = roc.get_local_size(0) output[0, group_id, local_id] = global_id output[1, group_id, local_id] = global_size output[2, group_id, local_id] = local_id output[3, group_id, local_id] = local_size output[4, group_id, local_id] = group_id output[5, group_id, local_id] = num_groups output[6, group_id, local_id] = workdim
def kernel_reduce_min(inp, out, nelem): tid = roc.get_local_id(0) blkid = roc.get_group_id(0) blksz = roc.get_local_size(0) numgroup = roc.get_num_groups(0) i = blkid * blksz + tid accum = dtype(POS_INF) while i < nelem: accum = min(accum, inp[i]) i += blksz * numgroup accum = group_reducer(accum) if tid == 0: out[blkid] = accum