def wave_reduce(val): tmp = val tid = hsa.get_local_id(0) laneid = tid & (WAVESIZE - 1) width = WAVESIZE // 2 while width > 0: hsa.wavebarrier() other = hsa.activelanepermute_wavewidth(tmp, laneid + width, 0, False) if laneid < width: tmp += other width //= 2 # First thread has the result hsa.wavebarrier() return hsa.activelanepermute_wavewidth(tmp, 0, 0, False)
def foo(inp, mask, out): tid = hsa.get_local_id(0) out[tid] = hsa.activelanepermute_wavewidth(inp[tid], mask[tid], 0, False)
def shuffle_up(val, width): tid = hsa.get_local_id(0) hsa.wavebarrier() res = hsa.activelanepermute_wavewidth(val, tid - width, 0, False) return res
def broadcast(val, src): hsa.wavebarrier() return hsa.activelanepermute_wavewidth(val, src, 0, False)