def atomicAggInc(counter): active = cuda.cg.coalesced_threads() mask = active.ballot(True) # Select the leader leader = cuda.ffs(mask) - 1 # Leader does the update if active.thread_rank == leader: res = cuda.atomic.add(counter, 0, cuda.popc(mask)) # Broadcast result res = active.shfl(res, leader) # Each thread computes its own value return res + cuda.popc(mask & ((1 << active.thread_rank) - 1))
def simple_popc(ary, c): ary[0] = cuda.popc(c)