def atomicAggInc(counter):
    active = cuda.cg.coalesced_threads()

    mask = active.ballot(True)
    # Select the leader
    leader = cuda.ffs(mask) - 1

    # Leader does the update
    if active.thread_rank == leader:
        res = cuda.atomic.add(counter, 0, cuda.popc(mask))

    # Broadcast result
    res = active.shfl(res, leader)

    # Each thread computes its own value
    return res + cuda.popc(mask & ((1 << active.thread_rank) - 1))
Ejemplo n.º 2
0
def simple_ffs(ary, c):
    ary[0] = cuda.ffs(c)
Ejemplo n.º 3
0
def simple_ffs(ary, c):
    ary[0] = cuda.ffs(c)