Example #1
0
    def group_reduce_min(val):
        """
        First thread of first wave get the result
        """
        tid = roc.get_local_id(0)
        blksz = roc.get_local_size(0)
        wid = tid >> WAVEBITS
        lane = tid & (WAVESIZE - 1)

        sm_partials = roc.shared.array(WAVESIZE, dtype=dtype)

        val = wave_reduce_min(val)

        if lane == 0:
            sm_partials[wid] = val

        roc.barrier()

        val = sm_partials[lane] if tid < (blksz //
                                          WAVESIZE) else dtype(POS_INF)

        if wid == 0:
            val = wave_reduce_min(val)

        return val
Example #2
0
        def udt(output):
            global_id = roc.get_global_id(0)
            global_size = roc.get_global_size(0)
            local_id = roc.get_local_id(0)
            group_id = roc.get_group_id(0)
            num_groups = roc.get_num_groups(0)
            workdim = roc.get_work_dim()
            local_size = roc.get_local_size(0)

            output[0, group_id, local_id] = global_id
            output[1, group_id, local_id] = global_size
            output[2, group_id, local_id] = local_id
            output[3, group_id, local_id] = local_size
            output[4, group_id, local_id] = group_id
            output[5, group_id, local_id] = num_groups
            output[6, group_id, local_id] = workdim
Example #3
0
    def kernel_reduce_min(inp, out, nelem):
        tid = roc.get_local_id(0)
        blkid = roc.get_group_id(0)
        blksz = roc.get_local_size(0)
        numgroup = roc.get_num_groups(0)

        i = blkid * blksz + tid

        accum = dtype(POS_INF)
        while i < nelem:
            accum = min(accum, inp[i])
            i += blksz * numgroup

        accum = group_reducer(accum)
        if tid == 0:
            out[blkid] = accum
Example #4
0
    def roc_uni_kde(support, samples, bandwidth, pdf):
        gid = roc.get_group_id(0)
        tid = roc.get_local_id(0)
        tsz = roc.get_local_size(0)

        supp = support[gid]

        # all local threads cooperatively computes the energy for a support
        energy = 0
        for base in range(0, samples.size, tsz):
            idx = tid + base
            if idx < samples.size:
                energy += kernel((samples[idx] - supp) / bandwidth) / bandwidth

        # reduce energy
        total = group_reduce_sum_float64(energy)
        if tid == 0:
            pdf[gid] = total / samples.size