Example #1
0
def double(inp, out):
    tx = cuda.threadIdx.x
    bx = cuda.blockIdx.x
    bw = cuda.blockDim.x

    idx = tx + bx * bw
    assert idx == cuda.grid(1)  # 1 dimension
    assert bw == cuda.gridsize(1)
    while idx < len(inp):
        out[idx] = inp[idx] * 2
        idx += cuda.gridsize(1)
Example #2
0
def min_max(x, min_max_array):
    """
    description:
        cuda jit to calculate the min and max values for the ndarray
    input:
        - x: ndarray
        - min_max_array: cuda.to_device(np.array([dtype_max, dtype_min],
        dtype=np.float32))
    """
    start = cuda.grid(1)
    stride = cuda.gridsize(1)

    # Array already seeded with starting values appropriate for x's dtype
    # Not a problem if this array has already been updated
    local_min = min_max_array[0]
    local_max = min_max_array[1]

    for i in range(start, x.shape[0], stride):
        element = x[i]
        local_min = min(element, local_min)
        local_max = max(element, local_max)

    # Now combine each thread local min and max
    cuda.atomic.min(min_max_array, 0, local_min)
    cuda.atomic.max(min_max_array, 1, local_max)
def compute_mandel(min_x, max_x, min_y, max_y, image, iters):
    '''
    A GPU version of calculating the mandel value for each element in the 
    image array. The real and imag variables contain a 
    value for each element of the complex space defined 
    by the X and Y boundaries (min_x, max_x) and 
    (min_y, max_y).

    Step 1:  define the absolute thread id (y, x) in (1024, 1536) and 
    Step 2:  define task size (e.g (1,12)) of each thread such to assign (1024 1536) tasks into (1024, 128) threads
    Step 3:  finish tasks in each thread
    '''
    grid_y, grid_x = cuda.gridsize(2) #(1024, 128) 
    y, x = cuda.grid(2) # (y, x) where y is in [0, 1023] and x is in [0, 127]
    height, width = image.shape # 1024, 1536

    pixel_size_x = (max_x - min_x) / width
    pixel_size_y = (max_y - min_y) / height
    
    # get the partition index of the y and x
    block_y = height // grid_y # 1
    block_x = width // grid_x # 12

    # every thread in (1024, 128) should handle (1, 12) tasks such that the totally (1024, 1536) tasks are evenly assigned

    for i in range(block_x):
        thread_x = x * block_x + i
        real = min_x + thread_x * pixel_size_x
        for j in range(block_y):
            thread_y = y * block_y+ j
            imag = min_y + thread_y * pixel_size_y
            if thread_y < height and thread_x < width:
                image[thread_y, thread_x] = mandel(real, imag, iters)
Example #4
0
def compute_inv_mass_cudakernel(offsets, pts, etas, phis, masses, mask_events,
                                mask_objects, out_inv_mass, out_pt_total):
    xi = cuda.grid(1)
    xstride = cuda.gridsize(1)
    for iev in range(xi, offsets.shape[0] - 1, xstride):
        if mask_events[iev]:
            start = np.uint64(offsets[iev])
            end = np.uint64(offsets[iev + 1])

            px_total = np.float32(0.0)
            py_total = np.float32(0.0)
            pz_total = np.float32(0.0)
            e_total = np.float32(0.0)

            for iobj in range(start, end):
                if mask_objects[iobj]:
                    pt = pts[iobj]
                    eta = etas[iobj]
                    phi = phis[iobj]
                    mass = masses[iobj]

                    px, py, pz, e = spherical_to_cartesian_devfunc(
                        pt, eta, phi, mass)

                    px_total += px
                    py_total += py
                    pz_total += pz
                    e_total += e

            inv_mass = math.sqrt(-(px_total**2 + py_total**2 + pz_total**2 -
                                   e_total**2))
            pt_total = math.sqrt(px_total**2 + py_total**2)
            out_inv_mass[iev] = inv_mass
            out_pt_total[iev] = pt_total
Example #5
0
def sort_in_offsets_kernel(content, offsets, index_to_get, mask_rows,
                           mask_content, out):

    xi = cuda.grid(1)
    xstride = cuda.gridsize(1)

    for i in range(xi, len(data), xstride):
        if not mask_rows[iev]:
            continue

        start = offsets[iev]
        end = offsets[iev + 1]
        event_content = content[start:end]

        merge_list = enumerate(event_content)

        while index_to_get < len(event_content):
            for j in range(len(event_content)):
                for i in range(1, len(event_content) - j):
                    if event_content[i - 1] > event_content[i]:
                        event_content[i - 1], event_content[i] = event_content[
                            i], event_content[i - 1]
                        merge_list[
                            i -
                            1], merge_list[i] = merge_list[i], merge_list[i -
                                                                          1]

            idx = merge_list[index_to_get][1]
            while not mask_content[idx + start]:
                idx += 1
            out[iev] += content[ielem]
Example #6
0
def count_weighted_pairs_3d_cuda_transpose_noncuml(ptswts1, ptswts2,
                                                   _rbins_squared, result):
    start = cuda.grid(1)
    stride = cuda.gridsize(1)

    n1 = ptswts1.shape[0] // 4
    n2 = ptswts1.shape[0] // 4
    nbins = _rbins_squared.shape[0] - 1
    dlogr = math.log(_rbins_squared[1] / _rbins_squared[0]) / 2
    logminr = math.log(_rbins_squared[0]) / 2

    smem = cuda.shared.array(128, numba.float32)
    if cuda.threadIdx.x == 0:
        for i in range(128):
            smem[i] = 0
    cuda.syncthreads()

    for i in range(start, n1, stride):
        loci = 4 * i
        for j in range(n2):
            locj = 4 * j
            dx = ptswts1[loci] - ptswts2[locj]
            dy = ptswts1[loci + 1] - ptswts2[locj + 1]
            dz = ptswts1[loci + 2] - ptswts2[locj + 2]
            dsq = cuda.fma(dx, dx, cuda.fma(dy, dy, dz * dz))

            k = int((math.log(dsq) / 2 - logminr) / dlogr)
            if k >= 0 and k < nbins:
                cuda.atomic.add(smem, k, ptswts1[loci + 3] * ptswts2[locj + 3])

    cuda.syncthreads()
    if cuda.threadIdx.x == 0:
        for k in range(nbins):
            cuda.atomic.add(result, k, smem[k])
Example #7
0
def calc_cumsum_data_tile(x, arr1):
    """
    description:
        cuda jit function calculate the data tile with cumulative sums
        for a 2-col ndarray
    input:
        - x: ndarray(3-cols)  (col1,col2, count_col2)
        - arr1: result array: type is int as the cells contain just
        frequencies
        (cuda.to_device(np.zeros(shape=(min_s,max_s)).astype(np.int32)))
        [X X X]
        [X X X]
        [X X X]
    """
    start = cuda.grid(1)
    stride = cuda.gridsize(1)

    for i in range(start, x.shape[0], stride):
        col1_i = int(round(x[i][0]))
        col2_i = int(round(x[i][1]))
        freq_i = x[i][2]

        if int(x[i][0]) != -1:
            if arr1[col2_i][col1_i] == float(0):
                arr1[col2_i][col1_i] = freq_i
            else:
                arr1[col2_i][col1_i] = (arr1[col2_i][col1_i] + freq_i) / 2
Example #8
0
def count_weighted_pairs_3d_cuda_smem_noncuml(x1, y1, z1, w1, x2, y2, z2, w2,
                                              _rbins_squared, result):
    start = cuda.grid(1)
    stride = cuda.gridsize(1)

    n1 = x1.shape[0]
    n2 = x2.shape[0]
    nbins = _rbins_squared.shape[0] - 1
    dlogr = math.log(_rbins_squared[1] / _rbins_squared[0]) / 2
    logminr = math.log(_rbins_squared[0]) / 2

    smem = cuda.shared.array(128, numba.float32)
    if cuda.threadIdx.x == 0:
        for i in range(128):
            smem[i] = 0
    cuda.syncthreads()

    for i in range(start, n1, stride):
        for j in range(n2):
            dx = x1[i] - x2[j]
            dy = y1[i] - y2[j]
            dz = z1[i] - z2[j]
            dsq = cuda.fma(dx, dx, cuda.fma(dy, dy, dz * dz))

            k = int((math.log(dsq) / 2 - logminr) / dlogr)
            if k >= 0 and k < nbins:
                cuda.atomic.add(smem, k, w1[i] * w2[j])

    cuda.syncthreads()
    if cuda.threadIdx.x == 0:
        for k in range(nbins):
            cuda.atomic.add(result, k, smem[k])
def numba_kde_cuda2(eval_points, samples, bandwidths, out):
    '''

    Parameters
    ----------
    eval_points : ndarray, shape (n_eval, n_bandwidths)
    samples : ndarray, shape (n_samples, n_bandwidths)
    out : ndarray, shape (n_eval,)

    '''
    thread_id1, thread_id2 = cuda.grid(2)
    stride1, stride2 = cuda.gridsize(2)

    (n_eval, n_bandwidths), n_samples = eval_points.shape, samples.shape[0]

    for eval_ind in range(thread_id1, n_eval, stride1):
        for sample_ind in range(thread_id2, n_samples, stride2):
            product_kernel = 1.0
            for bandwidth_ind in range(n_bandwidths):
                product_kernel *= (
                    gaussian_pdf(eval_points[eval_ind, bandwidth_ind],
                                 samples[sample_ind, bandwidth_ind],
                                 bandwidths[bandwidth_ind]))
            product_kernel /= n_samples
            cuda.atomic.add(out, eval_ind, product_kernel)
    def count_weighted_pairs_3d_cuda(x1, y1, z1, w1, x2, y2, z2, w2,
                                     rbins_squared, result):
        start = cuda.grid(1)
        stride = cuda.gridsize(1)

        n1 = x1.shape[0]
        n2 = x2.shape[0]
        nbins = rbins_squared.shape[0]

        for i in range(start, n1, stride):
            px = x1[i]
            py = y1[i]
            pz = z1[i]
            pw = w1[i]
            for j in range(n2):
                qx = x2[j]
                qy = y2[j]
                qz = z2[j]
                qw = w2[j]
                dx = px - qx
                dy = py - qy
                dz = pz - qz
                wprod = pw * qw
                dsq = dx * dx + dy * dy + dz * dz

                k = nbins - 1
                while dsq <= rbins_squared[k]:
                    cuda.atomic.add(result, k - 1, wprod)
                    k -= 1
                    if k <= 0:
                        break
Example #11
0
def select_individual(index, pop_d, individual):
    threadId_row, threadId_col = cuda.grid(2)
    stride_x, stride_y = cuda.gridsize(2)

    for row in range(threadId_row, pop_d.shape[0], stride_x):
        if row == index and threadId_col < pop_d.shape[1]:
            pop_d[row, threadId_col] = individual[row, threadId_col]
Example #12
0
def update_pop(count, parent_d_1, parent_d_2, child_d_1, child_d_2, pop_d):    
    threadId_row, threadId_col = cuda.grid(2)
    stride_x, stride_y = cuda.gridsize(2)

    for row in range(threadId_row, pop_d.shape[0], stride_x):    
        for col in range(threadId_col, pop_d.shape[1], stride_y):
            if child_d_1[row, -1] <= parent_d_1[row, -1] and \
            child_d_1[row, -1] <= parent_d_2[row, -1] and \
            child_d_1[row, -1] <= child_d_2[row, -1]:

                pop_d[row, col] = child_d_1[row, col]
                pop_d[row, 0] = count

            elif child_d_2[row, -1] <= parent_d_1[row, -1] and \
            child_d_2[row, -1] <= parent_d_2[row, -1] and \
            child_d_2[row, -1] <= child_d_1[row, -1]:

                pop_d[row, col] = child_d_2[row, col]
                pop_d[row, 0] = count

            elif parent_d_1[row, -1] <= parent_d_2[row, -1] and \
            parent_d_1[row, -1] <= child_d_1[row, -1] and \
            parent_d_1[row, -1] <= child_d_2[row, -1]:

                pop_d[row, col] = parent_d_1[row, col]
                pop_d[row, 0] = count

            elif parent_d_2[row, -1] <= parent_d_1[row, -1] and \
            parent_d_2[row, -1] <= child_d_1[row, -1] and \
            parent_d_2[row, -1] <= child_d_2[row, -1]:

                pop_d[row, col] = parent_d_2[row, col]
                pop_d[row, 0] = count
                
    cuda.syncthreads()
Example #13
0
def number_cut_points(candid_d_1, candid_d_2, candid_d_3, candid_d_4, parent_d_1, parent_d_2, count, min_n, max_n):
    
    threadId_row, threadId_col = cuda.grid(2)
    stride_x, stride_y = cuda.gridsize(2)

    for row in range(threadId_row, candid_d_1.shape[0], stride_x):
        for col in range(threadId_col, candid_d_1.shape[1], stride_y):
            candid_d_1[row, col] = 1
            candid_d_2[row, col] = 1
            candid_d_3[row, col] = 1
            candid_d_4[row, col] = 1

        # Calculate the actual length of parents
        if threadId_col == 15:
            for i in range(0, candid_d_1.shape[1]-2):
                if not (parent_d_1[row, i] == 1 and parent_d_1[row, i+1] == 1):
                    candid_d_1[row, 2] += 1
                    
                if not (parent_d_2[row, i] == 1 and parent_d_2[row, i+1] == 1):
                    candid_d_2[row, 2] += 1

            # Minimum length of the two parents
            candid_d_1[row, 3] = \
            min(candid_d_1[row, 2], candid_d_2[row, 2]) 

            # Number of cutting points = (n/5 - 2)
            # candid_d_1[row, 4] = candid_d_1[row, 3]//20 - 2
            n_points = max(min_n, (count%(max_n*4000))//4000) # the n_points increases one every 5000 iterations till 20 then resets to 2 and so on
            candid_d_1[row, 4] = n_points
    
    cuda.syncthreads()
Example #14
0
def cross_over_gpu(candid_d_1, candid_d_2, child_d_1, child_d_2, parent_d_1, parent_d_2):
    threadId_row, threadId_col = cuda.grid(2)
    stride_x, stride_y = cuda.gridsize(2)

    for row in range(threadId_row, candid_d_1.shape[0], stride_x):
        for col in range(threadId_col, candid_d_1.shape[1], stride_y):
            if col > 1:
                child_d_1[row, col] = parent_d_1[row, col]
                child_d_2[row, col] = parent_d_2[row, col]

                # Perform the crossover:
                no_cuts = candid_d_1[row, 4]
                if col < candid_d_2[row, 2]: # Swap from first element to first cut point
                    child_d_1[row, col], child_d_2[row, col] =\
                    child_d_2[row, col], child_d_1[row, col]

                if no_cuts%2 == 0: # For even number of cuts, swap from the last cut point to the end
                    if col > candid_d_2[row, no_cuts+1] and col < child_d_1.shape[1]-1:
                        child_d_1[row, col], child_d_2[row, col] =\
                        child_d_2[row, col], child_d_1[row, col]

                for j in range(2, no_cuts+1):
                    cut_idx = candid_d_2[row, j]
                    if no_cuts%2 == 0:
                        if j%2==1 and col >= cut_idx and col < candid_d_2[row, j+1]:
                            child_d_1[row, col], child_d_2[row, col] =\
                            child_d_2[row, col], child_d_1[row, col]
                    
                    elif no_cuts%2 == 1:
                        if j%2==1 and col>=cut_idx and col < candid_d_2[row, j+1]:
                            child_d_1[row, col], child_d_2[row, col] =\
                            child_d_2[row, col], child_d_1[row, col]

    cuda.syncthreads()
Example #15
0
def cap_adjust(r_flag, vrp_capacity, data_d, pop):    
    threadId_row, threadId_col = cuda.grid(2)
    stride_x, stride_y = cuda.gridsize(2)

    for row in range(threadId_row, pop.shape[0], stride_x):    
        if threadId_col == 15:
            reqcap = 0.0        # required capacity
            
            # Accumulate capacity:
            i = 1
            while pop[row, i] != r_flag:
                i += 1  
                if pop[row,i] == r_flag:
                    break
            
                if pop[row, i] != 1:
                    reqcap += data_d[pop[row, i]-1, 1] # index starts from 0 while individuals start from 1                
                    if reqcap > vrp_capacity:
                        reqcap = 0
                        # Insert '1' and shift right:
                        new_val = 1
                        rep_val = pop[row, i]
                        for j in range(i, pop.shape[1]-2):
                            pop[row, j] = new_val
                            new_val = rep_val
                            rep_val = pop[row, j+1]
                else:
                    reqcap = 0.0
    cuda.syncthreads()
def filter_arr(dst, nres, src, n):
    tid = cuda.grid(1)
    step = cuda.gridsize(1)

    for i in range(tid, n, step):
        if src[i] > 0:
            dst[atomicAggInc(nres)] = src[i]
Example #17
0
def gpu_expand_mask_bits(bits, out):
    """Expand each bits in bitmask *bits* into an element in out.
    This is a flexible kernel that can be launch with any number of blocks
    and threads.
    """
    for i in range(cuda.grid(1), out.size, cuda.gridsize(1)):
        out[i] = mask_get(bits, i)
Example #18
0
def gpu_expand_mask_bits(bits, out):
    """Expand each bits in bitmask *bits* into an element in out.
    This is a flexible kernel that can be launch with any number of blocks
    and threads.
    """
    for i in range(cuda.grid(1), out.size, cuda.gridsize(1)):
        out[i] = mask_get(bits, i)
Example #19
0
def count_weighted_pairs_3d_cuda_transpose2d(pt1, pt2, rbins_squared, result):
    """Naively count Npairs(<r), the total number of pairs that are separated
    by a distance less than r, for each r**2 in the input rbins_squared.
    """
    n1 = pt1.shape[0]
    n2 = pt2.shape[0]
    nbins = rbins_squared.shape[0]

    start = cuda.grid(1)
    stride = cuda.gridsize(1)

    for i in range(start, n1, stride):
        px, py, pz, pw = pt1[i]

        for j in range(n2):
            qx, qy, qz, qw = pt2[j]

            dx = px - qx
            dy = py - qy
            dz = pz - qz
            wprod = pw * qw
            dsq = dx * dx + dy * dy + dz * dz

            k = nbins - 1
            while dsq <= rbins_squared[k]:
                cuda.atomic.add(result, k - 1, wprod)
                k = k - 1
                if k <= 0:
                    break
Example #20
0
def count_weighted_pairs_3d_cuda_noncuml(x1, y1, z1, w1, x2, y2, z2, w2,
                                         rbins_squared, result):
    """Naively count Npairs(<r), the total number of pairs that are separated
    by a distance less than r, for each r**2 in the input rbins_squared.
    """
    start = cuda.grid(1)
    stride = cuda.gridsize(1)

    n1 = x1.shape[0]
    n2 = x2.shape[0]
    nbins = rbins_squared.shape[0] - 1

    dlogr = math.log(rbins_squared[1] / rbins_squared[0]) / 2
    minlogr = math.log(rbins_squared[0]) / 2

    for i in range(start, n1, stride):
        px = x1[i]
        py = y1[i]
        pz = z1[i]
        pw = w1[i]
        for j in range(n2):
            qx = x2[j]
            qy = y2[j]
            qz = z2[j]
            qw = w2[j]
            dx = px - qx
            dy = py - qy
            dz = pz - qz
            wprod = pw * qw
            dsq = dx * dx + dy * dy + dz * dz

            k = int((math.log(dsq) / 2 - minlogr) / dlogr)
            if k >= 0 and k < nbins:
                cuda.atomic.add(result, k, wprod)
Example #21
0
def mask_overlappingAK4_cudakernel(etas1, phis1, mask1, offsets1, etas2, phis2,
                                   mask2, offsets2, tau32, tau21, dr2,
                                   tau32cut, tau21cut, mask_out):
    xi = cuda.grid(1)
    xstride = cuda.gridsize(1)

    for iev in range(xi, len(offsets1) - 1, xstride):
        a1 = offsets1[iev]
        b1 = offsets1[iev + 1]

        a2 = offsets2[iev]
        b2 = offsets2[iev + 1]

        for idx1 in range(a1, b1):
            if not mask1[idx1]:
                continue

            eta1 = etas1[idx1]
            phi1 = phis1[idx1]
            for idx2 in range(a2, b2):
                if not mask2[idx2]:
                    continue
                eta2 = etas2[idx2]
                phi2 = phis2[idx2]

                deta = abs(eta1 - eta2)
                dphi = (phi1 - phi2 + math.pi) % (2 * math.pi) - math.pi

                #if first object is closer than dr2, mask element will be *disabled*
                passdr = ((deta**2 + dphi**2) < dr2)
                if passdr:
                    passtau32 = (tau32[idx2] < tau32cut)
                    passtau21 = (tau21[idx2] < tau21cut)
                    mask_out[idx1] = (passtau32 or passtau21)
Example #22
0
def mask_deltar_first_cudakernel(etas1, phis1, mask1, offsets1, etas2, phis2,
                                 mask2, offsets2, dr2, mask_out):
    xi = cuda.grid(1)
    xstride = cuda.gridsize(1)

    for iev in range(xi, len(offsets1) - 1, xstride):
        a1 = offsets1[iev]
        b1 = offsets1[iev + 1]

        a2 = offsets2[iev]
        b2 = offsets2[iev + 1]

        for idx1 in range(a1, b1):
            if not mask1[idx1]:
                continue

            eta1 = etas1[idx1]
            phi1 = phis1[idx1]
            for idx2 in range(a2, b2):
                if not mask2[idx2]:
                    continue
                eta2 = etas2[idx2]
                phi2 = phis2[idx2]

                deta = abs(eta1 - eta2)
                dphi = (phi1 - phi2 + math.pi) % (2 * math.pi) - math.pi

                #if first object is closer than dr2, mask element will be *disabled*
                passdr = ((deta**2 + dphi**2) < dr2)
                mask_out[idx1] = mask_out[idx1] | passdr
Example #23
0
def select_opposite_sign_muons_cudakernel(muon_charges_content,
                                          muon_charges_offsets,
                                          content_mask_in, content_mask_out):
    xi = cuda.grid(1)
    xstride = cuda.gridsize(1)

    for iev in range(xi, muon_charges_offsets.shape[0] - 1, xstride):
        start = muon_charges_offsets[iev]
        end = muon_charges_offsets[iev + 1]

        ch1 = 0
        idx1 = -1
        ch2 = 0
        idx2 = -1

        for imuon in range(start, end):
            if not content_mask_in[imuon]:
                continue

            if idx1 == -1:
                ch1 = muon_charges_content[imuon]
                idx1 = imuon
                continue
            else:
                ch2 = muon_charges_content[imuon]
                if (ch2 != ch1):
                    idx2 = imuon
                    content_mask_out[idx1] = 1
                    content_mask_out[idx2] = 1
                    break
    return
Example #24
0
def mask_deltar_first_cudakernel(etas1, phis1, mask1, offsets1, etas2, phis2,
                                 mask2, offsets2, dr2, mask_out):
    xi = cuda.grid(1)
    xstride = cuda.gridsize(1)

    for iev in range(xi, len(offsets1) - 1, xstride):
        a1 = np.uint64(offsets1[iev])
        b1 = np.uint64(offsets1[iev + 1])

        a2 = np.uint64(offsets2[iev])
        b2 = np.uint64(offsets2[iev + 1])

        for idx1 in range(a1, b1):
            if not mask1[idx1]:
                continue

            eta1 = np.float32(etas1[idx1])
            phi1 = np.float32(phis1[idx1])
            for idx2 in range(a2, b2):
                if not mask2[idx2]:
                    continue
                eta2 = np.float32(etas2[idx2])
                phi2 = np.float32(phis2[idx2])

                deta = abs(eta1 - eta2)
                dphi = deltaphi_devfunc(phi1, phi2)

                # if first object is closer than dr2, mask element will be *disabled*
                passdr = (deta**2 + dphi**2) < dr2
                mask_out[idx1] = mask_out[idx1] | passdr
def fillFull_traingle(dist_out, subarr, n):
    """Fill a sub-triangle under and over the diagonal of array with distance data.

    Parameters
    ----------
    dist_out: array
        The part of the array that will be used in the complete array.
    subarr: array
        Look-up table the with the distances.
    n: int
        Length of the the trigle-matrx.
            
    Returns
    -------
    Void
    """
    x, y = cuda.grid(2)
    dx, dy = cuda.gridsize(2)
    if x >= subarr.shape[0] and y >= subarr.shape[1]:
        return
    for i in range(x, n, dx):
        for k in range(y, i, dy):
            ix = k * n - k * (k + 1) / 2 + (i - 1 - k)
            subarr[int(i), int(k)] = dist_out[int(ix)]
            subarr[int(k), int(i)] = dist_out[int(ix)]
Example #26
0
def fill_histogram_several(data, weights, mask, bins, nbins, nbins_sum, out_w,
                           out_w2):
    xi = cuda.grid(1)
    xstride = cuda.gridsize(1)

    bi = cuda.blockIdx.x
    bd = cuda.blockDim.x
    ti = cuda.threadIdx.x

    # number of histograms to fill
    ndatavec = data.shape[0]

    for iev in range(xi, data.shape[1], xstride):
        if mask[iev]:
            for ivec in range(ndatavec):
                bin_idx = np.int32(
                    searchsorted_devfunc_right(
                        bins[nbins_sum[ivec]:nbins_sum[ivec + 1]], data[ivec,
                                                                        iev]) -
                    1)
                if bin_idx >= nbins[ivec]:
                    bin_idx = nbins[ivec] - 1
                bin_idx_histo = (ivec, bi, bin_idx)

                if bin_idx >= 0 and bin_idx < nbins[ivec]:
                    wi = weights[iev]
                    cuda.atomic.add(out_w, bin_idx_histo, wi)
                    cuda.atomic.add(out_w2, bin_idx_histo, wi**2)
Example #27
0
def select_opposite_sign_cudakernel(charges_offsets, charges_content,
                                    content_mask_in, content_mask_out):
    xi = cuda.grid(1)
    xstride = cuda.gridsize(1)

    for iev in range(xi, charges_offsets.shape[0] - 1, xstride):
        start = np.uint64(charges_offsets[iev])
        end = np.uint64(charges_offsets[iev + 1])

        ch1 = np.int32(0)
        idx1 = np.uint64(0)
        ch2 = np.int32(0)
        idx2 = np.uint64(0)

        for imuon in range(start, end):
            if not content_mask_in[imuon]:
                continue

            if idx1 == 0 and idx2 == 0:
                ch1 = charges_content[imuon]
                idx1 = imuon
                continue
            else:
                ch2 = charges_content[imuon]
                if ch2 != ch1:
                    idx2 = imuon
                    content_mask_out[idx1] = 1
                    content_mask_out[idx2] = 1
                    break
    return
Example #28
0
def searchsorted_kernel_left(vals, arr, inds_out):
    xi = cuda.grid(1)
    xstride = cuda.gridsize(1)
    assert len(vals) == len(inds_out)

    for i in range(xi, len(vals), xstride):
        inds_out[i] = searchsorted_devfunc_left(arr, vals[i])
Example #29
0
def _update_PI_kernel(i, D, ignore_trivial, profile, indices):
    """
    A Numba CUDA kernel to update the matrix profile and matrix profile indices

    Parameters
    ----------
    D : ndarray
        The distance array
    zone_start : int
        The start of the exclusion zone (inclusive)
    zone_stop : int
        The end of the exclusion zone (exclusive)

    Returns
    -------
    None
    """
    start = cuda.grid(1)
    stride = cuda.gridsize(1)

    for j in range(start, D.shape[0], stride):
        if D[j] < profile[j, 0]:
            profile[j, 0] = D[j]
            indices[j, 0] = i
        if ignore_trivial:
            if D[j] < profile[j, 1] and i < j:
                profile[j, 1] = D[j]
                indices[j, 1] = i
            if D[j] < profile[j, 2] and i > j:
                profile[j, 2] = D[j]
                indices[j, 2] = i
Example #30
0
def _ignore_trivial_kernel(D, zone_start, zone_stop):
    """
    A Numba CUDA GPU kernel to set distances to `np.inf` within the exclusion zone
    in the range `[zone_start, zone_stop]`

    Parameters
    ----------
    D : ndarray
        The distance array
    zone_start : int
        The start of the exclusion zone (inclusive)
    zone_stop : int
        The end of the exclusion zone (exclusive)

    Returns
    -------
    None
    """

    start = cuda.grid(1)
    stride = cuda.gridsize(1)

    for j in range(start, D.shape[0], stride):
        if j >= zone_start and j < zone_stop:
            D[j] = np.inf
def count_weighted_pairs_3d_cuda(
        x1, y1, z1, w1, x2, y2, z2, w2, rbins_squared, result):
    """Naively count Npairs(<r), the total number of pairs that are separated
    by a distance less than r, for each r**2 in the input rbins_squared.
    """
    start = cuda.grid(1)
    stride = cuda.gridsize(1)

    n1 = x1.shape[0]
    n2 = x2.shape[0]
    nbins = rbins_squared.shape[0]

    for i in range(start, n1, stride):
        px = x1[i]
        py = y1[i]
        pz = z1[i]
        pw = w1[i]
        for j in range(n2):
            qx = x2[j]
            qy = y2[j]
            qz = z2[j]
            qw = w2[j]
            dx = px-qx
            dy = py-qy
            dz = pz-qz
            wprod = pw*qw
            dsq = dx*dx + dy*dy + dz*dz

            k = nbins-1
            while dsq <= rbins_squared[k]:
                cuda.atomic.add(result, k-1, wprod)
                k = k-1
                if k <= 0:
                    break
Example #32
0
def histogram(x, xmin, xmax, histogram_out):
    nbins = histogram_out.shape[0]
    bin_width = (xmax - xmin) / nbins

    start = cuda.grid(1)
    stride = cuda.gridsize(1)

    for i in range(start, x.shape[0], stride):
        bin_number = np.int32((x[i] - xmin)/bin_width)
        if bin_number >= 0 and bin_number < histogram_out.shape[0]:
            histogram_out[bin_number] += 1
Example #33
0
 def foo(out):
     x, y, z = cuda.grid(3)
     a, b, c = cuda.gridsize(3)
     grid_is_right = (
         x == cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x and
         y == cuda.threadIdx.y + cuda.blockIdx.y * cuda.blockDim.y and
         z == cuda.threadIdx.z + cuda.blockIdx.z * cuda.blockDim.z
     )
     gridsize_is_right = (a == cuda.blockDim.x * cuda.gridDim.x and
                          b == cuda.blockDim.y * cuda.gridDim.y and
                          c == cuda.blockDim.z * cuda.gridDim.z)
     out[x, y, z] = grid_is_right and gridsize_is_right
Example #34
0
def histogram(x, xmin, xmax, histogram_out):
    nbins = histogram_out.shape[0]
    bin_width = (xmax - xmin) / nbins

    start = cuda.grid(1)
    stride = cuda.gridsize(1)

    for i in range(start, x.shape[0], stride):
        bin_number = np.int32((x[i] + xmin)/bin_width)

        if bin_number >= 0 or bin_number < histogram_out.shape[0]:
            cuda.atomic.add(histogram_out, bin_number, 1)
Example #35
0
def histogram(x, xmin, xmax, histogram_out):
    nbins = histogram_out.shape[0]
    bin_width = (xmax - xmin) / nbins

    start = cuda.grid(1)
    stride = cuda.gridsize(1)

    ### DEBUG FIRST THREAD
    if start == 0:
        from pdb import set_trace; set_trace()
    ###

    for i in range(start, x.shape[0], stride):
        bin_number = np.int32((x[i] + xmin)/bin_width)

        if bin_number >= 0 and bin_number < histogram_out.shape[0]:
            cuda.atomic.add(histogram_out, bin_number, 1)
Example #36
0
 def foo(out):
     x, y, z = cuda.grid(3)
     a, b, c = cuda.gridsize(3)
     out[x, y, z] = a * b * c
Example #37
0
 def foo(start, end, delta):
     for i in range(cuda.grid(1), delta.size, cuda.gridsize(1)):
         delta[i] = end[i] - start[i]
Example #38
0
 def foo(dates, target, delta, matches, outdelta):
     for i in range(cuda.grid(1), matches.size, cuda.gridsize(1)):
         matches[i] = dates[i] == target
         outdelta[i] = dates[i] - delta
Example #39
0
def simple_gridsize1d(ary):
    i = cuda.grid(1)
    x = cuda.gridsize(1)
    if i == 0:
        ary[0] = x
Example #40
0
def simple_gridsize2d(ary):
    i, j = cuda.grid(2)
    x, y = cuda.gridsize(2)
    if i == 0 and j == 0:
        ary[0] = x
        ary[1] = y