def double(inp, out): tx = cuda.threadIdx.x bx = cuda.blockIdx.x bw = cuda.blockDim.x idx = tx + bx * bw assert idx == cuda.grid(1) # 1 dimension assert bw == cuda.gridsize(1) while idx < len(inp): out[idx] = inp[idx] * 2 idx += cuda.gridsize(1)
def min_max(x, min_max_array): """ description: cuda jit to calculate the min and max values for the ndarray input: - x: ndarray - min_max_array: cuda.to_device(np.array([dtype_max, dtype_min], dtype=np.float32)) """ start = cuda.grid(1) stride = cuda.gridsize(1) # Array already seeded with starting values appropriate for x's dtype # Not a problem if this array has already been updated local_min = min_max_array[0] local_max = min_max_array[1] for i in range(start, x.shape[0], stride): element = x[i] local_min = min(element, local_min) local_max = max(element, local_max) # Now combine each thread local min and max cuda.atomic.min(min_max_array, 0, local_min) cuda.atomic.max(min_max_array, 1, local_max)
def compute_mandel(min_x, max_x, min_y, max_y, image, iters): ''' A GPU version of calculating the mandel value for each element in the image array. The real and imag variables contain a value for each element of the complex space defined by the X and Y boundaries (min_x, max_x) and (min_y, max_y). Step 1: define the absolute thread id (y, x) in (1024, 1536) and Step 2: define task size (e.g (1,12)) of each thread such to assign (1024 1536) tasks into (1024, 128) threads Step 3: finish tasks in each thread ''' grid_y, grid_x = cuda.gridsize(2) #(1024, 128) y, x = cuda.grid(2) # (y, x) where y is in [0, 1023] and x is in [0, 127] height, width = image.shape # 1024, 1536 pixel_size_x = (max_x - min_x) / width pixel_size_y = (max_y - min_y) / height # get the partition index of the y and x block_y = height // grid_y # 1 block_x = width // grid_x # 12 # every thread in (1024, 128) should handle (1, 12) tasks such that the totally (1024, 1536) tasks are evenly assigned for i in range(block_x): thread_x = x * block_x + i real = min_x + thread_x * pixel_size_x for j in range(block_y): thread_y = y * block_y+ j imag = min_y + thread_y * pixel_size_y if thread_y < height and thread_x < width: image[thread_y, thread_x] = mandel(real, imag, iters)
def compute_inv_mass_cudakernel(offsets, pts, etas, phis, masses, mask_events, mask_objects, out_inv_mass, out_pt_total): xi = cuda.grid(1) xstride = cuda.gridsize(1) for iev in range(xi, offsets.shape[0] - 1, xstride): if mask_events[iev]: start = np.uint64(offsets[iev]) end = np.uint64(offsets[iev + 1]) px_total = np.float32(0.0) py_total = np.float32(0.0) pz_total = np.float32(0.0) e_total = np.float32(0.0) for iobj in range(start, end): if mask_objects[iobj]: pt = pts[iobj] eta = etas[iobj] phi = phis[iobj] mass = masses[iobj] px, py, pz, e = spherical_to_cartesian_devfunc( pt, eta, phi, mass) px_total += px py_total += py pz_total += pz e_total += e inv_mass = math.sqrt(-(px_total**2 + py_total**2 + pz_total**2 - e_total**2)) pt_total = math.sqrt(px_total**2 + py_total**2) out_inv_mass[iev] = inv_mass out_pt_total[iev] = pt_total
def sort_in_offsets_kernel(content, offsets, index_to_get, mask_rows, mask_content, out): xi = cuda.grid(1) xstride = cuda.gridsize(1) for i in range(xi, len(data), xstride): if not mask_rows[iev]: continue start = offsets[iev] end = offsets[iev + 1] event_content = content[start:end] merge_list = enumerate(event_content) while index_to_get < len(event_content): for j in range(len(event_content)): for i in range(1, len(event_content) - j): if event_content[i - 1] > event_content[i]: event_content[i - 1], event_content[i] = event_content[ i], event_content[i - 1] merge_list[ i - 1], merge_list[i] = merge_list[i], merge_list[i - 1] idx = merge_list[index_to_get][1] while not mask_content[idx + start]: idx += 1 out[iev] += content[ielem]
def count_weighted_pairs_3d_cuda_transpose_noncuml(ptswts1, ptswts2, _rbins_squared, result): start = cuda.grid(1) stride = cuda.gridsize(1) n1 = ptswts1.shape[0] // 4 n2 = ptswts1.shape[0] // 4 nbins = _rbins_squared.shape[0] - 1 dlogr = math.log(_rbins_squared[1] / _rbins_squared[0]) / 2 logminr = math.log(_rbins_squared[0]) / 2 smem = cuda.shared.array(128, numba.float32) if cuda.threadIdx.x == 0: for i in range(128): smem[i] = 0 cuda.syncthreads() for i in range(start, n1, stride): loci = 4 * i for j in range(n2): locj = 4 * j dx = ptswts1[loci] - ptswts2[locj] dy = ptswts1[loci + 1] - ptswts2[locj + 1] dz = ptswts1[loci + 2] - ptswts2[locj + 2] dsq = cuda.fma(dx, dx, cuda.fma(dy, dy, dz * dz)) k = int((math.log(dsq) / 2 - logminr) / dlogr) if k >= 0 and k < nbins: cuda.atomic.add(smem, k, ptswts1[loci + 3] * ptswts2[locj + 3]) cuda.syncthreads() if cuda.threadIdx.x == 0: for k in range(nbins): cuda.atomic.add(result, k, smem[k])
def calc_cumsum_data_tile(x, arr1): """ description: cuda jit function calculate the data tile with cumulative sums for a 2-col ndarray input: - x: ndarray(3-cols) (col1,col2, count_col2) - arr1: result array: type is int as the cells contain just frequencies (cuda.to_device(np.zeros(shape=(min_s,max_s)).astype(np.int32))) [X X X] [X X X] [X X X] """ start = cuda.grid(1) stride = cuda.gridsize(1) for i in range(start, x.shape[0], stride): col1_i = int(round(x[i][0])) col2_i = int(round(x[i][1])) freq_i = x[i][2] if int(x[i][0]) != -1: if arr1[col2_i][col1_i] == float(0): arr1[col2_i][col1_i] = freq_i else: arr1[col2_i][col1_i] = (arr1[col2_i][col1_i] + freq_i) / 2
def count_weighted_pairs_3d_cuda_smem_noncuml(x1, y1, z1, w1, x2, y2, z2, w2, _rbins_squared, result): start = cuda.grid(1) stride = cuda.gridsize(1) n1 = x1.shape[0] n2 = x2.shape[0] nbins = _rbins_squared.shape[0] - 1 dlogr = math.log(_rbins_squared[1] / _rbins_squared[0]) / 2 logminr = math.log(_rbins_squared[0]) / 2 smem = cuda.shared.array(128, numba.float32) if cuda.threadIdx.x == 0: for i in range(128): smem[i] = 0 cuda.syncthreads() for i in range(start, n1, stride): for j in range(n2): dx = x1[i] - x2[j] dy = y1[i] - y2[j] dz = z1[i] - z2[j] dsq = cuda.fma(dx, dx, cuda.fma(dy, dy, dz * dz)) k = int((math.log(dsq) / 2 - logminr) / dlogr) if k >= 0 and k < nbins: cuda.atomic.add(smem, k, w1[i] * w2[j]) cuda.syncthreads() if cuda.threadIdx.x == 0: for k in range(nbins): cuda.atomic.add(result, k, smem[k])
def numba_kde_cuda2(eval_points, samples, bandwidths, out): ''' Parameters ---------- eval_points : ndarray, shape (n_eval, n_bandwidths) samples : ndarray, shape (n_samples, n_bandwidths) out : ndarray, shape (n_eval,) ''' thread_id1, thread_id2 = cuda.grid(2) stride1, stride2 = cuda.gridsize(2) (n_eval, n_bandwidths), n_samples = eval_points.shape, samples.shape[0] for eval_ind in range(thread_id1, n_eval, stride1): for sample_ind in range(thread_id2, n_samples, stride2): product_kernel = 1.0 for bandwidth_ind in range(n_bandwidths): product_kernel *= ( gaussian_pdf(eval_points[eval_ind, bandwidth_ind], samples[sample_ind, bandwidth_ind], bandwidths[bandwidth_ind])) product_kernel /= n_samples cuda.atomic.add(out, eval_ind, product_kernel)
def count_weighted_pairs_3d_cuda(x1, y1, z1, w1, x2, y2, z2, w2, rbins_squared, result): start = cuda.grid(1) stride = cuda.gridsize(1) n1 = x1.shape[0] n2 = x2.shape[0] nbins = rbins_squared.shape[0] for i in range(start, n1, stride): px = x1[i] py = y1[i] pz = z1[i] pw = w1[i] for j in range(n2): qx = x2[j] qy = y2[j] qz = z2[j] qw = w2[j] dx = px - qx dy = py - qy dz = pz - qz wprod = pw * qw dsq = dx * dx + dy * dy + dz * dz k = nbins - 1 while dsq <= rbins_squared[k]: cuda.atomic.add(result, k - 1, wprod) k -= 1 if k <= 0: break
def select_individual(index, pop_d, individual): threadId_row, threadId_col = cuda.grid(2) stride_x, stride_y = cuda.gridsize(2) for row in range(threadId_row, pop_d.shape[0], stride_x): if row == index and threadId_col < pop_d.shape[1]: pop_d[row, threadId_col] = individual[row, threadId_col]
def update_pop(count, parent_d_1, parent_d_2, child_d_1, child_d_2, pop_d): threadId_row, threadId_col = cuda.grid(2) stride_x, stride_y = cuda.gridsize(2) for row in range(threadId_row, pop_d.shape[0], stride_x): for col in range(threadId_col, pop_d.shape[1], stride_y): if child_d_1[row, -1] <= parent_d_1[row, -1] and \ child_d_1[row, -1] <= parent_d_2[row, -1] and \ child_d_1[row, -1] <= child_d_2[row, -1]: pop_d[row, col] = child_d_1[row, col] pop_d[row, 0] = count elif child_d_2[row, -1] <= parent_d_1[row, -1] and \ child_d_2[row, -1] <= parent_d_2[row, -1] and \ child_d_2[row, -1] <= child_d_1[row, -1]: pop_d[row, col] = child_d_2[row, col] pop_d[row, 0] = count elif parent_d_1[row, -1] <= parent_d_2[row, -1] and \ parent_d_1[row, -1] <= child_d_1[row, -1] and \ parent_d_1[row, -1] <= child_d_2[row, -1]: pop_d[row, col] = parent_d_1[row, col] pop_d[row, 0] = count elif parent_d_2[row, -1] <= parent_d_1[row, -1] and \ parent_d_2[row, -1] <= child_d_1[row, -1] and \ parent_d_2[row, -1] <= child_d_2[row, -1]: pop_d[row, col] = parent_d_2[row, col] pop_d[row, 0] = count cuda.syncthreads()
def number_cut_points(candid_d_1, candid_d_2, candid_d_3, candid_d_4, parent_d_1, parent_d_2, count, min_n, max_n): threadId_row, threadId_col = cuda.grid(2) stride_x, stride_y = cuda.gridsize(2) for row in range(threadId_row, candid_d_1.shape[0], stride_x): for col in range(threadId_col, candid_d_1.shape[1], stride_y): candid_d_1[row, col] = 1 candid_d_2[row, col] = 1 candid_d_3[row, col] = 1 candid_d_4[row, col] = 1 # Calculate the actual length of parents if threadId_col == 15: for i in range(0, candid_d_1.shape[1]-2): if not (parent_d_1[row, i] == 1 and parent_d_1[row, i+1] == 1): candid_d_1[row, 2] += 1 if not (parent_d_2[row, i] == 1 and parent_d_2[row, i+1] == 1): candid_d_2[row, 2] += 1 # Minimum length of the two parents candid_d_1[row, 3] = \ min(candid_d_1[row, 2], candid_d_2[row, 2]) # Number of cutting points = (n/5 - 2) # candid_d_1[row, 4] = candid_d_1[row, 3]//20 - 2 n_points = max(min_n, (count%(max_n*4000))//4000) # the n_points increases one every 5000 iterations till 20 then resets to 2 and so on candid_d_1[row, 4] = n_points cuda.syncthreads()
def cross_over_gpu(candid_d_1, candid_d_2, child_d_1, child_d_2, parent_d_1, parent_d_2): threadId_row, threadId_col = cuda.grid(2) stride_x, stride_y = cuda.gridsize(2) for row in range(threadId_row, candid_d_1.shape[0], stride_x): for col in range(threadId_col, candid_d_1.shape[1], stride_y): if col > 1: child_d_1[row, col] = parent_d_1[row, col] child_d_2[row, col] = parent_d_2[row, col] # Perform the crossover: no_cuts = candid_d_1[row, 4] if col < candid_d_2[row, 2]: # Swap from first element to first cut point child_d_1[row, col], child_d_2[row, col] =\ child_d_2[row, col], child_d_1[row, col] if no_cuts%2 == 0: # For even number of cuts, swap from the last cut point to the end if col > candid_d_2[row, no_cuts+1] and col < child_d_1.shape[1]-1: child_d_1[row, col], child_d_2[row, col] =\ child_d_2[row, col], child_d_1[row, col] for j in range(2, no_cuts+1): cut_idx = candid_d_2[row, j] if no_cuts%2 == 0: if j%2==1 and col >= cut_idx and col < candid_d_2[row, j+1]: child_d_1[row, col], child_d_2[row, col] =\ child_d_2[row, col], child_d_1[row, col] elif no_cuts%2 == 1: if j%2==1 and col>=cut_idx and col < candid_d_2[row, j+1]: child_d_1[row, col], child_d_2[row, col] =\ child_d_2[row, col], child_d_1[row, col] cuda.syncthreads()
def cap_adjust(r_flag, vrp_capacity, data_d, pop): threadId_row, threadId_col = cuda.grid(2) stride_x, stride_y = cuda.gridsize(2) for row in range(threadId_row, pop.shape[0], stride_x): if threadId_col == 15: reqcap = 0.0 # required capacity # Accumulate capacity: i = 1 while pop[row, i] != r_flag: i += 1 if pop[row,i] == r_flag: break if pop[row, i] != 1: reqcap += data_d[pop[row, i]-1, 1] # index starts from 0 while individuals start from 1 if reqcap > vrp_capacity: reqcap = 0 # Insert '1' and shift right: new_val = 1 rep_val = pop[row, i] for j in range(i, pop.shape[1]-2): pop[row, j] = new_val new_val = rep_val rep_val = pop[row, j+1] else: reqcap = 0.0 cuda.syncthreads()
def filter_arr(dst, nres, src, n): tid = cuda.grid(1) step = cuda.gridsize(1) for i in range(tid, n, step): if src[i] > 0: dst[atomicAggInc(nres)] = src[i]
def gpu_expand_mask_bits(bits, out): """Expand each bits in bitmask *bits* into an element in out. This is a flexible kernel that can be launch with any number of blocks and threads. """ for i in range(cuda.grid(1), out.size, cuda.gridsize(1)): out[i] = mask_get(bits, i)
def count_weighted_pairs_3d_cuda_transpose2d(pt1, pt2, rbins_squared, result): """Naively count Npairs(<r), the total number of pairs that are separated by a distance less than r, for each r**2 in the input rbins_squared. """ n1 = pt1.shape[0] n2 = pt2.shape[0] nbins = rbins_squared.shape[0] start = cuda.grid(1) stride = cuda.gridsize(1) for i in range(start, n1, stride): px, py, pz, pw = pt1[i] for j in range(n2): qx, qy, qz, qw = pt2[j] dx = px - qx dy = py - qy dz = pz - qz wprod = pw * qw dsq = dx * dx + dy * dy + dz * dz k = nbins - 1 while dsq <= rbins_squared[k]: cuda.atomic.add(result, k - 1, wprod) k = k - 1 if k <= 0: break
def count_weighted_pairs_3d_cuda_noncuml(x1, y1, z1, w1, x2, y2, z2, w2, rbins_squared, result): """Naively count Npairs(<r), the total number of pairs that are separated by a distance less than r, for each r**2 in the input rbins_squared. """ start = cuda.grid(1) stride = cuda.gridsize(1) n1 = x1.shape[0] n2 = x2.shape[0] nbins = rbins_squared.shape[0] - 1 dlogr = math.log(rbins_squared[1] / rbins_squared[0]) / 2 minlogr = math.log(rbins_squared[0]) / 2 for i in range(start, n1, stride): px = x1[i] py = y1[i] pz = z1[i] pw = w1[i] for j in range(n2): qx = x2[j] qy = y2[j] qz = z2[j] qw = w2[j] dx = px - qx dy = py - qy dz = pz - qz wprod = pw * qw dsq = dx * dx + dy * dy + dz * dz k = int((math.log(dsq) / 2 - minlogr) / dlogr) if k >= 0 and k < nbins: cuda.atomic.add(result, k, wprod)
def mask_overlappingAK4_cudakernel(etas1, phis1, mask1, offsets1, etas2, phis2, mask2, offsets2, tau32, tau21, dr2, tau32cut, tau21cut, mask_out): xi = cuda.grid(1) xstride = cuda.gridsize(1) for iev in range(xi, len(offsets1) - 1, xstride): a1 = offsets1[iev] b1 = offsets1[iev + 1] a2 = offsets2[iev] b2 = offsets2[iev + 1] for idx1 in range(a1, b1): if not mask1[idx1]: continue eta1 = etas1[idx1] phi1 = phis1[idx1] for idx2 in range(a2, b2): if not mask2[idx2]: continue eta2 = etas2[idx2] phi2 = phis2[idx2] deta = abs(eta1 - eta2) dphi = (phi1 - phi2 + math.pi) % (2 * math.pi) - math.pi #if first object is closer than dr2, mask element will be *disabled* passdr = ((deta**2 + dphi**2) < dr2) if passdr: passtau32 = (tau32[idx2] < tau32cut) passtau21 = (tau21[idx2] < tau21cut) mask_out[idx1] = (passtau32 or passtau21)
def mask_deltar_first_cudakernel(etas1, phis1, mask1, offsets1, etas2, phis2, mask2, offsets2, dr2, mask_out): xi = cuda.grid(1) xstride = cuda.gridsize(1) for iev in range(xi, len(offsets1) - 1, xstride): a1 = offsets1[iev] b1 = offsets1[iev + 1] a2 = offsets2[iev] b2 = offsets2[iev + 1] for idx1 in range(a1, b1): if not mask1[idx1]: continue eta1 = etas1[idx1] phi1 = phis1[idx1] for idx2 in range(a2, b2): if not mask2[idx2]: continue eta2 = etas2[idx2] phi2 = phis2[idx2] deta = abs(eta1 - eta2) dphi = (phi1 - phi2 + math.pi) % (2 * math.pi) - math.pi #if first object is closer than dr2, mask element will be *disabled* passdr = ((deta**2 + dphi**2) < dr2) mask_out[idx1] = mask_out[idx1] | passdr
def select_opposite_sign_muons_cudakernel(muon_charges_content, muon_charges_offsets, content_mask_in, content_mask_out): xi = cuda.grid(1) xstride = cuda.gridsize(1) for iev in range(xi, muon_charges_offsets.shape[0] - 1, xstride): start = muon_charges_offsets[iev] end = muon_charges_offsets[iev + 1] ch1 = 0 idx1 = -1 ch2 = 0 idx2 = -1 for imuon in range(start, end): if not content_mask_in[imuon]: continue if idx1 == -1: ch1 = muon_charges_content[imuon] idx1 = imuon continue else: ch2 = muon_charges_content[imuon] if (ch2 != ch1): idx2 = imuon content_mask_out[idx1] = 1 content_mask_out[idx2] = 1 break return
def mask_deltar_first_cudakernel(etas1, phis1, mask1, offsets1, etas2, phis2, mask2, offsets2, dr2, mask_out): xi = cuda.grid(1) xstride = cuda.gridsize(1) for iev in range(xi, len(offsets1) - 1, xstride): a1 = np.uint64(offsets1[iev]) b1 = np.uint64(offsets1[iev + 1]) a2 = np.uint64(offsets2[iev]) b2 = np.uint64(offsets2[iev + 1]) for idx1 in range(a1, b1): if not mask1[idx1]: continue eta1 = np.float32(etas1[idx1]) phi1 = np.float32(phis1[idx1]) for idx2 in range(a2, b2): if not mask2[idx2]: continue eta2 = np.float32(etas2[idx2]) phi2 = np.float32(phis2[idx2]) deta = abs(eta1 - eta2) dphi = deltaphi_devfunc(phi1, phi2) # if first object is closer than dr2, mask element will be *disabled* passdr = (deta**2 + dphi**2) < dr2 mask_out[idx1] = mask_out[idx1] | passdr
def fillFull_traingle(dist_out, subarr, n): """Fill a sub-triangle under and over the diagonal of array with distance data. Parameters ---------- dist_out: array The part of the array that will be used in the complete array. subarr: array Look-up table the with the distances. n: int Length of the the trigle-matrx. Returns ------- Void """ x, y = cuda.grid(2) dx, dy = cuda.gridsize(2) if x >= subarr.shape[0] and y >= subarr.shape[1]: return for i in range(x, n, dx): for k in range(y, i, dy): ix = k * n - k * (k + 1) / 2 + (i - 1 - k) subarr[int(i), int(k)] = dist_out[int(ix)] subarr[int(k), int(i)] = dist_out[int(ix)]
def fill_histogram_several(data, weights, mask, bins, nbins, nbins_sum, out_w, out_w2): xi = cuda.grid(1) xstride = cuda.gridsize(1) bi = cuda.blockIdx.x bd = cuda.blockDim.x ti = cuda.threadIdx.x # number of histograms to fill ndatavec = data.shape[0] for iev in range(xi, data.shape[1], xstride): if mask[iev]: for ivec in range(ndatavec): bin_idx = np.int32( searchsorted_devfunc_right( bins[nbins_sum[ivec]:nbins_sum[ivec + 1]], data[ivec, iev]) - 1) if bin_idx >= nbins[ivec]: bin_idx = nbins[ivec] - 1 bin_idx_histo = (ivec, bi, bin_idx) if bin_idx >= 0 and bin_idx < nbins[ivec]: wi = weights[iev] cuda.atomic.add(out_w, bin_idx_histo, wi) cuda.atomic.add(out_w2, bin_idx_histo, wi**2)
def select_opposite_sign_cudakernel(charges_offsets, charges_content, content_mask_in, content_mask_out): xi = cuda.grid(1) xstride = cuda.gridsize(1) for iev in range(xi, charges_offsets.shape[0] - 1, xstride): start = np.uint64(charges_offsets[iev]) end = np.uint64(charges_offsets[iev + 1]) ch1 = np.int32(0) idx1 = np.uint64(0) ch2 = np.int32(0) idx2 = np.uint64(0) for imuon in range(start, end): if not content_mask_in[imuon]: continue if idx1 == 0 and idx2 == 0: ch1 = charges_content[imuon] idx1 = imuon continue else: ch2 = charges_content[imuon] if ch2 != ch1: idx2 = imuon content_mask_out[idx1] = 1 content_mask_out[idx2] = 1 break return
def searchsorted_kernel_left(vals, arr, inds_out): xi = cuda.grid(1) xstride = cuda.gridsize(1) assert len(vals) == len(inds_out) for i in range(xi, len(vals), xstride): inds_out[i] = searchsorted_devfunc_left(arr, vals[i])
def _update_PI_kernel(i, D, ignore_trivial, profile, indices): """ A Numba CUDA kernel to update the matrix profile and matrix profile indices Parameters ---------- D : ndarray The distance array zone_start : int The start of the exclusion zone (inclusive) zone_stop : int The end of the exclusion zone (exclusive) Returns ------- None """ start = cuda.grid(1) stride = cuda.gridsize(1) for j in range(start, D.shape[0], stride): if D[j] < profile[j, 0]: profile[j, 0] = D[j] indices[j, 0] = i if ignore_trivial: if D[j] < profile[j, 1] and i < j: profile[j, 1] = D[j] indices[j, 1] = i if D[j] < profile[j, 2] and i > j: profile[j, 2] = D[j] indices[j, 2] = i
def _ignore_trivial_kernel(D, zone_start, zone_stop): """ A Numba CUDA GPU kernel to set distances to `np.inf` within the exclusion zone in the range `[zone_start, zone_stop]` Parameters ---------- D : ndarray The distance array zone_start : int The start of the exclusion zone (inclusive) zone_stop : int The end of the exclusion zone (exclusive) Returns ------- None """ start = cuda.grid(1) stride = cuda.gridsize(1) for j in range(start, D.shape[0], stride): if j >= zone_start and j < zone_stop: D[j] = np.inf
def count_weighted_pairs_3d_cuda( x1, y1, z1, w1, x2, y2, z2, w2, rbins_squared, result): """Naively count Npairs(<r), the total number of pairs that are separated by a distance less than r, for each r**2 in the input rbins_squared. """ start = cuda.grid(1) stride = cuda.gridsize(1) n1 = x1.shape[0] n2 = x2.shape[0] nbins = rbins_squared.shape[0] for i in range(start, n1, stride): px = x1[i] py = y1[i] pz = z1[i] pw = w1[i] for j in range(n2): qx = x2[j] qy = y2[j] qz = z2[j] qw = w2[j] dx = px-qx dy = py-qy dz = pz-qz wprod = pw*qw dsq = dx*dx + dy*dy + dz*dz k = nbins-1 while dsq <= rbins_squared[k]: cuda.atomic.add(result, k-1, wprod) k = k-1 if k <= 0: break
def histogram(x, xmin, xmax, histogram_out): nbins = histogram_out.shape[0] bin_width = (xmax - xmin) / nbins start = cuda.grid(1) stride = cuda.gridsize(1) for i in range(start, x.shape[0], stride): bin_number = np.int32((x[i] - xmin)/bin_width) if bin_number >= 0 and bin_number < histogram_out.shape[0]: histogram_out[bin_number] += 1
def foo(out): x, y, z = cuda.grid(3) a, b, c = cuda.gridsize(3) grid_is_right = ( x == cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x and y == cuda.threadIdx.y + cuda.blockIdx.y * cuda.blockDim.y and z == cuda.threadIdx.z + cuda.blockIdx.z * cuda.blockDim.z ) gridsize_is_right = (a == cuda.blockDim.x * cuda.gridDim.x and b == cuda.blockDim.y * cuda.gridDim.y and c == cuda.blockDim.z * cuda.gridDim.z) out[x, y, z] = grid_is_right and gridsize_is_right
def histogram(x, xmin, xmax, histogram_out): nbins = histogram_out.shape[0] bin_width = (xmax - xmin) / nbins start = cuda.grid(1) stride = cuda.gridsize(1) for i in range(start, x.shape[0], stride): bin_number = np.int32((x[i] + xmin)/bin_width) if bin_number >= 0 or bin_number < histogram_out.shape[0]: cuda.atomic.add(histogram_out, bin_number, 1)
def histogram(x, xmin, xmax, histogram_out): nbins = histogram_out.shape[0] bin_width = (xmax - xmin) / nbins start = cuda.grid(1) stride = cuda.gridsize(1) ### DEBUG FIRST THREAD if start == 0: from pdb import set_trace; set_trace() ### for i in range(start, x.shape[0], stride): bin_number = np.int32((x[i] + xmin)/bin_width) if bin_number >= 0 and bin_number < histogram_out.shape[0]: cuda.atomic.add(histogram_out, bin_number, 1)
def foo(out): x, y, z = cuda.grid(3) a, b, c = cuda.gridsize(3) out[x, y, z] = a * b * c
def foo(start, end, delta): for i in range(cuda.grid(1), delta.size, cuda.gridsize(1)): delta[i] = end[i] - start[i]
def foo(dates, target, delta, matches, outdelta): for i in range(cuda.grid(1), matches.size, cuda.gridsize(1)): matches[i] = dates[i] == target outdelta[i] = dates[i] - delta
def simple_gridsize1d(ary): i = cuda.grid(1) x = cuda.gridsize(1) if i == 0: ary[0] = x
def simple_gridsize2d(ary): i, j = cuda.grid(2) x, y = cuda.gridsize(2) if i == 0 and j == 0: ary[0] = x ary[1] = y