def kernel(dst, src): '''A simple kernel that adds 1 to every item ''' i = cuda.grid(1) if i >= dst.shape[0]: return dst[i] = src[i] + 1
def removeEdges(edgeList, sortedArgs, n_discarded): """ inputs: edgeList : list of edges sortedArgs : argument list of the sorted weight list n_discarded : number of edges to be discarded specified in sortedArgs Remove discarded edges form the edge list. Each edge discarded is replaced by -1. Discard edges specified by the last n_discarded arguments in the sortedArgs list. """ tgid = cuda.grid(1) # one thread per edge that must be discarded # total number of edges to be discarded is the difference # between the between the total number of edges and the # number of edges to be considered + the number edges # to be discarded if tgid >= n_discarded: return # remove not considered edges elif tgid < n_considered_edges: maxIdx = edgeList.size - 1 # maximum index of sortedArgs index = maxIdx - tgid # index of edgeList[index] = -1
def compute_lifetimes_CUDA(nweight, lifetimes): edge = cuda.grid(1) if edge >= lifetimes.size: return lifetimes[edge] = nweight[edge + 1] - nweight[edge]
def getWeightsOfEdges_gpu(edges, n_edges, weights, nweights): """ This function will take a list of edges (edges), the number of edges to consider (n_edges, the weights of all the possible edges (weights) and the array for the weights of the list of edges and put the weight of each edge in the list of edges in the nweights, in the same position. The kernel will also discard not considered edges, i.e. edges whose argument >= n_edges. Discarding an edge is done by replacing the edge by -1. """ # n_edges_sm = cuda.shared.array(1, dtype = int32) edge = cuda.grid(1) if edge >= edges.size: return # if edge == 0: # n_edges_sm[0] = n_edges[0] # cuda.syncthreads() # if edge >= n_edges_sm[0]: if edge >= n_edges[0]: edges[edge] = -1 else: myEdgeID = edges[edge] nweights[edge] = weights[myEdgeID]
def addEdges(edges, n_edges, dest, weight, fe, od, top_edge, ndest, nweight): n_edges_sm = cuda.shared.array(0, dtype = int32) edge = cuda.grid(1) # if edge == 0: # n_edges_sm[0] = n_edges[0] key = edges[edge] # if edge is -1 it was marked for removal if key == -1: return o_v = dest[key] i_v = binaryOriginVertexSearch_CUDA(key, dest, fe, od) # get and increment pointers for each vertex i_ptr = cuda.atomic.add(top_edge, i_v, 1) o_ptr =cuda.atomic.add(top_edge, o_v, 1) # add edges to destination array ndest[i_ptr] = o_v ndest[o_ptr] = i_v # add weight to edges edge_w = weight[key] nweight[i_ptr] = edge_w nweight[o_ptr] = edge_w
def get_grad_omega(grad_omega, omega, r, d, qbin): """ Get the gradient of the Debye sum with respect to atomic positions Parameters ---------- grad_omega: kx3xQ array The gradient omega: kxQ array Debye sum r: k array The pair distance array d: kx3 array The pair displacements qbin: float The qbin size """ kmax, _, qmax_bin = grad_omega.shape k, qx = cuda.grid(2) if k >= kmax or qx >= qmax_bin: return sv = f4(qx) * qbin rk = r[k] a = (sv * math.cos(sv * rk)) - omega[k, qx] a /= rk * rk for w in range(i4(3)): grad_omega[k, w, qx] = a * d[k, w]
def builtin_max(A, B, C): i = cuda.grid(1) if i >= len(C): return C[i] = float64(max(A[i], B[i]))
def gpu_expand_mask_bits(bits, out): """Expand each bits in bitmask *bits* into an element in out. This is a flexible kernel that can be launch with any number of blocks and threads. """ for i in range(cuda.grid(1), out.size, cuda.gridsize(1)): out[i] = mask_get(bits, i)
def d2_to_d1_sum(d1, d2): qx = cuda.grid(1) if qx >= len(d1): return tmp = d2[:, qx].sum() d1[qx] = tmp
def gpu_gather(data, index, out): i = cuda.grid(1) if i < index.size: idx = index[i] # Only do it if the index is in range if 0 <= idx < data.size: out[i] = data[idx]
def cufftShift_2D_kernel(data, N): """ adopted CUDA FFT shift code from: https://github.com/marwan-abdellah/cufftShift (GNU Lesser Public License) """ # // 2D Slice & 1D Line sLine = N sSlice = N * N # // Transformations Equations sEq1 = int((sSlice + sLine) / 2) sEq2 = int((sSlice - sLine) / 2) x, y = cuda.grid(2) # // Thread Index Converted into 1D Index index = (y * N) + x if x < N / 2: if y < N / 2: # // First Quad temp = data[index] data[index] = data[index + sEq1] # // Third Quad data[index + sEq1] = temp else: if y < N / 2: # // Second Quad temp = data[index] data[index] = data[index + sEq2] data[index + sEq2] = temp
def lateral_inh(S, V, K_inh): idx, idy, idz = cuda.grid(3) if idx > V.shape[0] - 1: return if idy > V.shape[1] - 1: return if idz > V.shape[2] - 1: return # if neuron has not fired terminate the thread if S[idx, idy, idz] != 1: return # if a neuron in this position has fired before do not fire again if K_inh[idx, idy] == 0: S[idx, idy, idz] = 0 return # neuron at this position but in other input map for k in range(V.shape[2]): if S[idx, idy, k] == 1 and V[idx, idy, idz] < V[idx, idy, k]: S[idx, idy, idz] = 0 return K_inh[idx, idy] = 0
def vec_add_ilp_x4(a, b, c): # read i = cuda.grid(1) ai = a[i] bi = b[i] bw = cuda.blockDim.x gw = cuda.gridDim.x stride = gw * bw j = i + stride aj = a[j] bj = b[j] k = j + stride ak = a[k] bk = b[k] l = k + stride al = a[l] bl = b[l] # compute ci = core(ai, bi) cj = core(aj, bj) ck = core(ak, bk) cl = core(al, bl) # write c[i] = ci c[j] = cj c[k] = ck c[l] = cl
def fast_matmul(A, B, C): # Define an array in the shared memory # The size and type of the arrays must be known at compile time sA = cuda.shared.array(shape=(TPB, TPB), dtype=float32) sB = cuda.shared.array(shape=(TPB, TPB), dtype=float32) x, y = cuda.grid(2) tx = cuda.threadIdx.x ty = cuda.threadIdx.y bpg = cuda.gridDim.x # blocks per grid if x >= C.shape[0] and y >= C.shape[1]: # Quit if (x, y) is outside of valid C boundary return # Each thread computes one element in the result matrix. # The dot product is chunked into dot products of TPB-long vectors. tmp = 0. for i in range(bpg): # Preload data into shared memory sA[tx, ty] = A[x, ty + i * TPB] sB[tx, ty] = B[tx + i * TPB, y] # Wait until all threads finish preloading cuda.syncthreads() # Computes partial product on the shared memory for j in range(TPB): tmp += sA[tx, j] * sB[j, ty] # Wait until all threads finish computing cuda.syncthreads() C[x, y] = tmp
def vec_add_ilp_x8(a, b, c): # read i = cuda.grid(1) ai = a[i] bi = b[i] bw = cuda.blockDim.x gw = cuda.gridDim.x stride = gw * bw j = i + stride aj = a[j] bj = b[j] k = j + stride ak = a[k] bk = b[k] l = k + stride al = a[l] bl = b[l] m = l + stride am = a[m] bm = b[m] n = m + stride an = a[n] bn = b[n] o = n + stride ao = a[o] bo = b[o] p = o + stride ap = a[o] bp = b[o] # compute ci = core(ai, bi) cj = core(aj, bj) ck = core(ak, bk) cl = core(al, bl) cm = core(am, bm) cn = core(an, bn) co = core(ao, bo) cp = core(ap, bp) # write c[i] = ci c[j] = cj c[k] = ck c[l] = cl c[m] = cm c[n] = cn c[o] = co c[p] = cp
def function_with_lots_of_registers(x, a, b, c, d, e, f): a1 = 1.0 a2 = 1.0 a3 = 1.0 a4 = 1.0 a5 = 1.0 b1 = 1.0 b2 = 1.0 b3 = 1.0 b4 = 1.0 b5 = 1.0 c1 = 1.0 c2 = 1.0 c3 = 1.0 c4 = 1.0 c5 = 1.0 d1 = 10 d2 = 10 d3 = 10 d4 = 10 d5 = 10 for i in range(a): a1 += b a2 += c a3 += d a4 += e a5 += f b1 *= b b2 *= c b3 *= d b4 *= e b5 *= f c1 /= b c2 /= c c3 /= d c4 /= e c5 /= f d1 <<= b d2 <<= c d3 <<= d d4 <<= e d5 <<= f x[cuda.grid(1)] = a1 + a2 + a3 + a4 + a5 x[cuda.grid(1)] += b1 + b2 + b3 + b4 + b5 x[cuda.grid(1)] += c1 + c2 + c3 + c4 + c5 x[cuda.grid(1)] += d1 + d2 + d3 + d4 + d5
def gpu_compact_mask_bytes(bools, bits): tid = cuda.grid(1) base = tid * mask_bitsize for i in range(base, base + mask_bitsize): if i >= bools.size: break if bools[i]: mask_set(bits, i)
def gpu_insert_if_masked(arr, mask, out_idx, out_queue): i = cuda.grid(1) if i < arr.size: diff = mask[i] if diff: wridx = cuda.atomic.add(out_idx, 0, 1) if wridx < out_queue.size: out_queue[wridx] = arr[i]
def cu_mat_power_binop(A, power, power_A): y, x = cuda.grid(2) m, n = power_A.shape if x >= n or y >= m: return power_A[y, x] = A[y, x] ** power
def cu_mat_power(A, power, power_A): y, x = cuda.grid(2) m, n = power_A.shape if x >= n or y >= m: return power_A[y, x] = math.pow(A[y, x], int32(power))
def cuconstRecAlign(A, B, C, D, E): Z = cuda.const.array_like(CONST_RECORD_ALIGN) i = cuda.grid(1) A[i] = Z[i]['a'] B[i] = Z[i]['b'] C[i] = Z[i]['x'] D[i] = Z[i]['y'] E[i] = Z[i]['z']
def simple_smem(ary): sm = cuda.shared.array(N, int32) i = cuda.grid(1) if i == 0: for j in range(N): sm[j] = j cuda.syncthreads() ary[i] = sm[i]
def rng_kernel_float64(states, out, count, distribution): thread_id = cuda.grid(1) for i in range(count): if distribution == UNIFORM: out[thread_id * count + i] = xoroshiro128p_uniform_float64(states, thread_id) elif distribution == NORMAL: out[thread_id * count + i] = xoroshiro128p_normal_float64(states, thread_id)
def saxpy(a, x, y, out): # Short for cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x i = cuda.grid(1) # Map i to array elements if i >= out.size: # Out of range? return # Do actual work out[i] = a * x[i] + y[i]
def experimental_sum_grad_fq1(new_grad, grad, k_cov): k, qx = cuda.grid(2) if k >= len(grad) or qx >= grad.shape[2]: return i, j = cuda_k_to_ij(i4(k + k_cov)) for tz in range(3): a = grad[k, tz, qx] cuda.atomic.add(new_grad, (j, tz, qx), a) cuda.atomic.add(new_grad, (i, tz, qx), f4(-1.) * a)
def matmul(A, B, C): """Perform square matrix multiplication of C = A * B """ i, j = cuda.grid(2) if i < C.shape[0] and j < C.shape[1]: tmp = 0. for k in range(A.shape[1]): tmp += A[i, k] * B[k, j] C[i, j] = tmp
def intrinsic_forloop_step(c): startX, startY = cuda.grid(2) gridX = cuda.gridDim.x * cuda.blockDim.x gridY = cuda.gridDim.y * cuda.blockDim.y height, width = c.shape for x in range(startX, width, gridX): for y in range(startY, height, gridY): c[y, x] = x + y
def preCalc(y, yA, yB, numDataPoints): i = cuda.grid(1) k = i % numDataPoints ans = float32(1.001 * float32(i)) y[i] = ans yA[i] = ans * 1.0 yB[i] = ans / 1.0
def cuda_all_euc_dists_inner(coords_arr, out_arr): x, y, m = cuda.grid(3) num_models, num_beads, num_dims = coords_arr.shape if x < num_beads and y < num_beads and m < num_models: acc = 0.0 for d in range(num_dims): acc += (coords_arr[m, x, d] - coords_arr[m, y, d]) ** 2 out_arr[x, y, m] = math.sqrt(acc)
def diagproduct(c, a, b): startX, startY = cuda.grid(2) gridX = cuda.gridDim.x * cuda.blockDim.x gridY = cuda.gridDim.y * cuda.blockDim.y height = c.shape[0] width = c.shape[1] for x in range(startX, width, (gridX)): for y in range(startY, height, (gridY)): c[y, x] = a[y, x] * b[x]
def math_sqrt(A, B): i = cuda.grid(1) B[i] = math.sqrt(A[i])
def math_acosh(A, B): i = cuda.grid(1) B[i] = math.acosh(A[i])
def useless_syncwarp(ary): i = cuda.grid(1) cuda.syncwarp() ary[i] = i
def useless_syncthreads(ary): i = cuda.grid(1) cuda.syncthreads() ary[i] = i
def dyn_shared_memory(ary): i = cuda.grid(1) sm = cuda.shared.array(0, float32) sm[i] = i * 2 cuda.syncthreads() ary[i] = sm[i]
def math_lgamma(A, B): i = cuda.grid(1) B[i] = math.lgamma(A[i])
def math_atanh(A, B): i = cuda.grid(1) B[i] = math.atanh(A[i])
def math_ceil(A, B): i = cuda.grid(1) B[i] = math.ceil(A[i])
def math_degrees(A, B): i = cuda.grid(1) B[i] = math.degrees(A[i])
def math_radians(A, B): i = cuda.grid(1) B[i] = math.radians(A[i])
def math_pow_binop(A, B, C): i = cuda.grid(1) C[i] = A[i] ** B[i]
def atomic_compare_and_swap(res, old, ary): gid = cuda.grid(1) if gid < res.size: out = cuda.atomic.compare_and_swap(res[gid:], -99, ary[gid]) old[gid] = out
def math_sin(A, B): i = cuda.grid(1) B[i] = math.sin(A[i])
def math_cos(A, B): i = cuda.grid(1) B[i] = math.cos(A[i])
def math_pow(A, B, C): i = cuda.grid(1) C[i] = math.pow(A[i], B[i])
def math_asinh(A, B): i = cuda.grid(1) B[i] = math.asinh(A[i])
def math_hypot(A, B, C): i = cuda.grid(1) C[i] = math.hypot(A[i], B[i])
def math_isfinite(A, B): i = cuda.grid(1) B[i] = math.isfinite(A[i])
def math_log1p(A, B): i = cuda.grid(1) B[i] = math.log1p(A[i])
def math_isinf(A, B): i = cuda.grid(1) B[i] = math.isinf(A[i])
def math_isnan(A, B): i = cuda.grid(1) B[i] = math.isnan(A[i])
def math_modf(A, B, C): i = cuda.grid(1) B[i], C[i] = math.modf(A[i])
def use_syncthreads_count(ary_in, ary_out): i = cuda.grid(1) ary_out[i] = cuda.syncthreads_count(ary_in[i])
def math_fmod(A, B, C): i = cuda.grid(1) C[i] = math.fmod(A[i], B[i])
def coop_smem2d(ary): i, j = cuda.grid(2) sm = cuda.shared.array((10, 20), float32) sm[i, j] = (i + 1) / (j + 1) cuda.syncthreads() ary[i, j] = sm[i, j]
def math_copysign(A, B, C): i = cuda.grid(1) C[i] = math.copysign(A[i], B[i])
def useless_syncwarp_with_mask(ary): i = cuda.grid(1) cuda.syncwarp(0xFFFF) ary[i] = i
def math_floor(A, B): i = cuda.grid(1) B[i] = math.floor(A[i])
def copykernel(x, y): i = cuda.grid(1) if i < x.shape[0]: x[i] = i y[i] = i
def math_mod_binop(A, B, C): i = cuda.grid(1) C[i] = A[i] % B[i]