def compute_sample_kernel(factors, longest_wavelet, offsets_per_wavelength, output, num_rows): num_wavelengths = longest_wavelet - 2 output[cuda.gridDim.x] = 0.0 for row_index in range(num_rows): output[cuda.grid(1)] += get_value_gpu(factors, row_index, cuda.gridDim.x, longest_wavelet, num_wavelengths, offsets_per_wavelength) output[cuda.grid(1)] += factors[-1]
def vec_add_ilp_x4(a, b, c): # read i = cuda.grid(1) ai = a[i] bi = b[i] bw = cuda.blockDim.x gw = cuda.gridDim.x stride = gw * bw j = i + stride aj = a[j] bj = b[j] k = j + stride ak = a[k] bk = b[k] l = k + stride al = a[l] bl = b[l] # compute ci = core(ai, bi) cj = core(aj, bj) ck = core(ak, bk) cl = core(al, bl) # write c[i] = ci c[j] = cj c[k] = ck c[l] = cl
def pruneGPU(input_d, num_elements, min_sup): tx = cuda.threadIdx.x index = cuda.grid(1) if index < num_elements: if input_d[index] < min_sup: input_d[index] = 0
def c_distribute(rands, low, high): i = cuda.grid(1) if i >= rands.shape[0]: return rands[i] = (1.0 - rands[i]) * low + rands[i] * high
def abs_m(a, out): n = out.shape[0] m = out.shape[1] i,j = cuda.grid(2) if i < n and j < m: out[i,j] = fabs(a[i,j])
def to_convert_to_cuda(lx, ly, numdrops, x, obst_x, y, obst_y, obst_r, fIn, gIn, rho2_0, tNS, cuNS2, jx2, jy2, rho1_0, cuNS1, jx1, jy1): j, k = cuda.grid(2) if j < lx + 1 and k < ly + 1: t = 0 for z in xrange(numdrops): obst_x[z, 0] obst_y[z, 0] obst_r[z, 0] val = (x[j, k] - obst_x[z, 0]) * (x[j, k] - obst_x[z, 0]) + ( y[j, k] - obst_y[z, 0]) * (y[j, k] - obst_y[z, 0]) ref = obst_r[z, 0] * obst_r[z, 0] if val <= ref: fIn[i, j, k] = 0 gIn[i, j, k] = rho2_0 * tNS[0, i] * ( 1 + cuNS2[j, k] + (1. / 2) * (cuNS2[j, k] * cuNS2[j, k]) - (3. / 2) * (jx2[j, k] * jx2[j, k] + jy2[j, k] * jy2[j, k])) t = t + 1 elif val > ref and (t < 1): fIn[i, j, k] = rho1_0 * tNS[0, i] * ( 1 + cuNS1[j, k] + 1. / 2 * (cuNS1[j, k] * cuNS1[j, k]) - (3. / 2) * (jx1[j, k] * jx1[j, k] + jy1[j, k] * jy1[j, k])) gIn[i, j, k] = 0 t = t + 1
def saxpy(a, x, y, out): i = cuda.grid( 1) # Short for cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x if i < out.size: out[i] = a * x[i] + y[i]
def mmultiply_pointwise(a,b,out): n = a.shape[0] m = a.shape[1] i,j = cuda.grid(2) if i < n and j < m: out[i,j] = a[i,j]*b[i,j]
def cu_matmul_sm(A, B, C, n, tpb, bpg): # decalre shared memory sA = cuda.shared.array(shape=block_dim, dtype=float32) sB = cuda.shared.array(shape=block_dim, dtype=float32) # we now need the thread ID within a block as well as the global thread ID tx = cuda.threadIdx.x ty = cuda.threadIdx.y x, y = cuda.grid(2) # pefort partial operations in block-szied tiles # saving intermediate values in an accumulator variable acc = 0.0 for i in range(bpg): # Stage 1: Prefil shared memory with current block from matrix A and matrix B sA[tx, ty] = A[x, ty + i * tpb] sB[tx, ty] = B[tx + i * tpb, y] # Block calculations till shared mmeory is filled cuda.syncthreads() # Stage 2: Compute partial dot product and add to accumulator if x < n and y < n: for j in range(tpb): acc += sA[tx, j] * sB[j, ty] # Blcok until all threads have completed calcuaiton before next loop iteration cuda.syncthreads() # Put accumulated dot product into output matrix if x < n and y < n: C[x, y] = acc
def _gaussian_cuda32(fac, n_rep, t, n_t, a_facGo, b_facGo, c_facGo): i, j = cuda.grid(2) if i >= n_rep or j >= n_t: return # Fill in 2D fac data structure fac[i, j] = a_facGo[i] * exp(-(t[j] - b_facGo[i])**2 /(2 * c_facGo[i]**2))
def produce_chId_lit_gpu(rid, literal, chunk_id, length): i = cuda.grid(1) if i <length: chunk_id[i] = rid[i]/31 literal[i] = (literal[i]|1<<31) #the left bit set to 1 off_set = 30-rid[i]%31 literal[i] = (literal[i]|1<<off_set)
def get_reduced(literal, start_pos, reduced_length, reduced_literal, input_data, chunk_id, reduced_input_data, reduced_chunk_id): i = cuda.grid(1) if i < reduced_length: for lit in literal[start_pos[i]:start_pos[i+1]]: reduced_literal[i] |= lit reduced_input_data[i] = input_data[start_pos[i]] reduced_chunk_id[i] = chunk_id[start_pos[i]]
def maxPoly(x0, coef, tol, nParam, argMax): # Thread IDs i = cuda.grid(1) # The Kernel should only execute if i < nParam if i >= nParam: return else: # Iterate to convergence x = x0 diff = tol+1 while diff > tol: # Compute the first derivative firstDeriv = 2*coef[i]*x + 2.3 # Compute the second derivative secondDeriv = 2*coef[i] # Newton step xNew = x - firstDeriv/secondDeriv # Compute difference for convergence check and update diff = abs(xNew-x) x = xNew # Function output argMax[i] = x
def maxPoly(x0, coef, tol, nParam, argMax): # Thread IDs i = cuda.grid(1) # The Kernel should only execute if i < nParam if i >= nParam: return else: # Iterate to convergence x = x0 diff = tol + 1 while diff > tol: # Compute the first derivative firstDeriv = 2 * coef[i] * x + 2.3 # Compute the second derivative secondDeriv = 2 * coef[i] # Newton step xNew = x - firstDeriv / secondDeriv # Compute difference for convergence check and update diff = abs(xNew - x) x = xNew # Function output argMax[i] = x
def reduce_by_key_gpu(literal, flag, is_finish, hop, length): i = cuda.grid(1) if i < length-hop: if (not is_finish[i]) and (not flag[i+hop]): literal[i] |= literal[i+hop] else: is_finish[i] = 1
def kernel(spheres, bitmap): x, y = cuda.grid(2) # alias for threadIdx.x + ( blockIdx.x * blockDim.x ), # threadIdx.y + ( blockIdx.y * blockDim.y ) # shift the grid to [-DIM/2, DIM/2] ox = x - DIM / 2 oy = y - DIM / 2 r = 0. g = 0. b = 0. maxz = -INF i = 0 # emulate a C-style for-loop, exposing the idx increment logic while (i < SPHERES): t = hit(ox, oy, spheres[i]) rad = spheres[i].radius if (t > maxz): dz = t - spheres[i].z # t = dz + z; inverting hit() result n = dz / sqrt(rad * rad) fscale = n # shades the color to be darker as we recede from # the edge of the cube circumscribing the sphere r = spheres[i].r * fscale g = spheres[i].g * fscale b = spheres[i].b * fscale maxz = t i += 1 # Save the RGBA value for this particular pixel bitmap[x, y, 0] = int(r * 255.) bitmap[x, y, 1] = int(g * 255.) bitmap[x, y, 2] = int(b * 255.) bitmap[x, y, 3] = 255
def produce_fill_gpu(d_head, d_reduced_chunk_id, reduced_chunk_id, reduced_length): i = cuda.grid(1) if i < reduced_length: if not d_head[i]: d_reduced_chunk_id[i] = reduced_chunk_id[i] - reduced_chunk_id[ i - 1] - 1
def getIdx_gpu(fill_word, reduced_literal, index, compact_flag, length): i = cuda.grid(1) if i < length: index[i * 2] = fill_word[i] index[i * 2 + 1] = reduced_literal[i] if not fill_word[i]: compact_flag[i * 2] = 0
def induced_velocity4(x, xvort, gam, vel): smem = cuda.shared.array((blksize, 3), dtype=f8) t = cuda.threadIdx.x i = cuda.grid(1) # eps = 1.e-2 nvort = xvort.shape[0] nx = x.shape[0] if i < nx: x0 = x[i, 0] x1 = x[i, 1] xvel = 0 yvel = 0 nvort = xvort.shape[0] for blk in range((nvort - 1) // blksize + 1): # load vortex positions and strengths into shared memory j = blk * blksize + t if j < nvort: smem[t, 0] = xvort[j, 0] smem[t, 1] = xvort[j, 1] smem[t, 2] = gam[j] else: smem[t, 0] = 0 smem[t, 1] = 0 smem[t, 2] = 0 cuda.syncthreads() # compute the contributions to the velocity for k in range(blksize): rsq = (x0 - smem[k, 0])**2 + (x1 - smem[k, 1])**2 + eps**2 xvel += smem[k, 2] * (x1 - smem[k, 1]) / rsq yvel += -smem[k, 2] * (x0 - smem[k, 0]) / rsq cuda.syncthreads() if i < nx: vel[i, 0] = xvel vel[i, 1] = yvel
def produce_chId_lit_gpu(rid, literal, chunk_id, length): i = cuda.grid(1) if i < length: chunk_id[i] = rid[i] / 31 literal[i] = (literal[i] | 1 << 31) #the left bit set to 1 off_set = 30 - rid[i] % 31 literal[i] = (literal[i] | 1 << off_set)
def getIdx_gpu(fill_word, reduced_literal, index, compact_flag, length): i = cuda.grid(1) if i<length: index[i*2] = fill_word[i] index[i*2+1] = reduced_literal[i] if not fill_word[i]: compact_flag[i*2] = 0
def const_m(out, const): n = out.shape[0] m = out.shape[1] i,j = cuda.grid(2) if i < n and j < m: out[i,j] = const
def exp_m(a, out): n = out.shape[0] m = out.shape[1] i,j = cuda.grid(2) if i < n and j < m: out[i,j] = exp(a[i,j])
def d_ifog_activate(i,f,o,g): x,y = cuda.grid(2) if (x<i.shape[0] and y < i.shape[1]): i[x,y] = d_sigmoid(i[x,y]) f[x,y] = d_sigmoid(f[x,y]) o[x,y] = d_sigmoid(o[x,y]) g[x,y] = math.tanh(g[x,y])
def d_ifog_build(ifog,i,f,o,g): x,y = cuda.grid(2) if (x<i.shape[0] and y < i.shape[1]): ifog[x,y] = i[x,y] ifog[x,y+i.shape[1]] = f[x,y] ifog[x,y+(i.shape[1]*2)] = o[x,y] ifog[x,y+(i.shape[1]*3)] = g[x,y]
def kernel(dst, src): '''A simple kernel that adds 1 to every item ''' i = cuda.grid(1) if i >= dst.shape[0]: return dst[i] = src[i] + 1
def m_mn_sadd_pointwise(a,b,alpha,beta,out): n = a.shape[0] m = a.shape[1] i,j = cuda.grid(2) if i < n and j < m: out[i,j] = alpha*a[i,j]+beta*b[i,0]
def stateUpdate(currentGrid, nextGrid, cluster, clusterSize, nClust, clusterOnes, enterP, activateP, choiceP, diffuseP, xis, eta): x,y = cuda.grid(2) i = cuda.grid(1) gw,gh = currentGrid.shape #settings pe = 0.0001 pd = 0.05 ph = 0.0485/1.5 A = 1.8 a = 2*A h = 0 if x < gw and y < gh: nextGrid[x,y] = cellUpdate(currentGrid, cluster, clusterSize, nClust, clusterOnes, x, y, i, pe, pd, ph, A, a, h, enterP, activateP, choiceP, diffuseP, xis, eta)
def reduce_by_key_gpu(literal, flag, is_finish, hop, length): i = cuda.grid(1) if i < length - hop: if (not is_finish[i]) and (not flag[i + hop]): literal[i] |= literal[i + hop] else: is_finish[i] = 1
def m_mn_add_pointwise(a,b,out): n = a.shape[0] m = a.shape[1] i,j = cuda.grid(2) if i < n and j < m: out[i,j] = a[i,j]+b[i,0]
def tanh_m(a, out): n = out.shape[0] m = out.shape[1] i,j = cuda.grid(2) if i < n and j < m: out[i,j] = tanh(a[i,j])
def d_mexpsum(a,b): sA = cuda.shared.array(shape=(100,100),dtype=float32) xidx,yidx = cuda.threadIdx.x,cuda.threadIdx.y x,y = cuda.grid(2) total = min(cuda.blockDim.y, a.shape[1] - (cuda.blockIdx.y*cuda.blockDim.y)) s = total/2 if yidx<s: sA[xidx,yidx] = math.exp(a[x,y]) + math.exp(a[x,y+s]) elif yidx+s==total-1: cuda.syncthreads() sA[xidx,0] += math.exp(a[x,y+s]) cuda.syncthreads() last_s = s s = s/2 while s>0: if yidx < s: sA[xidx,yidx] += sA[xidx,yidx+s] elif yidx+s==last_s-1: cuda.syncthreads() sA[xidx,0] += sA[xidx,yidx+s] cuda.syncthreads() last_s = s s=s/2 cuda.syncthreads() if yidx == 0: b[x,cuda.blockIdx.y] = sA[xidx,yidx]
def _gaussian_cuda32(fac, fac_Biman, n_rep, t, n_t, a_facGo, b_facGo, c_facGo): i, j = cuda.grid(2) if i >= n_rep or j >= n_t: return # Fill in 2D fac data structure fac[i, j] = fac_Biman[i, j] + (a_facGo[i] * exp(-(t[j] - b_facGo[i])**2 /(2 * c_facGo[i]**2))) # now adding two arrays together for new fac curve by Hayley
def log_m(a, out): n = out.shape[0] m = out.shape[1] i,j = cuda.grid(2) if i < n and j < m: out[i,j] = log(a[i,j])
def d_mclip(a): x,y = cuda.grid(2) if (x < a.shape[0]) and (y <a.shape[1]): if (a[x,y] > 0.9): a[x,y] = 0.9 if (a[x,y] < -0.9): a[x,y] = -0.9
def d_msum(a,b): sA = cuda.shared.array(shape=(32,32),dtype=float32) xidx,yidx = cuda.threadIdx.x,cuda.threadIdx.y x,y = cuda.grid(2) total = min(cuda.blockDim.y, a.shape[1] - (cuda.blockIdx.y*cuda.blockDim.y)) s = total/2 if y+s < a.shape[1]: if yidx<s: sA[xidx,yidx] = a[x,y] + a[x,y+s] cuda.syncthreads() if yidx == total-1 and not yidx < s: sA[xidx,0] += a[x,y+s] cuda.syncthreads() last_s = s s=s/2 while s>0: if yidx<s: sA[xidx,yidx] += sA[xidx,yidx+s] cuda.syncthreads() if yidx == last_s-1 and not yidx < s: sA[xidx,0] += sA[xidx,yidx+s] cuda.syncthreads() s=s/2 if yidx==0: b[x,cuda.blockIdx.y] = sA[xidx,yidx]
def vec_add_ilp_x8(a, b, c): # read i = cuda.grid(1) ai = a[i] bi = b[i] bw = cuda.blockDim.x gw = cuda.gridDim.x stride = gw * bw j = i + stride aj = a[j] bj = b[j] k = j + stride ak = a[k] bk = b[k] l = k + stride al = a[l] bl = b[l] m = l + stride am = a[m] bm = b[m] n = m + stride an = a[n] bn = b[n] p = n + stride ap = a[p] bp = b[p] q = n + stride aq = a[q] bq = b[q] # compute ci = core(ai, bi) cj = core(aj, bj) ck = core(ak, bk) cl = core(al, bl) cm = core(am, bm) cn = core(an, bn) cp = core(ap, bp) cq = core(aq, bq) # write c[i] = ci c[j] = cj c[k] = ck c[l] = cl c[m] = cm c[n] = cn c[p] = cp c[q] = cq
def downsweep_phase(zero_list, one_list, hop, base): i = cuda.grid(1) if i%(2*hop) == (2*hop-1): zero_list[i-hop], zero_list[i] = zero_list[i], zero_list[i-hop]+zero_list[i] one_list[i-hop], one_list[i] = one_list[i], one_list[i-hop]+one_list[i] cuda.syncthreads() if hop==1: one_list[i] += base
def binit(dictionary, stimuli, b): n = stimuli.shape[0] m = dictionary.shape[0] k = dictionary.shape[1] i, j = cuda.grid(2) for r in xrange(k): b[i, j] += stimuli[i, r] * dictionary[j, r]
def d_distances(a,b,val): x = cuda.grid(1) if (x < b.shape[0]): dist = 0. for y in range(b.shape[1]): dist += d_dist(a[0,y],b[x,y]) dist = math.sqrt(dist) val[0,x] = dist
def get_list(arr, length, iter_order, zero_list, one_list): i = cuda.grid(1) if i < length: one_list[i] = (arr[i] >> iter_order) % 2 zero_list[i] = 1 - one_list[i] else: one_list[i] = 0 zero_list[i] = 0
def cuda_match(a, b, c): i = cuda.grid(1) for j in range (len(b)) : if (a[i] == b[j]) : c[i] = b[j] break else : c[i] = 0
def cinit(dictionary, c): n = dictionary.shape[0] m = dictionary.shape[1] i, j = cuda.grid(2) if (i != j): for k in xrange(m): c[i, j] += dictionary[i, k] * dictionary[j, k]
def produce_flag( input_data, chunk_id, length, flag ): #flag initialized to 0 if a reduced segment start here, flag set to 1 i = cuda.grid(1) if i < length: if i == 0 or (input_data[i] != input_data[i - 1] or chunk_id[i] != chunk_id[i - 1]): flag[i] = 1
def get_reduced(literal, start_pos, reduced_length, reduced_literal, input_data, chunk_id, reduced_input_data, reduced_chunk_id): i = cuda.grid(1) if i < reduced_length: for lit in literal[start_pos[i]:start_pos[i + 1]]: reduced_literal[i] |= lit reduced_input_data[i] = input_data[start_pos[i]] reduced_chunk_id[i] = chunk_id[start_pos[i]]
def array_adjust(arr, d_arr,rid, d_rid, zero_list, one_list, d_zero_list, d_one_list, length): i = cuda.grid(1) if i<length: if zero_list[i] == 1: arr[d_zero_list[i]] = d_arr[i] rid[d_zero_list[i]] = d_rid[i] else: arr[d_one_list[i]] = d_arr[i] rid[d_one_list[i]] = d_rid[i]
def reduce_phase(zero_list, one_list, hop, thread_num): i = cuda.grid(1) if i%(2*hop) == (2*hop-1): zero_list[i] += zero_list[i-hop] one_list[i] += one_list[i-hop] if hop == thread_num/2: if i == thread_num-1: one_list[i] = 0 zero_list[i] = 0
def saxpy(a, x, y, out): # Short for cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x i = cuda.grid(1) # Map i to array elements if i >= out.size: # Out of range? return # Do actual work out[i] = a * x[i] + y[i]
def division(a, b, c, size): """ Kernel to compute ratio of device arrays a/b :param a: (device array) :param b: (device array) :param c: (device array) contains values of a/b :param size: (int) size of device arrays """ i = cuda.grid(1) if i < size: c[i] = a[i] / b[i]
def batch_norm(A, aInorm, K): tid = cuda.grid(1) if tid >= A.shape[1]: return sum = float_type(0.0) for k in range(K): val = A[k, tid] sum += val * val aInorm[tid] = math.sqrt(sum)
def gabs(x, y): i = cuda.grid(1) if i >= x.size or i >= y.size: return if x[i] < 0: y[i] = -x[i] else: y[i] = x[i] return
def norm_random_nums(C, d): i = cuda.grid(1) if i >= C.shape[1]: return c = C[:, i] sum = float_type(0.0) for j in range(d): cj = c[j] sum += cj * cj val = math.sqrt(sum) for j in range(d): c[j] /= val
def induced_velocity3(x, xvort, gam, vel): # eps = float32(1.e-2) # i, j = cuda.grid(2) i = cuda.grid(1) if i < x.shape[0]: vel[i, 0] = float32(0.) vel[i, 1] = float32(0.) nvort = xvort.shape[0] for j in range(nvort): rsq = (x[i, 0] - xvort[j, 0])**2 + (x[i, 1] - xvort[j, 1])**2 + eps**2 vel[i, 0] += gam[j] * (x[i, 1] - xvort[j, 1]) / rsq vel[i, 1] += -gam[j] * (x[i, 0] - xvort[j, 0]) / rsq
def maxCoefsABS(curCoefs, coefs, coefsd, winners, k): i = cuda.grid(1) #This is not a great idea. Does cuda do inf? What is largest negative number? maxVal = -1. maxLoc = -i length = curCoefs.shape[1] for jj in xrange(length): if math.fabs(curCoefs[i, jj]) > maxVal: maxVal = math.fabs(curCoefs[i, jj]) maxLoc = jj winners[k, i] = maxLoc coefs[i, maxLoc] = curCoefs[i, maxLoc] coefsd[i, maxLoc] = curCoefs[i, maxLoc]
def cu_monte_carlo_pricer(paths, dt, c0, c1, normdist, seed): # short for cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x i = cuda.grid(1) if i >= paths.shape[0]: return randnum = seed[i] for j in range(1, paths.shape[1]): # foreach time step elt = randnum % normdist.shape[0] if elt < 0: elt = -elt noise = normdist[elt] paths[i, j] = paths[i, j - 1] * np.exp(c0 * dt + c1 * noise) # generate next random number randnum = randnum * A + C