def or_reduction(literal, tmp_out, length): bw = cuda.blockDim.x bx = cuda.blockIdx.x tid = cuda.threadIdx.x shared_list = cuda.shared.array(shape = (tpb), dtype = uint32) i = bx*bw + tid shared_list[tid] = 0x00000000 if i<length: shared_list[tid] = literal[i] cuda.syncthreads() hop = bw/2 while hop > 0: if tid < hop: shared_list[tid] = shared_list[tid] | shared_list[tid+hop] #if i <length: # print shared_list[tid] cuda.syncthreads() hop /= 2 if tid == 0: tmp_out[bx] = shared_list[0] print tmp_out[0]
def histogramGPU(input_d, bins_d, num_elements): private_bin = cuda.shared.array(SM_MAX_SIZE, int32) tx = cuda.threadIdx.x index = tx + cuda.blockDim.x * cuda.blockIdx.x stride = cuda.blockDim.x * cuda.gridDim.x location_x = 0 for i in range(0, ceil(SM_MAX_SIZE / (1.0 * BLOCK_SIZE))): location_x = tx + i * BLOCK_SIZE if location_x < MAX_UNIQUE_ITEMS and location_x < SM_MAX_SIZE: private_bin[location_x] = 0 cuda.syncthreads() element = 0 while index < num_elements: element = input_d[index] if element < SM_MAX_SIZE: cuda.atomic.add(private_bin, element, 1) else: cuda.atomic.add(bins_d, element, 1) index += stride cuda.syncthreads() for i in range(0, ceil(SM_MAX_SIZE / (1.0 * BLOCK_SIZE))): location_x = tx + i * BLOCK_SIZE if location_x < MAX_UNIQUE_ITEMS and location_x < SM_MAX_SIZE: cuda.atomic.add(bins_d, location_x, private_bin[location_x])
def makeFlistGPU(d_offsets, d_transactions, d_flist, num_transactions, all_items_in_transactions): private_items = cuda.shared.array(MAX_SM_ITEMS, uint32) tx = cuda.threadIdx.x index = tx + cuda.blockDim.x * cuda.blockIdx.x location_x = 0 for i in range(0, ceil(MAX_SM_ITEMS / (1.0 * BLOCK_SIZE))): location_x = tx + i * BLOCK_SIZE if location_x < MAX_SM_ITEMS: private_items[location_x] = 0 cuda.syncthreads() item_ends = 0 if index == (num_transactions - 1): item_ends = all_items_in_transactions elif index < (num_transactions - 1): item_ends = d_offsets[index + 1] else: item_ends = 0 for i in range(d_offsets[index], item_ends): if d_transactions[i] >= 0 and d_transactions[i] < MAX_SM_ITEMS: cuda.atomic.add(private_items, d_transactions[i], 1) elif d_transactions[i] >= MAX_SM_ITEMS and d_transactions[ i] < MAX_UNIQUE_ITEMS: cuda.atomic.add(d_flist, d_transactions[i], 1) cuda.syncthreads() for i in range(0, ceil(MAX_SM_ITEMS / (1.0 * BLOCK_SIZE))): location_x = tx + i * BLOCK_SIZE if location_x < MAX_SM_ITEMS: cuda.atomic.add(d_flist, location_x, private_items[location_x])
def induced_velocity4(x, xvort, gam, vel): smem = cuda.shared.array((blksize, 3), dtype=f8) t = cuda.threadIdx.x i = cuda.grid(1) # eps = 1.e-2 nvort = xvort.shape[0] nx = x.shape[0] if i < nx: x0 = x[i, 0] x1 = x[i, 1] xvel = 0 yvel = 0 nvort = xvort.shape[0] for blk in range((nvort - 1) // blksize + 1): # load vortex positions and strengths into shared memory j = blk * blksize + t if j < nvort: smem[t, 0] = xvort[j, 0] smem[t, 1] = xvort[j, 1] smem[t, 2] = gam[j] else: smem[t, 0] = 0 smem[t, 1] = 0 smem[t, 2] = 0 cuda.syncthreads() # compute the contributions to the velocity for k in range(blksize): rsq = (x0 - smem[k, 0])**2 + (x1 - smem[k, 1])**2 + eps**2 xvel += smem[k, 2] * (x1 - smem[k, 1]) / rsq yvel += -smem[k, 2] * (x0 - smem[k, 0]) / rsq cuda.syncthreads() if i < nx: vel[i, 0] = xvel vel[i, 1] = yvel
def cu_square_matrix_mul(A, B, C): sA = cuda.shared.array(shape=(tpb, tpb), dtype=f4) sB = cuda.shared.array(shape=(tpb, tpb), dtype=f4) tx = cuda.threadIdx.x ty = cuda.threadIdx.y bx = cuda.blockIdx.x by = cuda.blockIdx.y bw = cuda.blockDim.x bh = cuda.blockDim.y x = tx + bx * bw y = ty + by * bh acc = 0. for i in range(bpg): if x < n and y < n: sA[ty, tx] = A[y, tx + i * tpb] sB[ty, tx] = B[ty + i * tpb, x] cuda.syncthreads() if x < n and y < n: for j in range(tpb): acc += sA[ty, j] * sB[j, tx] cuda.syncthreads() if x < n and y < n: C[y, x] = acc
def d_mexpsum(a,b): sA = cuda.shared.array(shape=(100,100),dtype=float32) xidx,yidx = cuda.threadIdx.x,cuda.threadIdx.y x,y = cuda.grid(2) total = min(cuda.blockDim.y, a.shape[1] - (cuda.blockIdx.y*cuda.blockDim.y)) s = total/2 if yidx<s: sA[xidx,yidx] = math.exp(a[x,y]) + math.exp(a[x,y+s]) elif yidx+s==total-1: cuda.syncthreads() sA[xidx,0] += math.exp(a[x,y+s]) cuda.syncthreads() last_s = s s = s/2 while s>0: if yidx < s: sA[xidx,yidx] += sA[xidx,yidx+s] elif yidx+s==last_s-1: cuda.syncthreads() sA[xidx,0] += sA[xidx,yidx+s] cuda.syncthreads() last_s = s s=s/2 cuda.syncthreads() if yidx == 0: b[x,cuda.blockIdx.y] = sA[xidx,yidx]
def cu_compute_error(X, Y, Ex, Ey, theta_x, theta_y): # Compute error for each element and store in the shared-memory Exsm = cuda.shared.array((1024, ), dtype=f8) Eysm = cuda.shared.array((1024, ), dtype=f8) tid = cuda.threadIdx.x base = cuda.blockIdx.x * cuda.blockDim.x i = base + tid x = X[i] y = Y[i] predict = theta_x + theta_y * x Exsm[tid] = predict - y Eysm[tid] = (predict - y) * x # Sum-reduce errors in the shared-memory n = cuda.blockDim.x while n > 1: cuda.syncthreads() half = n // 2 if tid < half: Exsm[tid] += Exsm[tid + half] Eysm[tid] += Eysm[tid + half] n = half if tid == 0: # First of a block? # Store result Ex[cuda.blockIdx.x] = Exsm[0] Ey[cuda.blockIdx.x] = Eysm[0]
def cu_compute_error(X, Y, Ex, Ey, theta_x, theta_y): # Compute error for each element and store in the shared-memory Exsm = cuda.shared.array((1024,), dtype=f8) Eysm = cuda.shared.array((1024,), dtype=f8) tid = cuda.threadIdx.x base = cuda.blockIdx.x * cuda.blockDim.x i = base + tid x = X[i] y = Y[i] predict = theta_x + theta_y * x Exsm[tid] = predict - y Eysm[tid] = (predict - y) * x # Sum-reduce errors in the shared-memory n = cuda.blockDim.x while n > 1: cuda.syncthreads() half = n // 2 if tid < half: Exsm[tid] += Exsm[tid + half] Eysm[tid] += Eysm[tid + half] n = half if tid == 0: # First of a block? # Store result Ex[cuda.blockIdx.x] = Exsm[0] Ey[cuda.blockIdx.x] = Eysm[0]
def jocabi_relax_core(A, Anew, error): smem = cuda.shared.array(shape=(32 + 2, 32 + 2), dtype=f4) n = A.shape[0] m = A.shape[1] tx = cuda.threadIdx.x ty = cuda.threadIdx.y j = ty + cuda.blockIdx.y * cuda.blockDim.y i = tx + cuda.blockIdx.x * cuda.blockDim.x sy = ty + 1 sx = tx + 1 smem[sy, sx] = A[j, i] if tx == 0 and i >= 1: smem[sy, 0] = A[j, i - 1] if ty == 0 and j < m - 1: smem[0, sx] = A[j - 1, i] if tx == 31 and j >= 1: smem[sy, 33] = A[j, i + 1] if ty == 31 and j < n - 1: smem[33, sx] = A[j + 1, i] cuda.syncthreads() # ensure smem is visible by all threads in the block if j >= 1 and j < n - 1 and i >= 1 and i < m - 1: Anew[j, i] = 0.25 * ( smem[sy, sx + 1] + smem[sy, sx - 1] \ + smem[sy - 1, sx] + smem[sy + 1, sx]) error[j, i] = Anew[j, i] - A[j, i]
def makeFlistGPU(d_offsets, d_transactions, d_flist, num_transactions, all_items_in_transactions): private_items = cuda.shared.array(MAX_SM_ITEMS, uint32) tx = cuda.threadIdx.x index = tx + cuda.blockDim.x * cuda.blockIdx.x location_x = 0 for i in range(0, ceil(MAX_SM_ITEMS / (1.0 * BLOCK_SIZE))): location_x = tx + i * BLOCK_SIZE if location_x < MAX_SM_ITEMS: private_items[location_x] = 0 cuda.syncthreads() item_ends = 0 if index == (num_transactions - 1): item_ends = all_items_in_transactions elif index < (num_transactions - 1): item_ends = d_offsets[index + 1] else: item_ends = 0 for i in range(d_offsets[index], item_ends): if d_transactions[i] >= 0 and d_transactions[i] < MAX_SM_ITEMS: cuda.atomic.add(private_items, d_transactions[i], 1) elif d_transactions[i] >= MAX_SM_ITEMS and d_transactions[i] < MAX_UNIQUE_ITEMS: cuda.atomic.add(d_flist, d_transactions[i], 1) cuda.syncthreads() for i in range(0, ceil(MAX_SM_ITEMS / (1.0 * BLOCK_SIZE))): location_x = tx + i * BLOCK_SIZE if location_x < MAX_SM_ITEMS: cuda.atomic.add(d_flist, location_x, private_items[location_x])
def cu_matmul_sm(A, B, C, n, tpb, bpg): # decalre shared memory sA = cuda.shared.array(shape=block_dim, dtype=float32) sB = cuda.shared.array(shape=block_dim, dtype=float32) # we now need the thread ID within a block as well as the global thread ID tx = cuda.threadIdx.x ty = cuda.threadIdx.y x, y = cuda.grid(2) # pefort partial operations in block-szied tiles # saving intermediate values in an accumulator variable acc = 0.0 for i in range(bpg): # Stage 1: Prefil shared memory with current block from matrix A and matrix B sA[tx, ty] = A[x, ty + i * tpb] sB[tx, ty] = B[tx + i * tpb, y] # Block calculations till shared mmeory is filled cuda.syncthreads() # Stage 2: Compute partial dot product and add to accumulator if x < n and y < n: for j in range(tpb): acc += sA[tx, j] * sB[j, ty] # Blcok until all threads have completed calcuaiton before next loop iteration cuda.syncthreads() # Put accumulated dot product into output matrix if x < n and y < n: C[x, y] = acc
def or_reduction(literal, tmp_out, length): bw = cuda.blockDim.x bx = cuda.blockIdx.x tid = cuda.threadIdx.x shared_list = cuda.shared.array(shape=(tpb), dtype=uint32) i = bx * bw + tid shared_list[tid] = 0x00000000 if i < length: shared_list[tid] = literal[i] cuda.syncthreads() hop = bw / 2 while hop > 0: if tid < hop: shared_list[tid] = shared_list[tid] | shared_list[tid + hop] #if i <length: # print shared_list[tid] cuda.syncthreads() hop /= 2 if tid == 0: tmp_out[bx] = shared_list[0] print tmp_out[0]
def downsweep_phase(zero_list, one_list, hop, base): i = cuda.grid(1) if i%(2*hop) == (2*hop-1): zero_list[i-hop], zero_list[i] = zero_list[i], zero_list[i-hop]+zero_list[i] one_list[i-hop], one_list[i] = one_list[i], one_list[i-hop]+one_list[i] cuda.syncthreads() if hop==1: one_list[i] += base
def IndexDefineGPU(in_d, rev_bit_d, in_size, last_input): tx = cuda.threadIdx.x index = tx + cuda.blockIdx.x * cuda.blockDim.x total_falses = in_d[in_size - 1] + last_input cuda.syncthreads() if index < in_size: if rev_bit_d[index] == 0: in_d[index] = index + 1 - in_d[index] + total_falses
def findFrequencyGPU(d_transactions, d_offsets, num_transactions, num_elements, dkeyIndex, dMask, num_patterns): Ts = cuda.shared.array(SM_SHAPE, int32) tx = cuda.threadIdx.x index = tx + cuda.blockDim.x * cuda.blockIdx.x trans_index = cuda.blockIdx.x * MAX_TRANSACTIONS_PER_SM for i in range(0, MAX_TRANSACTIONS_PER_SM): if tx < MAX_ITEMS_PER_TRANSACTIONS: Ts[i, tx] = -1 cuda.syncthreads() for i in range(0, MAX_TRANSACTIONS_PER_SM): item_ends = num_elements if (trans_index + i + 1) == num_transactions: item_ends = num_elements elif (trans_index + i + 1) < num_transactions: item_ends = d_offsets[trans_index + i + 1] else: continue if (tx + d_offsets[trans_index + i]) < item_ends and tx < MAX_ITEMS_PER_TRANSACTIONS: Ts[i, tx] = d_transactions[d_offsets[trans_index + i] + tx] #d_transactions[d_offsets[trans_index + i] + tx] += 1 cuda.syncthreads() for mask_id in range(0, int(ceil(num_patterns / 1.0 * cuda.blockDim.x))): loop_tx = cuda.threadIdx.x + mask_id * cuda.blockDim.x if loop_tx >= num_patterns: continue for last_seen in range(0, num_patterns): if dMask[loop_tx * num_patterns + last_seen] < 0: #last_seen += 1 continue item1 = dkeyIndex[loop_tx] item2 = dkeyIndex[last_seen] for tid in range(0, MAX_TRANSACTIONS_PER_SM): flag1 = False flag2 = False for titem in range(0, MAX_ITEMS_PER_TRANSACTIONS): if Ts[tid, titem] == item1: flag1 = True elif Ts[tid, titem] == item2: flag2 = True present_flag = flag1 and flag2 if present_flag: cuda.atomic.add(dMask, loop_tx * num_patterns + last_seen, 1)
def d_msum(a,b): sA = cuda.shared.array(shape=(32,32),dtype=float32) xidx,yidx = cuda.threadIdx.x,cuda.threadIdx.y x,y = cuda.grid(2) total = min(cuda.blockDim.y, a.shape[1] - (cuda.blockIdx.y*cuda.blockDim.y)) s = total/2 if y+s < a.shape[1]: if yidx<s: sA[xidx,yidx] = a[x,y] + a[x,y+s] cuda.syncthreads() if yidx == total-1 and not yidx < s: sA[xidx,0] += a[x,y+s] cuda.syncthreads() last_s = s s=s/2 while s>0: if yidx<s: sA[xidx,yidx] += sA[xidx,yidx+s] cuda.syncthreads() if yidx == last_s-1 and not yidx < s: sA[xidx,0] += sA[xidx,yidx+s] cuda.syncthreads() s=s/2 if yidx==0: b[x,cuda.blockIdx.y] = sA[xidx,yidx]
def cuda_prefixsum_base2(masks, indices, init, nelem): """ Args ---- nelem: Must be power of 2. Note ---- Launch 2*nelem threads. Support 1 block/grid. """ sm = cuda.shared.array((1024,), dtype=numba.int64) tid = cuda.threadIdx.x # Preload if 2 * tid + 1 < nelem: sm[2 * tid] = masks[2 * tid] sm[2 * tid + 1] = masks[2 * tid + 1] # Up phase limit = nelem >> 1 step = 1 idx = tid * 2 two_d = 1 for d in range(3): offset = two_d - 1 if tid < limit: sm[offset + idx + step] += sm[offset + idx] limit >>= 1 idx <<= 1 step <<= 1 two_d <<= 1 cuda.syncthreads() # Down phase if tid == 0: sm[nelem - 1] = 0 cuda.syncthreads() # Writeback if 2 * tid + 1 < nelem: indices[2 * tid] = sm[2 * tid] indices[2 * tid + 1] = sm[2 * tid + 1]
def cuda_prefixsum_base2(masks, indices, init, nelem): """ Args ---- nelem: Must be power of 2. Note ---- Launch 2*nelem threads. Support 1 block/grid. """ sm = cuda.shared.array((1024, ), dtype=numba.int64) tid = cuda.threadIdx.x # Preload if 2 * tid + 1 < nelem: sm[2 * tid] = masks[2 * tid] sm[2 * tid + 1] = masks[2 * tid + 1] # Up phase limit = nelem >> 1 step = 1 idx = tid * 2 two_d = 1 for d in range(3): offset = two_d - 1 if tid < limit: sm[offset + idx + step] += sm[offset + idx] limit >>= 1 idx <<= 1 step <<= 1 two_d <<= 1 cuda.syncthreads() # Down phase if tid == 0: sm[nelem - 1] = 0 cuda.syncthreads() # Writeback if 2 * tid + 1 < nelem: indices[2 * tid] = sm[2 * tid] indices[2 * tid + 1] = sm[2 * tid + 1]
def sum_reduction(zero_list, tmp_out): bw = cuda.blockDim.x bx = cuda.blockIdx.x tid = cuda.threadIdx.x shared_list = cuda.shared.array(shape = (TPB_MAX), dtype = int64) i = bx*bw + tid shared_list[tid] = zero_list[i] cuda.syncthreads() hop = bw/2 while hop > 0: if tid < hop: shared_list[tid] = shared_list[tid] + shared_list[tid+hop] cuda.syncthreads() hop /= 2 if tid == 0: tmp_out[bx] = shared_list[0]
def Blowfish_encipherG(Text, s, p): sS = cuda.shared.array(shape=(1024), dtype=uint32) sP = cuda.shared.array(shape=(18), dtype=uint32) tid = cuda.threadIdx.x gid = cuda.grid(1) interval = 1024/tpb for i in range (tid*interval, (tid+1)*interval, 1): sS[i] = s[i] cuda.syncthreads() if tid<N+2: sP[tid] = p[tid] cuda.syncthreads() if gid*2+1<len(Text)/4 and tid<cuda.blockDim.x: xl = Text[gid*2] xr = Text[gid*2+1] for j in range (0, N, 1): xl = xl ^ sP[j] x = xl d = x & 0x00FF x >>= 8 c = x & 0x00FF x >>= 8 b = x & 0x00FF x >>= 8 a = x & 0x00FF y = sS[a] + sS[256+b] y = y ^ sS[512+c] y = y + sS[768+d] xr = y ^ xr temp = xl xl = xr xr = temp temp = xl xl = xr xr = temp xr = xr ^ sP[N] xl = xl ^ sP[N + 1] cuda.syncthreads() Text[gid*2] = xl Text[gid*2+1] = xr cuda.syncthreads()
def max_kernel(a, b): "Simple implementation of reduction kernel" # Allocate static shared memory of 256. # This limits the maximum block size to 256. sa = cuda.shared.array(shape=(256,), dtype=int32) tx = cuda.threadIdx.x bx = cuda.blockIdx.x bw = cuda.blockDim.x i = tx + bx * bw if i < a.shape[0]: sa[tx] = a[i] if tx == 0: # Uses the first thread of each block to perform the actual # reduction m = sa[tx] cuda.syncthreads() for j in range(1, bw): m = mymax(m, sa[j]) b[bx] = m
def max_kernel(a, b): "Simple implementation of reduction kernel" # Allocate static shared memory of 256. # This limits the maximum block size to 256. sa = cuda.shared.array(shape=(256, ), dtype=int32) tx = cuda.threadIdx.x bx = cuda.blockIdx.x bw = cuda.blockDim.x i = tx + bx * bw if i < a.shape[0]: sa[tx] = a[i] if tx == 0: # Uses the first thread of each block to perform the actual # reduction m = sa[tx] cuda.syncthreads() for j in range(1, bw): m = mymax(m, sa[j]) b[bx] = m
def exclusiveScanGPU(aux_d, out_d, in_d, size): private_shared_in = cuda.shared.array(SM_SIZE, uint32) start = 2 * cuda.blockDim.x * cuda.blockIdx.x tx = cuda.threadIdx.x index = tx + start ############### Put 2 values per each thread into shared memory ############## if index < size: private_shared_in[tx] = in_d[index] else: private_shared_in[tx] = 0.0 if (index + BLOCK_SIZE) < size: private_shared_in[tx + BLOCK_SIZE] = in_d[index + BLOCK_SIZE] else: private_shared_in[tx + BLOCK_SIZE] = 0.0 cuda.syncthreads() ########################### Do the first scan ############################## d = 1 while d <= BLOCK_SIZE: tk = 2 * d * (tx + 1) - 1 if tk < (2 * BLOCK_SIZE): private_shared_in[tk] += private_shared_in[tk - d] d *= 2 cuda.syncthreads() ############################ Do the second scan ############################# d = BLOCK_SIZE / 2 while d > 0: tk = 2 * d * (tx + 1) - 1 if (tk + d) < (2 * BLOCK_SIZE): private_shared_in[tk + d] += private_shared_in[tk] d /= 2 cuda.syncthreads() ############################################################################# index += 1 if index < size: out_d[index] = private_shared_in[tx] if (index + BLOCK_SIZE) < size and (tx + BLOCK_SIZE) != (2 * BLOCK_SIZE - 1): out_d[index + BLOCK_SIZE] = private_shared_in[tx + BLOCK_SIZE] cuda.syncthreads() aux_d[cuda.blockIdx.x] = private_shared_in[2 * BLOCK_SIZE - 1] out_d[start] = 0.0
def histogramGPU(input_d, bins_d, num_elements): private_bin = cuda.shared.array(BIN_SIZE, uint32) tx = cuda.threadIdx.x index = cuda.grid(1) #tx + cuda.blockDim.x * cuda.blockIdx.x location_x = 0 for i in range(0, ceil(BIN_SIZE / (1.0 * BLOCK_SIZE))): location_x = tx + i * BLOCK_SIZE if location_x < BIN_SIZE: private_bin[location_x] = 0 cuda.syncthreads() if index < num_elements and input_d[index] < BIN_SIZE: cuda.atomic.add(private_bin, input_d[index], 1) #cuda.atomic.add(bins_d, input_d[index], 1) cuda.syncthreads() for i in range(0, ceil(BIN_SIZE / (1.0 * BLOCK_SIZE))): location_x = tx + i * BLOCK_SIZE if location_x < BIN_SIZE: cuda.atomic.add(bins_d, location_x, private_bin[location_x])
def d_margmax(a,val,idx): x = cuda.grid(1) tidx = cuda.threadIdx.x bidx = cuda.blockIdx.x total = min(cuda.blockDim.x, a.shape[1] - (cuda.blockIdx.x*cuda.blockDim.x)) if (x < a.shape[1]): sVal = cuda.shared.array((1024),dtype=float32) sIdx = cuda.shared.array((1024),dtype=float32) s = total/2 if x+s < total: if a[0,x] > a[0,x+s]: sVal[tidx] = a[0,x] sIdx[tidx] = x else: sVal[tidx] = a[0,x+s] sIdx[tidx] = x+s cuda.syncthreads() if total%2 == 1: if tidx == total-1: if a[0,x] > a[0,x-tidx]: sVal[0] = a[0,x] sIdx[0] = x s = s/2 cuda.syncthreads() while s > 0: if tidx+s < total: if sVal[tidx] < sVal[tidx+s]: sVal[tidx] = sVal[tidx+s] sIdx[tidx] = sIdx[tidx+s] cuda.syncthreads() if s%2 == 1 and s > 1: if tidx == s-1: if sVal[0] < sVal[tidx]: sVal[0] = sVal[tidx] sIdx[0] = sIdx[tidx] s = s/2 cuda.syncthreads() if tidx == 0: val[0,bidx] = sVal[tidx] idx[0,bidx] = sIdx[tidx]
def jocabi_relax_core(A, Anew, error): err_sm = cuda.shared.array((tpb, tpb), dtype=f8) ty = cuda.threadIdx.x tx = cuda.threadIdx.y bx = cuda.blockIdx.x by = cuda.blockIdx.y n = A.shape[0] m = A.shape[1] i, j = cuda.grid(2) err_sm[ty, tx] = 0 if j >= 1 and j < n - 1 and i >= 1 and i < m - 1: Anew[j, i] = 0.25 * ( A[j, i + 1] + A[j, i - 1] \ + A[j - 1, i] + A[j + 1, i]) err_sm[ty, tx] = Anew[j, i] - A[j, i] cuda.syncthreads() # max-reduce err_sm vertically t = tpb // 2 while t > 0: if ty < t: err_sm[ty, tx] = get_max(err_sm[ty, tx], err_sm[ty + t, tx]) t //= 2 cuda.syncthreads() # max-reduce err_sm horizontally t = tpb // 2 while t > 0: if tx < t and ty == 0: err_sm[ty, tx] = get_max(err_sm[ty, tx], err_sm[ty, tx + t]) t //= 2 cuda.syncthreads() if tx == 0 and ty == 0: error[by, bx] = err_sm[0, 0]
def findHigherPatternFrequencyGPU(d_transactions, d_offsets, num_transactions, num_elements, dkeyIndex, dMask, num_patterns, api_d, iil_d, power, size_api_d, size_iil_d): Ts = cuda.shared.array(SM_SHAPE, int32) tx = cuda.threadIdx.x index = tx + cuda.blockDim.x * cuda.blockIdx.x trans_index = cuda.blockIdx.x * MAX_TRANSACTIONS_PER_SM for i in range(0, MAX_TRANSACTIONS_PER_SM): if tx < MAX_ITEMS_PER_TRANSACTIONS: Ts[i, tx] = -1 cuda.syncthreads() for i in range(0, MAX_TRANSACTIONS_PER_SM): item_ends = num_elements if (trans_index + i + 1) == num_transactions: item_ends = num_elements elif (trans_index + i + 1) < num_transactions: item_ends = d_offsets[trans_index + i + 1] else: continue if (tx + d_offsets[trans_index + i]) < item_ends and tx < MAX_ITEMS_PER_TRANSACTIONS: Ts[i, tx] = d_transactions[d_offsets[trans_index + i] + tx] #d_transactions[d_offsets[trans_index + i] + tx] += 1 cuda.syncthreads() for mask_id in range(0, int(ceil(num_patterns / (1.0 * BLOCK_SIZE)))): loop_tx = tx + mask_id * BLOCK_SIZE if loop_tx >= num_patterns: continue for last_seen in range(0, num_patterns): if dMask[loop_tx * num_patterns + last_seen] < 0: continue hp1 = dkeyIndex[loop_tx] hp2 = dkeyIndex[last_seen] vitem1 = hp1 % (10 ** power) vitem2 = hp2 % (10 ** power) if ((vitem1 - 1) * 3 + 1) < size_iil_d and ((vitem2 - 1) * 3 + 1) < size_iil_d: index_item1 = iil_d[(vitem1 - 1) * 3 + 1] index_item2 = iil_d[(vitem2 - 1) * 3 + 1] if index_item1 < size_api_d and index_item2 < size_api_d: item1 = api_d[index_item1] item2 = api_d[index_item2] else: continue else: continue vcommon_pattern = hp1 / (10 ** power) if ((vcommon_pattern - 1) * 3 + 1) < size_iil_d: index_vpat1 = iil_d[(vcommon_pattern - 1) * 3 + 1] if index_vpat1 < size_api_d: vpat1 = api_d[index_vpat1]#array of max patterns else: continue else: continue for tid in range(0, MAX_TRANSACTIONS_PER_SM): flag1 = False flag2 = False fpat1 = False for titem in range(0, MAX_ITEMS_PER_TRANSACTIONS): if Ts[tid, titem] == item1: flag1 = True elif Ts[tid, titem] == item2: flag2 = True elif Ts[tid, titem] == vpat1: fpat1 = True present_flag = flag1 and flag2 and fpat1 if present_flag: cuda.atomic.add(dMask, loop_tx * num_patterns + last_seen, 1)
def findHigherPatternFrequencyGPU(d_transactions, d_offsets, num_transactions, num_elements, dkeyIndex, dMask, num_patterns, api_d, iil_d, power, size_api_d, size_iil_d): Ts = cuda.shared.array(SM_SHAPE, int32) tx = cuda.threadIdx.x index = tx + cuda.blockDim.x * cuda.blockIdx.x trans_index = cuda.blockIdx.x * MAX_TRANSACTIONS_PER_SM for i in range(0, MAX_TRANSACTIONS_PER_SM): if tx < MAX_ITEMS_PER_TRANSACTIONS: Ts[i, tx] = -1 cuda.syncthreads() for i in range(0, MAX_TRANSACTIONS_PER_SM): item_ends = num_elements if (trans_index + i + 1) == num_transactions: item_ends = num_elements elif (trans_index + i + 1) < num_transactions: item_ends = d_offsets[trans_index + i + 1] else: continue if (tx + d_offsets[trans_index + i]) < item_ends and tx < MAX_ITEMS_PER_TRANSACTIONS: Ts[i, tx] = d_transactions[d_offsets[trans_index + i] + tx] #d_transactions[d_offsets[trans_index + i] + tx] += 1 cuda.syncthreads() for mask_id in range(0, int(ceil(num_patterns / (1.0 * BLOCK_SIZE)))): loop_tx = tx + mask_id * BLOCK_SIZE if loop_tx >= num_patterns: continue for last_seen in range(0, num_patterns): if dMask[loop_tx * num_patterns + last_seen] < 0: continue hp1 = dkeyIndex[loop_tx] hp2 = dkeyIndex[last_seen] vitem1 = hp1 % (10**power) vitem2 = hp2 % (10**power) if ((vitem1 - 1) * 3 + 1) < size_iil_d and ( (vitem2 - 1) * 3 + 1) < size_iil_d: index_item1 = iil_d[(vitem1 - 1) * 3 + 1] index_item2 = iil_d[(vitem2 - 1) * 3 + 1] if index_item1 < size_api_d and index_item2 < size_api_d: item1 = api_d[index_item1] item2 = api_d[index_item2] else: continue else: continue vcommon_pattern = hp1 / (10**power) if ((vcommon_pattern - 1) * 3 + 1) < size_iil_d: index_vpat1 = iil_d[(vcommon_pattern - 1) * 3 + 1] if index_vpat1 < size_api_d: vpat1 = api_d[index_vpat1] #array of max patterns else: continue else: continue for tid in range(0, MAX_TRANSACTIONS_PER_SM): flag1 = False flag2 = False fpat1 = False for titem in range(0, MAX_ITEMS_PER_TRANSACTIONS): if Ts[tid, titem] == item1: flag1 = True elif Ts[tid, titem] == item2: flag2 = True elif Ts[tid, titem] == vpat1: fpat1 = True present_flag = flag1 and flag2 and fpat1 if present_flag: cuda.atomic.add(dMask, loop_tx * num_patterns + last_seen, 1)
def batch_k_selection(A, I, k): """QuickSelect """ sampleIdx = cuda.blockIdx.x tid = cuda.threadIdx.x # XXX: hardcoded array size for maximum capability values = cuda.shared.array(shape=1000, dtype=float64) indices = cuda.shared.array(shape=1000, dtype=int16) storeidx = cuda.shared.array(shape=1, dtype=int32) rightidx = cuda.shared.array(shape=1, dtype=int32) # Prefill cache values[tid] = A[tid, sampleIdx] indices[tid] = tid cuda.syncthreads() st = 0 rst = 0 n = A.shape[0] left = 0 right = n - 1 val = 0.0 ind = 0 while left < right: # for _ in range(1): st = -1 rst = -1 pivot = right #(right + left + 1) // 2 storeidx[0] = left rightidx[0] = 0 pval = values[pivot] # Move pivot to the end # if tid == 0: # print(7777777) # print(left + 0) # print(right + 0) # print(pivot + 0) # swapf(values, right, pivot) # swapi(indices, right, pivot) cuda.syncthreads() # Compare if tid >= left and tid < right: val = values[tid] ind = indices[tid] if val < pval: st = cuda.atomic.add(storeidx, 0, 1) else: rst = cuda.atomic.add(rightidx, 0, 1) cuda.syncthreads() finalpivot = storeidx[0] if rst != -1: # Assign right partition index st = finalpivot + rst # Swap if st != -1 and st != tid: values[st] = val indices[st] = ind cuda.syncthreads() # Move pivot to final destination if tid == 0: swapf(values, finalpivot, right) swapi(indices, finalpivot, right) cuda.syncthreads() # Adjust range or done remain = n - finalpivot if remain == k: break elif remain > k: left = finalpivot + 1 else: right = finalpivot - 1 if tid < k: I[tid, sampleIdx] = indices[n - tid - 1]
def selfJoinGPU(input_d, output_d, num_elements, power): tx = cuda.threadIdx.x #index = tx + cuda.blockIdx.x * cuda.blockDim.x start = cuda.blockIdx.x * MAX_ITEM_PER_SM sm1 = cuda.shared.array(MAX_ITEM_PER_SM, int32) sm2 = cuda.shared.array(MAX_ITEM_PER_SM, int32) actual_items_per_sm = num_elements - start if actual_items_per_sm >= MAX_ITEM_PER_SM: actual_items_per_sm = MAX_ITEM_PER_SM for i in range(0, ceil(MAX_ITEM_PER_SM / (1.0 * BLOCK_SIZE))): location_x = tx + i * BLOCK_SIZE if location_x < actual_items_per_sm and (start + location_x) < num_elements: sm1[location_x] = input_d[start + location_x] else: sm1[location_x] = 0 if cuda.blockIdx.x == 2 and sm1[location_x] == 405: print -3 print location_x print -3 cuda.syncthreads() for i in range(0, ceil(MAX_ITEM_PER_SM / (1.0 * BLOCK_SIZE))): loop_tx = tx + i * BLOCK_SIZE if loop_tx < actual_items_per_sm: for j in range(loop_tx + 1, actual_items_per_sm): if (sm1[loop_tx] / (10**power)) == (sm1[j] / (10**power)): output_d[(start + loop_tx) * num_elements + (start + j)] = 0 # else: # output_d[(start + loop_tx) * num_elements + (start + j)] = -1 cuda.syncthreads() # if (cuda.blockIdx.x + 1) < ceil(num_elements / (1.0 * MAX_ITEM_PER_SM)): # pass current_smid = 0 for smid in range(cuda.blockIdx.x + 1, ceil(num_elements / (1.0 * MAX_ITEM_PER_SM))): actual_items_per_secondary_sm = num_elements - current_smid * MAX_ITEM_PER_SM - start - MAX_ITEM_PER_SM if actual_items_per_secondary_sm > MAX_ITEM_PER_SM: actual_items_per_secondary_sm = MAX_ITEM_PER_SM for i in range(0, ceil(MAX_ITEM_PER_SM / (1.0 * BLOCK_SIZE))): location_x = tx + i * BLOCK_SIZE temp = sm1[3] if location_x < actual_items_per_secondary_sm and ( current_smid * MAX_ITEM_PER_SM + start + location_x) < num_elements: if cuda.blockIdx.x == 2 and tx == 0: print 99 print sm1[3] print 99 sm2[location_x] = input_d[(current_smid + 1) * MAX_ITEM_PER_SM + start + location_x] else: sm2[location_x] = 0 if cuda.blockIdx.x == 2 and tx == 0: print 100 print sm1[3] print 100 if cuda.blockIdx.x == 2 and sm2[location_x] == 406: print -2 print location_x print sm2[location_x] print sm1[0] print sm1[1] print sm1[2] print sm1[3] print -2 cuda.syncthreads() if cuda.blockIdx.x == 2: sm1[3] = 405 cuda.syncthreads() for i in range(0, ceil(MAX_ITEM_PER_SM / (1.0 * BLOCK_SIZE))): loop_tx = tx + i * BLOCK_SIZE if sm1[loop_tx] == 405: print -1 print sm2[0] print -1 if loop_tx < actual_items_per_sm: j = 0 while j < actual_items_per_secondary_sm: if (sm1[loop_tx] / (10**power)) == (sm2[j] / (10**power)): output_d[(start + loop_tx) * num_elements + (current_smid + 1) * MAX_ITEM_PER_SM + start + j] = 0 # else: # output_d[(start + loop_tx) * num_elements + smid * MAX_ITEM_PER_SM + start + j] = -1 j += 1 current_smid += 1
def RadixGPU(in_d, out_d, in_size): private_shared_in = cuda.shared.array(SM_SIZE, uint32) private_split = cuda.shared.array(SM_SIZE, uint32) private_scan = cuda.shared.array(SM_SIZE, uint32) start = 2 * cuda.blockDim.x * cuda.blockIdx.x tx = cuda.threadIdx.x index = tx + start ############### Put 2 values per each thread into shared memory ############## if index < in_size: private_shared_in[tx] = in_d[index] else: private_shared_in[tx] = 2 ** (DATA_TYPE - 1) #0xffffffff if (index + BLOCK_SIZE) < in_size: private_shared_in[tx + BLOCK_SIZE] = in_d[index + BLOCK_SIZE] else: private_shared_in[tx + BLOCK_SIZE] = 2 ** (DATA_TYPE - 1) #0xffffffff cuda.syncthreads() total_falses = 0.0 t = 0 f = 0 bit = 0 d = 1 for bit_shift in range(0, DATA_TYPE): bit = private_shared_in[tx] & (1 << bit_shift) if bit > 0: bit = 1 private_split[tx] = 1 - bit private_scan[tx] = 1 - bit bit = private_shared_in[tx + BLOCK_SIZE] & (1 << bit_shift) if bit > 0: bit = 1 private_split[tx + BLOCK_SIZE] = 1 - bit private_scan[tx + BLOCK_SIZE] = 1 - bit cuda.syncthreads() ########################### Do the first scan ############################## d = 1 while d <= BLOCK_SIZE: tk = 2 * d * (tx + 1) - 1 if tk < (2 * BLOCK_SIZE): private_scan[tk] += private_scan[tk - d] d *= 2 cuda.syncthreads() ############################ Do the second scan ############################# d = BLOCK_SIZE / 2 while d > 0: tk = 2 * d * (tx + 1) - 1 if (tk + d) < (2 * BLOCK_SIZE): private_scan[tk + d] += private_scan[tk] d /= 2 cuda.syncthreads() ############################################################################# # temp_index = tx + 1 # if index < in_size: # private_split_ex[temp_index] = private_split[tx] # if (index + BLOCK_SIZE) < in_size and (tx + BLOCK_SIZE) != (2 * BLOCK_SIZE - 1): # private_split_ex[temp_index + BLOCK_SIZE] = private_split[tx + BLOCK_SIZE] # total_falses = private_split[2 * BLOCK_SIZE - 1] # private_split_ex[start] = 0.0 total_falses = private_scan[SM_SIZE - 1] t = total_falses f = 0 if tx != 0: t = tx - private_scan[tx - 1] + total_falses f = private_scan[tx - 1] if private_split[tx] == 1: private_split[tx] = f else: private_split[tx] = t t = (tx + BLOCK_SIZE) - private_scan[tx + BLOCK_SIZE - 1] + total_falses f = private_scan[tx + BLOCK_SIZE - 1] if private_split[tx + BLOCK_SIZE] == 1: private_split[tx + BLOCK_SIZE] = f else: private_split[tx + BLOCK_SIZE] = t cuda.syncthreads() private_scan[private_split[tx]] = private_shared_in[tx] private_scan[private_split[tx + BLOCK_SIZE]] = private_shared_in[tx + BLOCK_SIZE] cuda.syncthreads() private_shared_in[tx] = private_scan[tx] private_shared_in[tx + BLOCK_SIZE] = private_scan[tx + BLOCK_SIZE] cuda.syncthreads() if index < in_size: out_d[index] = private_shared_in[tx] if (index + BLOCK_SIZE) < in_size: out_d[index + BLOCK_SIZE] = private_shared_in[tx + BLOCK_SIZE]
def selfJoinGPU(input_d, output_d, num_elements, power): tx = cuda.threadIdx.x #index = tx + cuda.blockIdx.x * cuda.blockDim.x start = cuda.blockIdx.x * MAX_ITEM_PER_SM sm1 = cuda.shared.array(MAX_ITEM_PER_SM, int32) sm2 = cuda.shared.array(MAX_ITEM_PER_SM, int32) actual_items_per_sm = num_elements - start if actual_items_per_sm >= MAX_ITEM_PER_SM: actual_items_per_sm = MAX_ITEM_PER_SM for i in range(0, ceil(MAX_ITEM_PER_SM / (1.0 * BLOCK_SIZE))): location_x = tx + i * BLOCK_SIZE if location_x < actual_items_per_sm and (start + location_x) < num_elements: sm1[location_x] = input_d[start + location_x] else: sm1[location_x] = 0 if cuda.blockIdx.x == 2 and sm1[location_x] == 405: print -3 print location_x print -3 cuda.syncthreads() for i in range(0, ceil(MAX_ITEM_PER_SM / (1.0 * BLOCK_SIZE))): loop_tx = tx + i * BLOCK_SIZE if loop_tx < actual_items_per_sm: for j in range(loop_tx + 1, actual_items_per_sm): if (sm1[loop_tx] / (10 ** power)) == (sm1[j] / (10 ** power)): output_d[(start + loop_tx) * num_elements + (start + j)] = 0 # else: # output_d[(start + loop_tx) * num_elements + (start + j)] = -1 cuda.syncthreads() # if (cuda.blockIdx.x + 1) < ceil(num_elements / (1.0 * MAX_ITEM_PER_SM)): # pass current_smid = 0 for smid in range(cuda.blockIdx.x + 1, ceil(num_elements / (1.0 * MAX_ITEM_PER_SM))): actual_items_per_secondary_sm = num_elements - current_smid * MAX_ITEM_PER_SM - start - MAX_ITEM_PER_SM if actual_items_per_secondary_sm > MAX_ITEM_PER_SM: actual_items_per_secondary_sm = MAX_ITEM_PER_SM for i in range(0, ceil(MAX_ITEM_PER_SM / (1.0 * BLOCK_SIZE))): location_x = tx + i * BLOCK_SIZE temp = sm1[3] if location_x < actual_items_per_secondary_sm and (current_smid * MAX_ITEM_PER_SM + start + location_x) < num_elements: if cuda.blockIdx.x == 2 and tx == 0: print 99 print sm1[3] print 99 sm2[location_x] = input_d[(current_smid + 1) * MAX_ITEM_PER_SM + start + location_x] else: sm2[location_x] = 0 if cuda.blockIdx.x == 2 and tx == 0: print 100 print sm1[3] print 100 if cuda.blockIdx.x == 2 and sm2[location_x] == 406: print -2 print location_x print sm2[location_x] print sm1[0] print sm1[1] print sm1[2] print sm1[3] print -2 cuda.syncthreads() if cuda.blockIdx.x == 2: sm1[3] = 405 cuda.syncthreads() for i in range(0, ceil(MAX_ITEM_PER_SM / (1.0 * BLOCK_SIZE))): loop_tx = tx + i * BLOCK_SIZE if sm1[loop_tx] == 405: print -1 print sm2[0] print -1 if loop_tx < actual_items_per_sm: j = 0 while j < actual_items_per_secondary_sm: if (sm1[loop_tx] / (10 ** power)) == (sm2[j] / (10 ** power)): output_d[(start + loop_tx) * num_elements + (current_smid + 1) * MAX_ITEM_PER_SM + start + j] = 0 # else: # output_d[(start + loop_tx) * num_elements + smid * MAX_ITEM_PER_SM + start + j] = -1 j += 1 current_smid += 1
def RadixGPU(in_d, out_d, in_size): private_shared_in = cuda.shared.array(SM_SIZE, uint32) private_split = cuda.shared.array(SM_SIZE, uint32) private_scan = cuda.shared.array(SM_SIZE, uint32) start = 2 * cuda.blockDim.x * cuda.blockIdx.x tx = cuda.threadIdx.x index = tx + start ############### Put 2 values per each thread into shared memory ############## if index < in_size: private_shared_in[tx] = in_d[index] else: private_shared_in[tx] = 2**(DATA_TYPE - 1) #0xffffffff if (index + BLOCK_SIZE) < in_size: private_shared_in[tx + BLOCK_SIZE] = in_d[index + BLOCK_SIZE] else: private_shared_in[tx + BLOCK_SIZE] = 2**(DATA_TYPE - 1) #0xffffffff cuda.syncthreads() total_falses = 0.0 t = 0 f = 0 bit = 0 d = 1 for bit_shift in range(0, DATA_TYPE): bit = private_shared_in[tx] & (1 << bit_shift) if bit > 0: bit = 1 private_split[tx] = 1 - bit private_scan[tx] = 1 - bit bit = private_shared_in[tx + BLOCK_SIZE] & (1 << bit_shift) if bit > 0: bit = 1 private_split[tx + BLOCK_SIZE] = 1 - bit private_scan[tx + BLOCK_SIZE] = 1 - bit cuda.syncthreads() ########################### Do the first scan ############################## d = 1 while d <= BLOCK_SIZE: tk = 2 * d * (tx + 1) - 1 if tk < (2 * BLOCK_SIZE): private_scan[tk] += private_scan[tk - d] d *= 2 cuda.syncthreads() ############################ Do the second scan ############################# d = BLOCK_SIZE / 2 while d > 0: tk = 2 * d * (tx + 1) - 1 if (tk + d) < (2 * BLOCK_SIZE): private_scan[tk + d] += private_scan[tk] d /= 2 cuda.syncthreads() ############################################################################# # temp_index = tx + 1 # if index < in_size: # private_split_ex[temp_index] = private_split[tx] # if (index + BLOCK_SIZE) < in_size and (tx + BLOCK_SIZE) != (2 * BLOCK_SIZE - 1): # private_split_ex[temp_index + BLOCK_SIZE] = private_split[tx + BLOCK_SIZE] # total_falses = private_split[2 * BLOCK_SIZE - 1] # private_split_ex[start] = 0.0 total_falses = private_scan[SM_SIZE - 1] t = total_falses f = 0 if tx != 0: t = tx - private_scan[tx - 1] + total_falses f = private_scan[tx - 1] if private_split[tx] == 1: private_split[tx] = f else: private_split[tx] = t t = (tx + BLOCK_SIZE) - private_scan[tx + BLOCK_SIZE - 1] + total_falses f = private_scan[tx + BLOCK_SIZE - 1] if private_split[tx + BLOCK_SIZE] == 1: private_split[tx + BLOCK_SIZE] = f else: private_split[tx + BLOCK_SIZE] = t cuda.syncthreads() private_scan[private_split[tx]] = private_shared_in[tx] private_scan[private_split[tx + BLOCK_SIZE]] = private_shared_in[tx + BLOCK_SIZE] cuda.syncthreads() private_shared_in[tx] = private_scan[tx] private_shared_in[tx + BLOCK_SIZE] = private_scan[tx + BLOCK_SIZE] cuda.syncthreads() if index < in_size: out_d[index] = private_shared_in[tx] if (index + BLOCK_SIZE) < in_size: out_d[index + BLOCK_SIZE] = private_shared_in[tx + BLOCK_SIZE]