def cu_square_matrix_mul(A, B, C): sA = cuda.shared.array(shape=SM_SIZE, dtype=float32) sB = cuda.shared.array(shape=(tpb, tpb), dtype=float32) tx = cuda.threadIdx.x ty = cuda.threadIdx.y bx = cuda.blockIdx.x by = cuda.blockIdx.y bw = cuda.blockDim.x bh = cuda.blockDim.y x = tx + bx * bw y = ty + by * bh acc = float32(0) # forces all the math to be f32 for i in range(bpg): if x < n and y < n: sA[ty, tx] = A[y, tx + i * tpb] sB[ty, tx] = B[ty + i * tpb, x] cuda.syncthreads() if x < n and y < n: for j in range(tpb): acc += sA[ty, j] * sB[j, tx] cuda.syncthreads() if x < n and y < n: C[y, x] = acc
def jacobi_relax_core(A, Anew, error): smem = cuda.shared.array(shape=(32 + 2, 32 + 2), dtype=f8) n = A.shape[0] m = A.shape[1] tx = cuda.threadIdx.x ty = cuda.threadIdx.y j = ty + cuda.blockIdx.y * cuda.blockDim.y i = tx + cuda.blockIdx.x * cuda.blockDim.x sy = ty + 1 sx = tx + 1 smem[sy, sx] = A[j, i] if tx == 0 and i >= 1: smem[sy, 0] = A[j, i - 1] if ty == 0 and j < m - 1: smem[0, sx] = A[j - 1, i] if tx == 31 and j >= 1: smem[sy, 33] = A[j, i + 1] if ty == 31 and j < n - 1: smem[33, sx] = A[j + 1, i] cuda.syncthreads() # ensure smem is visible by all threads in the block if j >= 1 and j < n - 1 and i >= 1 and i < m - 1: Anew[j, i] = 0.25 * ( smem[sy, sx + 1] + smem[sy, sx - 1] \ + smem[sy - 1, sx] + smem[sy + 1, sx]) error[j, i] = Anew[j, i] - A[j, i]
def _getOccupancyCUDAkernel(occus, coords, centers, channelsigmas, trunc): centeridx = cuda.blockIdx.x blockidx = cuda.blockIdx.y atomidx = (cuda.threadIdx.x + (cuda.blockDim.x * blockidx)) if atomidx >= coords.shape[0] or centeridx >= centers.shape[0]: return # TODO: Can remove this. Barely any speedup centcoor = cuda.shared.array(shape=(3), dtype=numba.float32) centcoor[0] = centers[centeridx, 0] centcoor[1] = centers[centeridx, 1] centcoor[2] = centers[centeridx, 2] cuda.syncthreads() dx = coords[atomidx, 0] - centcoor[0] dy = coords[atomidx, 1] - centcoor[1] dz = coords[atomidx, 2] - centcoor[2] d2 = dx * dx + dy * dy + dz * dz if d2 >= trunc: return d1 = 1 / sqrt(d2) for h in range(channelsigmas.shape[1]): if channelsigmas[atomidx, h] == 0: continue x = channelsigmas[atomidx, h] * d1 value = 1 - exp(-(x ** 12)) cuda.atomic.max(occus, (centeridx, h), value)
def fast_matmul(A, B, C): # Define an array in the shared memory # The size and type of the arrays must be known at compile time sA = cuda.shared.array(shape=(TPB, TPB), dtype=float32) sB = cuda.shared.array(shape=(TPB, TPB), dtype=float32) x, y = cuda.grid(2) tx = cuda.threadIdx.x ty = cuda.threadIdx.y bpg = cuda.gridDim.x # blocks per grid if x >= C.shape[0] and y >= C.shape[1]: # Quit if (x, y) is outside of valid C boundary return # Each thread computes one element in the result matrix. # The dot product is chunked into dot products of TPB-long vectors. tmp = 0. for i in range(bpg): # Preload data into shared memory sA[tx, ty] = A[x, ty + i * TPB] sB[tx, ty] = B[tx + i * TPB, y] # Wait until all threads finish preloading cuda.syncthreads() # Computes partial product on the shared memory for j in range(TPB): tmp += sA[tx, j] * sB[j, ty] # Wait until all threads finish computing cuda.syncthreads() C[x, y] = tmp
def simple_smem(ary): sm = cuda.shared.array(N, int32) i = cuda.grid(1) if i == 0: for j in range(N): sm[j] = j cuda.syncthreads() ary[i] = sm[i]
def atomic_add(ary): tid = cuda.threadIdx.x sm = cuda.shared.array(32, uint32) sm[tid] = 0 cuda.syncthreads() bin = ary[tid] % 32 cuda.atomic.add(sm, bin, 1) cuda.syncthreads() ary[tid] = sm[tid]
def atomic_add_float(ary): tid = cuda.threadIdx.x sm = cuda.shared.array(32, float32) sm[tid] = 0 cuda.syncthreads() bin = int(ary[tid] % 32) cuda.atomic.add(sm, bin, 1.0) cuda.syncthreads() ary[tid] = sm[tid]
def atomic_add_double(idx, ary): tid = cuda.threadIdx.x sm = cuda.shared.array(32, float64) sm[tid] = 0.0 cuda.syncthreads() bin = idx[tid] % 32 cuda.atomic.add(sm, bin, 1.0) cuda.syncthreads() ary[tid] = sm[tid]
def atomic_add3(ary): tx = cuda.threadIdx.x ty = cuda.threadIdx.y sm = cuda.shared.array((4, 8), uint32) sm[tx, ty] = ary[tx, ty] cuda.syncthreads() cuda.atomic.add(sm, (tx, uint64(ty)), 1) cuda.syncthreads() ary[tx, ty] = sm[tx, ty]
def atomic_add_float_2(ary): tx = cuda.threadIdx.x ty = cuda.threadIdx.y sm = cuda.shared.array((4, 8), float32) sm[tx, ty] = ary[tx, ty] cuda.syncthreads() cuda.atomic.add(sm, (tx, ty), 1) cuda.syncthreads() ary[tx, ty] = sm[tx, ty]
def idx_kernel(arr): s = cuda.shared.array(shape=maxThread, dtype=int32) idx = cuda.grid(1) if idx < arr.shape[0]: s[cuda.threadIdx.x] = 1 cuda.syncthreads() if idx < arr.shape[0]: cuda.atomic.add(arr, s[cuda.threadIdx.x], 1)
def problematic(x, y): tid = cuda.threadIdx.x ntid = cuda.blockDim.x if tid > 12: for i in range(ntid): y[i] += x[i] // y[i] cuda.syncthreads() if tid < 17: for i in range(ntid): x[i] += x[i] // y[i]
def atomic_max_double_shared(res, ary): tid = cuda.threadIdx.x smary = cuda.shared.array(32, float64) smary[tid] = ary[tid] smres = cuda.shared.array(1, float64) if tid == 0: smres[0] = res[0] cuda.syncthreads() cuda.atomic.max(smres, 0, smary[tid]) cuda.syncthreads() if tid == 0: res[0] = smres[0]
def device_reduce_full_block(arr, partials, sm_partials): """ Partially reduce `arr` into `partials` using `sm_partials` as working space. The algorithm goes like: array chunks of 128: | 0 | 128 | 256 | 384 | 512 | block-0: | x | | | x | | block-1: | | x | | | x | block-2: | | | x | | | The array is divided into chunks of 128 (size of a threadblock). The threadblocks consumes the chunks in roundrobin scheduling. First, a threadblock loads a chunk into temp memory. Then, all subsequent chunks are combined into the temp memory. Once all chunks are processed. Inner-block reduction is performed on the temp memory. So that, there will just be one scalar result per block. The result from each block is stored to `partials` at the dedicated slot. """ tid = cuda.threadIdx.x blkid = cuda.blockIdx.x blksz = cuda.blockDim.x gridsz = cuda.gridDim.x # block strided loop to compute the reduction start = tid + blksz * blkid stop = arr.size step = blksz * gridsz # load first value tmp = arr[start] # loop over all values in block-stride for i in range(start + step, stop, step): tmp = reduce_op(tmp, arr[i]) cuda.syncthreads() # inner-warp reduction inner_warp_reduction(sm_partials, tmp) cuda.syncthreads() # at this point, only the first slot for each warp in tsm_partials # is valid. # finish up block reduction # warning: this is assuming 4 warps. # assert numwarps == 4 if tid < 2: sm_partials[tid, 0] = reduce_op(sm_partials[tid, 0], sm_partials[tid + 2, 0]) if tid == 0: partials[blkid] = reduce_op(sm_partials[0, 0], sm_partials[1, 0])
def oracle(x, y): tid = cuda.threadIdx.x ntid = cuda.blockDim.x if tid > 12: for i in range(ntid): if y[i] != 0: y[i] += x[i] // y[i] cuda.syncthreads() if tid < 17: for i in range(ntid): if y[i] != 0: x[i] += x[i] // y[i]
def gpu_unique_k(arr, k, out, outsz_ptr): """ Note: run with small blocks. """ tid = cuda.threadIdx.x blksz = cuda.blockDim.x base = 0 # shared memory vset_size = 0 sm_mem_size = MAX_FAST_UNIQUE_K vset = cuda.shared.array(sm_mem_size, dtype=nbtype) share_vset_size = cuda.shared.array(1, dtype=int32) share_loaded = cuda.shared.array(sm_mem_size, dtype=nbtype) sm_mem_size = min(k, sm_mem_size) while vset_size < sm_mem_size and base < arr.size: pos = base + tid valid_load = min(blksz, arr.size - base) # load if tid < valid_load: share_loaded[tid] = arr[pos] # wait for load to complete cuda.syncthreads() # thread-0 inserts if tid == 0: for i in range(valid_load): val = share_loaded[i] new_size = gpu_unique_set_insert(vset, vset_size, val) if new_size >= 0: vset_size = new_size else: vset_size = sm_mem_size + 1 share_vset_size[0] = vset_size # wait until the insert is done cuda.syncthreads() vset_size = share_vset_size[0] # increment base += blksz # output if vset_size <= sm_mem_size: for i in range(tid, vset_size, blksz): out[i] = vset[i] if tid == 0: outsz_ptr[0] = vset_size else: outsz_ptr[0] = -1
def kernel(input, output): tile = cuda.shared.array(shape=tile_shape, dtype=dt) tx = cuda.threadIdx.x ty = cuda.threadIdx.y bx = cuda.blockIdx.x * cuda.blockDim.x by = cuda.blockIdx.y * cuda.blockDim.y x = by + tx y = bx + ty if by+ty < input.shape[0] and bx+tx < input.shape[1]: tile[ty, tx] = input[by+ty, bx+tx] cuda.syncthreads() if y < output.shape[0] and x < output.shape[1]: output[y, x] = tile[tx, ty]
def argmax_lvl0(ary, reduce_max, reduce_arg): """ This only works for positive values arrays. Shared memory must be initialized with double the size of the block size. """ sm_ary = cuda.shared.array(shape = 0, dtype = ary.dtype) # each thread will process two elements tgid = cuda.grid(1) thid = cuda.threadIdx.x # pointer to value and argument side of shared memory val_pointer = 0 arg_pointer = sm_ary.size / 2 # when global thread id is bigger or equal than the ary size # it means that the block is incomplete; in this case we just # fill the rest of the block with -1 so it is smaller than all # other elements; this only works for positive arrays if tgid < ary.size: sm_ary[val_pointer + thid] = ary[tgid] sm_ary[arg_pointer + thid] = tgid else: sm_ary[val_pointer + thid] = 0 sm_ary[arg_pointer + thid] = -1 cuda.syncthreads() s = cuda.blockDim.x / 2 while s >0: index = 2 * s * thid if thid < s: # only change if the left element is smaller than the right one if sm_ary[val_pointer + thid] < sm_ary[val_pointer + thid + s]: sm_ary[val_pointer + thid] = sm_ary[val_pointer + thid + s] sm_ary[arg_pointer + index] = sm_ary[arg_pointer + index + s] cuda.syncthreads() if thid == 0: reduce_ary[cuda.blockIdx.x] = sm_ary[val_pointer] reduce_arg[cuda.blockIdx.x] = sm_ary[arg_pointer]
def gpu_single_block_sum(arr, out): """ A naive single threadblock sum reduction """ temp = cuda.shared.array(gpu_block_sum_max_blockdim, dtype=float32) tid = cuda.threadIdx.x blksz = cuda.blockDim.x temp[tid] = 0 # block stride loop to sum-reduce cooperatively for i in range(tid, arr.size, blksz): temp[tid] += arr[i] cuda.syncthreads() # naive intra block sum that uses a single thread if tid == 0: for i in range(1, blksz): temp[0] += temp[i] # store result out[0] = temp[0]
def max_kernel(a, b): "Simple implementation of reduction kernel" # Allocate static shared memory of 256. # This limits the maximum block size to 256. sa = cuda.shared.array(shape=(256,), dtype=int32) tx = cuda.threadIdx.x bx = cuda.blockIdx.x bw = cuda.blockDim.x i = tx + bx * bw if i < a.shape[0]: sa[tx] = a[i] if tx == 0: # Uses the first thread of each block to perform the actual # reduction m = sa[tx] cuda.syncthreads() for j in range(1, bw): m = mymax(m, sa[j]) b[bx] = m
def device_reduce_partial_block(arr, partials, sm_partials): """ This computes reduction on `arr`. This device function must be used by 1 threadblock only. The blocksize must match `arr.size` and must not be greater than 128. """ tid = cuda.threadIdx.x blkid = cuda.blockIdx.x blksz = cuda.blockDim.x warpid = tid // _WARPSIZE laneid = tid % _WARPSIZE size = arr.size # load first value tid = cuda.threadIdx.x value = arr[tid] sm_partials[warpid, laneid] = value cuda.syncthreads() if (warpid + 1) * _WARPSIZE < size: # fully populated warps inner_warp_reduction(sm_partials, value) else: # partially populated warps # NOTE: this uses a very inefficient sequential algorithm if laneid == 0: sm_this = sm_partials[warpid, :] base = warpid * _WARPSIZE for i in range(1, size - base): sm_this[0] = reduce_op(sm_this[0], sm_this[i]) cuda.syncthreads() # finish up if tid == 0: num_active_warps = (blksz + _WARPSIZE - 1) // _WARPSIZE result = sm_partials[0, 0] for i in range(1, num_active_warps): result = reduce_op(result, sm_partials[i, 0]) partials[blkid] = result
def experimental_sum_fq(g_odata, g_idata, n): _, qx = cuda.grid(2) sdata = cuda.shared.array(512, f4) tid = cuda.threadIdx.x bd = cuda.blockDim.x bid = cuda.blockIdx.x i = bid * bd * 2 + tid gridsize = bd * 2 * cuda.gridDim.x sdata[tid] = 0. while i < n: if i + bd >= len(g_idata): sdata[tid] += g_idata[i, qx] else: sdata[tid] += g_idata[i, qx] + g_idata[i + bd, qx] i += gridsize cuda.syncthreads() if bd >= 512: if tid < 256: sdata[tid] += sdata[tid + 256] cuda.syncthreads() if bd >= 256: if tid < 128: sdata[tid] += sdata[tid + 128] cuda.syncthreads() if bd >= 128: if tid < 64: sdata[tid] += sdata[tid + 64] cuda.syncthreads() if tid < 32: if bd >= 64: sdata[tid] += sdata[tid + 32] if bd >= 32: sdata[tid] += sdata[tid + 16] if bd >= 16: sdata[tid] += sdata[tid + 8] if bd >= 8: sdata[tid] += sdata[tid + 4] if bd >= 4: sdata[tid] += sdata[tid + 2] if bd >= 2: sdata[tid] += sdata[tid + 1] if tid == 0: g_odata[cuda.blockIdx.x, qx] = sdata[0]
def rotate_iou_kernel_eval(N, K, dev_boxes, dev_query_boxes, dev_iou, criterion=-1): threadsPerBlock = 8 * 8 row_start = cuda.blockIdx.x col_start = cuda.blockIdx.y tx = cuda.threadIdx.x row_size = min(N - row_start * threadsPerBlock, threadsPerBlock) col_size = min(K - col_start * threadsPerBlock, threadsPerBlock) block_boxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32) block_qboxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32) dev_query_box_idx = threadsPerBlock * col_start + tx dev_box_idx = threadsPerBlock * row_start + tx if tx < col_size: block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0] block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1] block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2] block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3] block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4] if tx < row_size: block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0] block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1] block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2] block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3] block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4] cuda.syncthreads() if tx < row_size: for i in range(col_size): offset = (row_start * threadsPerBlock * K + col_start * threadsPerBlock + tx * K + i) dev_iou[offset] = devRotateIoUEval( block_qboxes[i * 5:i * 5 + 5], block_boxes[tx * 5:tx * 5 + 5], criterion, )
def calc_albedo_gpu(SURFALBEDSW, SURFALBEDLW, OCEANMASK, SOILTEMP): i, j = cuda.grid(2) # ocean if OCEANMASK[i,j] == 1: SURFALBEDSW[i,j] = 0.05 #SURFALBEDLW[i,j] = 0.05 SURFALBEDLW[i,j] = 0.00 # land else: SURFALBEDSW[i,j] = 0.2 #SURFALBEDLW[i,j] = 0.2 SURFALBEDLW[i,j] = 0.0 # ice (land and sea) if SOILTEMP[i,j,0] <= 273.15: SURFALBEDSW[i,j] = 0.5 #SURFALBEDLW[i,j] = 0.3 SURFALBEDLW[i,j] = 0.0 cuda.syncthreads()
def fast_matmul(A, B, C): """ Perform matrix multiplication of C = A * B Each thread computes one element of the result matrix C """ # Define an array in the shared memory # The size and type of the arrays must be known at compile time sA = cuda.shared.array(shape=(TPB, TPB), dtype=float32) sB = cuda.shared.array(shape=(TPB, TPB), dtype=float32) x, y = cuda.grid(2) tx = cuda.threadIdx.x ty = cuda.threadIdx.y if x >= C.shape[0] and y >= C.shape[1]: # Quit if (x, y) is outside of valid C boundary return # Each thread computes one element in the result matrix. # The dot product is chunked into dot products of TPB-long vectors. tmp = 0. for i in range(int(A.shape[1] / TPB)): # Preload data into shared memory sA[tx, ty] = A[x, ty + i * TPB] sB[tx, ty] = B[tx + i * TPB, y] # Wait until all threads finish preloading cuda.syncthreads() # Computes partial product on the shared memory for j in range(TPB): tmp += sA[tx, j] * sB[j, ty] # Wait until all threads finish computing cuda.syncthreads() C[x, y] = tmp
def matmul_shared_memory(A, B, C): """ 使用Shared Memory的矩阵乘法 C = A * B """ # 在Shared Memory中定义向量 # 向量可被整个Block的所有Thread共享 # 必须声明向量大小和数据类型 sA = cuda.shared.array(shape=(BLOCK_SIZE, BLOCK_SIZE), dtype=float32) sB = cuda.shared.array(shape=(BLOCK_SIZE, BLOCK_SIZE), dtype=float32) tx = cuda.threadIdx.x ty = cuda.threadIdx.y row = cuda.threadIdx.x + cuda.blockDim.x * cuda.blockIdx.x col = cuda.threadIdx.y + cuda.blockDim.y * cuda.blockIdx.y if row >= C.shape[0] and col >= C.shape[1]: # 当(x, y)越界时退出 return tmp = 0. # 以一个 BLOCK_SIZE x BLOCK_SIZE 为单位 for m in range(math.ceil(A.shape[1] / BLOCK_SIZE)): sA[tx, ty] = A[row, ty + m * BLOCK_SIZE] sB[tx, ty] = B[tx + m * BLOCK_SIZE, col] # 线程同步,等待Block中所有Thread预加载结束 # 该函数会等待所有Thread执行完之后才执行下一步 cuda.syncthreads() # 此时已经将A和B的子矩阵拷贝到了sA和sB # 计算Shared Memory中的向量点积 # 直接从Shard Memory中读取数据的延迟很低 for n in range(BLOCK_SIZE): tmp += sA[tx, n] * sB[n, ty] # 线程同步,等待Block中所有Thread计算结束 cuda.syncthreads() # 循环后得到每个BLOCK的点积之和 C[row, col] = tmp
def copy_strides(arr, n, stride, tpb): sm = cuda.shared.array(1, dtype=uint32) i = cuda.threadIdx.x base = 0 if i == 0: sm[0] = 0 val = arr[0] while base < n: idx = base + i if idx < n: val = arr[idx * stride] cuda.syncthreads() if base + i < n: arr[sm[0] + i] = val if i == 0: sm[0] += tpb base += tpb
def k__histogram_shmem(x, xmin, xmax, histogram): """ Recall the common usage of shared memory, i.e., caching and buffering, this exmample demonstrates usage #2. Here we are benefited by putting the write- intensive operations into the shared memory and then collect the tally once a block has finished its calculation. """ start = cuda.grid(1) stride = cuda.gridsize(1) # though unnecessary in the present case, below is always a good habit if start >= x.shape[0]: return # allocate space in the shared memory whose size must be a constant tid = cuda.threadIdx.x hist_buffer = cuda.shared.array(nbins, types.int32) for i in range(nbins): hist_buffer[i] = 0 cuda.syncthreads() # this is important # real calculations bin_width = (xmax - xmin) / nbins for idx in range(start, x.shape[0], stride): bin_number = np.int32((x[idx] - xmin) / bin_width) if 0 <= bin_number < nbins: # writing in the shared memory cuda.atomic.add(hist_buffer, bin_number, 1) cuda.syncthreads() # this is important # move the tallied result back to the output array if tid < nbins: # assuming griddim >= nbins cuda.atomic.add(histogram, tid, hist_buffer[tid])
def fast_matmul(A, B, C): # Define an array in the shared memory # The size and type of the arrays must be known at compile time sA = cuda.shared.array(shape=(TPB, TPB), dtype=float32) sB = cuda.shared.array(shape=(TPB, TPB), dtype=float32) x, y = cuda.grid(2) tx = cuda.threadIdx.x ty = cuda.threadIdx.y bpg = cuda.gridDim.x # blocks per grid if x >= C.shape[0] and y >= C.shape[1]: # Quit if (x, y) is outside of valid C boundary return # Each thread computes one element in the result matrix. # The dot product is chunked into dot products of TPB-long vectors. tmp = 0. for i in range(bpg): print(x, ty, i, TPB, tx, i, TPB, y) # Preload data into shared memory sA[tx, ty] = A[x, ty + i * TPB] # row sB[tx, ty] = B[tx + i * TPB, y] # col # Wait until all threads finish preloading cuda.syncthreads() # Computes partial product on the shared memory for j in range(TPB): tmp += sA[tx, j] * sB[j, ty] # Wait until all threads finish computing cuda.syncthreads() # print(x,y, tmp) C[x,y] = tmp
def kernel_shared(dVARdt, VAR): sVAR = cuda.shared.array(shape=(shared_memory_size), dtype=float32) i, j, k = cuda.grid(3) si = cuda.threadIdx.x + 1 sj = cuda.threadIdx.y + 1 sk = cuda.threadIdx.z sVAR[si, sj, sk] = VAR[i, j, k] #cuda.syncthreads() if i_run_x: if si == 1: sVAR[si - 1, sj, sk] = VAR[i - 1, j, k] if si == cuda.blockDim.x: sVAR[si + 1, sj, sk] = VAR[i + 1, j, k] #cuda.syncthreads() if i_run_y: if sj == 1: sVAR[si, sj - 1, sk] = VAR[i, j - 1, k] if sj == cuda.blockDim.y: sVAR[si, sj + 1, sk] = VAR[i, j + 1, k] cuda.syncthreads() if i >= nb and i < nx + nb and j >= nb and j < ny + nb: tmp = wp(0.) if i_run_x: tmp += (sVAR[si + 1, sj, sk] - sVAR[si - 1, sj, sk] - sVAR[si, sj, sk]) if i_run_y: tmp += (sVAR[si, sj + 1, sk] - sVAR[si, sj - 1, sk] - sVAR[si, sj, sk]) if i_run_z: if k >= 1 and k < nz - 1: tmp += (sVAR[si, sj, sk + 1] - sVAR[si, sj, sk - 1] + sVAR[si, sj, sk]) dVARdt[i, j, k] = tmp
def cuda_sum(my_array, my_sums): # 1. Declaramos la memoria compartida shared_mem = cuda.shared.array(shape=128, dtype=numba.float32) # 2. Obtenemos los índices tidx = cuda.threadIdx.x idx = cuda.blockDim.x * cuda.blockIdx.x + tidx # 3. Inicialiamos a cero shared_mem[tidx] = 0 # 4. Cada thread comprueba un stride doble del grid while idx < my_array.size: shared_mem[tidx] += my_array[idx] idx += cuda.blockDim.x * cuda.gridDim.x cuda.syncthreads() # 5. Unroll de bloque # Consideramos que estamos usando 128 hebras por bloque. if tidx < 64: shared_mem[tidx] += shared_mem[tidx + 64] cuda.syncthreads() # 6. Hacemos unroll para un warp (nos ahorramos syncthreads) if tidx < 32: shared_mem[tidx] += shared_mem[tidx + 32] shared_mem[tidx] += shared_mem[tidx + 16] shared_mem[tidx] += shared_mem[tidx + 8] shared_mem[tidx] += shared_mem[tidx + 4] shared_mem[tidx] += shared_mem[tidx + 2] shared_mem[tidx] += shared_mem[tidx + 1] # El primer thread de cada bloque indica su suma # Si da para más de un bloque, luego hay que reaplicar el kernel if tidx == 0: my_sums[cuda.blockIdx.x] = shared_mem[tidx]
def convolutionColumnsGPU(d_Dst, d_Src, c_Kernel, imageW, imageH, pitch): COLUMNS_BLOCKDIM_X = 16 COLUMNS_BLOCKDIM_Y = 8 COLUMNS_RESULT_STEPS = 8 COLUMNS_HALO_STEPS = 1 KERNEL_RADIUS = 8 #cuda.const.array_like(c_Kernel) #s_Data = cuda.shared.array(shape=(COLUMNS_BLOCKDIM_X,(COLUMNS_RESULT_STEPS + 2 * COLUMNS_HALO_STEPS) * COLUMNS_BLOCKDIM_Y + 1), dtype=float32) s_Data = cuda.shared.array(shape=(16,81), dtype=float32) #Offset to the upper halo edge baseX = cuda.blockIdx.x * COLUMNS_BLOCKDIM_X + cuda.threadIdx.x baseY = (cuda.blockIdx.y * COLUMNS_RESULT_STEPS - COLUMNS_HALO_STEPS) * COLUMNS_BLOCKDIM_Y + cuda.threadIdx.y #d_Src += baseY * pitch + baseX #d_Dst += baseY * pitch + baseX desvio = baseY * pitch + baseX #Main data for i in xrange(COLUMNS_HALO_STEPS,COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS): s_Data[cuda.threadIdx.x][cuda.threadIdx.y + i * COLUMNS_BLOCKDIM_Y] = d_Src[desvio + (i * COLUMNS_BLOCKDIM_Y * pitch)] #Upper halo for i in xrange(COLUMNS_HALO_STEPS): s_Data[cuda.threadIdx.x][cuda.threadIdx.y + i * COLUMNS_BLOCKDIM_Y] = d_Src[desvio + (i * COLUMNS_BLOCKDIM_Y * pitch)] if (baseY >= -i * COLUMNS_BLOCKDIM_Y) else 0 #Lower halo for i in xrange(COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS,COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS + COLUMNS_HALO_STEPS): s_Data[cuda.threadIdx.x][cuda.threadIdx.y + i * COLUMNS_BLOCKDIM_Y]= d_Src[desvio + (i * COLUMNS_BLOCKDIM_Y * pitch)] if (imageH - baseY > i * COLUMNS_BLOCKDIM_Y) else 0 #Compute and store results cuda.syncthreads() for i in xrange(COLUMNS_HALO_STEPS,COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS): sum = 0.0 for j in xrange(-KERNEL_RADIUS,KERNEL_RADIUS+1): sum += c_Kernel[KERNEL_RADIUS - j] * s_Data[cuda.threadIdx.x][cuda.threadIdx.y + i * COLUMNS_BLOCKDIM_Y + j] d_Dst[desvio+(i * COLUMNS_BLOCKDIM_Y * pitch)] = sum
def uLocal1D(uvalsMatrix, Plist, Qlist, Rlist, vjInvList, djlist, quadCoeffsMatrix, JLvalMatrix, JRvalMatrix, wMatrix, leftComm, rightComm, JSizeList): i = cuda.blockIdx.x j = cuda.threadIdx.x JSize = JSizeList[i] P = Plist[i] Q = Qlist[i] R = Rlist[i] vjsqrinv = vjInvList[i] dj = djlist[i] calcLocalJs(uvalsMatrix[i, :], P, Q, R, vjsqrinv, dj, quadCoeffsMatrix[i, :], JLvalMatrix[i, :], JRvalMatrix[i, :], JSize) cuda.syncthreads() #wait for all threads to finish #this seems like a bad way to do this, #if we are on the first thread if j == 1: index = 0 while index < JSize: if index != 0: JLvalMatrix[i][index] = dj * JLvalMatrix[i][ index - 1] + JLvalMatrix[i][index] if index != JSize - 1: reverseIndex = JSize - index - 1 JRvalMatrix[i][ reverseIndex - 1] = dj * JRvalMatrix[i][reverseIndex] + JRvalMatrix[i][ reverseIndex - 1] index += 1 cuda.syncthreads() wMatrix[i, j] = JLvalMatrix[i, j] + JRvalMatrix[i, j] cuda.syncthreads() if j == 1: leftComm[len(uvalsMatrix) - i - 1] = wMatrix[i][0] rightComm[i] = wMatrix[i][JSize - 1] cuda.syncthreads()
def kernel_similarity_shared(objs, sim_matrix): """ Calculates l2 distance between row and column in numba with shared memory. """ shared_objs_row = cuda.shared.array((NTHREADS, NTHREADS), dtype=float32) shared_objs_col = cuda.shared.array((NTHREADS, NTHREADS), dtype=float32) row, col = cuda.grid(2) thread_row = cuda.threadIdx.x # from 0 to NTHREADS - 1 thread_col = cuda.threadIdx.y # from 0 to NTHREADS - 1 if row < sim_matrix.shape[0] and col < sim_matrix.shape[1]: current_sum = 0.0 for i in range(BLOCKS_PER_GRID): shared_objs_row[thread_row, thread_col] = objs[row, thread_col + i * NTHREADS] shared_objs_col[thread_row, thread_col] = objs[col, thread_row + i * NTHREADS] cuda.syncthreads() for j in range(NTHREADS): current_sum += (shared_objs_row[thread_row, j] - shared_objs_col[thread_col, j])**2 cuda.syncthreads() sim_matrix[row, col] = current_sum
def __GPU_reduce_flex_C(x, out, sz): tid = cuda.threadIdx.x i = cuda.blockIdx.x * (2 * THREADS) + tid step = (THREADS * 2) * cuda.gridDim.x end = sz - THREADS buf = cuda.shared.array((THREADS, 2), dtype=numba.float32) buf[tid, 0] = 0 buf[tid, 1] = 0 while i < end: buf[tid, 0] += x[i].real + x[i + THREADS].real buf[tid, 1] += x[i].imag + x[i + THREADS].imag i += step if i < sz: buf[tid, 0] += x[i].real buf[tid, 1] += x[i].imag cuda.syncthreads() __GPU_reduce_2(buf) if tid == 0: out[0, cuda.blockIdx.x] = buf[0, 0] out[1, cuda.blockIdx.x] = buf[0, 1]
def scan_sum(g_data, aux): temp = cuda.shared.array(shape = 1, dtype = numba.i4) thid = cuda.threadIdx.x # thread id in block bid = cuda.blockIdx.x # block id if thid == 0: temp[0] = aux[bid] tgid = cuda.grid(1) # thread id in grid elid = tgid * 2 # each thread processes 2 elements n = g_data.size if elid >= n: return cuda.syncthreads() # synchronize to make sure value to sum is loaded in memory g_data[elid] += aux[bid] # do the sum if elid + 1 < n: g_data[elid + 1] += aux[bid]
def vec_sum_row(vecs, sums): sm = cuda.shared.array(threadsperblock, float64) bid = cuda.blockIdx.x tid = cuda.threadIdx.x bdim = cuda.blockDim.x # load shared memory with vector using block-stride loop lid = tid sm[lid] = 0 while lid < nCols: sm[tid] += vecs[bid, lid] lid += bdim cuda.syncthreads() # perform shared memory sweep reduction sweep = bdim // 2 while sweep > 0: if tid < sweep: sm[tid] += sm[tid + sweep] sweep = sweep // 2 cuda.syncthreads() if tid == 0: sums[bid] = sm[0]
def fastsumall_impl(a, out): tx = int32(cuda.threadIdx.x) gtx = tx + cuda.blockIdx.x * 1024 gsize = 1024 * cuda.gridDim.x sz2 = a[0].size nc = a[0].shape[1] fshared = cuda.shared.array(shape=1024, dtype=float32) fidx = 0 for ai in range(a.shape[0]): sumv = float32(0) for i in range(gtx,sz2,gsize): sumv += a[ai,i//nc,i%nc] fshared[tx] = sumv cuda.syncthreads() sz = int32(512) while sz>0: if tx<sz: fshared[tx] += fshared[tx+sz] cuda.syncthreads() sz//=2 if tx==0: out[cuda.blockIdx.x + fidx] = fshared[0] fidx += cuda.gridDim.x
def cu_sums1(nme, member, vel, virial_potential, coll, nblocks): sm = cuda.shared.array(256, nb.float32) i = cuda.grid(1) tx = cuda.threadIdx.x temp = nb.float32(0.0) if i < nme: idx = member[i] vi = vel[idx] mi = vi[3] temp = mi * (vi[0] * vi[0] + vi[1] * vi[1] + vi[2] * vi[2]) sm[tx] = temp cuda.syncthreads() offs = cuda.blockDim.x >> nb.int32(1) while offs > nb.int32(0): if tx < offs: sm[tx] += sm[tx + offs] offs >>= nb.int32(1) cuda.syncthreads() if tx == nb.int32(0): coll[cuda.blockIdx.x] = sm[0]
def calculate_forces(positions, weights, accelerations): """ Calculate accelerations produced on all bodies by mutual gravitational forces. """ sh_positions = cuda.shared.array((tile_size, 2), float32) sh_weights = cuda.shared.array(tile_size, float32) i = cuda.grid(1) axi = 0.0 ayi = 0.0 xi = positions[i, 0] yi = positions[i, 1] for j in range(0, len(weights), tile_size): index = (j // tile_size) * cuda.blockDim.x + cuda.threadIdx.x sh_index = cuda.threadIdx.x sh_positions[sh_index, 0] = positions[index, 0] sh_positions[sh_index, 1] = positions[index, 1] sh_weights[sh_index] = weights[index] cuda.syncthreads() axi, ayi = tile_calculation(xi, yi, axi, ayi, sh_positions, sh_weights) cuda.syncthreads() accelerations[i, 0] = axi accelerations[i, 1] = ayi
def soil_temperature_euler_forward_gpu(dSOILTEMPdt, SOILTEMP, LWFLXNET, SWFLXNET, SOILCP, SOILRHO, SOILDEPTH, dt): nx = SOILTEMP.shape[0] ny = SOILTEMP.shape[1] nzs = LWFLXNET.shape[2] i, j = cuda.grid(2) dSOILTEMPdt[i,j,0] = 0. if i_radiation > 0: dSOILTEMPdt[i,j,0] = (LWFLXNET[i,j,nzs-1] + SWFLXNET[i,j,nzs-1])/ \ (SOILCP[i,j] * SOILRHO[i,j] * SOILDEPTH[i,j]) #if i_microphysics > 0: # dSOILTEMPdt = dSOILTEMPdt - ( MIC.surf_evap_flx * MIC.lh_cond_water ) / \ # (CF.SOILCP * CF.SOILRHO * CF.SOILDEPTH) SOILTEMP[i,j,0] = SOILTEMP[i,j,0] + dt * dSOILTEMPdt[i,j,0] cuda.syncthreads()
def lbp_texture(arry, hist): # We have 32*32 threads per block A = cuda.shared.array(shape=(32, 32), dtype=int32) # H = cuda.shared.array(BIN_COUNT, dtype=int32) x, y = cuda.grid(2) ty = cuda.threadIdx.x tx = cuda.threadIdx.y A[ty, tx] = arry[x, y] cuda.syncthreads() threadCountX = A.shape[0] - 1 threadCountY = A.shape[1] - 1 # If within x range and y range then calculate the LBP discriptor along # with histogram value to specific bin # Other wise Ignore the Value if (ty > 0 and (threadCountX - ty) > 0) and (tx > 0 and (threadCountY - tx) > 0): # # You can do the Processing here. ^_^ code = 0 # We need to make sure that each value is accessable to each thread center = A[ty, tx] # Compiler optimization: By loop unrolling # turns out twice faster than rolled version for over # 16*16 window code |= (1 if A[ty - 1][tx - 1] > center else 0) << 7 code |= (1 if A[ty][tx - 1] > center else 0) << 6 code |= (1 if A[ty + 1][tx - 1] > center else 0) << 5 code |= (1 if A[ty + 1][tx] > center else 0) << 4 code |= (1 if A[ty + 1][tx + 1] > center else 0) << 3 code |= (1 if A[ty][tx + 1] > center else 0) << 2 code |= (1 if A[ty - 1][tx + 1] > center else 0) << 1 code |= (1 if A[ty - 1][tx - 1] > center else 0) << 0 # Since atomic add; adds value to the existing value # Need to figure out the fraction to be added in the previous value code = (code - center) A[ty, tx] = code cuda.syncthreads() # Fun It's Fun to have a visible LBP Texture # So, overriding that with the origional vale. val = A[ty, tx] cuda.atomic.add(arry, (x, y), val) cuda.syncthreads() # This Atomic Operation is equivalent to hist[code % 256] += 1 ind = code % BIN_COUNT cuda.atomic.add(hist, ind, 1)
def kernel_shared(dVARdt, VAR): sVAR = cuda.shared.array(shape=(shared_memory_size), dtype=float32) k, j, i = cuda.grid(3) si = cuda.threadIdx.z + 1 sj = cuda.threadIdx.y + 1 sk = cuda.threadIdx.x + 1 sVAR[sk, sj, si] = VAR[k, j, i] #cuda.syncthreads() if si == 1: sVAR[sk, sj, si - 1] = VAR[k, j, i - 1] if si == cuda.blockDim.z: sVAR[sk, sj, si + 1] = VAR[k, j, i + 1] #cuda.syncthreads() if sj == 1: sVAR[sk, sj - 1, si] = VAR[k, j - 1, i] if sj == cuda.blockDim.y: sVAR[sk, sj + 1, si] = VAR[k, j + 1, i] #cuda.syncthreads() if sk == 1: sVAR[sk - 1, sj, si] = VAR[k - 1, j, i] if sk == cuda.blockDim.x: sVAR[sk + 1, sj, si] = VAR[k + 1, j, i] cuda.syncthreads() if i >= nb and i < nx + nb and j >= nb and j < ny + nb: tmp = wp(0.) tmp += (sVAR[sk, sj + 1, si] - sVAR[sk, sj - 1, si]) tmp += (sVAR[sk, sj, si + 1] - sVAR[sk, sj, si - 1]) if k >= 1 and k < nz - 1: tmp += (sVAR[sk + 1, sj, si] - sVAR[sk - 1, sj, si]) dVARdt[k, j, i] = tmp
def get_pp_col_time_kernel(pp_dt, pos, vel, mask, N, radius):#, pp_dt_full, a_gpu, b_gpu, c_gpu): pp_dt_shr = cuda.shared.array(shape=(pp_brows, pp_bcols), dtype=nb_dtype) tx = cuda.threadIdx.x ty = cuda.threadIdx.y p = ty + cuda.blockIdx.y * cuda.blockDim.y q = tx + cuda.blockIdx.x * cuda.blockDim.x if ((p >= N) or (q >= N)): pp_dt_shr[ty,tx] = np.inf a = p b = q c = N else: a = 0.0 b = 0.0 c = 0.0 for d in range(dim): dx = pos[p,d] - pos[q,d] dv = vel[p,d] - vel[q,d] a += (dv * dv) b += (dx * dv * 2) c += (dx * dx) c -= (radius[p] + radius[q])**2 if ((mask[0]==p) & (mask[1]==q)): masked = True elif ((mask[0]==q) & (mask[1]==p)): masked = True else: masked = False pp_dt_shr[ty,tx] = solve_quadratic_gpu(a, b, c, masked) cuda.syncthreads() row_min_gpu(pp_dt_shr) pp_dt[p, cuda.blockIdx.x] = pp_dt_shr[ty, 0] cuda.syncthreads()
def convolutionRowsGPU(d_Dst, d_Src, c_Kernel, imageW, imageH, pitch): ROWS_BLOCKDIM_X = 16 ROWS_BLOCKDIM_Y = 4 ROWS_RESULT_STEPS = 8 ROWS_HALO_STEPS = 1 KERNEL_RADIUS = 8 #s_Data = cuda.shared.array(shape=(ROWS_BLOCKDIM_Y,(ROWS_RESULT_STEPS + 2 * ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X),dtype=float32) s_Data = cuda.shared.array(shape=(4,160),dtype=float32) #Offset to the left halo edge baseX = (cuda.blockIdx.x * ROWS_RESULT_STEPS - ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X + cuda.threadIdx.x baseY = cuda.blockIdx.y * ROWS_BLOCKDIM_Y + cuda.threadIdx.y #d_Src += baseY * pitch + baseX #d_Dst += baseY * pitch + baseX desvio = baseY * pitch + baseX #Load main data for i in xrange(ROWS_HALO_STEPS,ROWS_HALO_STEPS + ROWS_RESULT_STEPS): s_Data[cuda.threadIdx.y][cuda.threadIdx.x + i * ROWS_BLOCKDIM_X] = d_Src[desvio + (i * ROWS_BLOCKDIM_X)] #Load left halo for i in xrange(ROWS_HALO_STEPS): s_Data[cuda.threadIdx.y][cuda.threadIdx.x + i * ROWS_BLOCKDIM_X] = d_Src[desvio + (i * ROWS_BLOCKDIM_X)] if (baseX >= -i * ROWS_BLOCKDIM_X ) else 0 #Load right halo for i in xrange(ROWS_HALO_STEPS + ROWS_RESULT_STEPS,ROWS_HALO_STEPS + ROWS_RESULT_STEPS + ROWS_HALO_STEPS): s_Data[cuda.threadIdx.y][cuda.threadIdx.x + i * ROWS_BLOCKDIM_X] = d_Src[desvio + (i * ROWS_BLOCKDIM_X)] if (imageW - baseX > i * ROWS_BLOCKDIM_X) else 0 #Compute and store results cuda.syncthreads() for i in xrange(ROWS_HALO_STEPS,ROWS_HALO_STEPS + ROWS_RESULT_STEPS): sum = 0.0 for j in xrange(-KERNEL_RADIUS,KERNEL_RADIUS+1): sum += c_Kernel[KERNEL_RADIUS - j] * s_Data[cuda.threadIdx.y][cuda.threadIdx.x + i * ROWS_BLOCKDIM_X + j] d_Dst[desvio + (i * ROWS_BLOCKDIM_X)] = sum
def GridDecompPath(grid, start, goal, parents, h, block): x, y = cuda.grid(2) glb_x, glb_y = dim goal_x, goal_y = goal tx = cuda.threadIdx.x ty = cuda.threadIdx.y bpg = cuda.gridDim.x # blocks per grid if x < grid.shape[0] and y < grid.shape[1]: # do the search for as many times as number of tiles in the grid if passable(grid, (x, y)) and (x != goal_x or y != goal_y): # print(x, y) # initialize local arrays local_open = cuda.local.array(dim, cp.int32) local_closed = cuda.local.array(dim, cp.int32) local_cost = cuda.local.array(dim, cp.int32) local_g = cuda.local.array(dim, cp.int32) local_neighbors = cuda.local.array((8, 2), cp.int32) for i in range(glb_x): for j in range(glb_y): local_open[i, j] = UNEXPLORED local_closed[i, j] = UNEXPLORED local_cost[i, j] = 0 local_g[i, j] = 0 cuda.syncthreads() for i in range(8): local_neighbors[i, 0] = 0 local_neighbors[i, 1] = 0 cuda.syncthreads() # search(x, y, shared_planning_block, (block_x,block_y), goal, local_open, local_closed, parents, local_cost, local_g, shared_h, local_neighbors) search(x, y, grid, (x, y), goal, local_open, local_closed, parents[x, y], local_cost, local_g, h, local_neighbors, block)
def fast_matmul(a, b, c): sa = cuda.shared.array(shape=(TPB, TPB), dtype=float64) sb = cuda.shared.array(shape=(TPB, TPB), dtype=float64) x, y = cuda.grid(2) tx = cuda.threadIdx.x ty = cuda.threadIdx.y # bpg = cuda.gridDim.x if x >= c.shape[0] and y >= c.shape[1]: # Quit if (x, y) is outside of valid C boundary return tmp = 0 for i in range(hid): sa[tx, ty] = a[x, ty + i * TPB] sb[tx, ty] = b[tx + i * TPB, y] cuda.syncthreads() for j in range(TPB): tmp += sa[tx, j] * sb[j, ty] cuda.syncthreads() c[x, y] = tmp
def get_geopotential(PHI, PHIVB, PVTF, PVTFVB, \ POTT, HSURF): nx = PHIVB.shape[0] - 2 ny = PHIVB.shape[1] - 2 nzs = PHIVB.shape[2] i, j, ks = cuda.grid(3) if i > 0 and i < nx + 1 and j > 0 and j < ny + 1: kiter = nzs - 1 if ks == kiter: PHIVB[i, j, ks] = HSURF[i, j] * con_g kiter = kiter - 1 cuda.syncthreads() while kiter >= 0: if ks == kiter: PHI [i,j,ks] = PHIVB[i,j,ks+1] - con_cp* \ ( POTT[i,j,ks] * ( PVTF [i,j,ks ] \ - PVTFVB[i,j,ks+1] ) ) PHIVB[i,j,ks] = PHI [i,j,ks ] - con_cp * \ ( POTT[i,j,ks] * ( PVTFVB[i,j,ks ] \ - PVTF [i,j,ks ] ) ) kiter = kiter - 1 cuda.syncthreads()
def kernel_calc_force_rec(q, out, m, S_m_cos_parts, S_m_sin_parts, S_m_cos_sum, S_m_sin_sum, dS_modul_sq, charges, f_rec_prefactor, coeff_S): i = cuda.grid(1) if i < n_particles: for j in range(m.shape[1]): for k in range(n_dim): S_m_cos_parts[i, j] += q[i, k] * m[k, j] S_m_sin_parts[i, j] += q[i, k] * m[k, j] S_m_cos_parts[i, j] = math.cos( 2 * math.pi * S_m_cos_parts[i, j]) * charges[i] S_m_sin_parts[i, j] = math.sin( 2 * math.pi * S_m_sin_parts[i, j]) * charges[i] S_m_cos_sum[j] += S_m_cos_parts[i, j] S_m_sin_sum[j] += S_m_sin_parts[i, j] cuda.syncthreads() for j in range(m.shape[1]): dS_modul_sq[i, j] = coeff_S[j] * ( 2. * S_m_cos_sum[j] * S_m_sin_parts[i, j] - 2. * S_m_sin_sum[j] * S_m_cos_parts[i, j]) for j in range(m.shape[1]): for k in range(n_dim): out[i, k] += f_rec_prefactor[i] * dS_modul_sq[i, j] * m[k, j]
def mm_shared(a, b, c): sum = 0 # `a_cache` and `b_cache` are already correctly defined a_cache = cuda.shared.array(block_size, types.int32) b_cache = cuda.shared.array(block_size, types.int32) # TODO: use each thread to populate one element each a_cache and b_cache x, y = cuda.grid(2) tx = cuda.threadIdx.x ty = cuda.threadIdx.y bpg = cuda.gridDim.x TPB = int(N) for i in range(a.shape[1] / TPB): a_cache[tx, ty] = a[x, ty + i * TPB] b_cache[tx, ty] = b[tx + i * TPB, y] cuda.syncthreads() for j in range(TPB): #a.shape[1]): # TODO: calculate the `sum` value correctly using values from the cache sum += a_cache[tx][j] * b_cache[j][ty] cuda.syncthreads() c[x][y] = sum
def rotate_nms_kernel(n_boxes, nms_overlap_thresh, dev_boxes, dev_mask): threadsPerBlock = 8 * 8 row_start = cuda.blockIdx.y col_start = cuda.blockIdx.x tx = cuda.threadIdx.x row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock) col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock) block_boxes = cuda.shared.array(shape=(64 * 6,), dtype=numba.float32) dev_box_idx = threadsPerBlock * col_start + tx if tx < col_size: block_boxes[tx * 6 + 0] = dev_boxes[dev_box_idx * 6 + 0] block_boxes[tx * 6 + 1] = dev_boxes[dev_box_idx * 6 + 1] block_boxes[tx * 6 + 2] = dev_boxes[dev_box_idx * 6 + 2] block_boxes[tx * 6 + 3] = dev_boxes[dev_box_idx * 6 + 3] block_boxes[tx * 6 + 4] = dev_boxes[dev_box_idx * 6 + 4] block_boxes[tx * 6 + 5] = dev_boxes[dev_box_idx * 6 + 5] cuda.syncthreads() if tx < row_size: cur_box_idx = threadsPerBlock * row_start + tx # cur_box = dev_boxes + cur_box_idx * 5; t = 0 start = 0 if row_start == col_start: start = tx + 1 for i in range(start, col_size): iou = devRotateIoU( dev_boxes[cur_box_idx * 6 : cur_box_idx * 6 + 5], block_boxes[i * 6 : i * 6 + 5], ) # print('iou', iou, cur_box_idx, i) if iou > nms_overlap_thresh: t |= 1 << i col_blocks = (n_boxes) // (threadsPerBlock) + ( (n_boxes) % (threadsPerBlock) > 0 ) dev_mask[cur_box_idx * col_blocks + col_start] = t
def calculate_forces(positions, weights, accelerations): """ Calculate accelerations produced on all bodies by mutual gravitational forces. """ sh_positions = cuda.shared.array((tile_size, 2), float32) sh_weights = cuda.shared.array(tile_size, float32) i = cuda.grid(1) axi = 0.0 ayi = 0.0 xi = positions[i,0] yi = positions[i,1] for j in range(0, len(weights), tile_size): index = (j // tile_size) * cuda.blockDim.x + cuda.threadIdx.x sh_index = cuda.threadIdx.x sh_positions[sh_index,0] = positions[index,0] sh_positions[sh_index,1] = positions[index,1] sh_weights[sh_index] = weights[index] cuda.syncthreads() axi, ayi = tile_calculation(xi, yi, axi, ayi, sh_positions, sh_weights) cuda.syncthreads() accelerations[i,0] = axi accelerations[i,1] = ayi
def BusquedaLocal_CUDA(nuevo_individuos, probabilidades, Aristmono, MatrizAdjacencia, H, numNodos, tam_pobla, rng_states): bx = cuda.blockIdx.x thx = cuda.threadIdx.x id = bx * H + thx '''individuo = cuda.local.array(shape= (4,52), dtype= int32) probabilidad = cuda.local.array(shape = 4, dtype = float32) MatrizAdjCuda = cuda.local.array(shape=(52,52), dtype=int32) if id < tam_pobla: for j in range(tam_pobla): individuo = nuevo_individuos[id] probabilidad = probabilidades[id] Aristas = Aristmono[id] MatrizAdjCuda= MatrizAdjacencia metropolis_gpu(MatrizAdjCuda, individuo, probabilidad,Aristas,numColores,numNodos,rng_states,id) # regresa al individuo después de realzar la búsqueda local cuda.syncthreads() ''' metropolis_gpu( MatrizAdjacencia, nuevo_individuos[id], probabilidades[id], Aristmono[id], numColores, numNodos, rng_states, id) # regresa al individuo después de realzar la búsqueda local cuda.syncthreads()
def scan_sum(g_data, aux): temp = cuda.shared.array(shape=1, dtype=numba.i4) thid = cuda.threadIdx.x # thread id in block bid = cuda.blockIdx.x # block id if thid == 0: temp[0] = aux[bid] tgid = cuda.grid(1) # thread id in grid elid = tgid * 2 # each thread processes 2 elements n = g_data.size if elid >= n: return # synchronize to make sure value to sum is loaded in memory cuda.syncthreads() g_data[elid] += aux[bid] # do the sum if elid + 1 < n: g_data[elid + 1] += aux[bid]
def gaussian_lu_decomposition(A, L, size, i): """ Performs Gaussian LU elimination. @param A Coefficient matrix A. @param L Matrix in which to store the multipliers. @param size Size of coefficiente matrix. @param i Integer representing the current column in which all threads are performing row operations. @return None """ idx = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x idy = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y index = idx * size + idy if idx < size and idy < size: if idx > i: mul = A[idx * size + i] / A[i * size + i] if idy >= i: A[index] -= A[i * size + idy] * mul if idy == i: L[index] = mul elif idx == idy: L[index] = 1 cuda.syncthreads()
def reducer(inp, out, nelem, ostride): tid = cuda.threadIdx.x i = cuda.blockIdx.x * (blocksize * 2) + tid gridSize = blocksize * 2 * cuda.gridDim.x sdata = cuda.shared.array(blocksize, dtype=typ) while i < nelem: sdata[tid] = binop(inp[i], inp[i + blocksize]) i += gridSize cuda.syncthreads() if blocksize >= 512: if tid < 256: sdata[tid] = binop(sdata[tid], sdata[tid + 256]) cuda.syncthreads() if blocksize >= 256: if tid < 128: sdata[tid] = binop(sdata[tid], sdata[tid + 128]) cuda.syncthreads() if blocksize >= 128: if tid < 64: sdata[tid] = binop(sdata[tid], sdata[tid + 64]) cuda.syncthreads() if tid < 32: if blocksize >= 64: sdata[tid] = binop(sdata[tid], sdata[tid + 32]) if blocksize >= 32: sdata[tid] = binop(sdata[tid], sdata[tid + 16]) if blocksize >= 16: sdata[tid] = binop(sdata[tid], sdata[tid + 8]) if blocksize >= 8: sdata[tid] = binop(sdata[tid], sdata[tid + 4]) if blocksize >= 4: sdata[tid] = binop(sdata[tid], sdata[tid + 2]) if blocksize >= 2: sdata[tid] = binop(sdata[tid], sdata[tid + 1]) if tid == 0: out[cuda.blockIdx.x * ostride] = sdata[0]
def jocabi_relax_core(A, Anew, error): err_sm = cuda.shared.array((tpb, tpb), dtype=f8) ty = cuda.threadIdx.x tx = cuda.threadIdx.y bx = cuda.blockIdx.x by = cuda.blockIdx.y n = A.shape[0] m = A.shape[1] i, j = cuda.grid(2) err_sm[ty, tx] = 0 if j >= 1 and j < n - 1 and i >= 1 and i < m - 1: Anew[j, i] = 0.25 * ( A[j, i + 1] + A[j, i - 1] \ + A[j - 1, i] + A[j + 1, i]) err_sm[ty, tx] = Anew[j, i] - A[j, i] cuda.syncthreads() # max-reduce err_sm vertically t = tpb // 2 while t > 0: if ty < t: err_sm[ty, tx] = get_max(err_sm[ty, tx], err_sm[ty + t, tx]) t //= 2 cuda.syncthreads() # max-reduce err_sm horizontally t = tpb // 2 while t > 0: if tx < t and ty == 0: err_sm[ty, tx] = get_max(err_sm[ty, tx], err_sm[ty, tx + t]) t //= 2 cuda.syncthreads() if tx == 0 and ty == 0: error[by, bx] = err_sm[0, 0]
def last_scan(g_data, aux, auxidx, elb, start_idx): """ Performs the Bleloch scan on last block, where size might be variable. g_data : array to perform scan on aux : where to store sum auxidx : where to store sum in aux array; if auxid == -1 it means that this is not part of a large array scan and sums should not be stored elb : number of elements of last block """ temp = cuda.shared.array(shape = 0, dtype = int32) thid = cuda.threadIdx.x # thread id in block tgid = cuda.grid(1) # thread id in grid bid = cuda.blockIdx.x # block id bsize = cuda.blockDim.x # load input into shared memory # if index is above number of elements in last block, # shared memory should be 0 idx1 = 2 * thid idx2 = 2 * thid +1 if idx1 < elb: temp[idx1] = g_data[start_idx + idx1] else: temp[idx1] = 0 if idx2 < elb: temp[idx2] = g_data[start_idx + idx2] else: temp[idx2] = 0 offset = 1 # build sum in place up the tree d = bsize # bsize is half the number of elements to process while d > 0: # if thid == 0: # from pdb import set_trace; set_trace() cuda.syncthreads() if thid < d: ai = offset * (2 * thid + 1) - 1 bi = offset * (2 * thid + 2) - 1 temp[bi] += temp[ai] offset <<= 1 # multipy by 2 d >>= 1 # divide by 2 # clear the last element if thid == 0: # the last element processed by this block is the size # of the block multiplied by 2 last_elem_id = bsize * 2 - 1 if auxidx != -1: #aux[auxidx] = temp[last_elem_id] aux[auxidx] = temp[last_elem_id] temp[last_elem_id] = 0 # traverse down tree and build scan d = 1 while d < bsize << 1: # same thing as before offset >>= 1 cuda.syncthreads() if thid < d: ai = offset * (2 * thid + 1) - 1 bi = offset * (2 * thid + 2) - 1 t = temp[ai] temp[ai] = temp[bi] temp[bi] += t d <<= 1 cuda.syncthreads() # write results to device memory, in global IDs if idx1 < elb: g_data[start_idx + idx1] = temp[idx1] if idx2 < elb: g_data[start_idx + idx2] = temp[idx2]
def advanced_scan(g_odata, g_idata, n, aux): """ Bleloch algorithm. receives auxiliary array to store the whole sum only works for array of max size 1024 adapted to Numba CUDA from [1] M. Harris, S. Sengupta, and J. D. Owens, \“Parallel Prefix Sum (Scan) with CUDA Mark,\” Gpu gems 3, no. April, pp. 1–24, 2007. """ temp = cuda.shared.array(shape = 0, dtype = numba.i4) thid = cuda.threadIdx.x # thread id in block tgid = cuda.grid(1) # thread id in grid bid = cuda.blockIdx.x # block id # load input into shared memory temp[2 * thid] = g_idata[2 * thid] temp[2 * thid + 1] = g_idata[2 * thid + 1] offset = 1 # build sum in place up the tree d = n / 2 while d > 0: cuda.syncthreads() if thid < d: ai = offset * (2 * thid + 1) - 1 bi = offset * (2 * thid + 2) - 1 temp[bi] += temp[ai] offset <<= 1 # multipy by 2 d >>= 1 # divide by 2 # clear the last element if thid == 0: temp[n - 1] = 0 # traverse down tree and build scan d = 1 while d < n: offset >>= 1 cuda.syncthreads() if thid < d: ai = offset * (2 * thid + 1) - 1 bi = offset * (2 * thid + 2) - 1 t = temp[ai] temp[ai] = temp[bi] temp[bi] += t d *= 2 cuda.syncthreads() # write results to device memory g_odata[2 * thid] = temp[2 * thid] g_odata[2 * thid + 1] = temp[2 * thid + 1]
def reducer(inp, out, nelem, ostride): tid = cuda.threadIdx.x i = cuda.blockIdx.x * (blocksize * 2) + tid gridSize = blocksize * 2 * cuda.gridDim.x # Blocks perform most of the reduction within shared memory, in the # sdata array sdata = cuda.shared.array(sdatasize, dtype=typ) # The first reduction operation is performed during the process of # loading the data from global memory, in order to reduce the number of # idle threads (See "Reduction #4: First Add During Load") while i < nelem: sdata[tid] = binop(inp[i], inp[i + blocksize]) i += gridSize # The following reduction steps rely on all values being loaded into # sdata; we need to synchronize in order to meet this condition cuda.syncthreads() # The following lines implement an unrolled loop that repeatedly reduces # the number of values by two (by performing the reduction operation) # until only a single value is left. This is done to reduce instruction # overhead (See the section "Instruction Bottleneck") if blocksize >= 512: if tid < 256: sdata[tid] = binop(sdata[tid], sdata[tid + 256]) cuda.syncthreads() if blocksize >= 256: if tid < 128: sdata[tid] = binop(sdata[tid], sdata[tid + 128]) cuda.syncthreads() if blocksize >= 128: if tid < 64: sdata[tid] = binop(sdata[tid], sdata[tid + 64]) cuda.syncthreads() # At this point only the first warp has any work to do - we perform a # check on the thread ID here so that we can avoid calling syncthreads # (operations are synchronous within a warp) and also to avoid checking # the thread ID at each iteration (See the section "Unrolling the Last # Warp) if tid < 32: if blocksize >= 64: sdata[tid] = binop(sdata[tid], sdata[tid + 32]) if blocksize >= 32: sdata[tid] = binop(sdata[tid], sdata[tid + 16]) if blocksize >= 16: sdata[tid] = binop(sdata[tid], sdata[tid + 8]) if blocksize >= 8: sdata[tid] = binop(sdata[tid], sdata[tid + 4]) if blocksize >= 4: sdata[tid] = binop(sdata[tid], sdata[tid + 2]) if blocksize >= 2: sdata[tid] = binop(sdata[tid], sdata[tid + 1]) # Write this block's partially reduced value into the vector of all # partially-reduced values. if tid == 0: out[cuda.blockIdx.x * ostride] = sdata[0]