def blocked_fw(A, n): # Stages is the number of three-staged iterations each thread will need # to undertake. Block size is the size of blocks the adjacency matrix is # broken up into t0 = time.time() stages = int(n / TILE_WIDTH) print('Copying memory to device...\n') # We first allocate the memory on device, then copy from host to # device row by row A_global_mem = cuda.device_array((n, n), dtype=np.uint8) for i in range(n): A_global_mem[i] = A[i] if (i + 1) % 1000 == 0: print(f'Copied {i+1}/{n} rows') t1 = time.time() print(f'\nCopied memory to device. Took {t1 - t0}s') block_size = (TILE_WIDTH, TILE_WIDTH) print('Starting GPU blocked Floyd-Warshall') print(f'Number of stages = {stages}') print(f'Tile width = {TILE_WIDTH}') print(f'Data type of A elements is {type(A[0][0])} \n') # Grid size is the number of thread blocks which are used for each stage phase_1_grid = (1, 1) phase_2_grid = (stages, 2) phase_3_grid = (stages, stages) for k in range(stages): base = TILE_WIDTH * k phase_1_kernel[phase_1_grid, block_size](A_global_mem, n, base) phase_2_kernel[phase_2_grid, block_size](A_global_mem, n, k, base) phase_3_kernel[phase_3_grid, block_size](A_global_mem, n, k, base) cuda.synchronize() if (k + 1) % UPDATE_INTERVAL == 0: print(f'Completed stage {k+1}/{stages}') t2 = time.time() print(f'\nFinished GPU blocked Floyd-Warshall. Took {t2 - t1}s\n') print('Copying memory to host...') A = A_global_mem.copy_to_host() t3 = time.time() print(f'Copied memory to host. Took {t3 - t2}s\n') return A
def main(): n = 20000000 x = np.arange(n).astype(np.int32) y = 2 * x # copy data to gpu x_device = cuda.to_device(x) y_device = cuda.to_device(y) # Init an empty array to storage result gpu_result = cuda.device_array(n) cpu_result = np.empty(n) threads_per_block = 1024 # blocks_per_grid = math.ceil(n / threads_per_block) blocks_per_grid = (n + threads_per_block - 1) // threads_per_block start = time() gpu_add[blocks_per_grid, threads_per_block](x_device, y_device, gpu_result, n) cuda.synchronize() print("gpu vector add time " + str(time() - start)) start = time() cpu_result = np.add(x, y) print("cpu vector add time " + str(time() - start)) if (np.array_equal(cpu_result, gpu_result.copy_to_host())): print("result correct!")
def test_issue_2393(self): """ Test issue of warp misalign address due to nvvm not knowing the alignment(? but it should have taken the natural alignment of the type) """ num_weights = 2 num_blocks = 48 examples_per_block = 4 threads_per_block = 1 @cuda.jit def costs_func(d_block_costs): s_features = cuda.shared.array((examples_per_block, num_weights), float64) s_initialcost = cuda.shared.array(7, float64) # Bug threadIdx = cuda.threadIdx.x prediction = 0 for j in range(num_weights): prediction += s_features[threadIdx, j] d_block_costs[0] = s_initialcost[0] + prediction block_costs = np.zeros(num_blocks, dtype=np.float64) d_block_costs = cuda.to_device(block_costs) costs_func[num_blocks, threads_per_block](d_block_costs) cuda.synchronize()
def compute_inv_mass(objects, mask_events, mask_objects, use_cuda): this_worker = get_worker_wrapper() NUMPY_LIB, backend = this_worker.NUMPY_LIB, this_worker.backend inv_mass = NUMPY_LIB.zeros(len(mask_events), dtype=np.float32) pt_total = NUMPY_LIB.zeros(len(mask_events), dtype=np.float32) if use_cuda: this_worker.kernels.compute_inv_mass_cudakernel[32, 1024]( objects.offsets, objects.pt, objects.eta, objects.phi, objects.mass, mask_events, mask_objects, inv_mass, pt_total, ) cuda.synchronize() else: this_worker.kernels.compute_inv_mass_kernel( objects.offsets, objects.pt, objects.eta, objects.phi, objects.mass, mask_events, mask_objects, inv_mass, pt_total, ) return inv_mass, pt_total
def peaks(image, return_numpy=True): """ Detect all peaks from a full SLI measurement. Peaks will not be filtered in any way. To detect only significant peaks, filter the peaks by using the prominence as a threshold. Args: image: Complete SLI measurement image stack as a 2D/3D Numpy array return_numpy: Necessary if using `use_gpu`. Specifies if a CuPy or Numpy array will be returned. Returns: 2D/3D boolean image containing masking the peaks with `True` """ gpu_image = cupy.array(image, dtype='float32') resulting_peaks = cupy.zeros(gpu_image.shape, dtype='int8') blocks_per_grid, threads_per_block = prepare_kernel_execution(gpu_image) _peaks[blocks_per_grid, threads_per_block](gpu_image, resulting_peaks) cuda.synchronize() if return_numpy: peaks_cpu = cupy.asnumpy(resulting_peaks) del resulting_peaks return peaks_cpu.astype('bool') else: return resulting_peaks.astype('bool')
def _lombscargle(x, y, freqs, pgram, y_dot): if pgram.dtype == "float32": numba_type = float32 elif pgram.dtype == "float64": numba_type = float64 if (str(numba_type)) in _kernel_cache: kernel = _kernel_cache[(str(numba_type))] else: sig = _numba_lombscargle_signature(numba_type) if pgram.dtype == "float32": kernel = _kernel_cache[(str(numba_type))] = cuda.jit( sig, fastmath=True, max_registers=32 )(_numba_lombscargle_32) print("Registers", kernel._func.get().attrs.regs) elif pgram.dtype == "float64": kernel = _kernel_cache[(str(numba_type))] = cuda.jit( sig, fastmath=True, max_registers=64 )(_numba_lombscargle_64) print("Registers", kernel._func.get().attrs.regs) device_id = cp.cuda.Device() numSM = device_id.attributes["MultiProcessorCount"] threadsperblock = (128,) blockspergrid = (numSM * 20,) kernel[blockspergrid, threadsperblock](x, y, freqs, pgram, y_dot) cuda.synchronize()
def neighbour_list(self): with cuda.gpus[self.gpu]: if self.n != self.frame.n: self.bpg = int(self.frame.n // self.tpb + 1) self.d_nl = cuda.device_array((self.frame.n, self.n_guess), dtype=np.int32) self.d_nc = cuda.device_array((self.frame.n, ), dtype=np.int32) while True: cu_set_to_int[self.bpg, self.tpb](self.d_nc, 0) # reset situation while build nlist self.cu_nlist[self.bpg, self.tpb]( self.frame.d_x, self.frame.d_box, self.frame.r_cut2, self.clist.d_cell_map, self.clist.d_cell_list, self.clist.d_cell_counts, self.clist.d_cells, self.d_nl, self.d_nc, self.d_n_max, self.contain_self) p_n_max = self.d_n_max.copy_to_host() cuda.synchronize() # n_max = np.array([120]) if p_n_max[0] > self.n_guess: self.n_guess = p_n_max[0] self.n_guess = self.n_guess + 8 - (self.n_guess & 7) self.d_nl = cuda.device_array((self.frame.n, self.n_guess), dtype=np.int32) else: break self.n = self.frame.n
def select_muons_opposite_sign(muons, in_mask): out_mask = cupy.invert(muons.make_mask()) select_opposite_sign_muons_cudakernel[32, 1024](muons.charge, muons.offsets, in_mask, out_mask) cuda.synchronize() return out_mask
def get_in_offsets(content, offsets, indices, mask_rows, mask_content): out = cupy.zeros(len(offsets) - 1, dtype=content.dtype) #out = -999.*cupy.ones(len(offsets) - 1, dtype=content.dtype) #to avoid histos being filled with 0 for non-existing objects, i.e. in events with no fat jets get_in_offsets_cudakernel[32, 1024](content, offsets, indices, mask_rows, mask_content, out) cuda.synchronize() return out
def benchmark_argmin(): SIZE_LG = 1000000 SIZE_SM = 100 DIM = 100 a = np.random.rand(SIZE_LG, DIM).astype(np.float32) b = np.random.rand(SIZE_SM, DIM).astype(np.float32) c = np.zeros(SIZE_LG, dtype=np.int64) a_gpu = cuda.to_device(a) b_gpu = cuda.to_device(b) c_gpu = cuda.to_device(c) thread_in_batch, batches = 128, 64 #thread_in_batch, batches = 32, 64 cpu_argmin(a, b) gpu_argmin[batches, thread_in_batch](a_gpu, b_gpu, c_gpu) cuda.synchronize() #time.sleep(4) timestamp = time.time() cpu_argmin(a, b) print("CPU time", time.time() - timestamp) timestamp = time.time() gpu_argmin[batches, thread_in_batch](a_gpu, b_gpu, c_gpu) cuda.synchronize() print("GPU time", time.time() - timestamp)
def __setitem__(self, key, val): if key in self._vars: # get data data = self.__getitem__('_data') _var2idx = self.__getitem__('_var2idx') idx = _var2idx[key] # gpu setattr if profile.run_on_gpu(): gpu_data = self.get_cuda_data() if data.shape[1] <= profile._num_thread_gpu: num_thread = data.shape[1] num_block = 1 else: num_thread = profile._num_thread_gpu num_block = math.ceil(data.shape[1] / profile._num_thread_gpu) if np.isscalar(val): gpu_set_scalar_val[num_block, num_thread](gpu_data, val, idx) else: if val.shape[0] != data.shape[1]: raise ValueError( f'Wrong value dimension {val.shape[0]} != {data.shape[1]}' ) gpu_set_vector_val[num_block, num_thread](gpu_data, val, idx) cuda.synchronize() # cpu setattr else: data[idx] = val elif key in ['_data', '_var2idx', '_idx2var']: raise KeyError(f'"{key}" cannot be modified.') else: raise KeyError(f'"{key}" is not defined in {type(self).__name__}, ' f'only finds "{str(self._keys)}".')
def main(): # 初始化矩阵 M = 6000 N = 4800 P = 4000 A = np.random.random((M, N)) # 随机生成的 [M x N] 矩阵 B = np.random.random((N, P)) # 随机生成的 [N x P] 矩阵 A_device = cuda.to_device(A) B_device = cuda.to_device(B) C_device = cuda.device_array((M, P)) # [M x P] 矩阵 # 执行配置 threads_per_block = (BLOCK_SIZE, BLOCK_SIZE) blocks_per_grid_x = int(math.ceil(A.shape[0] / BLOCK_SIZE)) blocks_per_grid_y = int(math.ceil(B.shape[1] / BLOCK_SIZE)) blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y) start = time() matmul[blocks_per_grid, threads_per_block](A_device, B_device, C_device) cuda.synchronize() print("matmul time :" + str(time() - start)) start = time() matmul_shared_memory[blocks_per_grid, threads_per_block](A_device, B_device, C_device) cuda.synchronize() print("matmul with shared memory time :" + str(time() - start)) C = C_device.copy_to_host()
def process(self, neutrons): """ Propagate a buffer of particles through this guide. Adjusts the buffer to include only the particles that exit, at the moment of exit. Parameters: neutrons: a buffer containing the particles """ (entrance_width, entrance_height, R0, Qc, alpha, m, W) = self.nature neutron_array = neutrons_as_npyarr(neutrons) neutron_array.shape = -1, ndblsperneutron threads_per_block = 256 number_of_blocks = ceil(len(neutrons) / threads_per_block) guide_process[number_of_blocks, threads_per_block](entrance_width, entrance_height, R0, Qc, alpha, m, W, self.sides, neutron_array) cuda.synchronize() mask = array(list(map(lambda weight: weight > 0, neutron_array.T[9])), dtype=bool) neutrons.resize(count_nonzero(mask), neutrons[0]) neutrons.from_npyarr(neutron_array[mask])
def neighbour_list(self): with cuda.gpus[self.gpu]: while True: cu_set_to_int[self.bpg, self.tpb](self.d_nc, 0) # reset situation while build nlist self.cu_nlist[self.bpg, self.tpb](self.system.d_x, self.d_last_x, self.system.d_box, self.r_cut2, self.clist.d_cell_map, self.clist.d_cell_list, self.clist.d_cell_counts, self.clist.d_cells, self.d_nl, self.d_nc, self.d_n_max, self.d_situation) self.d_n_max.copy_to_host(self.p_n_max) cuda.synchronize() # n_max = np.array([120]) if self.p_n_max[0] > self.n_guess: self.n_guess = self.p_n_max[0] self.n_guess = self.n_guess + 8 - (self.n_guess & 7) self.d_nl = cuda.device_array((self.system.N, self.n_guess), dtype=np.int32) else: break
def float_2_rgb(data, cmap, limits): """ data is a single channel image as a NumPy array. """ assert (data.ndim == 2) # Dimensions. H, W = data.shape # The output image. img = np.zeros((H, W, 3), dtype=np.uint8) dData = cuda.to_device(data) dCMap = cuda.to_device(cmap) dImg = cuda.to_device(img) # CUDA threads dimensions. CUDA_THREADS = 16 gridX = int(np.floor(W / CUDA_THREADS)) gridY = int(np.floor(H / CUDA_THREADS)) # CUDA execution. cuda.synchronize() k_convert_float_2_rgb[[gridX, gridY, 1], [CUDA_THREADS, CUDA_THREADS, 1]](dData, dCMap, limits[0], limits[1], dImg) cuda.synchronize() img = dImg.copy_to_host() return img
def calc_quality_metrics(self, archive_d_qual, observed_mean_arr_qual, rows_, rows_curr_gpu): blocks_per_grid_quality = min([rows_, 1000]) threads_per_block_quality = int(rows_ / blocks_per_grid_quality) + 1 quality_sum = cuda.to_device(np.zeros(shape=rows_, dtype=np.int32)) @cuda.jit def quality_sum_kernal(archive_d, qual_sum, rows_this_curr): x = cuda.grid(1) # execute for each row if x >= rows_this_curr[0]: return sum_ = 0 for i in range(int(NEIGHBOURS_ARCHIVE_SIZE / 2)): sum_ += archive_d[x, i] qual_sum[x] = sum_ quality_sum_kernal[blocks_per_grid_quality, threads_per_block_quality](archive_d_qual, quality_sum, rows_curr_gpu) cuda.synchronize() quality_sum = quality_sum.copy_to_host() observed_means = observed_mean_arr_qual.copy_to_host()[0:rows_] return np.mean(quality_sum), np.std(quality_sum), np.mean( observed_means), np.std(observed_means)
def main(): arr = np.arange(0, 10, 1) threadsperblock = 32 blockspergrid = (arr.size + (threadsperblock - 1)) increment_by_one[blockspergrid, threadsperblock](arr) cuda.synchronize() print(arr)
def main(): n = 20000000 x = np.arange(n).astype(np.int32) y = 2 * x # 拷贝数据到设备端 x_device = cuda.to_device(x) y_device = cuda.to_device(y) # 在显卡设备上初始化一块用于存放GPU计算结果的空间 gpu_result = cuda.device_array(n) cpu_result = np.empty(n) threads_per_block = 1024 blocks_per_grid = math.ceil(n / threads_per_block) start = time() gpu_add[blocks_per_grid, threads_per_block](x_device, y_device, gpu_result, n) cuda.synchronize() print("gpu vector add time " + str(time() - start)) start = time() cpu_result = np.add(x, y) print("cpu vector add time " + str(time() - start)) if (np.array_equal(cpu_result, gpu_result.copy_to_host())): print("result correct!")
def cdist_dtw_cuda(x1, x2= None, global_constraint=0, sakoe_chiba_radius=None, itakura_max_slope=None): x1 = to_time_series_dataset(x1) if x2 is not None: x2 = to_time_series_dataset(x2) mask = compute_mask(x1[0], x2[0], global_constraint=global_constraint, sakoe_chiba_radius=sakoe_chiba_radius, itakura_max_slope=itakura_max_slope) matrix = numpy.zeros((len(x1), len(x2)), dtype=numpy.float64) x2 = cuda.to_device(x2) else: mask = compute_mask(x1[0], x1[0], global_constraint=global_constraint, sakoe_chiba_radius=sakoe_chiba_radius, itakura_max_slope=itakura_max_slope) matrix = numpy.zeros((len(x1), len(x1)), dtype=numpy.float64) x1 = cuda.to_device(x1) matrix = cuda.to_device(matrix) mask = cuda.to_device(mask) # it is a small trick to make SH a global constrant # otherwise it does not work in cudajit cuda.local.array() fucntion def funс_sh(): global SH SH = int(x1.shape[1] + 1) return SH funс_sh() threadsperblock = (16, 16) blockspergrid_x = math.ceil(matrix.shape[0] / threadsperblock[0]) blockspergrid_y = math.ceil(matrix.shape[1] / threadsperblock[1]) blockspergrid = (blockspergrid_x, blockspergrid_y) # increment_a_2D_array[blockspergrid, threadsperblock](an_array) cdist_dtw[blockspergrid, threadsperblock](x1, matrix, mask); cuda.synchronize() matrix_res = numpy.asarray(matrix) matrix = matrix_res + matrix_res.T return matrix
def update(self): with cuda.gpus[self.gpu]: while True: self.p_out_of_box[0] = -1 cu_set_to_int[self.bpg_cell, self.tpb](self.d_cell_counts, 0) self.cu_cell_list[self.bpg, self.tpb](self.system.d_x, self.system.d_box, self.d_ibox, self.d_cell_list, self.d_cell_counts, self.d_cells, self.d_cell_max, self.d_out_of_box) self.d_cell_max.copy_to_host(self.p_cell_max) self.d_out_of_box.copy_to_host(self.p_out_of_box) cuda.synchronize() if self.p_out_of_box[0] != -1: err_coor = ''.join( ["{0: .4f} ".format(_) for _ in self.p_out_of_box[1:]]) raise ValueError("Error, particle %d %s is out of box!" % (int(self.p_out_of_box[0]), err_coor)) if self.p_cell_max[0] > self.cell_guess: self.cell_guess = self.p_cell_max[0] self.cell_guess = self.cell_guess + 8 - (self.cell_guess & 7) self.d_cell_list = cuda.device_array( (self.n_cell, self.cell_guess), dtype=np.int32) else: break
def main(): n = 20000000 x = np.arange(n).astype(np.int32) y = 2 * x # 拷贝数据到设备端 x_device = cuda.to_device(x) y_device = cuda.to_device(y) # 在显卡设备上初始化一块用于存放GPU计算结果的空间 # 这很好解释了为什么gpu计算显存时需要考虑特征图大小!!! gpu_result = cuda.device_array(n) cpu_result = np.empty(n) gpu_result_cpu = np.empty(n) threads_per_block = 1024 blocks_per_grid = math.ceil(n / threads_per_block) start = time() gpu_add[blocks_per_grid, threads_per_block](x, y, gpu_result_cpu, n) cuda.synchronize() print("gpu(non-optmi) vector add time " + str(time() - start)) start = time() gpu_add[blocks_per_grid, threads_per_block](x_device, y_device, gpu_result, n) #gpu 需要启动时间,越算越快 cuda.synchronize() print("gpu vector add time " + str(time() - start)) start = time() cpu_result = np.add(x, y) #cpu 不需要启动时间 print("cpu vector add time " + str(time() - start)) if (np.array_equal(cpu_result, gpu_result.copy_to_host())): print("result correct!")
def histogram_from_vector(data, weights, bins, mask=None): assert len(data) == len(weights) allowed_dtypes = [cupy.float32, cupy.int32, cupy.int8] assert data.dtype in allowed_dtypes assert weights.dtype in allowed_dtypes assert bins.dtype in allowed_dtypes # Allocate output arrays nblocks = 64 nthreads = 256 out_w = cupy.zeros((nblocks, len(bins) - 1), dtype=cupy.float32) out_w2 = cupy.zeros((nblocks, len(bins) - 1), dtype=cupy.float32) # Fill output if len(data) > 0: if mask is None: fill_histogram[nblocks, nthreads](data, weights, bins, out_w, out_w2) else: assert len(data) == len(mask) fill_histogram_masked[nblocks, nthreads](data, weights, bins, mask, out_w, out_w2) cuda.synchronize() out_w = out_w.sum(axis=0) out_w2 = out_w2.sum(axis=0) return cupy.asnumpy(out_w), cupy.asnumpy(out_w2), cupy.asnumpy(bins)
def guided_filter_3(mask, img): """ mask: A 1-channel mask image, numpy array. dtype == np.uint8. Two-value mask. img: A 1-channel image, numpy array. """ # Check the dimensions. assert (mask.shape[0] == img.shape[0]) assert (mask.shape[1] == img.shape[1]) # Get the dimensions height = mask.shape[0] width = mask.shape[1] # Transfer memory to the CUDA device. dMask = cuda.to_device(mask) dImg = cuda.to_device(img) # Filter. cuda.synchronize() k_filter_horizontal[[1, int(height / 16) + 1, 1], [1, 16, 1]](dMask, dImg) k_filter_vertical[[int(width / 16) + 1, 1, 1], [16, 1, 1]](dMask, dImg) cuda.synchronize() # Transfer memory to the host. mask = dMask.copy_to_host() img = dImg.copy_to_host() return mask, img
def main(): # https://habr.com/post/317328/ from numba import cuda import numpy as np # import matplotlib.pyplot as plt from time import time from numpy.core.tests.test_mem_overlap import xrange n = 512 blockdim = 16, 16 griddim = int(n / blockdim[0]), int(n / blockdim[1]) L = 1. h = L / n dt = 0.1 * h ** 2 nstp = 5000 @cuda.jit("void(float64[:], float64[:])") def nextstp_gpu(u0, u): i, j = cuda.grid(2) u00 = u0[i + n * j] if i > 0: uim1 = u0[i - 1 + n * j] else: uim1 = 0. if i < n - 1: uip1 = u0[i + 1 + n * j] else: uip1 = 0. if j > 0: ujm1 = u0[i + n * (j - 1)] else: ujm1 = 0. if j < n - 1: ujp1 = u0[i + n * (j + 1)] else: ujp1 = 1. d2x = (uim1 - 2. * u00 + uip1) d2y = (ujm1 - 2. * u00 + ujp1) u[i + n * j] = u00 + (dt / h / h) * (d2x + d2y) u0 = np.full(n * n, 0., dtype=np.float64) u = np.full(n * n, 0., dtype=np.float64) st = time() d_u0 = cuda.to_device(u0) d_u = cuda.to_device(u) for i in xrange(0, int(nstp / 2)): nextstp_gpu[griddim, blockdim](d_u0, d_u) nextstp_gpu[griddim, blockdim](d_u, d_u0) cuda.synchronize() u0 = d_u0.copy_to_host() print('time on GPU = ', time() - st)
def show(self): cell_list = self.clist.d_cell_list.copy_to_host() cell_map = self.clist.d_cell_map.copy_to_host() cell_counts = self.clist.d_cell_counts.copy_to_host() nl = self.d_nl.copy_to_host() nc = self.d_nc.copy_to_host() cuda.synchronize() return cell_list, cell_counts, cell_map, nl, nc
def test_print_array(self): """ Eyeballing required """ jcuprintary = cuda.jit('void(float32[:])')(cuprintary) A = numpy.arange(10, dtype=numpy.float32) jcuprintary[2, 5](A) cuda.synchronize()
def sum_in_offsets(struct, content, mask_rows, mask_content, dtype=None): if not dtype: dtype = content.dtype sum_offsets = cupy.zeros(len(struct.offsets) - 1, dtype=dtype) sum_in_offsets_cudakernel[32, 1024](content, struct.offsets, mask_rows, mask_content, sum_offsets) cuda.synchronize() return sum_offsets
def test_print_array(self): """ Eyeballing required """ jcuprintary = cuda.jit('void(float32[:])')(cuprintary) A = np.arange(10, dtype=np.float32) jcuprintary[2, 5](A) cuda.synchronize()
def searchsorted(arr, vals, side="right"): ret = cupy.zeros_like(vals, dtype=cupy.int32) if side == "right": searchsorted_kernel_right[32, 1024](vals, arr, ret) elif side == "left": searchsorted_kernel_left[32, 1024](vals, ret, arr) cuda.synchronize() return ret
def min_in_offsets(offsets, content, mask_rows=None, mask_content=None): max_offsets = cupy.zeros(len(offsets) - 1, dtype=content.dtype) mask_rows, mask_content = make_masks(offsets, content, mask_rows, mask_content) min_in_offsets_cudakernel[32, 1024](offsets, content, mask_rows, mask_content, max_offsets) cuda.synchronize() return max_offsets
def testin(): N = 2000 M = 2000 h = np.asarray(np.float32(2) + np.random.random((N, M)), dtype=np.float32) n = np.asarray(np.random.random((N, M)), dtype=np.float32) u = np.asarray(np.random.random((N + 1, M)), dtype=np.float32) v = np.asarray(np.random.random((N, M + 1)), dtype=np.float32) f = np.asarray(np.random.random((N, M)), dtype=np.float32) dx = np.float32(0.1) dy = np.float32(0.2) #p.g = np.float32(1.0) nu = np.float32(1.0) out_u = np.asarray(np.random.random((M, N + 1)), dtype=np.float32) threadsperblock = (16, 32) # (16,16) blockspergrid_x = (u.shape[0] + threadsperblock[0]) // threadsperblock[0] blockspergrid_y = (u.shape[1] + threadsperblock[1]) // threadsperblock[1] blockspergrid = (blockspergrid_x, blockspergrid_y) print("here we go", u.shape) print("blocks per grid", blockspergrid) print("threads per block", threadsperblock) try: for cu_u_driver in (cu_u_driver_global, ): print(cu_u_driver) # h1 = cuda.to_device(h) # n1 = cuda.to_device(n) # u1 = cuda.to_device(u) v1 = cuda.to_device(v) # f1 = cuda.to_device(f) out_u1 = cuda.to_device(out_u) ts = [] for i in range(10): t = mytime() # time.process_time() for j in range(100): cu_u_driver[blockspergrid, threadsperblock](v1, out_u1) cu_u_driver[blockspergrid, threadsperblock](v1, out_u1) cu_u_driver[blockspergrid, threadsperblock](v1, out_u1) cu_u_driver[blockspergrid, threadsperblock](v1, out_u1) cu_u_driver[blockspergrid, threadsperblock](v1, out_u1) cu_u_driver[blockspergrid, threadsperblock](v1, out_u1) cu_u_driver[blockspergrid, threadsperblock](v1, out_u1) cu_u_driver[blockspergrid, threadsperblock](v1, out_u1) cu_u_driver[blockspergrid, threadsperblock](v1, out_u1) cu_u_driver[blockspergrid, threadsperblock](v1, out_u1) cuda.synchronize() t2 = mytime() # time.process_time() ts.append(t - t2) # time.sleep(1) print("cuda") print(np.median(ts), np.min(ts), np.max(ts), np.std(ts)) print(ts) finally: print("cuda closer") cuda.close() print("all done")
def time_this(kernel, gridsz, blocksz, args): timings = [] cuda.synchronize() for i in range(10): # best of 10 ts = timer() kernel[gridsz, blocksz](*args) cuda.synchronize() te = timer() timings.append(te - ts) return sum(timings) / len(timings)
def time_this(kernel, gridsz, blocksz, args): timings = [] cuda.synchronize() try: for i in range(10): # best of 10 ts = timer() kernel[gridsz, blocksz](*args) cuda.synchronize() te = timer() timings.append(te - ts) except cuda.errors.CudaDriverError, e: print 'exc suppressed', e return -1
def captured_cuda_stdout(): """ Return a minimal stream-like object capturing the text output of either CUDA or the simulator. """ if config.ENABLE_CUDASIM: # The simulator calls print() on Python stdout with captured_stdout() as stream: yield PythonTextCapture(stream) else: # The CUDA runtime writes onto the system stdout from numba import cuda fd = sys.__stdout__.fileno() with redirect_fd(fd) as stream: yield CUDATextCapture(stream) cuda.synchronize()
def captured_cuda_stdout(): """ Return a minimal stream-like object capturing the text output of either CUDA or the simulator. """ # Prevent accidentally capturing previously output text sys.stdout.flush() if config.ENABLE_CUDASIM: # The simulator calls print() on Python stdout with captured_stdout() as stream: yield PythonTextCapture(stream) else: # The CUDA runtime writes onto the system stdout from numba import cuda with redirect_c_stdout() as stream: yield CUDATextCapture(stream) cuda.synchronize()
res_cuda = numpy.zeros_like(res) d_a = cuda.to_device(a) d_b = cuda.to_device(b) d_orders = cuda.to_device(orders) d_C = cuda.to_device(C) d_Ad = cuda.to_device(Ad) d_dAd = cuda.to_device(dAd) d_X = cuda.to_device(X) d_res_cuda = cuda.to_device(res_cuda) #warmup jitted[int(N/512)+1, 512](d_a,d_b,d_orders,d_C,d_X,d_res_cuda,d_Ad,d_dAd) cuda.synchronize() t1 = time.time() for t in range(T): jitted[int(N/512)+1, 512](d_a,d_b,d_orders,d_C,d_X,d_res_cuda,d_Ad,d_dAd) cuda.synchronize() t2 = time.time() res_cuda = d_res_cuda.copy_to_host() print("CUDA: {}" .format((t2-t1)/T)) # Compare with CPU impplementations (requires interpolation.py) from interpolation.splines.eval_cubic_numba import vec_eval_cubic_spline_3 vec_eval_cubic_spline_3(a,b,orders,C,X,res)