def convolution_cuda(sourceImage, fil): sourceImage = np.float32(sourceImage) # Perform separable convolution on sourceImage using CUDA. destImage = sourceImage.copy() (imageHeight, imageWidth) = sourceImage.shape # print(imageWidth,imageHeight) fil = np.float32(fil) DATA_H = imageHeight; DATA_W = imageWidth DATA_H = np.int32(DATA_H) DATA_W = np.int32(DATA_W) # Prepare device arrays sourceImage_gpu = cuda.mem_alloc_like(sourceImage) fil_gpu = cuda.mem_alloc_like(fil) destImage_gpu = cuda.mem_alloc_like(sourceImage) cuda.memcpy_htod(sourceImage_gpu, sourceImage) cuda.memcpy_htod(fil_gpu, fil) convolutionGPU(destImage_gpu, sourceImage_gpu, fil_gpu, DATA_W, DATA_H, block=(imageHeight,1 , 1), grid=(1,imageWidth)) # Pull the data back from the GPU. cuda.memcpy_dtoh(destImage, destImage_gpu) return destImage
def erode_cuda(sourceImage): fil = np.ones((7, 7)) # binary = th2 = cv2.adaptiveThreshold(sourceImage,255,cv2.ADAPTIVE_THRESH_MEAN_C,cv2.THRESH_BINARY,3,2)#cv2.threshold(sourceImage, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU) ret, binary = cv2.threshold(sourceImage, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU) sourceImage = np.float32(binary) # Perform separable convolution on sourceImage using CUDA. destImage = sourceImage.copy() (imageHeight, imageWidth) = sourceImage.shape # print(imageWidth,imageHeight) fil = np.float32(fil) DATA_H = imageHeight DATA_W = imageWidth DATA_H = np.int32(DATA_H) DATA_W = np.int32(DATA_W) # Prepare device arrays sourceImage_gpu = cuda.mem_alloc_like(sourceImage) fil_gpu = cuda.mem_alloc_like(fil) destImage_gpu = cuda.mem_alloc_like(sourceImage) cuda.memcpy_htod(sourceImage_gpu, sourceImage) cuda.memcpy_htod(fil_gpu, fil) erodeGPU(destImage_gpu, sourceImage_gpu, fil_gpu, DATA_W, DATA_H, block=(imageHeight, 1, 1), grid=(1, imageWidth)) # Pull the data back from the GPU. cuda.memcpy_dtoh(destImage, destImage_gpu) destImage = np.uint8(destImage) return destImage
def convolution_cuda(sourceImage, filterx, filtery): # Perform separable convolution on sourceImage using CUDA. # Operates on floating point images with row-major storage. destImage = sourceImage.copy() assert sourceImage.dtype == 'float32', 'source image must be float32' (imageHeight, imageWidth) = sourceImage.shape assert filterx.shape == filtery.shape == (KERNEL_W, ) , 'Kernel is compiled for a different kernel size! Try changing KERNEL_W' filterx = numpy.float32(filterx) filtery = numpy.float32(filtery) DATA_W = iAlignUp(imageWidth, 16) DATA_H = imageHeight BYTES_PER_WORD = 4 # 4 for float32 DATA_SIZE = DATA_W * DATA_H * BYTES_PER_WORD KERNEL_SIZE = KERNEL_W * BYTES_PER_WORD # Prepare device arrays destImage_gpu = cuda.mem_alloc_like(destImage) sourceImage_gpu = cuda.mem_alloc_like(sourceImage) intermediateImage_gpu = cuda.mem_alloc_like(sourceImage) cuda.memcpy_htod(sourceImage_gpu, sourceImage) cuda.memcpy_htod(d_Kernel_rows, filterx) # The kernel goes into constant memory via a symbol defined in the kernel cuda.memcpy_htod(d_Kernel_columns, filtery) # Call the kernels for convolution in each direction. blockGridRows = (iDivUp(DATA_W, ROW_TILE_W), DATA_H) blockGridColumns = (iDivUp(DATA_W, COLUMN_TILE_W), iDivUp(DATA_H, COLUMN_TILE_H)) threadBlockRows = (KERNEL_RADIUS_ALIGNED + ROW_TILE_W + KERNEL_RADIUS, 1, 1) threadBlockColumns = (COLUMN_TILE_W, 8, 1) DATA_H = numpy.int32(DATA_H) DATA_W = numpy.int32(DATA_W) convolutionRowGPU(intermediateImage_gpu, sourceImage_gpu, DATA_W, DATA_H, grid=[int(e) for e in blockGridRows], block=[int(e) for e in threadBlockRows]) convolutionColumnGPU(destImage_gpu, intermediateImage_gpu, DATA_W, DATA_H, numpy.int32(COLUMN_TILE_W * threadBlockColumns[1]), numpy.int32(DATA_W * threadBlockColumns[1]), grid=[int(e) for e in blockGridColumns], block=[int(e) for e in threadBlockColumns]) # Pull the data back from the GPU. cuda.memcpy_dtoh(destImage, destImage_gpu) return destImage
def convolution_cuda(sourceImage, filterx, filtery): # Perform separable convolution on sourceImage using CUDA. # Operates on floating point images with row-major storage. destImage = sourceImage.copy() assert sourceImage.dtype == 'float32', 'source image must be float32' (imageHeight, imageWidth) = sourceImage.shape assert filterx.shape == filtery.shape == ( KERNEL_W, ), 'Kernel is compiled for a different kernel size! Try changing KERNEL_W' filterx = numpy.float32(filterx) filtery = numpy.float32(filtery) DATA_W = iAlignUp(imageWidth, 16) DATA_H = imageHeight BYTES_PER_WORD = 4 # 4 for float32 DATA_SIZE = DATA_W * DATA_H * BYTES_PER_WORD KERNEL_SIZE = KERNEL_W * BYTES_PER_WORD # Prepare device arrays destImage_gpu = cuda.mem_alloc_like(destImage) sourceImage_gpu = cuda.mem_alloc_like(sourceImage) intermediateImage_gpu = cuda.mem_alloc_like(sourceImage) cuda.memcpy_htod(sourceImage_gpu, sourceImage) cuda.memcpy_htod( d_Kernel_rows, filterx ) # The kernel goes into constant memory via a symbol defined in the kernel cuda.memcpy_htod(d_Kernel_columns, filtery) # Call the kernels for convolution in each direction. blockGridRows = (iDivUp(DATA_W, ROW_TILE_W), DATA_H) blockGridColumns = (iDivUp(DATA_W, COLUMN_TILE_W), iDivUp(DATA_H, COLUMN_TILE_H)) threadBlockRows = (KERNEL_RADIUS_ALIGNED + ROW_TILE_W + KERNEL_RADIUS, 1, 1) threadBlockColumns = (COLUMN_TILE_W, 8, 1) DATA_H = numpy.int32(DATA_H) DATA_W = numpy.int32(DATA_W) grid_rows = tuple([int(e) for e in blockGridRows]) block_rows = tuple([int(e) for e in threadBlockRows]) grid_cols = tuple([int(e) for e in blockGridColumns]) block_cols = tuple([int(e) for e in threadBlockColumns]) #TESTING CODE # print("Block rows \n",block_rows) # print("BLock columns \n",block_cols) convolutionRowGPU(intermediateImage_gpu, sourceImage_gpu, DATA_W, DATA_H, grid=grid_rows, block=block_rows) convolutionColumnGPU(destImage_gpu, intermediateImage_gpu, DATA_W, DATA_H, numpy.int32(COLUMN_TILE_W * threadBlockColumns[1]), numpy.int32(DATA_W * threadBlockColumns[1]), grid=grid_cols, block=block_cols) # Pull the data back from the GPU. cuda.memcpy_dtoh(destImage, destImage_gpu) return destImage
def __init__(self, pts, axis, split, sigma): if split[0] < 2 or split[1] < 2: raise ValueError("Split needs to be at least 2x2") if not pts.flags['C_CONTIGUOUS']: pts = np.require(pts, dtype=pts.dtype, requirements=['C']) if not pts.flags['C_CONTIGUOUS']: raise Exception("Points are not contiguous") self.axis = axis self.sigma = sigma self.pts = pts self.pts_gpu = None # Initiates all of cuda stuff self.grid = np.zeros(split).astype(pts.dtype) self.grid_gpu = cuda.mem_alloc_like(self.grid) cuda.memcpy_htod(self.grid_gpu, self.grid) kernel = SourceModule(self.__cuda_code) self.gpu_gaussian = kernel.get_function("gpu_gaussian") self.dx = 1 / float(split[0] - 1) self.dy = 1 / float(split[1] - 1) self.grid_size, self.block_size = self.__setup_cuda_sizes(split)
def __compute_guassian_on_pts(self): view = self.view_tile.get_View() for dset in self.data_sets: _data = np.array(dset.getDataSet(), copy=True) _data[:, 0] = (_data[:, 0] - view.left)/view.width() _data[:, 1] = (_data[:, 1] - view.bottom)/view.height() for row in range(self.grid_size[0]): for col in range(self.grid_size[1]): # 3 * SIGMA give the 95% left = 1 / float(self.grid_size[1]) * col - (3 * self.sigma) right = 1 / float(self.grid_size[1]) * (col + 1) + (3 * self.sigma) bottom = 1 / float(self.grid_size[0]) * row - (3 * self.sigma) top = 1 / float(self.grid_size[0]) * (row + 1) + (3 * self.sigma) pts = getFilteredDataSet(_data, (left, right, bottom, top)) if len(pts) > 0: self.pts_gpu = cuda.mem_alloc_like(pts) cuda.memcpy_htod(self.pts_gpu, pts) self.gpu_gaussian(self.grid_gpu, # Grid self.pts_gpu, # Points np.int32(col), # Block Index x np.int32(row), # Block Index y np.int32(self.grid_size[1]), # Grid Dimensions x np.int32(self.grid_size[0]), # Grid Dimensions y np.int32(pts.shape[0]), # Point Length np.float32(self.dx), # dx np.float32(self.dy), # dy np.float32(self.sigma), # Sigma block=self.block_size) self.pts_gpu.free()
def __init__(self, view_tile, size, sigma, debug=False): self.debug = debug if size[0] < 2 or size[1] < 2: raise ValueError("Split needs to be at least 2x2") self.data_sets = view_tile.get_Data() for dset in self.data_sets: data = dset.getDataSet() if not data.flags['C_CONTIGUOUS']: print "NOT CONTIGUOUS, trying to reformat the points" data = np.require(data, dtype=data.dtype, requirements=['C']) if not data.flags['C_CONTIGUOUS']: raise Exception("Points are not contiguous") dset.setDataSet(data) self.view_tile = view_tile self.sigma = sigma self.pts_gpu = None # Initiates all of cuda stuff self.grid = np.zeros(size).astype(np.float32) self.grid_gpu = cuda.mem_alloc_like(self.grid) cuda.memcpy_htod(self.grid_gpu, self.grid) kernel = SourceModule(self.__cuda_code) self.gpu_gaussian = kernel.get_function("gpu_gaussian") self.view = self.view_tile.get_View() self.grid_size, self.block_size = self.__setup_cuda_sizes(size) self.dx = 1 / float(size[1] - 1) self.dy = 1 / float(size[0] - 1)
def test_compare_order(): ''' compare_order between C(row-major), F(column-major) ''' compare_order = mod_cu.get_function('compare_order') nx, ny = 3, 4 f_1d = np.arange(nx*ny, dtype='f8') f_2d_C = f_1d.reshape((nx,ny), order='C') f_2d_F = f_1d.reshape((nx,ny), order='F') print '' print 'f_1d_C\n\n', f_1d print 'f_2d_C\n', f_2d_C print 'f_2d_F\n', f_2d_F print '' print 'after cuda' ret_f_1d = np.zeros_like(f_1d) f_1d_gpu = cuda.mem_alloc_like(f_1d) f_2d_C_gpu = cuda.to_device(f_2d_C) compare_order(f_2d_C_gpu, f_1d_gpu, block=(nx*ny,1,1), grid=(1,1)) cuda.memcpy_dtoh(ret_f_1d, f_1d_gpu) print 'f_1d from f_2d_C\n', ret_f_1d f_2d_F_gpu = cuda.to_device(f_2d_F) compare_order(f_2d_F_gpu, f_1d_gpu, block=(nx*ny,1,1), grid=(1,1)) cuda.memcpy_dtoh(ret_f_1d, f_1d_gpu) print 'f_1d from f_2d_F\n', ret_f_1d
def test_constant_memory(self): # contributed by Andrew Wagner module = SourceModule(""" __constant__ float const_array[32]; __global__ void copy_constant_into_global(float* global_result_array) { global_result_array[threadIdx.x] = const_array[threadIdx.x]; } """) copy_constant_into_global = module.get_function("copy_constant_into_global") const_array, _ = module.get_global('const_array') host_array = np.random.randint(0,255,(32,)).astype(np.float32) global_result_array = drv.mem_alloc_like(host_array) drv.memcpy_htod(const_array, host_array) copy_constant_into_global( global_result_array, grid=(1, 1), block=(32, 1, 1)) host_result_array = np.zeros_like(host_array) drv.memcpy_dtoh(host_result_array, global_result_array) assert (host_result_array == host_array).all
def test_constant_memory(self): # contributed by Andrew Wagner module = SourceModule(""" __constant__ float const_array[32]; __global__ void copy_constant_into_global(float* global_result_array) { global_result_array[threadIdx.x] = const_array[threadIdx.x]; } """) copy_constant_into_global = module.get_function( "copy_constant_into_global") const_array, _ = module.get_global('const_array') host_array = np.random.randint(0, 255, (32, )).astype(np.float32) global_result_array = drv.mem_alloc_like(host_array) drv.memcpy_htod(const_array, host_array) copy_constant_into_global(global_result_array, grid=(1, 1), block=(32, 1, 1)) host_result_array = np.zeros_like(host_array) drv.memcpy_dtoh(host_result_array, global_result_array) assert (host_result_array == host_array).all
def __init__(self, n, L, CUDA=True): self.dtype = dtype self.r = np.zeros([n, 3], dtype=dtype) self.v = np.zeros([n, 3], dtype=dtype) self.m = np.ones([n, 1], dtype=dtype) self.a = np.zeros([n, 3], dtype=dtype) self.f = np.zeros([n, 3], dtype=dtype) self.n = n self.nc = L self.L = L self.Lh = L / 2.0 self.max_nei = 10 self.rc = 1.0 self.CUDA = True if (self.CUDA): self.h_r = np.zeros([n * 3], dtype=dtype) self.h_v = np.zeros([n * 3], dtype=dtype) self.h_m = np.zeros([n], dtype=dtype) self.h_a = np.zeros([n * 3], dtype=dtype) self.h_f = np.zeros([n * 3], dtype=dtype) self.h_cells = np.zeros([self.nc * self.nc**3], dtype=np.int32) self.h_narray = np.zeros([self.nc**3], dtype=np.int32) self.h_nei_index = np.zeros([n], dtype=np.int32) self.h_nei_list = np.zeros([n * self.max_nei], dtype=np.int32) self.d_r = cuda.mem_alloc_like(self.h_r) self.d_v = cuda.mem_alloc_like(self.h_v) self.d_m = cuda.mem_alloc_like(self.h_m) self.d_a = cuda.mem_alloc_like(self.h_a) self.d_f = cuda.mem_alloc_like(self.h_f) self.d_nei_index = cuda.mem_alloc_like(self.h_nei_index) self.d_nei_list = cuda.mem_alloc_like(self.h_nei_list) self.d_cells = cuda.mem_alloc_like(self.h_cells) self.d_narray = cuda.mem_alloc_like(self.h_narray) cuda.memcpy_htod(self.d_r, self.h_r) cuda.memcpy_htod(self.d_v, self.h_v) cuda.memcpy_htod(self.d_m, self.h_m) cuda.memcpy_htod(self.d_a, self.h_a) cuda.memcpy_htod(self.d_f, self.h_f) cuda.memcpy_htod(self.d_cells, self.h_cells) cuda.memcpy_htod(self.d_narray, self.h_narray) cuda.memcpy_htod(self.d_nei_list, self.h_nei_list) cuda.memcpy_htod(self.d_nei_index, self.h_nei_index)
def batch_memcpy_cmp(size: int, batch: int): event_start_1 = cuda.Event() event_stop_1 = cuda.Event() event_start_2 = cuda.Event() event_stop_2 = cuda.Event() array = np.random.rand(size, 9) array.astype(np.float32) mem = cuda.aligned_zeros_like(array) mem = cuda.register_host_memory(mem, cuda.mem_host_register_flags.DEVICEMAP) mem_d = cuda.mem_alloc_like(mem) event_start_1.record() cuda.memcpy_htod(mem_d, mem) event_stop_1.record() event_stop_1.synchronize() mem2 = [] this_mem = [] size_per_batch = int(size / batch) for i in range(batch): mem2.append( cuda.mem_alloc_like(array[i * size_per_batch:(i + 1) * size_per_batch])) this_mem.append(array[i * size_per_batch:(i + 1) * size_per_batch]) this_mem[i] = cuda.register_host_memory( this_mem[i], cuda.mem_host_register_flags.DEVICEMAP) event_start_2.record() for i in range(batch): cuda.memcpy_htod(mem2[i], this_mem[i]) event_stop_2.record() event_stop_2.synchronize() t1 = event_stop_1.time_since(event_start_1) t2 = event_stop_2.time_since(event_start_2) print("batch_memcpy_cmp size", size, " batch ", batch) print(t1) print(t2)
def line_cuda(sourceImage): time_e = time.time() gray = cv2.cvtColor(sourceImage, cv2.COLOR_BGR2GRAY) # binary = th2 = cv2.adaptiveThreshold(sourceImage,255,cv2.ADAPTIVE_THRESH_MEAN_C,cv2.THRESH_BINARY,3,2)#cv2.threshold(sourceImage, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU) ret, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU) # gray = sourceImage binary = np.float32(gray) destImage = np.float32(gray * 5) (imageHeight, imageWidth) = gray.shape DATA_H = np.int32(imageHeight) DATA_W = np.int32(imageWidth) # for i in range( -90,90): # sourceImage_gpu = cuda.mem_alloc_like(binary) # destImage_gpu = cuda.mem_alloc_like(binary) # cuda.memcpy_htod(sourceImage_gpu, binary) # theta = np.int32(i) # HoffGPU(destImage_gpu, sourceImage_gpu, theta, DATA_W, DATA_H, block=(imageHeight,1 , 1), grid=(1,imageWidth)) # cuda.memcpy_dtoh(destImage, destImage_gpu) # ans = np.sort(destImage) sourceImage_gpu = cuda.mem_alloc_like(binary) destImage_gpu = cuda.mem_alloc_like(destImage) cuda.memcpy_htod(sourceImage_gpu, binary) HoffGPU(destImage_gpu, sourceImage_gpu, DATA_W, DATA_H, block=(1, 1, 1), grid=(imageWidth, imageHeight)) cuda.memcpy_dtoh(destImage, destImage_gpu) ans = np.sort(destImage) time_b = time.time() print("GPU mode time:", (time_b - time_e)) return binary
def matmul(self, mat, return_time=False): '''Matrix multiplication between two matrices''' # # allocate memory on device for both matrices # self.allocate_memory() # mat.allocate_memory() # JIT compile the cuda kernel and source module # with dimension parameters mod = SourceModule(self.kernel_matmul % { "a_nrows": self.nrows, "a_ncols": self.ncols, "b_ncols": mat.ncols }) # check dimensions first: if self.ncols != mat.nrows: raise ValueError("Dimensions {0} and {1} do not match.".format( self.ncols, mat.nrows)) # allocate gpu memory for product yield prod_arr = np.zeros((self.nrows, mat.ncols)).astype(np.float32) prod_arr_gpu = cuda.mem_alloc_like(prod_arr) cuda.memcpy_htod(prod_arr_gpu, prod_arr) # cuda.In(prod_arr) # get matrix multiplication function mmul = mod.get_function("kernel_matmul") # also record the time it takes for the evaluation t0 = time.perf_counter_ns() # evaluate the matrix multiplication mmul(prod_arr_gpu, self.arr_gpu, mat.arr_gpu, block=self.block_dim, grid=self.grid_dim) t1 = time.perf_counter_ns() eval_time = (t1 - t0) * (1e-9) # time for each matmul evaluation # move from device to host # cuda.Out(prod_arr) prod_arr = np.zeros((self.nrows, mat.ncols)).astype(np.float32) cuda.memcpy_dtoh(prod_arr, prod_arr_gpu) return (cuMatrix(prod_arr), eval_time) if return_time else cuMatrix(prod_arr)
def test_register_host_memory(self): if drv.get_version() < (4, ): from py.test import skip skip("register_host_memory only exists on CUDA 4.0 and later") import sys if sys.platform == "darwin": from py.test import skip skip("register_host_memory is not supported on OS X") a = drv.aligned_empty((2**20, ), np.float64) a_pin = drv.register_host_memory(a) gpu_ary = drv.mem_alloc_like(a) stream = drv.Stream() drv.memcpy_htod_async(gpu_ary, a_pin, stream) drv.Context.synchronize()
def test_register_host_memory(self): if drv.get_version() < (4,): from py.test import skip skip("register_host_memory only exists on CUDA 4.0 and later") import sys if sys.platform == "darwin": from py.test import skip skip("register_host_memory is not supported on OS X") a = drv.aligned_empty((2**20,), np.float64) a_pin = drv.register_host_memory(a) gpu_ary = drv.mem_alloc_like(a) stream = drv.Stream() drv.memcpy_htod_async(gpu_ary, a_pin, stream) drv.Context.synchronize()
def LPF_cuda(fft, D_0=20): fft = np.fft.fftshift(fft) destImage = np.float32(fft) (imageHeight, imageWidth) = destImage.shape # print(imageWidth,imageHeight) D_0 = np.int32(D_0) DATA_H = np.int32(imageHeight) DATA_W = np.int32(imageWidth) destImage_gpu = cuda.mem_alloc_like(destImage) LPFGPU(destImage_gpu, D_0, DATA_W, DATA_H, block=(imageHeight, 1, 1), grid=(1, imageWidth)) cuda.memcpy_dtoh(destImage, destImage_gpu) ans = np.multiply(destImage, fft) ans = np.fft.ifftshift(ans) ans = np.fft.ifft2(ans) ans = np.uint8(ans) return ans
def price_options(strike_info_array, driver_price, forward_anchor, vol_time, bank_time, forward_time, driver_time, decay_percent, today_vol_time, one_day_vol_time, zero_rate, carry_cost, atm_vol, skew, put_curve, call_curve, put_linear_knot, call_linear_knot, put_taper_knot, call_taper_knot, put_taper, call_taper, vol_kernel, pricing_kernel, *args, **kwargs): # can't use record arrays with pycuda, so need to copy strikes to scalar array strikes = strike_info_array['strikes'].copy() # initialize results_array to store calculation results results_array = get_results_array(strikes.shape) results_array['strikes'] = strikes results_array['call_ids'] = strike_info_array['call_ids'] results_array['put_ids'] = strike_info_array['put_ids'] # initialize temporary scalar arrays to copy results from gpu before moving to results_array call_prices = np.zeros_like(strikes) put_prices = np.zeros_like(strikes) vols = np.zeros_like(strikes) up_call_prices = np.zeros_like(strikes) up_put_prices = np.zeros_like(strikes) down_call_prices = np.zeros_like(strikes) down_put_prices = np.zeros_like(strikes) # initialize memory on gpu to be used by kernel gpu_strikes = cuda.mem_alloc_like(strikes) gpu_vols = cuda.mem_alloc_like(strikes) gpu_call_prices = cuda.mem_alloc_like(strikes) gpu_put_prices = cuda.mem_alloc_like(strikes) # copy strikes to gpu cuda.memcpy_htod(gpu_strikes, strikes) #################### # price options vs current driver_price current_vol_time = vol_time - today_vol_time * decay_percent current_driver_time = driver_time - ONE_DAY_BANK_TIME * decay_percent current_forward_time = forward_time - ONE_DAY_BANK_TIME * decay_percent current_bank_time = bank_time - ONE_DAY_BANK_TIME * decay_percent #################### # calculate forward given driver_price forward = calculate_forward(driver_price, current_driver_time, current_forward_time, zero_rate, carry_cost) forward = np.float32(forward) # calculate current atm vol given driver_price and forward_anchor atm = linear_two_sided_vol(forward, forward_anchor, current_vol_time, atm_vol, skew, put_curve, call_curve, put_linear_knot, call_linear_knot, put_taper, call_taper, put_taper_knot, call_taper_knot) atm = np.float32(atm) vol_kernel.prepared_call(VOL_GRID, VOL_BLOCK, gpu_vols, gpu_strikes, forward, current_vol_time, atm, skew, put_curve, call_curve, put_linear_knot, call_linear_knot, put_taper, call_taper, put_taper_knot, call_taper_knot, np.int32(strikes.size)) # copy vol results from gpu cuda.memcpy_dtoh(vols, gpu_vols) pricing_kernel.prepared_call(OPTION_GRID, PRICING_BLOCK, gpu_call_prices, gpu_put_prices, gpu_strikes, gpu_vols, forward, zero_rate, current_vol_time, current_bank_time, np.int32(strikes.size)) # copy pricing results from gpu cuda.memcpy_dtoh(call_prices, gpu_call_prices) cuda.memcpy_dtoh(put_prices, gpu_put_prices) # copy vol results to results_array results_array['vols'] = vols # copy price results to results array results_array['call_prices'] = call_prices results_array['put_prices'] = put_prices #################### # now estimate vegas by permuting vols used in the options calculations # we don't need to recalculate forward or ATM vol as we can use the values calculated earlier # also, we're just going to directly permute strike vols by 50bp and use those new vols to calculate the vega vol_increment = .005 # increment vols up vols += vol_increment # copy to gpu cuda.memcpy_htod(gpu_vols, vols) #run pricing kernel pricing_kernel.prepared_call(OPTION_GRID, PRICING_BLOCK, gpu_call_prices, gpu_put_prices, gpu_strikes, gpu_vols, forward, zero_rate, current_vol_time, current_bank_time, np.int32(strikes.size)) # copy results from gpu cuda.memcpy_dtoh(up_call_prices, gpu_call_prices) cuda.memcpy_dtoh(up_put_prices, gpu_put_prices) # permute vols downwards by double the increment to undo the increase then increment vols downwards by same amount vols -= 2 * vol_increment #copy to gpu cuda.memcpy_htod(gpu_vols, vols) #run pricing kernel pricing_kernel.prepared_call(OPTION_GRID, PRICING_BLOCK, gpu_call_prices, gpu_put_prices, gpu_strikes, gpu_vols, forward, zero_rate, current_vol_time, current_bank_time, np.int32(strikes.size)) # copy results from gpu cuda.memcpy_dtoh(down_call_prices, gpu_call_prices) cuda.memcpy_dtoh(down_put_prices, gpu_put_prices) # estimate vegas then store results in results_array results_array['call_vegas'] = up_call_prices - down_call_prices / (.01 / (2 * vol_increment)) results_array['put_vegas'] = up_put_prices - down_put_prices / (.01 / (2 * vol_increment)) #################### # use the day weights to estimate theta # first we need to figure out what the adjusted calc times will be at this time during the next trading day decay_vol_time = current_vol_time - one_day_vol_time decay_driver_time = current_bank_time - ONE_DAY_BANK_TIME decay_forward_time = current_forward_time - ONE_DAY_BANK_TIME decay_bank_time = current_bank_time - ONE_DAY_BANK_TIME # calculate adjusted forward given new calculation time forward = calculate_forward(driver_price, decay_driver_time, decay_forward_time, zero_rate, carry_cost) forward = np.float32(forward) # calculate atm vol using today_vol_time to account for the fact that as we adjust our skews # due to decay, we will be moving the atm_vol using the previous vol path atm = linear_two_sided_vol(forward, forward_anchor, current_vol_time, atm_vol, skew, put_curve, call_curve, put_linear_knot, call_linear_knot, put_taper, call_taper, put_taper_knot, call_taper_knot) atm = np.float32(atm) # now run vol kernel with the tomorrow_vol_time and tomorrow_bank_time with the # previously calculated atm_vol vol_kernel.prepared_call(VOL_GRID, VOL_BLOCK, gpu_vols, gpu_strikes, forward, decay_vol_time, atm, skew, put_curve, call_curve, put_linear_knot, call_linear_knot, put_taper, call_taper, put_taper_knot, call_taper_knot, np.int32(strikes.size)) # run pricing kernel with new times pricing_kernel.prepared_call(OPTION_GRID, PRICING_BLOCK, gpu_call_prices, gpu_put_prices, gpu_strikes, gpu_vols, forward, zero_rate, decay_vol_time, decay_bank_time, np.int32(strikes.size)) # copy pricing results from gpu cuda.memcpy_dtoh(down_call_prices, gpu_call_prices) cuda.memcpy_dtoh(down_put_prices, gpu_put_prices) # estimate thetas and store in results_array results_array['call_thetas'] = down_call_prices - call_prices results_array['put_thetas'] = down_put_prices - put_prices #################### # now estimate deltas and gammas by permuting driver price # choose driver_increment to give relatively smooth gamma/delta profile driver_increment = np.float32(sqrt(3 * vol_time)) # calculate new forward and vols after incrementing driver_price upwards forward = calculate_forward(driver_price+driver_increment, current_driver_time, current_forward_time, zero_rate, carry_cost) forward = np.float32(forward) atm = linear_two_sided_vol(forward, forward_anchor, current_vol_time, atm_vol, skew, put_curve, call_curve, put_linear_knot, call_linear_knot, put_taper, call_taper, put_taper_knot, call_taper_knot) atm = np.float32(atm) vol_kernel.prepared_call(VOL_GRID, VOL_BLOCK, gpu_vols, gpu_strikes, forward, current_vol_time, atm, skew, put_curve, call_curve, put_linear_knot, call_linear_knot, put_taper, call_taper, put_taper_knot, call_taper_knot, np.int32(strikes.size)) pricing_kernel.prepared_call(OPTION_GRID, PRICING_BLOCK, gpu_call_prices, gpu_put_prices, gpu_strikes, gpu_vols, forward, zero_rate, current_vol_time, current_bank_time, np.int32(strikes.size)) # copy results from gpu cuda.memcpy_dtoh(up_call_prices, gpu_call_prices) cuda.memcpy_dtoh(up_put_prices, gpu_put_prices) # calculate new forward and vols after incrementing driver_price downwards forward = calculate_forward(driver_price - driver_increment, current_driver_time, current_forward_time, zero_rate, carry_cost) forward = np.float32(forward) atm = linear_two_sided_vol(forward, forward_anchor, current_vol_time, atm_vol, skew, put_curve, call_curve, put_linear_knot, call_linear_knot, put_taper, call_taper, put_taper_knot, call_taper_knot) atm = np.float32(atm) vol_kernel.prepared_call(VOL_GRID, VOL_BLOCK, gpu_vols, gpu_strikes, forward, current_vol_time, atm, skew, put_curve, call_curve, put_linear_knot, call_linear_knot, put_taper, call_taper, put_taper_knot, call_taper_knot, np.int32(strikes.size)) pricing_kernel.prepared_call(OPTION_GRID, PRICING_BLOCK, gpu_call_prices, gpu_put_prices, gpu_strikes, gpu_vols, forward, zero_rate, current_vol_time, current_bank_time, np.int32(strikes.size)) # copy results from gpu cuda.memcpy_dtoh(down_call_prices, gpu_call_prices) cuda.memcpy_dtoh(down_put_prices, gpu_put_prices) # estimate deltas and gammas then store in results_array results_array['call_deltas'] = (up_call_prices - down_call_prices) / (2 * driver_increment) results_array['put_deltas'] = (up_put_prices - down_put_prices) / (2 * driver_increment) results_array['call_gammas'] = (up_call_prices + down_call_prices - 2 * call_prices) / (2 * driver_increment) ** 2 results_array['put_gammas'] = (up_put_prices + down_put_prices - 2 * put_prices) / (2 * driver_increment) ** 2 return results_array
def convolve_gpu(sourceImage, convFilter, convType): """ convType is the same as in: http://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.convolve.html#scipy.signal.convolve """ # Cuda C code template = """ #define FILTER_W $FILTER_W #define FILTER_H $FILTER_H #include <stdio.h> __device__ __constant__ float d_Kernel_filter[FILTER_H*FILTER_W]; __global__ void ConvolutionKernel( float* img, int imgW, int imgH, float* out ) { const int nThreads = blockDim.x * gridDim.x; const int idx = blockIdx.x * blockDim.x + threadIdx.x; const int outW = imgW - FILTER_W + 1; const int outH = imgH - FILTER_H + 1; const int nPixels = outW * outH; for(int curPixel = idx; curPixel < nPixels; curPixel += nThreads) { int x = curPixel % outW; int y = curPixel / outW; float sum = 0; for (int filtY = 0; filtY < FILTER_H; filtY++) for (int filtX = 0; filtX < FILTER_W; filtX++) { int sx = x + filtX; int sy = y + filtY; sum+= img[sy*imgW + sx] * d_Kernel_filter[filtY*FILTER_W + filtX]; } out[y * outW + x] = sum; } } """ convFilter = np.flipud(np.fliplr(convFilter)) (DATA_H, DATA_W) = sourceImage.shape (outH, outW) = (0, 0) # -- Add zero paddings (padWl, padWr, padHt, padHb) = (0, 0, 0, 0) (filtH, filtW) = (convFilter.shape[0], convFilter.shape[1]) if convType == 'full': padWl = filtW-1 padWr = filtW-1 padHt = filtH-1 padHb = filtH-1 (outH, outW) = (DATA_H+filtH-1, DATA_W+filtW-1) elif convType == 'same': padWl = filtW/2 padWr = filtW/2 - (1-filtW%2) padHt = filtH/2 padHb = filtH/2 - (1-filtH%2) (outH, outW) = (DATA_H, DATA_W) elif convType == 'valid': (outH, outW) = (sourceImage.shape[0]-convFilter.shape[0]+1, sourceImage.shape[1]-convFilter.shape[1]+1) # -- zero padding tmpImg = np.zeros((padHt+DATA_H+padHb, padWl+DATA_W+padWr)) tmpImg[padHt:padHt+DATA_H, padWl:padWl+DATA_W] = sourceImage sourceImage = tmpImg (DATA_H, DATA_W) = sourceImage.shape destImage = np.float32(np.zeros((outH, outW))) #assert sourceImage.dtype == 'float32', 'source image must be float32' #assert convFilter.dtype == 'float32', 'convFilter must be float32' # -- interface stuff to Cuda C template = string.Template(template) code = template.substitute(FILTER_H = convFilter.shape[0], FILTER_W = convFilter.shape[1]) module = SourceModule(code) # -- change the numpy arrays to row vectors of float32 sourceImage = np.float32(sourceImage.reshape(sourceImage.size)) convFilter = np.float32(convFilter.reshape(convFilter.size)) convolutionGPU = module.get_function('ConvolutionKernel') d_Kernel_filter = module.get_global('d_Kernel_filter')[0] # -- Prepare device arrays destImage_gpu = cuda.mem_alloc_like(destImage) sourceImage_gpu = cuda.mem_alloc_like(sourceImage) cuda.memcpy_htod(sourceImage_gpu, sourceImage) cuda.memcpy_htod(d_Kernel_filter, convFilter) # The kernel goes into constant memory via a symbol defined in the kernel convolutionGPU(sourceImage_gpu, np.int32(DATA_W), np.int32(DATA_H), destImage_gpu, block=(400,1,1), grid=(1,1)) # Pull the data back from the GPU. cuda.memcpy_dtoh(destImage, destImage_gpu) return destImage
def convolutional_degrid_GPU(kernel_list, vshape, uvgrid, vuvwmap, vfrequencymap, vpolarisationmap=None): mod = SourceModule(""" #include<stdio.h> #include<stdlib.h> __global__ void convol_degird_kernels2(float *visReal, float *visImag, float *uvgridReal, float *uvgridImag, float *ckernel0Real, float *ckernel0Imag, int *vfrequencymap, int *x, int *y, int *xf, int *yf, int gh, int gw, int nx, int vnpol, int length) { for(int pol=0;pol<vnpol;pol++) { int row=threadIdx.x+blockIdx.x*blockDim.x; int col=threadIdx.y+blockIdx.y*blockDim.y; int slience=threadIdx.z+blockIdx.z*blockDim.z; int i=row+col*blockDim.x*gridDim.x+slience*blockDim.x*gridDim.x*blockDim.y*gridDim.y; if(i<length) { int chan=vfrequencymap[i]; int xx=x[i]; int yy=y[i]; int xxf=xf[i]; int yyf=yf[i]; float sumReal=0.0; float sumImag=0.0; int t1=chan*vnpol*nx*nx+pol*nx*nx; int t2=yyf*gh*gh*gh+xxf*gw*gh; for(int j=yy;j<yy+gh;j++) { for(int k=xx;k<xx+gw;k++) { int t3=t1+j*nx+k; int t4=t2+(j-yy)*gh+k-xx; sumReal+=(uvgridReal[t3]*ckernel0Real[t4]-uvgridImag[t3]*ckernel0Imag[t4]); sumImag+=(uvgridReal[t3]*ckernel0Imag[t4]+uvgridImag[t3]*ckernel0Real[t4]); } } visReal[i*vnpol+pol]=sumReal; visImag[i*vnpol+pol]=sumImag; } } } """) kernel_indices, kernels = kernel_list kernel_oversampling, _, gh, gw = kernels[0].shape assert gh % 2 == 0, "Convolution kernel must have even number of pixels" assert gw % 2 == 0, "Convolution kernel must have even number of pixels" inchan, inpol, ny, nx = uvgrid.shape vnpol = vshape[1] nvis = vshape[0] vis = numpy.zeros(vshape, dtype='complex') wt = numpy.zeros(vshape) # uvw -> fraction of grid mapping y, yf = frac_coord(ny, kernel_oversampling, vuvwmap[:, 1]) y -= gh // 2 x, xf = frac_coord(nx, kernel_oversampling, vuvwmap[:, 0]) x -= gw // 2 uvgridReal = uvgrid.real uvgridImag = uvgrid.imag if len(kernels) > 1: ckernels = numpy.conjugate(kernels) length = min(len(kernel_indices), len(vfrequencymap), len(x), len(y), len(xf), len(yf)) for pol in range(vnpol): for i in range(length): kind = kernel_indices[i] chan = vfrequencymap[i] xx = x[i] yy = y[i] xxf = xf[i] yyf = yf[i] vis[i, pol] = numpy.sum( uvgrid[chan, pol, yy:yy + gh, xx:xx + gw] * ckernels[kind][yyf, xxf, :, :]) else: ckernel0 = numpy.conjugate(kernels[0]) ckernel0Real = ckernel0.real ckernel0Imag = ckernel0.imag length = min(len(vfrequencymap), len(x), len(y), len(xf), len(yf)) visReal = np.zeros_like(wt, dtype=np.float32) visIamg = np.zeros_like(wt, dtype=np.float32) vis_real = visReal.reshape(-1) vis_iamg = visIamg.reshape(-1) uvgridReal = np.array(uvgridReal) uvgrid_real = uvgridReal.reshape(-1) uvgridImag = np.array(uvgridImag) uvgrid_imag = uvgridImag.reshape(-1) ckernel0Real = np.array(ckernel0Real) ckernel0_real = ckernel0Real.reshape(-1) ckernel0Imag = np.array(ckernel0Imag) ckernel0_imag = ckernel0Imag.reshape(-1) # vis_real_gpu=drv.mem_alloc_like(vis_real) # vis_iamg_gpu=drv.mem_alloc_like(vis_iamg) uvgrid_real_gpu = drv.mem_alloc_like(uvgrid_real) uvgrid_imag_gpu = drv.mem_alloc_like(uvgrid_imag) ckernel0_real_gpu = drv.mem_alloc_like(ckernel0_real) ckernel0_imag_gpu = drv.mem_alloc_like(ckernel0_imag) vfrequencymap_gpu = drv.mem_alloc_like(vfrequencymap) x_gpu = drv.mem_alloc_like(x) y_gpu = drv.mem_alloc_like(y) xf_gpu = drv.mem_alloc_like(xf) yf_gpu = drv.mem_alloc_like(yf) strm = drv.Stream() drv.memcpy_htod_async(uvgrid_real_gpu, np.array(uvgrid_real), strm) drv.memcpy_htod_async(uvgrid_imag_gpu, np.array(uvgrid_imag), strm) drv.memcpy_htod_async(ckernel0_real_gpu, np.array(ckernel0_real), strm) drv.memcpy_htod_async(ckernel0_imag_gpu, np.array(ckernel0_imag), strm) drv.memcpy_htod_async(vfrequencymap_gpu, np.array(vfrequencymap), strm) drv.memcpy_htod_async(x_gpu, np.array(x), strm) drv.memcpy_htod_async(y_gpu, np.array(y), strm) drv.memcpy_htod_async(xf_gpu, np.array(xf), strm) drv.memcpy_htod_async(yf_gpu, np.array(yf), strm) strm.synchronize() uvgrid_real = np.array(uvgrid_real) uvgrid_imag = np.array(uvgrid_imag) vis_real = np.array(vis_real) vis_iamg = np.array(vis_iamg) convol_degird_kernels2 = mod.get_function("convol_degird_kernels2") convol_degird_kernels2(drv.Out(vis_real), drv.Out(vis_iamg), uvgrid_real_gpu, uvgrid_imag_gpu, ckernel0_real_gpu, ckernel0_imag_gpu, vfrequencymap_gpu, x_gpu, y_gpu, xf_gpu, yf_gpu, np.int32(gh), np.int32(gw), np.int32(nx), np.int32(vnpol), np.int32(length), block=(32, 32, 1), grid=(1024, 96, 1)) vis = numpy.ones((length, vnpol), dtype='complex') vis_real_2D = vis_real.reshape(-1, vnpol) vis_iamg_2D = vis_iamg.reshape(-1, vnpol) vis.real = vis_real_2D vis.imag = vis_iamg_2D return numpy.array(vis)
def convolutional_grid_GPU(kernel_list, uvgrid, vis, visweights, vuvwmap, vfrequencymap, vpolarisationmap=None): mod = SourceModule(""" #include<stdio.h> #include<stdlib.h> __global__ void convol_grid_kernel1(float *uvgrid_real, float *uvgrid_imag, float *sumwt, float *kernels_real, float *kernels_imag, float *viswt_real, float *viswt_imag, float *wts, int *kernel_indices, int *vfrequencypam, int *x, int *y, int *xf, int *yf, int nx, int gh, int gw, int npol, int length) { for(int pol=0;pol<npol;pol++) { int row=threadIdx.x+blockIdx.x*blockDim.x; int col=threadIdx.y+blockIdx.y*blockDim.y; int slience=threadIdx.z+blockIdx.z*blockDim.z; int i=row+col*blockDim.x*gridDim.x+ slience*blockDim.x*gridDim.x*blockDim.y*gridDim.y; if(i<length) { float v_real=viswt_real[i*npol+pol]; float v_imag=viswt_imag[i*npol+pol]; float vwt=wts[i*npol+pol]; int kind=kernel_indices[i]; int chan=vfrequencypam[i]; int xx=x[i]; int yy=y[i]; int xxf=xf[i]; int yyf=yf[i]; for(int j=yy;j<yy+gh;j++) for(int k=xx;k<xx+gw;k++) { int w=chan*npol*nx*nx+pol*nx*nx+j*nx+k; int q=kind*gh*gh*gh*gh+yyf*gh*gh*gh+xxf*gh*gh+j*gh+k; uvgrid_real[w] +=(kernels_real[q]*v_real- kernels_imag[q]*v_imag); uvgrid_imag[w] +=(kernels_real[q]*v_imag+ kernels_imag[q]*v_real); } sumwt[chan*npol+pol]+=vwt; } } } __global__ void convol_grid_kernel2(float *uvgrid_real, float *uvgrid_imag, float *sumwt, float *kernel0_real, float *kernel0_imag, float *viswt_real, float *viswt_imag, float *wts, int *vfrequencymap, int *x, int *y, int *xf, int *yf, int nx, int gh, int gw, int npol, int length) { for(int pol=0;pol<npol;pol++) { int row=threadIdx.x+blockIdx.x*blockDim.x; int col=threadIdx.y+blockIdx.y*blockDim.y; int slience=threadIdx.z+blockIdx.z*blockDim.z; int i=row+col*blockDim.x*gridDim.x+ slience*blockDim.x*gridDim.x*blockDim.y*gridDim.y; if(i<length) { float v_real=viswt_real[i*npol+pol]; float v_imag=viswt_imag[i*npol+pol]; float vwt=wts[i*npol+pol]; int chan=vfrequencymap[i];//89 int xx=x[i]; int yy=y[i]; int xxf=xf[i]; int yyf=yf[i]; for(int j=yy;j<yy+gh;j++) for(int k=xx;k<xx+gw;k++) { int w=chan*pol*nx*nx+pol*nx*nx+j*nx+k; int q=yyf*gh*gh*gh+xxf*gh*gh+j*gh+k; uvgrid_real[w]+=(kernel0_real[q]*v_real- kernel0_imag[q]*v_imag); uvgrid_imag[w]+=(kernel0_real[q]*v_imag+ kernel0_imag[q]*v_real); } sumwt[chan*npol+pol] += vwt; } } } """) kernel_indices, kernels = kernel_list kernel_oversampling, _, gh, gw = kernels[0].shape assert gh % 2 == 0, "Convolution kernel must have even number of pixels" assert gw % 2 == 0, "Convolution kernel must have even number of pixels" inchan, inpol, ny, nx = uvgrid.shape # Construct output grids (in uv space) sumwt = numpy.zeros([inchan, inpol]) # uvw -> fraction of grid mapping y, yf = frac_coord(ny, kernel_oversampling, vuvwmap[:, 1]) y -= gh // 2 x, xf = frac_coord(nx, kernel_oversampling, vuvwmap[:, 0]) x -= gw // 2 # About 228k samples per second for standard kernel so about 10 million CMACs per second # Now we can loop over all rows wts = visweights[...] viswt = vis[...] * visweights[...] npol = vis.shape[-1] uvgrid_array = np.array(uvgrid) uvgrid_real = uvgrid_array.real.reshape(-1) uvgrid_imag = uvgrid_array.imag.reshape(-1) viswt_array = np.array(viswt) viswt_real = viswt_array.real.reshape(-1) viswt_imag = viswt_array.imag.reshape(-1) wts1 = np.array(wts).reshape(-1) if len(kernels) > 1: for pol in range(npol): minlen = min(len(viswt[..., pol]), len(wts[..., pol]), len(kernel_indices), len(list(vfrequencymap)), len(x), len(y), len(xf), len(yf)) for i in range(minlen): print(pol, "->", i) v = viswt[i, :, pol] vwt = wts[i, :, pol] kind = kernel_indices[i] chan = vfrequencymap[i] xx = x[i] yy = y[i] xxf = xf[i] yyf = yf[i] uvgrid[chan, pol, yy:yy + gh, xx:xx + gw] += kernels[kind][yyf, xxf, :, :] * v sumwt[chan, pol] += vwt else: kernel0 = kernels[0] kernel0_array = np.array(kernel0) kernel0_real = kernel0_array.real.reshape(-1) kernel0_imag = kernel0_array.imag.reshape(-1) convol_grid_kernel2 = mod.get_function("convol_grid_kernel2") length = len(x) cublock = (32, 32, 1) if length // (32 * 32) > 32: cugrid = (32, length // (32 * 32 * 32) + 1 * (length % (32 * 32 * 32) != 0), 1) else: cugrid = (length // (32 * 32) + 1 * (length % (32 * 32) != 0), 1, 1) kernel0_real_gpu = drv.mem_alloc_like(kernel0_real) kernel0_imag_gpu = drv.mem_alloc_like(kernel0_imag) viswt_real_gpu = drv.mem_alloc_like(viswt_real) viswt_imag_gpu = drv.mem_alloc_like(viswt_imag) wts1_gpu = drv.mem_alloc_like(wts1) vfrequencymap_gpu = drv.mem_alloc_like(vfrequencymap) x_gpu = drv.mem_alloc_like(x) y_gpu = drv.mem_alloc_like(y) xf_gpu = drv.mem_alloc_like(xf) yf_gpu = drv.mem_alloc_like(yf) #print(kernel0_real) strm = drv.Stream() drv.memcpy_htod_async(kernel0_real_gpu, np.array(kernel0_real), strm) drv.memcpy_htod_async(kernel0_imag_gpu, np.array(kernel0_imag), strm) drv.memcpy_htod_async(viswt_real_gpu, np.array(viswt_real), strm) drv.memcpy_htod_async(viswt_imag_gpu, np.array(viswt_imag), strm) drv.memcpy_htod_async(wts1_gpu, np.array(wts1), strm) drv.memcpy_htod_async(vfrequencymap_gpu, np.array(vfrequencymap), strm) drv.memcpy_htod_async(x_gpu, np.array(x), strm) drv.memcpy_htod_async(y_gpu, np.array(y), strm) drv.memcpy_htod_async(xf_gpu, np.array(xf), strm) drv.memcpy_htod_async(yf_gpu, np.array(yf), strm) strm.synchronize() uvgrid_real = np.array(uvgrid_real) uvgrid_imag = np.array(uvgrid_imag) convol_grid_kernel2(drv.Out(uvgrid_real), drv.Out(uvgrid_imag), drv.Out(sumwt), kernel0_real_gpu, kernel0_imag_gpu, viswt_real_gpu, viswt_imag_gpu, wts1_gpu, vfrequencymap_gpu, x_gpu, y_gpu, xf_gpu, yf_gpu, np.int32(nx), np.int32(gh), np.int32(gw), np.int32(npol), np.int32(length), block=(32, 32, 1), grid=(100, 100, 1)) uvgrid_real_4D = uvgrid_real.reshape(inchan, inpol, nx, ny) uvgrid_imag_4D = uvgrid_imag.reshape(inchan, inpol, nx, ny) uvgrid.real = uvgrid_real_4D uvgrid.imag = uvgrid_imag_4D return uvgrid, sumwt
bin[(i+1)*3 + (j+1)]=0; else bin[(i+1)*3 + (j+1)]=1; } lbpvals[row*numCols+col] = (bin[0]*128) + (bin[1]*64) + (bin[2]*32) + (bin[3]) + (bin[5]*16) + (bin[6]*2) + (bin[7]*4) + (bin[8]*8); } """) kernel = mod.get_function("lbpval_data") img = cv2.imread("image.jpg") img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) rows, cols = img.shape d_img = cuda.mem_alloc_like(img) d_res = cuda.mem_alloc_like(img) bins = numpy.zeros((3, 3), numpy.uint8) d_bins = cuda.mem_alloc_like(bins) numRows = numpy.int32(rows) numCols = numpy.int32(cols) cuda.memcpy_htod(d_img, img) kernel(d_img, d_res, numRows, numCols, grid=(int(numpy.ceil(cols / 32.0)), int(numpy.ceil(rows / 32.0))), block=(32, 32, 1)) h_res = numpy.zeros((rows, cols), numpy.uint8) cuda.memcpy_dtoh(h_res, d_res) cv2.imshow('Final', h_res) cv2.imwrite('LBP_RES.png', h_res)
d_u[2*NN+iind] = ZER; }else{ d_ddu[2*NN+iind] = ddwn; d_du[2*NN+iind] = dw + dt/TWO*(ddw + ddwn); d_u[2*NN+iind] = wi + dt*dw + dt*dt/TWO*ddw; } } } """, options=["--use_fast_math"]) d_dil = cuda.mem_alloc(NN * L.dtype.itemsize) d_u = cuda.mem_alloc(3 * NN * L.dtype.itemsize) d_du = cuda.mem_alloc(3 * NN * L.dtype.itemsize) d_ddu = cuda.mem_alloc(3 * NN * L.dtype.itemsize) d_Sf = cuda.mem_alloc_like(Sf) d_dmg = cuda.mem_alloc(((NB) * NN + 7) // 8) cuda.memcpy_htod(d_Sf, Sf) d_calcForceState = mod.get_function("calcForceState") d_calcDilation = mod.get_function("calcDilation") d_calcForceState.set_cache_config(cuda.func_cache.PREFER_L1) d_calcDilation.set_cache_config(cuda.func_cache.PREFER_L1) dil = np.empty((NN)) print("Begining simulation: ", NN) t0 = time.time() for tt in range(100): d_calcDilation(d_Sf,
def single_advance_gpu(state, num_points, grid_space): rhs = cuda.aligned_zeros((num_moments, num_points), dtype=np.float32) time_before = cuda.Event() time_1 = cuda.Event() time_after = cuda.Event() ## allocate GPU memory indices_device = cuda.mem_alloc_like(indices) cuda.memcpy_htod(indices_device, indices) f_min = cuda.mem_alloc(int(sizeof_float * num_moments * num_nodes * num_points)) f_max = cuda.mem_alloc(int(sizeof_float * num_moments*num_nodes*num_points)) flux_1 = cuda.mem_alloc_like(state) flux_2 = cuda.mem_alloc_like(state) ## compile GPU kernel BlockSize = (256, 1, 1) GridSize = (num_points +BlockSize[0] - 1) /BlockSize[0]; GridSize = (int(GridSize), 1, 1) domain_get_flux = QUAD.get_function('domain_get_flux_3d') fsum = QUAD.get_function('fsum_3d') flux_out = QUAD.get_function('flux_3d') ## compute_rhs time_before.record() # grid_inversion(state) # output are pointer object to GPU memory _, w, x, y, z = chyqmom27(state, num_points) time_1.record() # domain_get_fluxes(weights, abscissas, qbmm_mgr.indices, # num_points, qbmm_mgr.num_moments, # qbmm_mgr.num_nodes, flux) domain_get_flux(w, x, y, z, indices_device, f_min, f_max, np.int32(num_moments), np.int32(num_nodes), np.int32(num_points), block=BlockSize, grid=GridSize) fsum(flux_1, f_min, f_max, np.int32(num_moments), np.int32(num_nodes), np.int32(num_points), block=BlockSize, grid=GridSize) flux_out(flux_1, flux_2, np.float32(grid_space), np.int32(num_moments), np.int32(num_points), block=BlockSize, grid=GridSize) time_after.record() time_1.synchronize() time_after.synchronize() total_time = time_after.time_since(time_before) quad_time = time_after.time_since(time_1) cuda.memcpy_dtoh(rhs, flux_2) w.free() x.free() y.free() z.free() return rhs, total_time, quad_time
destImage = sourceImage.copy() assert sourceImage.dtype == 'float32', 'source image must be float32' (imageHeight, imageWidth) = sourceImage.shape assert filterx.shape == filtery.shape == ( KERNEL_W, ), 'Kernel is compiled for a different kernel size! Try changing KERNEL_W' filterx = numpy.float32(filterx) filtery = numpy.float32(filtery) DATA_W = iAlignUp(imageWidth, 16) DATA_H = imageHeight BYTES_PER_WORD = 4 # 4 for float32 DATA_SIZE = DATA_W * DATA_H * BYTES_PER_WORD KERNEL_SIZE = KERNEL_W * BYTES_PER_WORD # Prepare device arrays destImage_gpu = cuda.mem_alloc_like(destImage) sourceImage_gpu = cuda.mem_alloc_like(sourceImage) intermediateImage_gpu = cuda.mem_alloc_like(sourceImage) cuda.memcpy_htod(sourceImage_gpu, sourceImage) cuda.memcpy_htod( d_Kernel_rows, filterx ) # The kernel goes into constant memory via a symbol defined in the kernel cuda.memcpy_htod(d_Kernel_columns, filtery) # Call the kernels for convolution in each direction. blockGridRows = (iDivUp(DATA_W, ROW_TILE_W), DATA_H) blockGridColumns = (iDivUp(DATA_W, COLUMN_TILE_W), iDivUp(DATA_H, COLUMN_TILE_H)) threadBlockRows = (KERNEL_RADIUS_ALIGNED + ROW_TILE_W + KERNEL_RADIUS, 1, 1) threadBlockColumns = (COLUMN_TILE_W, 8, 1)
def _thread(pid, tid, cuda_context, cuda_kernel, dispatcher, temp_storage, total_edge_count, log_lock, merge_lock, exit_signal, exit_state): try: with log_lock: logging.debug('Clustering subprocess {} thread {} started.'.format( pid, tid)) cuda_context.push() ref_block_height, ref_block_width = block_dimensions edg_path = Path(temp_storage, 'edg') dps_path = Path(temp_storage, 'dps') ranked_spectra = session.ranked_spectra cuda_stream = drv.Stream() allocation_size_divisor = allocation_size_initial_divisor allocation_size = int(ref_block_height * ref_block_width / allocation_size_divisor) reallocated = False with log_lock: logging.debug( 'Clustering subprocess {} thread {}: Allocating host and device memory.' .format(pid, tid)) # allocate host pagelocked memory # input plm_precursor_mass = drv.pagelocked_empty( ref_block_height + ref_block_width, dtype=CG_PRECURSOR_MASS_DATA_TYPE) plm_mz = drv.pagelocked_empty( (ref_block_height + ref_block_width, num_of_peaks), dtype=CG_MZ_DATA_TYPE) plm_intensity = drv.pagelocked_empty( (ref_block_height + ref_block_width, num_of_peaks), dtype=CG_INTENSITY_DATA_TYPE) plm_block_dimensions = drv.pagelocked_empty( 2, dtype=CG_BLOCK_DIMENSIONS_DATA_TYPE) plm_offset = drv.pagelocked_empty(2, dtype=CG_OFFSET_DATA_TYPE) plm_allocation_size = drv.pagelocked_empty( 1, dtype=CG_ALLOCATION_SIZE_DATA_TYPE) # output plm_counter = drv.pagelocked_empty(1, dtype=CG_COUNTER_DATA_TYPE) plm_edge = drv.pagelocked_empty((allocation_size, 2), dtype=CG_EDGE_DATA_TYPE) plm_dot_product = drv.pagelocked_empty(allocation_size, dtype=CG_DOT_PRODUCT_DATA_TYPE) plm_overflowed = drv.pagelocked_empty(1, dtype=CG_OVERFLOWED_DATA_TYPE) # allocate device memory # input dvp_precursor_mass = drv.mem_alloc_like(plm_precursor_mass) dvp_mz = drv.mem_alloc_like(plm_mz) dvp_intensity = drv.mem_alloc_like(plm_intensity) dvp_block_dimensions = drv.mem_alloc_like(plm_block_dimensions) dvp_offset = drv.mem_alloc_like(plm_offset) dvp_allocation_size = drv.mem_alloc_like(plm_allocation_size) # output dvp_counter = drv.mem_alloc_like(plm_counter) dvp_edge = drv.mem_alloc_like(plm_edge) dvp_dot_product = drv.mem_alloc_like(plm_dot_product) dvp_overflowed = drv.mem_alloc_like(plm_overflowed) with log_lock: logging.debug( 'Clustering subprocess {} thread {}: Start iterating dispatcher.' .format(pid, tid)) previous_row_id = -1 dispatcher.connect(pid, tid) # iterate dispatcher to get blocks for row_id, column_id, block in dispatcher.iterate(pid, tid): if exit_signal.value: with log_lock: logging.debug( 'Subprocess {} thread {}: Received exit signal, exits now.' .format(pid, tid)) break try: y_range, x_range = block block_height = y_range[1] - y_range[0] block_width = x_range[1] - x_range[0] if row_id != previous_row_id: with log_lock: logging.debug( '\033[92mSubprocess {} thread {}: Processing row {} (y:{}->{}).\033[0m' .format(pid, tid, row_id, *y_range)) previous_row_id = row_id # get necessary data plm_precursor_mass[: block_height] = ranked_spectra.precursor_mass[ y_range[0]:y_range[1]] plm_precursor_mass[ block_height:block_height + block_width] = ranked_spectra.precursor_mass[ x_range[0]:x_range[1]] plm_mz[:block_height] = ranked_spectra.mz[ y_range[0]:y_range[1]] plm_mz[block_height:block_height + block_width] = ranked_spectra.mz[x_range[0]:x_range[1]] plm_intensity[:block_height] = ranked_spectra.intensity[ y_range[0]:y_range[1]] plm_intensity[block_height:block_height + block_width] = ranked_spectra.intensity[ x_range[0]:x_range[1]] plm_block_dimensions[:] = (block_height, block_width) plm_offset[:] = (y_range[0], x_range[0]) # upload data drv.memcpy_htod_async(dvp_precursor_mass, plm_precursor_mass, cuda_stream) drv.memcpy_htod_async(dvp_mz, plm_mz, cuda_stream) drv.memcpy_htod_async(dvp_intensity, plm_intensity, cuda_stream) drv.memcpy_htod_async(dvp_block_dimensions, plm_block_dimensions, cuda_stream) drv.memcpy_htod_async(dvp_offset, plm_offset, cuda_stream) if reallocated: allocation_size_divisor = allocation_size_initial_divisor allocation_size = int(ref_block_height * ref_block_width / allocation_size_divisor) # reallocate host pagelocked memory del plm_edge del plm_dot_product plm_edge = drv.pagelocked_empty((allocation_size, 2), dtype=CG_EDGE_DATA_TYPE) plm_dot_product = drv.pagelocked_empty( allocation_size, dtype=CG_DOT_PRODUCT_DATA_TYPE) # reallocate device memory del dvp_edge del dvp_dot_product dvp_edge = drv.mem_alloc_like(plm_edge) dvp_dot_product = drv.mem_alloc_like(plm_dot_product) with log_lock: logging.debug( '\033[92mSubprocess {} thread {}: Reset memory allocation size divisor to {}.\033[0m' .format(pid, tid, allocation_size_divisor)) reallocated = False cublockdim = (cuda_block_dimensions[1], cuda_block_dimensions[0], 1) cugriddim = (math.ceil(block_width / cuda_block_dimensions[1]), math.ceil(block_height / cuda_block_dimensions[0])) while True: plm_allocation_size[0] = allocation_size plm_counter[0] = 0 plm_overflowed[0] = False drv.memcpy_htod_async(dvp_allocation_size, plm_allocation_size, cuda_stream) drv.memcpy_htod_async(dvp_counter, plm_counter, cuda_stream) drv.memcpy_htod_async(dvp_overflowed, plm_overflowed, cuda_stream) cuda_kernel.prepared_async_call( cugriddim, cublockdim, cuda_stream, dvp_precursor_mass, dvp_mz, dvp_intensity, dvp_block_dimensions, dvp_offset, dvp_allocation_size, dvp_counter, dvp_edge, dvp_dot_product, dvp_overflowed) # transfer computation result from device to host drv.memcpy_dtoh_async(plm_edge, dvp_edge, cuda_stream) drv.memcpy_dtoh_async(plm_counter, dvp_counter, cuda_stream) drv.memcpy_dtoh_async(plm_overflowed, dvp_overflowed, cuda_stream) drv.memcpy_dtoh_async(plm_dot_product, dvp_dot_product, cuda_stream) cuda_stream.synchronize() if plm_overflowed[0]: allocation_size_divisor = int(allocation_size_divisor / 2) if allocation_size_divisor < 1: err_msg = ( '\nSubprocess {} thread {}: Allocation size divisor reached to the impossible value of {}.' .format(pid, tid, allocation_size_divisor)) with log_lock: logging.error(err_msg) raise Exception(err_msg) with log_lock: logging.debug( '\033[92mSubprocess {} thread {}: Edge list overflowed, ' 'decreases allocation size divisor to {}.\033[0m' .format(pid, tid, allocation_size_divisor)) allocation_size = int(block_width * block_height / allocation_size_divisor) # reallocate host pagelocked memory del plm_edge del plm_dot_product plm_edge = drv.pagelocked_empty( (allocation_size, 2), dtype=CG_EDGE_DATA_TYPE) plm_dot_product = drv.pagelocked_empty( allocation_size, dtype=CG_DOT_PRODUCT_DATA_TYPE) # reallocate device memory del dvp_edge del dvp_dot_product dvp_edge = drv.mem_alloc_like(plm_edge) dvp_dot_product = drv.mem_alloc_like(plm_dot_product) reallocated = True continue else: break if abs(plm_precursor_mass[block_height - 1] - plm_precursor_mass[block_height + block_width - 1]) > precursor_tolerance: dispatcher.next_row(pid, tid) with merge_lock: edge_list_size = int(plm_counter[0]) if edge_list_size != 0: total_edge_count.value += edge_list_size edg = np.memmap(str(edg_path), dtype=CG_EDGE_DATA_TYPE, mode='r+', shape=(total_edge_count.value, 2)) dps = np.memmap(str(dps_path), dtype=CG_DOT_PRODUCT_DATA_TYPE, mode='r+', shape=total_edge_count.value) edg[-edge_list_size:] = plm_edge[:edge_list_size] dps[-edge_list_size:] = plm_dot_product[: edge_list_size] except Exception: err_msg = '\nSubprocess {} thread {}: Failed to clustering block (y:{}->{}, x:{}->{}).' \ .format(pid, tid, y_range[0], y_range[1], x_range[0], x_range[1]) with log_lock: logging.error(err_msg) raise with log_lock: if not exit_signal.value: logging.debug( 'Subprocess {} thread {}: Reached the end of iteration, work done.' .format(pid, tid)) cuda_context.pop() except (Exception, KeyboardInterrupt) as e: if type(e) is KeyboardInterrupt: with log_lock: logging.debug( 'Subprocess {} thread {}: Received KeyboardInterrupt, exits now.' .format(pid, tid)) else: with log_lock: logging.exception( '\nSubprocess {} thread {}: Ended unexpectedly. Logging traceback:\n' '==========TRACEBACK==========\n'.format(pid, tid)) exit_signal.value = True exit_state.value = 1 cuda_context.pop() return
""") # Kernel Function Declaration createKernel = mod.get_function("createKernel") gaussianBlur = mod.get_function("gaussianBlur") sobelFilter = mod.get_function("sobelFilter") nonMaxSuppress = mod.get_function("nonMaxSuppress") threshold = mod.get_function("threshold") # Gaussian Filter Create size = numpy.uint8(5) sig = numpy.int32(2) s = numpy.int32(0) k = numpy.zeros(size * size, dtype=numpy.float32) kernel = cuda.mem_alloc_like(k) createKernel(kernel, size, sig, grid=(1, 1), block=(int(size), 1, 1), shared=int(size * size)) # Gaussian Blur colorImg = cv2.imread('Original.jpg') img = cv2.cvtColor(colorImg, cv2.COLOR_BGR2GRAY) blur = numpy.zeros(img.shape, dtype=numpy.uint8) width, height = img.shape i = copy.deepcopy(img) d_i = cuda.mem_alloc_like(i) d_res = cuda.mem_alloc_like(i)