def interior_buffer(source_im, dest_im, b_size, g_size, RGB, neighbors): # create Cheetah template and fill in variables for mask kernel mask_template = Template(mask_source) mask_template.BLOCK_DIM_X = b_size[0] mask_template.BLOCK_DIM_Y = b_size[1] mask_template.WIDTH = dest_im.shape[1] mask_template.HEIGHT = dest_im.shape[0] mask_template.RGB = RGB mask_template.NEIGHBORS = neighbors # compile the CUDA kernel mask_kernel = cuda_compile(mask_template, "mask_kernel") # alloc memory to GPU d_source = cu.mem_alloc(source_im.nbytes) cu.memcpy_htod(d_source, source_im) # sends to GPU filter out interior points in the mask mask_kernel(d_source, block=b_size, grid=g_size) # retrieves interior point buffer from GPU inner_buffer = np.array(dest_im, dtype =np.uint8) cu.memcpy_dtoh(inner_buffer, d_source) # returns the interior buffer return inner_buffer
def poisson_parallel(source_im, dest_im, b_size, g_size, RGB, neighbors, interior_buffer, n): # create Cheetah template and fill in variables for Poisson kernal template = Template(poisson_blending_source) template.BLOCK_DIM_X = b_size[0] template.BLOCK_DIM_Y = b_size[1] template.WIDTH = dest_im.shape[1] template.HEIGHT = dest_im.shape[0] template.RGB = RGB template.NEIGHBORS = neighbors # compile the CUDA kernel poisson_blending_kernel = cuda_compile(template, "poisson_blending_kernel") # alloc memory in GPU out_image = np.array(dest_im, dtype =np.uint8) d_source, d_destination, d_buffer= cu.mem_alloc(source_im.nbytes), cu.mem_alloc(dest_im.nbytes), cu.mem_alloc(interior_buffer.nbytes) cu.memcpy_htod(d_source, source_im) cu.memcpy_htod(d_destination, dest_im) cu.memcpy_htod(d_buffer, interior_buffer) # calls CUDA for Poisson Blending n # of times for i in range(n): poisson_blending_kernel(d_source, d_destination, d_buffer, block=b_size, grid=g_size) # retrieves the final output image and returns cu.memcpy_dtoh(out_image, d_destination) return out_image
def make_frame_processor(frame_shape, frame_filter_data): """ Return a function that takes a frame and returns the filtered frame. """ size, scale, offset, F = frame_filter_data frame_filter = np.asarray(F, dtype=np.float32).reshape(size) / scale # calculate offsets based on size of filter min_offset= (size[0] - 1 )/2*-1 max_offset = (size[0]- 1)/2 + 1 # make list of offsets to apply to pixel and neighbors in kernel offset_list = [] filter_list = [] for a in xrange(min_offset, max_offset): for b in xrange(min_offset, max_offset): offset_list.append([a,b]) # append the filter to each pair of offsets for f in frame_filter: for ff in f: filter_list.append(ff) comb_list = zip(offset_list, filter_list) #print comb_list # create a list consisting of (offset a, offset b, filter value) final_list = [] for t in comb_list: t[0].append(t[1]) final_list.append((t[0])) # Block size (threads per block) b_size = (32, 32, 1) print 'frame shape: ' + str(frame_shape) # Grid size (blocks per grid) g_size = (int(np.ceil(float(frame_shape[1])/b_size[0])), int(np.ceil(float(frame_shape[0])/b_size[1]))) # initialize template and hard code variables template = Template(filter_source) template.LIST = final_list template.HEIGHT, template.WIDTH, _ = frame_shape template.MAX_OFF = max_offset - 1 template.MIN_OFF = min_offset #print template # Compile the CUDA Kernel module = nvcc.SourceModule(template) # Return a handle to the compiled CUDA kernel filter_kernel = module.get_function("filter_kernel") def processor(frame): """Applies the frame_filter 2D array to each channel of the image""" # allocate memory and transfer from host to device d_frame_in, d_frame_out = cu.mem_alloc(frame.nbytes), cu.mem_alloc(frame.nbytes) #, cu.mem_alloc(offset.nbytes), cu.mem_alloc(F.nbytes) cu.memcpy_htod(d_frame_in, frame) cu.memcpy_htod(d_frame_out, frame) filter_kernel(d_frame_in, d_frame_out, block=b_size, grid= g_size) # transfer from device to host cu.memcpy_dtoh(frame, d_frame_out) return frame # Return the function return processor