def convert_image_yuv(self, image): start = time.time() iplanes = image.get_planes() width = image.get_width() height = image.get_height() strides = image.get_rowstride() pixels = image.get_pixels() assert iplanes==ImageWrapper._3_PLANES, "we only handle planar data as input!" assert image.get_pixel_format()==self.src_format, "invalid source format: %s (expected %s)" % (image.get_pixel_format(), self.src_format) assert len(strides)==len(pixels)==3, "invalid number of planes or strides (should be 3)" assert width>=self.src_width and height>=self.src_height, "expected source image with dimensions of at least %sx%s but got %sx%s" % (self.src_width, self.src_height, width, height) #adjust work dimensions for subsampling: #(we process N pixels at a time in each dimension) divs = get_subsampling_divs(self.src_format) wwidth = dimdiv(self.dst_width, max(x_div for x_div, _ in divs)) wheight = dimdiv(self.dst_height, max(y_div for _, y_div in divs)) globalWorkSize, localWorkSize = self.get_work_sizes(wwidth, wheight) kernelargs = [self.queue, globalWorkSize, localWorkSize] iformat = pyopencl.ImageFormat(pyopencl.channel_order.R, pyopencl.channel_type.UNSIGNED_INT8) input_images = [] for i in range(3): _, y_div = divs[i] plane = pixels[i] if type(plane)==str: flags = mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR else: flags = mem_flags.READ_ONLY | mem_flags.USE_HOST_PTR shape = strides[i], self.src_height/y_div iimage = pyopencl.Image(self.context, flags, iformat, shape=shape, hostbuf=plane) input_images.append(iimage) #output image: oformat = pyopencl.ImageFormat(self.channel_order, pyopencl.channel_type.UNORM_INT8) oimage = pyopencl.Image(self.context, mem_flags.WRITE_ONLY, oformat, shape=(self.dst_width, self.dst_height)) kernelargs += input_images + [numpy.int32(self.src_width), numpy.int32(self.src_height), numpy.int32(self.dst_width), numpy.int32(self.dst_height), self.sampler, oimage] kstart = time.time() log("convert_image(%s) calling %s%s after upload took %.1fms", image, self.kernel_function_name, tuple(kernelargs), 1000.0*(kstart-start)) self.kernel_function(*kernelargs) self.queue.finish() #free input images: for iimage in input_images: iimage.release() kend = time.time() log("%s took %.1fms", self.kernel_function, 1000.0*(kend-kstart)) out_array = numpy.empty(self.dst_width*self.dst_height*4, dtype=numpy.byte) pyopencl.enqueue_read_image(self.queue, oimage, (0, 0), (self.dst_width, self.dst_height), out_array) self.queue.finish() log("readback using %s took %.1fms", CHANNEL_ORDER_TO_STR.get(self.channel_order), 1000.0*(time.time()-kend)) self.time += time.time()-start self.frames += 1 return ImageWrapper(0, 0, self.dst_width, self.dst_height, out_array.data, self.dst_format, 24, self.dst_width*4, planes=ImageWrapper.PACKED)
def gpu_gradient(): if len(sys.argv) != 3: print "USAGE: " + sys.argv[0] + " <inputImageFile> <outputImageFile>" return 1 # create context and command queue ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) # load image im = Image.open(sys.argv[1]) if im.mode != "RGBA": im = im.convert("RGBA") imgSize = im.size buffer = im.tostring() # len(buffer) = imgSize[0] * imgSize[1] * 4 # Create ouput image object clImageFormat = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNSIGNED_INT8) input_image = cl.Image(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, clImageFormat, imgSize, None, buffer) output_image = cl.Image(ctx, cl.mem_flags.WRITE_ONLY, clImageFormat, imgSize) # load the kernel source code kernelFile = open("gradient.cl", "r") kernelSrc = kernelFile.read() # Create OpenCL program program = cl.Program(ctx, kernelSrc).build() # Call the kernel directly globalWorkSize = ( imgSize[0],imgSize[1] ) gpu_start_time = time() program.gradient(queue, globalWorkSize, None, input_image, output_image) # Read the output buffer back to the Host buffer = numpy.zeros(imgSize[0] * imgSize[1] * 4, numpy.uint8) origin = ( 0, 0, 0 ) region = ( imgSize[0], imgSize[1], 1 ) cl.enqueue_read_image(queue, output_image, origin, region, buffer).wait() # Save the image to disk gsim = Image.fromstring("RGBA", imgSize, buffer.tostring()) gsim.save("GPU_"+sys.argv[2]) gpu_end_time = time() print("GPU Time: {0} s".format(gpu_end_time - gpu_start_time))
def main(): imageObjects = [0, 0] # Main if len(sys.argv) != 3: print "USAGE: " + sys.argv[0] + " <inputImageFile> <outputImageFile>" return 1 # Create an OpenCL context on first available platform context, device = CreateContext() if context == None: print "Failed to create OpenCL context." return 1 # Create a command-queue on the first device available commandQueue = cl.CommandQueue(context, device) # Make sure the device supports images, otherwise exit if not device.get_info(cl.device_info.IMAGE_SUPPORT): print "OpenCL device does not support images." return 1 # Load input image from file and load it into # an OpenCL image object imageObjects[0], imgSize = LoadImage(context, sys.argv[1]) # Create ouput image object clImageFormat = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNORM_INT8) imageObjects[1] = cl.Image(context, cl.mem_flags.WRITE_ONLY, clImageFormat, imgSize) # Create sampler for sampling image object sampler = cl.Sampler( context, False, # Non-normalized coordinates cl.addressing_mode.CLAMP_TO_EDGE, cl.filter_mode.NEAREST) # Create OpenCL program program = CreateProgram(context, device, "ImageFilter2D.cl") # Call the kernel directly localWorkSize = (16, 16) globalWorkSize = (RoundUp(localWorkSize[0], imgSize[0]), RoundUp(localWorkSize[1], imgSize[1])) program.gaussian_filter(commandQueue, globalWorkSize, localWorkSize, imageObjects[0], imageObjects[1], sampler, numpy.int32(imgSize[0]), numpy.int32(imgSize[1])) # Read the output buffer back to the Host buffer = numpy.zeros(imgSize[0] * imgSize[1] * 4, numpy.uint8) origin = (0, 0, 0) region = (imgSize[0], imgSize[1], 1) cl.enqueue_read_image(commandQueue, imageObjects[1], origin, region, buffer).wait() print "Executed program successfully." # Save the image to disk SaveImage(sys.argv[2], buffer, imgSize)
def get(self, **kwargs): queue = get_device().queue if hasattr(self,"shape"): imshape = self.shape else: imshape = (self.width,) dshape = imshape[::-1] if self.format.channel_count>1: dshape += (self.format.channel_count,) out = np.empty(dshape,dtype=self.dtype) pyopencl.enqueue_read_image(queue,self,[0]*len(dshape),imshape,out) return out.reshape(dshape)
def get(self, **kwargs): queue = get_device().queue if hasattr(self, "shape"): imshape = self.shape else: imshape = (self.width, ) dshape = imshape[::-1] if self.format.channel_count > 1: dshape += (self.format.channel_count, ) out = np.empty(dshape, dtype=self.dtype) pyopencl.enqueue_read_image(queue, self, [0] * len(dshape), imshape, out) return out.reshape(dshape)
def get(self, **kwargs): queue = get_device().queue if hasattr(self, "shape"): imshape = self.shape else: imshape = (self.width, ) dshape = imshape[::-1] ndim = len(imshape) if self.num_channels > 1: dshape += (self.num_channels, ) #dshape = (self.num_channels,) + dshape out = np.empty(dshape, dtype=self.dtype) cl.enqueue_read_image(queue, self, [0] * ndim, imshape, out) return out
def get(self, **kwargs): queue = get_device().queue if hasattr(self,"shape"): imshape = self.shape else: imshape = (self.width,) dshape = imshape[::-1] ndim = len(imshape) if self.num_channels>1: dshape += (self.num_channels,) #dshape = (self.num_channels,) + dshape out = np.empty(dshape,dtype=self.dtype) cl.enqueue_read_image(queue,self,[0]*ndim,imshape,out) return out
def parallel_prediction_errors(self, image): """ Get the MILC prediction errors for a 3D image by means of OpenCL accelerated computation Keyword arguments: image -- a 3D numpy array (bitmap image) Return: a 3D numpy array of the same shape of "image", containing the prediction errors """ mf = cl.mem_flags # Define the image format for the prediction errors err_format = cl.ImageFormat(channel_order=cl.channel_order.R, channel_type=DataType.CL_ERR.value) # Define the input image from the numpy 3D array source_image = cl.image_from_array(self.ctx, image) original_shape = numpy.shape(image) cl_shape = list( reversed(original_shape)) # inverted shape (pyOpenCL bug?) # output image output_image = cl.Image(self.ctx, mf.WRITE_ONLY, err_format, shape=cl_shape) # sampler. pixels out of range have a value of '0' sampler = cl.Sampler(self.ctx, False, cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) # enqueue kernel self.program.image_test(self.queue, original_shape, None, source_image, output_image, sampler) # read the resulting image into a numpy array output_data = numpy.empty(shape=cl_shape, dtype=DataType.ERR.value) cl.enqueue_read_image(self.queue, output_image, (0, 0, 0), cl_shape, output_data) return output_data.reshape(original_shape)
def main(): k = 0.5 # pan_data = scm.imread('taipei_pan.jpg') # mul_data = scm.imread('taipei_mul.jpg') mul = Image.open("test_mul.jpg") pan = Image.open("test_pan.jpg") mul_data = np.array(mul) pan_data = np.array(pan) r = mul_data[:, :, 0] g = mul_data[:, :, 1] b = mul_data[:, :, 2] ## float64 to float32 pan_data = pan_data.astype(np.float32) r = r.astype(np.float32) g = g.astype(np.float32) b = b.astype(np.float32) if __debug__: print("pan type: " + str(type(pan_data[0,0]))) print("mul type: " + str(type(mul_data[:,:,0][0,0]))) print("r: " + str(type(r[0,0]))) print("g: " + str(type(g[0,0]))) print("b: " + str(type(b[0,0]))) time_start = time.time() i = (r * 0.171 + g * 0.2 + b * 0.171) / 0.632 ## float64 if __debug__: print("i: " + str(type(i[0,0]))) kx__pan_minus_iii = k * (pan_data - i) if __debug__: print("kx__pan_minus_iii: " + str(type(kx__pan_minus_iii[0,0]))) # coe = pan_data / (i + kx__pan_minus_iii) ## float64 with np.errstate(divide='ignore', invalid='ignore'): denominator = (i + kx__pan_minus_iii) coe = pan_data / denominator coe[denominator == 0] = 0 if __debug__: print("coe: " + str(type(coe[0,0]))) nr = coe * (r + kx__pan_minus_iii) ng = coe * (g + kx__pan_minus_iii) nb = coe * (b + kx__pan_minus_iii) output_img = np.empty_like(mul_data) if __debug__: print("nr: " + str(type(nr[0,0]))) finish_time = time.time() - time_start # 刪除 nan nr = np.nan_to_num(nr) ng = np.nan_to_num(ng) nb = np.nan_to_num(nb) # 溢位問題 nr[nr > 255] = 255 ng[ng > 255] = 255 nb[nb > 255] = 255 # 小於 0 預設也不是 0 (其實也是溢位問題) nr[nr < 0] = 0 ng[ng < 0] = 0 nb[nb < 0] = 0 # 預設非四捨五入(rte) 而是無條件捨去 nr = np.round(nr) ng = np.round(ng) nb = np.round(nb) output_img[:, :, 0] = nr output_img[:, :, 1] = ng output_img[:, :, 2] = nb # for index in range(0, len(nr.ravel()), 1): # if nr.ravel()[index] > 255: # print("nr: " + str(nr.ravel()[index]) + ", output_img: " + str(output_img[:, :, 0].ravel()[index])) # compare = output_img[:,:,0].ravel()==nr.ravel() # com_index = np.where(compare == False)[0] # print("Different index length: " + str(len(com_index))) # for dif_id in com_index: # if not math.isnan(nr.ravel()[dif_id]): # print("id : " + str(dif_id) + " output_img: " + str(output_img[:,:,0].ravel()[dif_id]) + " nr: " + str(nr.ravel()[dif_id])) # scm.imsave("output.jpg", output_img) print ("finish time:" + str(finish_time) + " s") ## =========================================================== device = cl.get_platforms()[0].get_devices()[1] ctx = cl.Context([device]) queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) # convert image if mul.mode != "RGBA": mul = mul.convert("RGBA") mulSize = mul.size # set mul alpha to pan mul.putalpha(pan) mul_str = mul.tostring() # image format mulImageFormat = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNSIGNED_INT8) time_start = time.time() # input buf (mul+pan) mul_buf = cl.Image(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, mulImageFormat, mulSize, None, mul_str) # create ouput image object result_buf = cl.Image(ctx, cl.mem_flags.WRITE_ONLY, mulImageFormat, mulSize) finish_time = time.time() - time_start print ("host to device finish time:" + str(finish_time) + " s") # load kernel.cl kernelFile = open("kernel.cl", "r") kernelSrc = kernelFile.read() # create OpenCL program program = cl.Program(ctx, kernelSrc).build() # program parameter localWorkSize = (16, 16) globalWorkSize = ( RoundUp(localWorkSize[0], mulSize[0]), RoundUp(localWorkSize[1], mulSize[1]) ) # warm up program.calculate(queue, globalWorkSize, localWorkSize, mul_buf, result_buf) # execute exec_evt = program.calculate(queue, globalWorkSize, localWorkSize, mul_buf, result_buf) exec_evt.wait() elapsed = 1e-9*(exec_evt.profile.end - exec_evt.profile.start) print("OpenCL execute success!") print("OpenCL finish time: %g s" % elapsed) # read output buffer to the Host mul_str = np.zeros(mulSize[0] * mulSize[1] * 4, np.uint8) origin = ( 0, 0, 0 ) region = ( mulSize[0], mulSize[1], 1 ) time_start = time.time() # cl.enqueue_read_image(queue, result_buf, # origin, region, mul_str).wait() exec_evt = cl.enqueue_read_image(queue, result_buf, origin, region, mul_str) exec_evt.wait() elapsed = 1e-9*(exec_evt.profile.end - exec_evt.profile.start) print("OpenCL get result success!") print("OpenCL get result finish time: %g s" % elapsed) # save image cl_output_img = Image.fromstring("RGBA", mulSize, mul_str.tostring()) cl_output_img.save("cl_output.jpg") # check result result_data = np.array(cl_output_img)[:,:,[0,1,2]].ravel() output_img = output_img.ravel() compare = (output_img == result_data) equal = np.all(compare) # equal_close = np.allclose(output_img, result_data) if not equal: print("Results doesn't match!!") print(output_img) print("================================================================") print(result_data) print("================================================================") com_index = np.where(compare == False)[0] print("Different index length: " + str(len(com_index))) # for dif_id in com_index: # print("id : " + str(dif_id) + " ori: " + str(output_img[dif_id]) + " rst: " + str(result_data[dif_id])) else: print("Results OK")
def main(): os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1' imageObjects = [ 0, 0, 0 ] # Main if len(sys.argv) != 4: print "USAGE: " + sys.argv[0] + " <source> <palette> <output>" return 1 # Create an OpenCL context on first available platform context, device = CreateContext(); if context == None: print "Failed to create OpenCL context." return 1 # Create a command-queue on the first device available # on the created context commandQueue = cl.CommandQueue(context, device) # Make sure the device supports images, otherwise exit if not device.get_info(cl.device_info.IMAGE_SUPPORT): print "OpenCL device does not support images." return 1 # Load input image from file and load it into # an OpenCL image object imageObjects[0], srcSize = LoadImage(context, sys.argv[1]) imageObjects[1], palSize = LoadImage(context, sys.argv[2]) pixels = srcSize[0] * srcSize[1] if (palSize[0] * palSize[1] != pixels): print "Images do not contain the same number of pixels." return 1 ######################################### ### ### ### TODO Rearrange the palette to the correct dimensions here ### ### ######################################### # Create ouput image object clImageFormat = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNORM_INT8) bufferObject = cl.Buffer(context, cl.mem_flags.READ_WRITE, pixels * 4 * 4) imageObjects[2] = cl.Image(context, cl.mem_flags.WRITE_ONLY, clImageFormat, srcSize) # Create sampler for sampling image object sampler = cl.Sampler(context, False, # Non-normalized coordinates cl.addressing_mode.CLAMP_TO_EDGE, cl.filter_mode.NEAREST) # Create OpenCL program program = CreateProgram(context, device, "quad_swap.cl") # Call the kernel directly localWorkSize = (16,) globalWorkSize = ( RoundUp(localWorkSize[0], srcSize[0]*srcSize[1]/2), ) program.quad_swap(commandQueue, globalWorkSize, localWorkSize, imageObjects[0], imageObjects[1], bufferObject, imageObjects[2], sampler, numpy.int32(srcSize[0]), numpy.int32(srcSize[1])) # Read the output buffer back to the Host buffer = numpy.zeros(srcSize[0] * srcSize[1] * 4, numpy.uint8) origin = ( 0, 0, 0 ) region = ( srcSize[0], srcSize[1], 1 ) cl.enqueue_read_image(commandQueue, imageObjects[2], origin, region, buffer).wait() # Save the image to disk SaveImage(sys.argv[3], buffer, srcSize)
def main(): imageObjects = [0, 0] if len(sys.argv) != 3: print "USAGE: " + sys.argv[0] + " <inputImageFile> <outputImageFile>" return 1 # create context and command queue ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) # load image im = Image.open(sys.argv[1]) if im.mode != "RGBA": im = im.convert("RGBA") imgSize = im.size buffer = im.tostring() # Create ouput image object clImageFormat = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNSIGNED_INT8) imageObjects[0] = cl.Image( ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, clImageFormat, imgSize, None, buffer) imageObjects[1] = cl.Image(ctx, cl.mem_flags.WRITE_ONLY, clImageFormat, imgSize) # load the kernel source code #kernelFile = open("grayscale.cl", "r") kernelSrc = """ const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; __kernel void rgbaToGrayscale(__read_only image2d_t srcImg, __write_only image2d_t dstImg) { // Converts RGBA image to gray scale intensity using the following formula: // I = 0.2126 * R + 0.7152 * G + 0.0722 * B int2 coord = (int2) (get_global_id(0), get_global_id(1)); int width = get_image_width(srcImg); int height = get_image_height(srcImg); int threshold = 100; if (coord.x < width && coord.y < height) { uint4 color = read_imageui(srcImg, sampler, coord); if(color.x>=threshold){ color.x = 255; } else{ color.x = 0; } if(color.y>=threshold){ color.y = 255; } else{ color.y = 0; } if(color.z>=threshold){ color.z = 255; } else{ color.z = 0; } // Write the output value to image write_imageui(dstImg, coord, color); } }""" # Create OpenCL program program = cl.Program(ctx, kernelSrc).build() # Call the kernel directly localWorkSize = (16, 16) globalWorkSize = (RoundUp(localWorkSize[0], imgSize[0]), RoundUp(localWorkSize[1], imgSize[1])) gr = time.time() program.rgbaToGrayscale(queue, globalWorkSize, localWorkSize, imageObjects[0], imageObjects[1]) # Read the output buffer back to the Host buffer = numpy.zeros(imgSize[0] * imgSize[1] * 4, numpy.uint8) origin = (0, 0, 0) region = (imgSize[0], imgSize[1], 1) cl.enqueue_read_image(queue, imageObjects[1], origin, region, buffer).wait() print time.time() - gr print "Executed program succesfully." # Save the image to disk gsim = Image.fromstring("RGBA", imgSize, buffer.tostring()) gsim.save(sys.argv[2])
cl.channel_type.UNSIGNED_INT8), shape=OutImg.shape) prg = cl.Program(ctx, """ const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_FILTER_LINEAR | CLK_ADDRESS_CLAMP_TO_EDGE; __kernel void ImageDS(__read_only image2d_t sourceImage, __write_only image2d_t targetImage) { int w = get_image_width(targetImage); int h = get_image_height(targetImage); int outX = get_global_id(0); int outY = get_global_id(1); int2 posOut = {outX, outY}; float inX = outX / (float) w; float inY = outY / (float) h; float2 posIn = (float2) (inX, inY); float4 pixel = read_imagef(sourceImage, sampler, posIn); write_imagef(targetImage, posOut, pixel); } """).build() prg.ImageDS(queue, OutImg.shape, None, dev_Img, dev_OutImg) cl.enqueue_read_image(queue, dev_OutImg, (0, 0), OutImg.shape, OutImg).wait() cv2.imwrite("out.jpg", OutImg)
def main(): imageObjects = [ 0, 0 ] # Main if len(sys.argv) != 3: print "USAGE: " + sys.argv[0] + " <inputImageFile> <outputImageFile>" return 1 # Create an OpenCL context on first available platform context, device = CreateContext(); if context == None: print "Failed to create OpenCL context." return 1 # Create a command-queue on the first device available # on the created context commandQueue = cl.CommandQueue(context, device) # Make sure the device supports images, otherwise exit if not device.get_info(cl.device_info.IMAGE_SUPPORT): print "OpenCL device does not support images." return 1 # Load input image from file and load it into # an OpenCL image object imageObjects[0], imgSize = LoadImage(context, sys.argv[1]) # Create ouput image object clImageFormat = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNORM_INT8) imageObjects[1] = cl.Image(context, cl.mem_flags.WRITE_ONLY, clImageFormat, imgSize) # Create sampler for sampling image object sampler = cl.Sampler(context, False, # Non-normalized coordinates cl.addressing_mode.CLAMP_TO_EDGE, cl.filter_mode.NEAREST) # Create OpenCL program program = CreateProgram(context, device, "ImageFilter2D.cl") # Call the kernel directly localWorkSize = ( 16, 16 ) globalWorkSize = ( RoundUp(localWorkSize[0], imgSize[0]), RoundUp(localWorkSize[1], imgSize[1]) ) program.gaussian_filter(commandQueue, globalWorkSize, localWorkSize, imageObjects[0], imageObjects[1], sampler, numpy.int32(imgSize[0]), numpy.int32(imgSize[1])) # Read the output buffer back to the Host buffer = numpy.zeros(imgSize[0] * imgSize[1] * 4, numpy.uint8) origin = ( 0, 0, 0 ) region = ( imgSize[0], imgSize[1], 1 ) cl.enqueue_read_image(commandQueue, imageObjects[1], origin, region, buffer).wait() print "Executed program succesfully." # Save the image to disk SaveImage(sys.argv[2], buffer, imgSize)
def gpu_filter(in_put='in.jpg'): gpu_filter_ = GpuFilter() image_objects = [0, 0] # if len(sys.argv) != 3: # print " : " + sys.argv[0] + " <inputImageFile> <outputImageFile>" # exit() # Create an OpenCL context on first available platform context, device = gpu_filter_.create_context() if context is None: print "Failed to create OpenCL context." exit() # Create a command-queue on the first device available on the context that has been created command_queue = cl.CommandQueue(context, device) # Make sure the device supports images, otherwise exit if not device.get_info(cl.device_info.IMAGE_SUPPORT): print "OpenCL device does not support images." exit() # Load input image from file and load it into an OpenCL image object image_objects[0], img_size = gpu_filter_.load_image(context, in_put) # print image_objects[0], img_size # Create output image object cl_image_format = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNORM_INT8) image_objects[1] = cl.Image(context, cl.mem_flags.WRITE_ONLY, cl_image_format, img_size) # Create sampler for sampling image object sampler = cl.Sampler( context, False, # Non-normalized coordinates cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) # Create OpenCL program program = gpu_filter_.create_program(context, device, "ImageFilter2D.cl") # Call the kernel directly local_work_size = (16, 16) global_work_size = (gpu_filter_.round_up(local_work_size[0], img_size[0]), gpu_filter_.round_up(local_work_size[1], img_size[1])) program.gaussian_filter(command_queue, global_work_size, local_work_size, image_objects[0], image_objects[1], sampler, numpy.int32(img_size[0]), numpy.int32(img_size[1])) # Read the output buffer back to the Host cl_buffer = numpy.zeros(img_size[0] * img_size[1] * 4, numpy.uint8) origin = (0, 0, 0) region = (img_size[0], img_size[1], 1) cl.enqueue_read_image(command_queue, image_objects[1], origin, region, cl_buffer).wait() print "Executed program successfully." # return the image matrix return gpu_filter_.image_convert(cl_buffer, img_size)
def copy_image_to_host(queue, output, size, dtype): buffer = np.zeros(size[0] * size[1] * 4, dtype=dtype) origin = (0, 0, 0) region = (size[0], size[1], 1) cl.enqueue_read_image(queue, output, origin, region, buffer).wait() return buffer
def parallelSumRed(imgRGBA, width, height): global c1 global c2 C = 0. F = 259.*(C + 255.)/(255.*(259. - C)) #print(F) #Create buffers #host -> device width_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=numpy.int32(width)) height_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=numpy.int32(height)) dest_sum_buf = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=numpy.int32(0)) dest_sumY_buf = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=numpy.int32(0)) dest_N_buf = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=numpy.int32(0)) F_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=numpy.float32(F)) clImage = cl.Image(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNORM_INT8), (640, 480), None, imgRGBA.tostring() ) clOutImage = cl.Image(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNORM_INT8), (640, 480), None, imgRGBA.tostring() ) sampler = cl.Sampler(ctx, False, # Non-normalized coordinates cl.addressing_mode.CLAMP_TO_EDGE, cl.filter_mode.NEAREST) #compile openCL code prg = cl.Program(ctx, kernel).build() #define grid size gridSizeX = 640 gridSizeY = 480 globalWorkSize = (gridSizeX, gridSizeY) #run kernel prg.getLaserCoord(queue, globalWorkSize, clImage, clOutImage, sampler, width_buf, height_buf, dest_sum_buf, dest_N_buf, dest_sumY_buf) #can't use Intel CPU for now, need to install NVidia drivers; use AMD for now #set up output buffers sumX = numpy.empty_like(0) sumY = numpy.empty_like(0) N = numpy.empty_like(0) buff = numpy.zeros(width * height * 4, numpy.uint8) #output is numpy array of (640, 480, 4); need to convert to RGBA -> RGB -> BGR and then display origin = (0,0,0) region = (width, height,1) #device -> host cl.enqueue_copy(queue, sumX, dest_sum_buf) #from 3rd arg on device to 2nd arg on host cl.enqueue_copy(queue, N, dest_N_buf) cl.enqueue_copy(queue, sumY, dest_sumY_buf) cl.enqueue_read_image(queue, clOutImage, origin, region, buff).wait() #print("N = " + str(N) + "; SumX = " + str(sumX) + "; SumY = " + str(sumY)) #print(buff) #remember that every fourth value is alpha = 255 offsetX = 0 offsetY = 0 if N!=0: print("LASER (x,y) = (" + str(sumX/N) + ", " + str(sumY/N) + ")") if N>5: offsetX = sumX/N-320. offsetY = sumY/N-240. return (buff, int(offsetX), int(offsetY))
# 打开图片文件 src1 = Image.open('temp/images/f1.png') src2 = Image.open('temp/images/f2.png') dist = Image.new('RGBA',(640,480),(255,255,255)) # OpenCL处理的图片文件格式RGBA,unit8 imageFormat = cl.ImageFormat(cl.channel_order.RGBA,cl.channel_type.UNSIGNED_INT8) # 将图片从Host复制到Device img1 = cl.Image(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR,imageFormat,src1.size,None,src1.tobytes()) img2 = cl.Image(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR,imageFormat,src2.size,None,src2.tobytes()) output = cl.Image(context=ctx,flags=mf.WRITE_ONLY,format=imageFormat,shape=src1.size) # 根据图片大小定义WorkSize localWorkSize = ( 8, 8 ) globalWorkSize = ( RoundUp(localWorkSize[0], src1.size[0]), RoundUp(localWorkSize[1], src1.size[1])) # 执行Kernel prg.image_add(queue,globalWorkSize,localWorkSize,img1,img2,output) buffer = np.zeros(src1.size[0] * src1.size[1] * 4, np.uint8) origin = ( 0, 0, 0 ) region = ( src1.size[0], src1.size[1], 1 ) # 将处理好的图片从设备复制到HOST cl.enqueue_read_image(queue, output, origin, region, buffer).wait() # 保存图片 dist = Image.frombytes("RGBA",src1.size, buffer.tobytes()) dist.save('temp/images/cl-output.png') dist.show()
gray_image = cl.Image(gpu_context, memory_flags.READ_ONLY, cl_single_chanel_image_format, frame_size) gaussian_image = cl.Image(gpu_context, cl.mem_flags.READ_ONLY, cl_single_chanel_image_format, frame_size) sobel_image = cl.Image(gpu_context, cl.mem_flags.READ_ONLY, cl_single_chanel_image_format, frame_size) angles_image = cl.Image(gpu_context, cl.mem_flags.READ_ONLY, cl_single_chanel_image_format, frame_size) edges_image = cl.Image(gpu_context, cl.mem_flags.READ_ONLY, cl_single_chanel_image_format, frame_size) thin_edges_image = cl.Image(gpu_context, cl.mem_flags.READ_ONLY, cl_single_chanel_image_format, frame_size) sobel_mask_x = numpy.array([-1,0,1,-2,0,2,-1,0,1], dtype=numpy.int32) sobel_mask_y = numpy.array([-1,-2,-1,0,0,0,1,2,1], dtype=numpy.int32) gaussian_mask = numpy.array([2,4,5,4,2,4,9,12,9,4,5,12,15,12,5,4,9,12,9,4,2,4,5,4,2], dtype=numpy.int32) sobel_x_buffer = cl.Buffer(gpu_context, memory_flags.READ_ONLY | memory_flags.COPY_HOST_PTR, hostbuf=sobel_mask_x) sobel_y_buffer = cl.Buffer(gpu_context, memory_flags.READ_ONLY | memory_flags.COPY_HOST_PTR, hostbuf=sobel_mask_y) gaussian_buffer = cl.Buffer(gpu_context, memory_flags.READ_ONLY | memory_flags.COPY_HOST_PTR, hostbuf=gaussian_mask) while True: frame = cv.QueryFrame(stream) cv.ShowImage("camera_window1", frame) cv.CvtColor( frame, img, cv.CV_RGB2RGBA) frame_string = cv.GetMat(img).tostring() clImage = cl.Image(gpu_context,cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,rgba_image_format,frame_size,None,frame_string) event = gpu_program.convert_to_gray(command_queue,globalWorkSize,localWorkSize,clImage, gray_image, sampler, numpy.int32(frame_x), numpy.int32(frame_y)).wait() event2 = gpu_program.apply_gaussian_mask(command_queue,globalWorkSize,localWorkSize,gray_image, gaussian_image,gaussian_buffer, sampler, numpy.int32(frame_x), numpy.int32(frame_y)).wait() event2 = gpu_program.apply_sobel_mask(command_queue,globalWorkSize,localWorkSize,gaussian_image, sobel_image,angles_image,sobel_x_buffer,sobel_y_buffer, sampler, numpy.int32(frame_x), numpy.int32(frame_y)).wait() event2 = gpu_program.find_edges(command_queue,globalWorkSize,localWorkSize,sobel_image, edges_image,angles_image,sampler, numpy.int32(frame_x), numpy.int32(frame_y)).wait() event2 = gpu_program.suppress_edges(command_queue,globalWorkSize,localWorkSize,edges_image,sobel_image,angles_image, thin_edges_image,sampler, numpy.int32(frame_x), numpy.int32(frame_y)).wait() event3 = cl.enqueue_read_image(command_queue, thin_edges_image,origin, region, thin_edges_array).wait() cv.ShowImage("camera_window3", cv.fromarray(thin_edges_array.reshape(frame_y,frame_x))) if cv.WaitKey(10) == 27: breakcv.DestroyWindow("camera_window")
np.int32(imgSize[0]), np.int32(imgSize[1])).wait() for i in xrange(ntimes): if i%2 == 0: m, n = 1, 2 else: m, n = 2, 1 program.blur_filter(commandQueue, globalWorkSize, None, imageObjects[m], imageObjects[n], sampler, np.int32(imgSize[0]), np.int32(imgSize[1])).wait() t13=time.clock() print t13 - t12, " Run Kernel..." t14 = time.clock() buf2 = np.zeros(imgSize[0] * imgSize[1] * 4, np.uint8) cl.enqueue_read_image(commandQueue, imageObjects[2], origin, region, buf2, is_blocking=True) IMG_3 = buf2.reshape(imgSize[1], imgSize[0], 4) t15 = time.clock() print t15 - t14, " Read Image from GPU..." print "Executed program succesfully." t16 = time.clock() print t16 - t4, " Total GPU..." else: ## device == CPU t4 = time.clock() im = Image.open(inFile) img = np.array(im) IMG_1 = scale_img(img, 8) img1 = Image.fromarray(IMG_1) # Make sure the image is RGBA formatted if img1.mode != "RGBA":
import os, glob directory = os.path.dirname(filename) files = glob.glob(directory+'/thumbnail_*') for filename in files: # load a 512x512 image Img = cv2.imread(filename, cv2.CV_LOAD_IMAGE_GRAYSCALE) OutImg = np.empty(shape=(width/factor, height/factor), dtype=np.uint8) # create Output-Image # OutImg = np.empty(shape=(100,100), dtype=np.uint8) # create Output-Image mf = cl.mem_flags dev_Img = cl.Image(ctx, mf.READ_ONLY | mf.USE_HOST_PTR, cl.ImageFormat(cl.channel_order.R, cl.channel_type.UNSIGNED_INT8), hostbuf=Img) dev_OutImg = cl.Image(ctx, mf.WRITE_ONLY, cl.ImageFormat(cl.channel_order.R, cl.channel_type.UNSIGNED_INT8), shape=OutImg.shape) prg.ImageDS(queue, OutImg.shape, None, dev_Img, dev_OutImg) cl.enqueue_read_image(queue, dev_OutImg, (0, 0), OutImg.shape, OutImg).wait() # cv2.imwrite("/tmp/sub_cl.jpg", OutImg)
def main(): imageObjects = [ 0, 0, 0, 0 ] # Main if len(sys.argv) != 3: print "USAGE: " + sys.argv[0] + " <inputImageFile> <outputImageFile>" return 1 # Create an OpenCL context on first available platform context, device = CreateContext(); if context == None: print "Failed to create OpenCL context." return 1 # Create a command-queue on the first device available # on the created context commandQueue = cl.CommandQueue(context, device) # Make sure the device supports images, otherwise exit if not device.get_info(cl.device_info.IMAGE_SUPPORT): print "OpenCL device does not support images." return 1 print("Device Global Memory Size => %s MB"%(device.global_mem_size/(1024*1024))) print("Device Max Memory Allocation Size => %s MB"%(device.max_mem_alloc_size/(1024*1024))) cl_kernels = {} host_kernels = {} for cell_type in ganglion_cells: for centre_type in ganglion_cells[cell_type]: width = ganglion_cells[cell_type][centre_type]['width'] pos_sigma = ganglion_cells[cell_type][centre_type]['sigma'] neg_sigma = 3.*pos_sigma pos_gaussian = makeGaussianKernel(width, pos_sigma).flatten().astype(numpy.float32) neg_gaussian = makeGaussianKernel(width, neg_sigma).flatten().astype(numpy.float32) diff_of_gaussian = (pos_gaussian - neg_gaussian) host_kernels["%s-%s"%(cell_type, centre_type)] = diff_of_gaussian cl_kernels["%s-%s"%(cell_type, centre_type)] = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=(host_kernels["%s-%s"%(cell_type, centre_type)])) print(host_kernels) cap = cv2.VideoCapture('sts-11-landing.webm') ret, frame = cap.read() height, width, channels = frame.shape print("VIDEO width = %s, height = %s, channels = %s"%(width, height, channels)) frame_buffer = numpy.zeros(width * height * channels, numpy.uint8) numBits = 8 imgSize = (width, height) fourcc = cv.CV_FOURCC(*'XVID') fps = 20.0 out = cv2.VideoWriter('output.avi',fourcc, fps, (3*width, 2*height)) # Load input image from file and load it into # an OpenCL image object # imageObjects[0], imgSize = LoadImage(context, sys.argv[1]) # Create ouput image object clImageFormat = cl.ImageFormat(cl.channel_order.RGB, cl.channel_type.UNORM_INT8) frame_buffer = numpy.array( frame[:,:] ).flatten().astype(numpy.uint8)#.tostring() print(frame_buffer) imageObjects[0] = cl.Buffer(context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, #clImageFormat, #imgSize, hostbuf=(frame_buffer)) print("CL img 0 -> (%s, %s, %s)"%(imageObjects[0].width, imageObjects[0].height, imageObjects[0].element_size)) imageObjects[1] = cl.Image(context, cl.mem_flags.WRITE_ONLY, clImageFormat, imgSize) print("CL img 1 -> (%s, %s, %s)"%(imageObjects[1].width, imageObjects[1].height, imageObjects[1].element_size)) # Create sampler for sampling image object sampler = cl.Sampler(context, False, # Non-normalized coordinates cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) # Create OpenCL program program = CreateProgram(context, device, "ImageFilter2D.cl") commandQueue.finish() #print("Device => %s "%(device.global_mem_size/(1024*1024))) #print("first frame copy") vid2CL(commandQueue, width, height, frame, imageObjects[0]) while(cap.isOpened()): #frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGBA) composite = cv.CreateImage((width*3, height*2), numBits, channels) for cell_type in ganglion_cells: for centre_type in ganglion_cells[cell_type]: print("%s -> %s"%(cell_type, centre_type)) k_width = ganglion_cells[cell_type][centre_type]['width'] local = 16 #minPowerOf2(width) localWorkSize = ( local, local ) globalWorkSize = ( RoundUp(localWorkSize[0], imgSize[0]), RoundUp(localWorkSize[1], imgSize[1]) ) print("starting convolution") start_time = time.time() program.convolution(commandQueue, globalWorkSize, localWorkSize, imageObjects[0], imageObjects[1], cl_kernels["%s-%s"%(cell_type, centre_type)], numpy.int32(k_width), sampler, numpy.int32(imgSize[0]), numpy.int32(imgSize[1])) print("end of %s-%s convolution %s (sec)"%(cell_type, centre_type, time.time()-start_time)) # Read the output buffer back to the Host buff = numpy.zeros(width * height * channels, numpy.uint8) origin = ( 0, 0, 0 ) region = ( imgSize[0], imgSize[1], 1 ) cl.enqueue_read_image(commandQueue, imageObjects[1], origin, region, buff).wait() x_steps = ganglion_cells[cell_type][centre_type]['out_x'] y_steps = ganglion_cells[cell_type][centre_type]['out_y'] out_origin = (x_steps*width, y_steps*height) buff2CV(imgSize, channels, out_origin, buff, composite) #SaveImage("%s-%s---%s"%(cell_type, centre_type, sys.argv[2]), buff, imgSize) out.write(composite) ret, frame = cap.read() print("later frame copy") vid2CL(commandQueue, width, height, frame, imageObjects[0]) print "Executed program succesfully."
def parallelSumRed(imgRGBA, width, height): global c1 global c2 C = 0. F = 259. * (C + 255.) / (255. * (259. - C)) #print(F) #Create buffers #host -> device width_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=numpy.int32(width)) height_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=numpy.int32(height)) dest_sum_buf = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=numpy.int32(0)) dest_sumY_buf = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=numpy.int32(0)) dest_N_buf = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=numpy.int32(0)) F_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=numpy.float32(F)) clImage = cl.Image( ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNORM_INT8), (640, 480), None, imgRGBA.tostring()) clOutImage = cl.Image( ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNORM_INT8), (640, 480), None, imgRGBA.tostring()) sampler = cl.Sampler( ctx, False, # Non-normalized coordinates cl.addressing_mode.CLAMP_TO_EDGE, cl.filter_mode.NEAREST) #compile openCL code prg = cl.Program(ctx, kernel).build() #define grid size gridSizeX = 640 gridSizeY = 480 globalWorkSize = (gridSizeX, gridSizeY) #run kernel prg.getLaserCoord( queue, globalWorkSize, clImage, clOutImage, sampler, width_buf, height_buf, dest_sum_buf, dest_N_buf, dest_sumY_buf ) #can't use Intel CPU for now, need to install NVidia drivers; use AMD for now #set up output buffers sumX = numpy.empty_like(0) sumY = numpy.empty_like(0) N = numpy.empty_like(0) buff = numpy.zeros( width * height * 4, numpy.uint8 ) #output is numpy array of (640, 480, 4); need to convert to RGBA -> RGB -> BGR and then display origin = (0, 0, 0) region = (width, height, 1) #device -> host cl.enqueue_copy(queue, sumX, dest_sum_buf) #from 3rd arg on device to 2nd arg on host cl.enqueue_copy(queue, N, dest_N_buf) cl.enqueue_copy(queue, sumY, dest_sumY_buf) cl.enqueue_read_image(queue, clOutImage, origin, region, buff).wait() #print("N = " + str(N) + "; SumX = " + str(sumX) + "; SumY = " + str(sumY)) #print(buff) #remember that every fourth value is alpha = 255 offsetX = 0 offsetY = 0 if N != 0: print("LASER (x,y) = (" + str(sumX / N) + ", " + str(sumY / N) + ")") if N > 5: offsetX = sumX / N - 320. offsetY = sumY / N - 240. return (buff, int(offsetX), int(offsetY))
# img = cv2.imread(sys.argv[1], cv2.CV_LOAD_IMAGE_GRAYSCALE) img_width, img_height = img.shape mf = cl.mem_flags in_image_format = cl.ImageFormat(cl.channel_order.R, cl.channel_type.UNSIGNED_INT8) in_image = cl.Image(p.context, mf.READ_ONLY | mf.USE_HOST_PTR, in_image_format, hostbuf=img) # # create output buffer # out_buffer = np.zeros(shape=(img_width/2, img_height/2), dtype=np.uint8) # # create ouput image object # # out_image_format = cl.ImageFormat(cl.channel_order.R, cl.channel_type.UNSIGNED_INT8) out_image = cl.Image(p.context, mf.WRITE_ONLY, in_image_format, out_buffer.shape) # # call kernel # p.program.downsample(p.queue, out_buffer.shape, None, in_image, out_image) # # read output # cl.enqueue_read_image(p.queue, out_image, (0,0), out_buffer.shape, out_buffer).wait() # cv2.imwrite('/tmp/pycl_tex_z1.jpg', out_buffer)