def vglClNdCopy(self, img_input, img_output): print("# Running vglClNdCopy") if (not img_input.clForceAsBuf == vl.IMAGE_ND_ARRAY()): print( "vglClNdCopy: Error: this function supports only OpenCL data as buffer and img_input isn't." ) exit() if (not img_output.clForceAsBuf == vl.IMAGE_ND_ARRAY()): print( "vglClNdCopy: Error: this function supports only OpenCL data as buffer and img_output isn't." ) exit() vl.vglCheckContext(img_input, vl.VGL_CL_CONTEXT()) vl.vglCheckContext(img_output, vl.VGL_CL_CONTEXT()) _program = self.cl_ctx.get_compiled_kernel("../CL_ND/vglClNdCopy.cl", "vglClNdCopy") kernel_run = _program.vglClNdCopy kernel_run.set_arg(0, img_input.get_oclPtr()) kernel_run.set_arg(1, img_output.get_oclPtr()) cl.enqueue_nd_range_kernel(self.ocl.commandQueue, kernel_run, img_output.get_ipl().shape, None) vl.vglSetContext(img_output, vl.VGL_CL_CONTEXT())
def max_length_real4(ipt): out = CLReal(len(ipt)) kern = _lengthkern_real4.kern kern.set_arg(0, ipt._buffer) kern.set_arg(1, out._buffer) cl.enqueue_nd_range_kernel(ipt._ctrl.clqueue, kern, (len(ipt),), None) return max_reduce(out)
def search(self, midstate): msg = flipendian32(midstate) for i in xrange(8): self.sha512_fill.set_arg(i, msg[i * 4:i * 4 + 4]) self.sha512_fill.set_arg(8, self.hashes_buf) self.sha512_fill.set_arg(9, self.keyhash_buf) # t1 = time.time() cl.enqueue_nd_range_kernel(self.queue, self.sha512_fill, (HASHES_NUM, ), (self.sha512_fill_ws, )) self.queue.finish() # print "fill %f" % (time.time() - t1) output = bytearray(OUTPUT_SIZE) cl.enqueue_write_buffer(self.queue, self.output_buf, output) self.queue.finish() self.ksearch.set_arg(0, self.hashes_buf) self.ksearch.set_arg(1, self.keyhash_buf) self.ksearch.set_arg(2, self.output_buf) cl.enqueue_nd_range_kernel(self.queue, self.ksearch, (KEYS_NUM, ), (self.ksearch_ws, )) self.queue.finish() cl.enqueue_read_buffer(self.queue, self.output_buf, output) self.queue.finish() return str(output)
def vglClBinConway(self, img_input, img_output): vl.vglCheckContext(img_input, vl.VGL_CL_CONTEXT()) vl.vglCheckContext(img_output, vl.VGL_CL_CONTEXT()) _program = self.cl_ctx.get_compiled_kernel( "../CL_BIN/vglClBinConway.cl", "vglClBinConway") kernel_run = _program.vglClBinConway mobj_img_shape = img_input.getVglShape().get_asVglClShape_buffer() kernel_run.set_arg(0, img_input.get_oclPtr()) kernel_run.set_arg(1, img_output.get_oclPtr()) kernel_run.set_arg(2, mobj_img_shape) _worksize_0 = img_input.getWidthIn() if (img_input.depth == vl.IPL_DEPTH_1U()): _worksize_0 = img_input.getWidthStep() if (img_output.depth == vl.IPL_DEPTH_1U()): _worksize_0 = img_output.getWidthStep() worksize = (int(_worksize_0), img_input.getHeigthIn(), img_input.getNFrames()) cl.enqueue_nd_range_kernel(self.ocl.commandQueue, kernel_run, worksize, None) #cl.enqueue_nd_range_kernel(self.ocl.commandQueue, kernel_run, img_output.get_oclPtr().shape, None) vl.vglSetContext(img_output, vl.VGL_CL_CONTEXT())
def do_opencl_pow(hash, target): global ctx, queue, program, gpus, hash_dt output = numpy.zeros(1, dtype=[("v", numpy.uint64, 1)]) if ctx == False: return output[0][0] data = numpy.zeros(1, dtype=hash_dt, order="C") data[0]["v"] = ("0000000000000000" + hash).decode("hex") data[0]["target"] = target hash_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data) dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, output.nbytes) kernel = program.kernel_sha512 worksize = kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, gpus[0]) kernel.set_arg(0, hash_buf) kernel.set_arg(1, dest_buf) start = time.time() progress = 0 globamt = worksize * 2000 while output[0][0] == 0: kernel.set_arg(2, pack("<Q", progress)) cl.enqueue_nd_range_kernel(queue, kernel, (globamt,), (worksize,)) cl.enqueue_read_buffer(queue, dest_buf, output) queue.finish() progress += globamt sofar = time.time() - start # logger.debug("Working for %.3fs, %.2f Mh/s", sofar, (progress / sofar) / 1000000) taken = time.time() - start # logger.debug("Took %d tries.", progress) return output[0][0]
def do_opencl_pow(hash, target): output = numpy.zeros(1, dtype=[('v', numpy.uint64, 1)]) if (len(enabledGpus) == 0): return output[0][0] data = numpy.zeros(1, dtype=hash_dt, order='C') data[0]['v'] = ("0000000000000000" + hash).decode("hex") data[0]['target'] = target hash_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data) dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, output.nbytes) kernel = program.kernel_sha512 worksize = kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, enabledGpus[0]) kernel.set_arg(0, hash_buf) kernel.set_arg(1, dest_buf) start = time.time() progress = 0 globamt = worksize*2000 while output[0][0] == 0 and shutdown == 0: kernel.set_arg(2, pack("<Q", progress)) cl.enqueue_nd_range_kernel(queue, kernel, (globamt,), (worksize,)) cl.enqueue_read_buffer(queue, dest_buf, output) queue.finish() progress += globamt sofar = time.time() - start # logger.debug("Working for %.3fs, %.2f Mh/s", sofar, (progress / sofar) / 1000000) if shutdown != 0: raise Exception ("Interrupted") taken = time.time() - start # logger.debug("Took %d tries.", progress) return output[0][0]
def vglClNdCopy(img_input, img_output): if (not img_input.clForceAsBuf == vl.IMAGE_ND_ARRAY()): print( "vglClNdCopy: Error: this function supports only OpenCL data as buffer and img_input isn't." ) exit(1) if (not img_output.clForceAsBuf == vl.IMAGE_ND_ARRAY()): print( "vglClNdCopy: Error: this function supports only OpenCL data as buffer and img_output isn't." ) exit(1) vl.vglCheckContext(img_input, vl.VGL_CL_CONTEXT()) vl.vglCheckContext(img_output, vl.VGL_CL_CONTEXT()) _program = vl.get_ocl_context().get_compiled_kernel( "CL_ND/vglClNdCopy.cl", "vglClNdCopy") _kernel = _program.vglClNdCopy _kernel.set_arg(0, img_input.get_oclPtr()) _kernel.set_arg(1, img_output.get_oclPtr()) # THIS IS A BLOCKING COMMAND. IT EXECUTES THE KERNEL. cl.enqueue_nd_range_kernel(vl.get_ocl().commandQueue, _kernel, img_input.get_ipl().shape, None) vl.vglSetContext(img_input, vl.VGL_CL_CONTEXT()) vl.vglSetContext(img_output, vl.VGL_CL_CONTEXT())
def do_opencl_pow(hash_, target): """Perform PoW using OpenCL""" output = numpy.zeros(1, dtype=[('v', numpy.uint64, 1)]) if not enabledGpus: return output[0][0] data = numpy.zeros(1, dtype=hash_dt, order='C') data[0]['v'] = ("0000000000000000" + hash_).decode("hex") data[0]['target'] = target hash_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data) dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, output.nbytes) kernel = program.kernel_sha512 worksize = kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, enabledGpus[0]) kernel.set_arg(0, hash_buf) kernel.set_arg(1, dest_buf) progress = 0 globamt = worksize * 2000 while output[0][0] == 0 and shutdown == 0: kernel.set_arg(2, pack("<Q", progress)) cl.enqueue_nd_range_kernel(queue, kernel, (globamt,), (worksize,)) try: cl.enqueue_read_buffer(queue, dest_buf, output) except AttributeError: cl.enqueue_copy(queue, output, dest_buf) queue.finish() progress += globamt if shutdown != 0: raise Exception("Interrupted") # logger.debug("Took %d tries.", progress) return output[0][0]
def runFilter(self): if self.atts.height == 1 and self.atts.slices == 1: mid = 1 elif self.atts.slices == 1: mid = 4 else: mid = 13 globalSize = [0, 0] localSize = [0, 0] self.clattr.computeWorkingGroupSize( localSize, globalSize, [self.atts.width, self.atts.height, 1]) try: # set up parameters self.kernel.set_args(self.clattr.inputBuffer, self.clattr.outputBuffer, np.int32(self.atts.width), np.int32(self.atts.height), np.int32(self.clattr.maxSliceCount), np.int32(mid)) # execute kernel cl.enqueue_nd_range_kernel(self.clattr.queue, self.kernel, globalSize, localSize) except Exception as e: raise e # write results cl.enqueue_copy(self.clattr.queue, self.clattr.inputBuffer, self.clattr.outputBuffer) self.clattr.queue.finish() return True
def runFilter(self): globalSize = [0, 0] localSize = [0, 0] self.clattr.computeWorkingGroupSize( localSize, globalSize, [self.atts.width, self.atts.height, 1]) try: self.kernel.set_args( self.clattr.inputBuffer, self.clattr.outputBuffer, np.int32(self.atts.width), np.int32(self.atts.height), np.int32(self.clattr.maxSliceCount + self.getInfo().overlapZ), self.spatialKernel, np.int32((self.spatialRadius + 1) * 2 - 1), self.rangeKernel, np.int32((self.rangeRadius + 1) * 2 - 1)) cl.enqueue_nd_range_kernel(self.clattr.queue, self.kernel, globalSize, localSize) except Exception as e: raise e # write results cl.enqueue_copy(self.clattr.queue, self.clattr.inputBuffer, self.clattr.outputBuffer) self.clattr.queue.finish() return True
def prefixSumUp(self, e, data, ndata, data2, ndata2, events): import numpy as np import pyopencl as cl mf = cl.mem_flags if not isinstance(data, cl.Buffer): data_buf = cl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=data) else: data_buf = data if not isinstance(data2, cl.Buffer): data2_buf = cl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=data2) else: data2_buf = data2 kernel = self.prg.prefixSumUp kernel.set_args(data_buf, np.uint64(ndata), data2_buf, np.uint64(ndata2)) global_dims = self.get_global(self.get_grid_dims(ndata)) print "prefixSumUp" if e is None: e = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims, wait_for=e), ) else: e = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims), ) events += e return (e, data_buf, data2_buf)
def compute(self, image, num_bins): width, height = np.shape(image) numpixels = width * height image = np.reshape(image, (numpixels, )).astype(np.float32) result = np.zeros((numpixels * num_bins, ), dtype=np.float32) mf = cl.mem_flags self.buf_image = cl.Buffer(self.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=image) self.output_buf = cl.Buffer(self.context, mf.READ_WRITE, result.nbytes) kernel = self.program.iif_binid kernel.set_scalar_arg_dtypes([np.uintc, np.uintc, np.ubyte] + [None] * 2) kernel.set_arg(0, np.uintc(width)) kernel.set_arg(1, np.uintc(height)) kernel.set_arg(2, np.ubyte(num_bins)) kernel.set_arg(3, self.buf_image) kernel.set_arg(4, self.output_buf) cl.enqueue_nd_range_kernel(self.queue, kernel, image.shape, None).wait() cl.enqueue_read_buffer(self.queue, self.output_buf, result).wait() return np.reshape(result, (width, height, num_bins)).astype(np.float32)
def compute(self, floatimage, histogram, k): width, height, nbins = np.shape(histogram) numpixels = width * height image_linear = np.reshape(floatimage, (numpixels, )).astype(np.float32) histogram_linear = np.reshape( histogram, (np.size(histogram), )).astype(np.float32) transform = np.zeros_like(image_linear).astype(np.float32) mf = cl.mem_flags self.buf_image = cl.Buffer(self.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=image_linear) self.buf_histogram = cl.Buffer(self.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=histogram_linear) self.output_buf = cl.Buffer(self.context, mf.READ_WRITE, transform.nbytes) kernel = self.program.IIF kernel.set_scalar_arg_dtypes([np.uintc, np.uintc, np.float32] + [None] * 3) kernel.set_arg(0, np.uintc(width)) kernel.set_arg(1, np.uintc(height)) kernel.set_arg(2, np.float32(k)) kernel.set_arg(3, self.buf_image) kernel.set_arg(4, self.buf_histogram) kernel.set_arg(5, self.output_buf) cl.enqueue_nd_range_kernel(self.queue, kernel, image_linear.shape, None).wait() cl.enqueue_read_buffer(self.queue, self.output_buf, transform).wait() return np.reshape(transform, (width, height)).astype(np.float)
def __call__(self, thread_count, work_group_size, *args): fun = self.compile() for i, arg in enumerate(args): fun.set_arg(i, arg) with timed_region("ParLoop kernel"): cl.enqueue_nd_range_kernel(_queue, fun, (thread_count,), (work_group_size,), g_times_l=False).wait()
def applyMorphOp(imgIn, op): "apply morphological operation to image using GPU" # (1) setup OpenCL platforms = cl.get_platforms() # a platform corresponds to a driver (e.g. AMD) platform = platforms[0] # take first platform devices = platform.get_devices(cl.device_type.GPU) # get GPU devices of selected platform device = devices[0] # take first GPU context = cl.Context([device]) # put selected GPU into context object queue = cl.CommandQueue(context, device) # create command queue for selected GPU and context # (2) get shape of input image, allocate memory for output to which result can be copied to shape = imgIn.T.shape imgOut = np.empty_like(imgIn) # (2) create image buffers which hold images for OpenCL imgInBuf = cl.Image(context, cl.mem_flags.READ_ONLY, cl.ImageFormat(cl.channel_order.LUMINANCE, cl.channel_type.UNORM_INT8), shape=shape) # holds a gray-valued image of given shape imgOutBuf = cl.Image(context, cl.mem_flags.WRITE_ONLY, cl.ImageFormat(cl.channel_order.LUMINANCE, cl.channel_type.UNORM_INT8), shape=shape) # placeholder for gray-valued image of given shape # (3) load and compile OpenCL program program = cl.Program(context, open('Erosion_Dilation.cl').read()).build() # (3) from OpenCL program, get kernel object and set arguments (input image, operation type, output image) kernel = cl.Kernel(program, 'morphOpKernel') # name of function according to kernel.py kernel.set_arg(0, imgInBuf) # input image buffer kernel.set_arg(1, np.uint32(op)) # operation type passed as an integer value (dilate=0, erode=1) kernel.set_arg(2, imgOutBuf) # output image buffer # (4) copy image to device, execute kernel, copy data back cl.enqueue_copy(queue, imgInBuf, imgIn, origin=(0, 0), region=shape, is_blocking=False) # copy image from CPU to GPU cl.enqueue_nd_range_kernel(queue, kernel, shape, None) # execute kernel, work is distributed across shape[0]*shape[1] work-items (one work-item per pixel of the image) cl.enqueue_copy(queue, imgOut, imgOutBuf, origin=(0, 0), region=shape, is_blocking=True) # wait until finished copying resulting image back from GPU to CPU return imgOut
def filterPrepare(self, e, data, keys, ndata, events): import numpy as np import pyopencl as cl mf = cl.mem_flags ndata = data.size if keys.size != ndata: raise Exception() filtbytes = np.bool8(False).nbytes * ndata if not isinstance(data, cl.Buffer): data_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf= data) else: data_buf = data if not isinstance(keys, cl.Buffer): keys_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf= keys) else: keys_buf = keys filt_buf = cl.Buffer(self.ctx, mf.READ_WRITE, filtbytes) kernel = self.prg.filterPrepare kernel.set_args(data_buf, keys_buf, np.uint64(ndata), np.uint8(33), np.uint8(66), filt_buf) global_dims = self.get_global(self.get_grid_dims(ndata)) print "filterPrepare" if e is None: e = [ cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims), ] else: e = [ cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims, wait_for=e), ] events += e return (e, data_buf, keys_buf, filt_buf)
def run_kernel(self, kernel, grid_size, stream=None): global_size = [] for i, dim in enumerate(grid_size): global_size.append(dim * kernel.block[i]) cl.enqueue_nd_range_kernel(self.default_queue, kernel, global_size, kernel.block[0:len(global_size)])
def run(self): cl.enqueue_nd_range_kernel( self.queue, self.kernel, self.global_size, self.local_size, ).wait()
def __call__(self, thread_count, work_group_size, *args): fun = self.compile() for i, arg in enumerate(args): fun.set_arg(i, arg) with timed_region("ParLoopCKernel"): cl.enqueue_nd_range_kernel(_queue, fun, (thread_count,), (work_group_size,), g_times_l=False).wait()
def do_opencl_pow(hash, target): output = numpy.zeros(1, dtype=[('v', numpy.uint64, 1)]) if (ctx == False): return output[0][0] data = numpy.zeros(1, dtype=hash_dt, order='C') data[0]['v'] = ("0000000000000000" + hash).decode("hex") data[0]['target'] = target hash_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data) dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, output.nbytes) kernel = program.kernel_sha512 worksize = kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, cl.get_platforms()[0].get_devices()[1]) kernel.set_arg(0, hash_buf) kernel.set_arg(1, dest_buf) start = time.time() progress = 0 globamt = worksize*2000 while output[0][0] == 0: kernel.set_arg(2, pack("<Q", progress)) cl.enqueue_nd_range_kernel(queue, kernel, (globamt,), (worksize,)) cl.enqueue_read_buffer(queue, dest_buf, output) queue.finish() progress += globamt sofar = time.time() - start print sofar, progress / sofar, "hashes/sec" taken = time.time() - start print progress, taken return output[0][0]
def do_opencl_pow(hash, target): output = numpy.zeros(1, dtype=[('v', numpy.uint64, 1)]) if (ctx == False): return output[0][0] data = numpy.zeros(1, dtype=hash_dt, order='C') data[0]['v'] = ("0000000000000000" + hash).decode("hex") data[0]['target'] = target hash_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data) dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, output.nbytes) kernel = program.kernel_sha512 worksize = kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, gpus[0]) kernel.set_arg(0, hash_buf) kernel.set_arg(1, dest_buf) start = time.time() progress = 0 globamt = worksize*2000 while output[0][0] == 0: kernel.set_arg(2, pack("<Q", progress)) cl.enqueue_nd_range_kernel(queue, kernel, (globamt,), (worksize,)) cl.enqueue_read_buffer(queue, dest_buf, output) queue.finish() progress += globamt sofar = time.time() - start print sofar, progress / sofar, "hashes/sec" taken = time.time() - start print progress, taken return output[0][0]
def runFilter(self): mask = self.atts.getMaskImages(self.mask, self.L)[0] if self.atts.width*self.atts.height*self.atts.slices != np.product(mask.shape): print("Mask dimensions not equal to original image's") return False globalSize = [0] localSize = [0] self.clattr.computeWorkingGroupSize(localSize, globalSize, [self.atts.width, self.atts.height, self.clattr.maxSliceCount + self.atts.overlap[self.index]]) self.maskBuffer = self.atts.getStructElement(self.clattr.context, self.clattr.queue, mask, globalSize[0]) try: self.kernel.set_args(self.clattr.inputBuffer, self.maskBuffer, self.clattr.outputBuffer, np.int32(self.atts.width), np.int32(self.atts.height), np.int32(self.clattr.maxSliceCount + self.atts.overlap[self.index])) cl.enqueue_nd_range_kernel(self.clattr.queue, self.kernel, globalSize, localSize) except Exception as e: raise e # write results cl.enqueue_copy(self.clattr.queue, self.clattr.inputBuffer, self.clattr.outputBuffer) self.clattr.queue.finish() return True
def vglCl3dThreshold(img_input, img_output, thresh, top=1.0): print("# Running vglCl3dThreshold") vl.vglCheckContext(img_input, vl.VGL_CL_CONTEXT()) vl.vglCheckContext(img_output, vl.VGL_CL_CONTEXT()) if( not isinstance(thresh, np.float32) ): print("vglCl3dThreshold: Warning: thresh not np.float32! Trying to convert...") try: thresh = np.float32(thresh) except Exception as e: print("vglCl3dThreshold: Error!! Impossible to convert thresh as a np.float32 object.") print(str(e)) exit() if( not isinstance(top, np.float32) ): print("vglCl3dThreshold: Warning: top not np.float32! Trying to convert...") try: top = np.float32(top) except Exception as e: print("vglCl3dThreshold: Error!! Impossible to convert top as a np.float32 object.") print(str(e)) exit() _program = vl.get_ocl_context().get_compiled_kernel("../CL/vglCl3dThreshold.cl", "vglCl3dThreshold") kernel_run = _program.vglCl3dThreshold kernel_run.set_arg(0, img_input.get_oclPtr()) kernel_run.set_arg(1, img_output.get_oclPtr()) kernel_run.set_arg(2, thresh) kernel_run.set_arg(3, top) cl.enqueue_nd_range_kernel(vl.get_ocl().commandQueue, kernel_run, img_output.get_oclPtr().shape, None) vl.vglSetContext(img_output, vl.VGL_CL_CONTEXT())
def vglClNdThreshold(img_input, img_output, thresh, top=255): if (not img_input.clForceAsBuf == vl.IMAGE_ND_ARRAY()): print( "vglClNdCopy: Error: this function supports only OpenCL data as buffer and img_input isn't." ) exit(1) if (not img_output.clForceAsBuf == vl.IMAGE_ND_ARRAY()): print( "vglClNdCopy: Error: this function supports only OpenCL data as buffer and img_output isn't." ) exit(1) vl.vglCheckContext(img_input, vl.VGL_CL_CONTEXT()) vl.vglCheckContext(img_output, vl.VGL_CL_CONTEXT()) # EVALUATING IF thresh IS IN CORRECT TYPE if (not isinstance(thresh, np.uint8)): print( "vglClConvolution: Warning: thresh not np.uint8! Trying to convert..." ) try: thresh = np.uint8(thresh) except Exception as e: print( "vglClConvolution: Error!! Impossible to convert thresh as a np.uint8 object." ) print(str(e)) exit() # EVALUATING IF top IS IN CORRECT TYPE if (not isinstance(top, np.uint8)): print( "vglClConvolution: Warning: top not np.uint8! Trying to convert..." ) try: top = np.uint8(top) except Exception as e: print( "vglClConvolution: Error!! Impossible to convert top as a np.uint8 object." ) print(str(e)) exit() _program = vl.get_ocl_context().get_compiled_kernel( "CL_ND/vglClNdThreshold.cl", "vglClNdThreshold") _kernel = _program.vglClNdThreshold _kernel.set_arg(0, img_input.get_oclPtr()) _kernel.set_arg(1, img_output.get_oclPtr()) _kernel.set_arg(2, thresh) _kernel.set_arg(3, top) # THIS IS A BLOCKING COMMAND. IT EXECUTES THE KERNEL. cl.enqueue_nd_range_kernel(vl.get_ocl().commandQueue, _kernel, img_input.get_ipl().shape, None) vl.vglSetContext(img_input, vl.VGL_CL_CONTEXT()) vl.vglSetContext(img_output, vl.VGL_CL_CONTEXT())
def vglClNdBinThreshold(self, img_input, img_output, thresh): if (not img_input.clForceAsBuf == vl.IMAGE_ND_ARRAY()): print( "vglClNdBinThreshold: Error: this function supports only OpenCL data as buffer and img_input isn't." ) exit() if (not img_output.clForceAsBuf == vl.IMAGE_ND_ARRAY()): print( "vglClNdBinThreshold: Error: this function supports only OpenCL data as buffer and img_output isn't." ) exit() if (not isinstance(thresh, np.uint8)): print( "vglClNdBinThreshold: Warning: thresh not np.uint8! Trying to convert..." ) try: thresh = np.uint8(thresh) except Exception as e: print( "vglClNdBinThreshold: Error!! Impossible to convert thresh as a np.uint8 object." ) print(str(e)) exit() vl.vglCheckContext(img_input, vl.VGL_CL_CONTEXT()) vl.vglCheckContext(img_output, vl.VGL_CL_CONTEXT()) _program = self.cl_ctx.get_compiled_kernel( "../CL_BIN/vglClNdBinThreshold.cl", "vglClNdBinThreshold") kernel_run = _program.vglClNdBinThreshold mobj_img_shape_input = img_input.getVglShape().get_asVglClShape_buffer( ) mobj_img_shape_output = img_output.getVglShape( ).get_asVglClShape_buffer() kernel_run.set_arg(0, img_input.get_oclPtr()) kernel_run.set_arg(1, img_output.get_oclPtr()) kernel_run.set_arg(2, thresh) kernel_run.set_arg(3, mobj_img_shape_input) kernel_run.set_arg(4, mobj_img_shape_output) _worksize_0 = img_input.getWidthIn() if (img_input.depth == vl.IPL_DEPTH_1U()): _worksize_0 = img_input.getWidthStep() if (img_output.depth == vl.IPL_DEPTH_1U()): _worksize_0 = img_output.getWidthStep() worksize = (int(_worksize_0), img_input.getHeigthIn(), img_input.getNFrames()) # ENQUEUEING KERNEL EXECUTION #cl.enqueue_nd_range_kernel(self.ocl.commandQueue, kernel_run, worksize, None) cl.enqueue_nd_range_kernel(self.ocl.commandQueue, kernel_run, img_output.ipl.shape, None) vl.vglSetContext(img_output, vl.VGL_CL_CONTEXT())
def exec_lsz_safe(self, localsize): """execute the kernel with specific localsize. Safe also for lernels with local variables""" oldloc = int(self._localsize) self.localsize = localsize cl.enqueue_nd_range_kernel(self._solverobj.clqueue, self._clkernel, (self.globalsize,), (self.localsize,)) self._solverobj.clqueue.finish() self.localsize = oldloc
def Difference(img1, img2, threshold): img1 = np.array(img1).astype('uint8') img2 = np.array(img2).astype('uint8') platforms = cl.get_platforms() platform = platforms[0] devices = platform.get_devices(cl.device_type.GPU) device = devices[0] context = cl.Context([device]) queue = cl.CommandQueue(context, device) shape = img1.T.shape result = np.empty_like(img1) imgInBuf1 = cl.Image(context, cl.mem_flags.READ_ONLY, cl.ImageFormat(cl.channel_order.LUMINANCE, cl.channel_type.UNORM_INT8), shape=shape) imgInBuf2 = cl.Image(context, cl.mem_flags.READ_ONLY, cl.ImageFormat(cl.channel_order.LUMINANCE, cl.channel_type.UNORM_INT8), shape=shape) imgOutBuf = cl.Image(context, cl.mem_flags.WRITE_ONLY, cl.ImageFormat(cl.channel_order.LUMINANCE, cl.channel_type.UNORM_INT8), shape=shape) program = cl.Program(context, open('Difference.cl').read()).build() kernel = cl.Kernel(program, 'Difference') kernel.set_arg(0, imgInBuf1) kernel.set_arg(1, imgInBuf2) kernel.set_arg(2, imgOutBuf) kernel.set_arg(3, np.float32(threshold)) cl.enqueue_copy(queue, imgInBuf1, img1, origin=(0, 0), region=shape, is_blocking=False) cl.enqueue_copy(queue, imgInBuf2, img2, origin=(0, 0), region=shape, is_blocking=False) cl.enqueue_nd_range_kernel(queue, kernel, shape, None) cl.enqueue_copy(queue, result, imgOutBuf, origin=(0, 0), region=shape, is_blocking=True) return result
def futhark_main(self, screenX_700, screenY_701, depth_702, xmin_703, ymin_704, xmax_705, ymax_706): res_707 = (xmax_705 - xmin_703) res_708 = (ymax_706 - ymin_704) y_711 = sitofp_i32_f32(screenX_700) y_712 = sitofp_i32_f32(screenY_701) x_713 = slt32(np.int32(0), depth_702) bytes_902 = (np.int32(4) * screenY_701) mem_903 = cl.Buffer( self.ctx, cl.mem_flags.READ_WRITE, long( long(bytes_902) if (bytes_902 > np.int32(0)) else np.int32(1))) mem_905 = cl.Buffer( self.ctx, cl.mem_flags.READ_WRITE, long( long(bytes_902) if (bytes_902 > np.int32(0)) else np.int32(1))) group_size_911 = np.int32(512) num_groups_912 = squot32( ((screenY_701 + group_size_911) - np.int32(1)), group_size_911) if ((np.int32(1) * (num_groups_912 * group_size_911)) != np.int32(0)): self.map_kernel_894_var.set_args(np.float32(ymin_704), np.float32(y_712), np.float32(res_708), np.int32(screenY_701), mem_903, mem_905) cl.enqueue_nd_range_kernel( self.queue, self.map_kernel_894_var, (long( (num_groups_912 * group_size_911)), ), (long(group_size_911), )) if synchronous: self.queue.finish() nesting_size_844 = (screenX_700 * screenY_701) bytes_906 = (bytes_902 * screenX_700) mem_908 = cl.Buffer( self.ctx, cl.mem_flags.READ_WRITE, long( long(bytes_906) if (bytes_906 > np.int32(0)) else np.int32(1))) group_size_917 = np.int32(512) num_groups_918 = squot32( (((screenY_701 * screenX_700) + group_size_917) - np.int32(1)), group_size_917) if ((np.int32(1) * (num_groups_918 * group_size_917)) != np.int32(0)): self.map_kernel_846_var.set_args(np.int32(screenX_700), np.int32(screenY_701), mem_905, np.byte(x_713), np.int32(depth_702), np.float32(xmin_703), mem_903, np.float32(y_711), np.float32(res_707), mem_908) cl.enqueue_nd_range_kernel( self.queue, self.map_kernel_846_var, (long( (num_groups_918 * group_size_917)), ), (long(group_size_917), )) if synchronous: self.queue.finish() out_mem_909 = mem_908 out_memsize_910 = bytes_906 return (out_memsize_910, out_mem_909)
def dfunKernel(self, state_variables, coupling, local_coupling=0.0): n_states = state_variables.shape[0] n_nodes = state_variables.shape[1] n_mode = state_variables.shape[2] # allocate data if not yet done so if not hasattr(self, '_arrays'): self._alloc_opencl(n_nodes, n_states=n_states, n_mode=n_mode) # copy if passed host arrays if isinstance(state_variables, numpy.ndarray): # state_variables, coupling will be (1, n, 1) if (DEBUG): print("state_variables are ndarray", "states:", state_variables.shape, "coupling:", coupling.shape) #self._arrays['state'][:] = state_variables.reshape((1, n_states*n_nodes*n_mode)).astype('f') #self._arrays['coupling'][:] = coupling.reshape((1, n_nodes)).astype('f') # self._arrays['state'] = state_variables.flatten() #self._arrays['coupling'] = coupling.reshape((1, n_nodes)).astype('f') if (DEBUG): print( "state_variable shape:", state_variables.reshape( (n_states, n_nodes * n_mode, 1)).astype('f').shape) print("array state shape", self._arrays['state'][:].shape) self._arrays['state'][:] = state_variables.reshape( (n_states, n_nodes, n_mode)).astype('f') self._arrays['coupling'][:] = coupling.reshape( (1, n_nodes)).astype('f') # set kernel arg if passed device arrays elif isinstance(state_variables, pyopencl.array.Array): self._kernel.set_args(state_variables.data, coupling.data, self._arrays['param'].data, self._arrays['deriv'].data) # otherwise, complain else: raise TypeError('unsupported data type %r', type(state_variables)) # run the kernel and wait print("Run kernel...") pyopencl.enqueue_nd_range_kernel(self._queue, self._kernel, (n_nodes, ), None).wait() # return derivatives following input type deriv = self._arrays['deriv'] if (DEBUG): print("derive shape:", deriv.shape) if isinstance(state_variables, numpy.ndarray): deriv = deriv.get().reshape( (n_states, n_nodes, n_mode)).astype('d') return deriv
def runKernel(self, maskImages, overlapAmount): globalSize = [0, 0] localSize = [0, 0] self.clattr.computeWorkingGroupSize(localSize, globalSize, [self.atts.width, self.atts.height, 1]) for i in range(len(maskImages)): mask = maskImages[i] size = [0, 0, 0] size[2] = mask.shape[0] size[1] = mask.shape[1] size[0] = mask.shape[2] structElem = self.atts.getStructElement(self.clattr.context, self.clattr.queue, mask) startOffset = 0 endOffset = 0 if self.atts.overlap[self.index] > 0: startOffset = int(self.atts.overlap[self.index] / 2) endOffset = int(self.atts.overlap[self.index] / 2) if self.atts.sliceStart <= 0: startOffset = 0 if self.atts.sliceEnd >= 0: endOffset = 0 if i == 0: self.kernel.set_args(self.clattr.inputBuffer, self.clattr.outputTmpBuffer, np.int32(self.atts.width), np.int32(self.atts.height), np.int32(self.clattr.maxSliceCount+self.atts.overlap[self.index]), structElem, np.int32(size[0]), np.int32(size[1]), np.int32(size[2]), np.int32(startOffset), np.int32(endOffset)) else: tmpBuffer1 = self.clattr.outputTmpBuffer if i%2 != 0 else self.clattr.outputBuffer tmpBuffer2 = self.clattr.outputTmpBuffer if i%2 == 0 else self.clattr.outputBuffer self.kernel2.set_args(self.clattr.inputBuffer, tmpBuffer1, tmpBuffer2, np.int32(self.atts.width), np.int32(self.atts.height), np.int32(self.clattr.maxSliceCount + self.atts.overlap[self.index]), structElem, np.int32(size[0]), np.int32(size[1]), np.int32(size[2]), np.int32(startOffset), np.int32(endOffset)) try: cl.enqueue_nd_range_kernel(self.clattr.queue, self.kernel if i ==0 else self.kernel2, globalSize, localSize) except Exception: return False structElem.release() if len(maskImages)%2 != 0: tmpBuffer = self.clattr.outputTmpBuffer self.clattr.outputTmpBuffer = self.clattr.outputBuffer self.clattr.outputBuffer = tmpBuffer return True
def test_algorithm(self): print "\n**************************" print "test_pbrs:" passed = 0 buffersize_in = 188*8 buffersize_out = 188*8 # opencl buffer uint self.inputbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=buffersize_in*4) # opencl buffer uint self.outputbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=buffersize_out*4) for k in self.kernelname: kernel = self.load_kernel(self.filename, k) passed = 0 self.fd_input = open('test_bench_pbrs_input.csv', 'r') self.fd_output = open('test_bench_pbrs_output.csv', 'r') for j in range(0,6): encoded_data = numpy.array(numpy.zeros(buffersize_out/4), dtype=numpy.uint32) data_to_encode = string.replace(self.fd_input.readline(),'\n','') reference_data = string.replace(self.fd_output.readline(),'\n','') for i in range(0,7): data_to_encode = "%s,%s" % (data_to_encode, string.replace(self.fd_input.readline(),'\n','')) reference_data = "%s,%s" % (reference_data, string.replace(self.fd_output.readline(),'\n','')) data_to_encode = numpy.fromstring(numpy.fromstring(data_to_encode, dtype=numpy.uint8, sep=",").tostring(), dtype=numpy.uint32) reference_data = numpy.fromstring(reference_data, dtype=numpy.uint8, sep=",") cl.enqueue_copy(self.queue, self.inputbuffer, data_to_encode).wait() kernel.set_args(self.inputbuffer, self.outputbuffer) cl.enqueue_nd_range_kernel(self.queue,kernel,(8,),(8,),None ).wait() cl.enqueue_copy(self.queue, encoded_data, self.outputbuffer).wait() encoded_data = (numpy.fromstring(encoded_data.tostring(), dtype=numpy.uint8)) if encoded_data.tostring() == reference_data.tostring(): passed += 1 print "Test %d PASSED" % (j+1) else: print "Test %d FAILED" % (j+1) print "input data:" print numpy.fromstring(data_to_encode.tostring(), dtype=numpy.uint8) print "encoded data:" print numpy.fromstring(encoded_data.tostring(), dtype=numpy.uint8) print "reference data:" print reference_data print "error data:" print (reference_data - numpy.fromstring(encoded_data.tostring(), dtype=numpy.uint8)) print "%d pass out of 6" % passed self.fd_input.close() self.fd_output.close() if passed == 6: print "All pbrs tests PASS\n" return True else: print "at least one pbrs test FAILED\n" return False
def prefixSumDownInplace(self, e, data, ndata, events): import numpy as np import pyopencl as cl mf = cl.mem_flags if not isinstance(data, cl.Buffer): data_buf = cl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=data) else: data_buf = data grid_dims = self.get_grid_dims(ndata) psumbytes = int(np.prod(grid_dims) * np.uint64(0).nbytes) npsumbytes = np.uint64(0).nbytes psum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, psumbytes) npsum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, npsumbytes) kernel = self.prg.prefixSumDownInplace kernel.set_args(data_buf, np.uint64(ndata), psum_buf, npsum_buf) global_dims = self.get_global(grid_dims) print "prefixSumDownInplace %s %s %d %d" % ( str(global_dims), str(self.localDims), ndata, psumbytes) if e is None: e = (cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims, wait_for=e), ) else: e = (cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims), ) events += e npsum = np.zeros(1, dtype=np.uint64) events += (cl.enqueue_copy(self.queue, npsum, npsum_buf, wait_for=e), ) if npsum > 1: (e, psum_buf, psum1_buf, npsum1_buf, ndata2) = self.prefixSumDownInplace(e, psum_buf, npsum.item(), events) else: ndata2 = np.zeros(1, dtype=np.uint64) events += (cl.enqueue_copy(self.queue, ndata2, psum_buf, wait_for=e), ) ndata2 = ndata2.item() print ndata2 self.prefixSumUp(e, data_buf, ndata, psum_buf, npsum, events) return (e, data_buf, psum_buf, npsum_buf, ndata2)
def vglCl3dDilate(img_input, img_output, convolution_window, window_size_x, window_size_y, window_size_z): print("# Running vglCl3dDilate") vl.vglCheckContext(img_input, vl.VGL_CL_CONTEXT()) vl.vglCheckContext(img_output, vl.VGL_CL_CONTEXT()) # TRANSFORMAR EM BUFFER try: cl_convolution_window = cl.Buffer(vl.get_ocl().context, cl.mem_flags.READ_ONLY, convolution_window.nbytes) cl.enqueue_copy(vl.get_ocl().commandQueue, cl_convolution_window, convolution_window.tobytes(), is_blocking=True) convolution_window = cl_convolution_window except Exception as e: print("vglCl3dDilate: Error!! Impossible to convert convolution_window to cl.Buffer object.") print(str(e)) exit() if( not isinstance(window_size_x, np.uint32) ): print("vglCl3dDilate: Warning: window_size_x not np.uint32! Trying to convert...") try: window_size_x = np.uint32(window_size_x) except Exception as e: print("vglCl3dDilate: Error!! Impossible to convert window_size_x as a np.uint32 object.") print(str(e)) exit() if( not isinstance(window_size_y, np.uint32) ): print("vglCl3dDilate: Warning: window_size_y not np.uint32! Trying to convert...") try: window_size_y = np.uint32(window_size_y) except Exception as e: print("vglCl3dDilate: Error!! Impossible to convert window_size_y as a np.uint32 object.") print(str(e)) exit() if( not isinstance(window_size_z, np.uint32) ): print("vglCl3dDilate: Warning: window_size_z not np.uint32! Trying to convert...") try: window_size_z = np.uint32(window_size_z) except Exception as e: print("vglCl3dDilate: Error!! Impossible to convert window_size_z as a np.uint32 object.") print(str(e)) exit() _program = vl.get_ocl_context().get_compiled_kernel("../CL/vglCl3dDilate.cl", "vglCl3dDilate") kernel_run = _program.vglCl3dDilate kernel_run.set_arg(0, img_input.get_oclPtr()) kernel_run.set_arg(1, img_output.get_oclPtr()) kernel_run.set_arg(2, convolution_window) kernel_run.set_arg(3, window_size_x) kernel_run.set_arg(4, window_size_y) kernel_run.set_arg(5, window_size_z) cl.enqueue_nd_range_kernel(vl.get_ocl().commandQueue, kernel_run, img_output.get_oclPtr().shape, None) vl.vglSetContext(img_output, vl.VGL_CL_CONTEXT())
def calc_weights_gradient(self): """ Calculate gradient of weights. This method should be called only for processed layers as it's used inputs array which is valid only at processing time. """ for l in self._next_layers: if not l[0].processed: l[0].calc_weights_gradient() queue = self.opencl.queue kernel = self.opencl.kernel_calc_layer_gradient kernel.set_arg(2, self._inputs_offset) kernel.set_arg(3, self._neurons_offset) kernel.set_arg(4, self._inputs_per_neuron) kernel.set_arg(5, self._weights_offset) kernel.set_arg(7, self._weights_count) kernel.set_arg( 8, pyopencl.LocalMemory( int(4 * (self._inputs_per_neuron + 1 + self.opencl.max_local_size[0] // self._inputs_per_neuron)) )) self._calc_gradient_event = pyopencl.enqueue_nd_range_kernel( queue, kernel, (int(self._weights_buf_size), ), (self.opencl.max_local_size[0], ), wait_for=self._calc_gradient_wait_for) del self._calc_gradient_wait_for[:] kernel = self.opencl.kernel_propagate_errors kernel.set_arg(2, self._neurons_offset) kernel.set_arg(5, self._neuron_count) kernel.set_arg(7, self._inputs_per_neuron) i_s = numpy.int32(1) for l in self._prev_layers: kernel.set_arg(3, l[0]._neurons_offset + l[1]) kernel.set_arg(4, l[2]) kernel.set_arg(6, self._weights_offset + i_s) l[0]._calc_gradient_wait_for.append( pyopencl.enqueue_nd_range_kernel( queue, kernel, (int(l[2] * 64), ), (64, ), wait_for=(self._calc_gradient_event, ))) i_s += l[2] self._processed = True
def execute(self): kernel = self.program.mul kernel.set_args(self.a_buf, self.b_buf, self.c_buf, numpy.int32(2), numpy.int32(5), numpy.int32(10)) cl.enqueue_nd_range_kernel(self.queue, kernel, (2, 5), None) c = numpy.empty_like(self.a.dot(self.b)) cl.enqueue_copy(self.queue, c, self.c_buf).wait() print("a", self.a) print("b", self.b) print("c", c)
def vglClNdThreshold(self, img_input, img_output, thresh, top=255): print("# Running vglClNdThreshold") if (not img_input.clForceAsBuf == vl.IMAGE_ND_ARRAY()): print( "vglClNdThreshold: Error: this function supports only OpenCL data as buffer and img_input isn't." ) exit() if (not img_output.clForceAsBuf == vl.IMAGE_ND_ARRAY()): print( "vglClNdThreshold: Error: this function supports only OpenCL data as buffer and img_output isn't." ) exit() if (not isinstance(thresh, np.uint8)): print( "vglClNdThreshold: Warning: thresh not np.uint8! Trying to convert..." ) try: thresh = np.uint8(thresh) except Exception as e: print( "vglClNdThreshold: Error!! Impossible to convert thresh as a np.uint8 object." ) print(str(e)) exit() if (not isinstance(top, np.uint8)): print( "vglClNdThreshold: Warning: top not np.uint8! Trying to convert..." ) try: top = np.uint8(top) except Exception as e: print( "vglClNdThreshold: Error!! Impossible to convert top as a np.uint8 object." ) print(str(e)) exit() vl.vglCheckContext(img_input, vl.VGL_CL_CONTEXT()) vl.vglCheckContext(img_output, vl.VGL_CL_CONTEXT()) _program = self.cl_ctx.get_compiled_kernel( "../CL_ND/vglClNdThreshold.cl", "vglClNdThreshold") kernel_run = _program.vglClNdThreshold kernel_run.set_arg(0, img_input.get_oclPtr()) kernel_run.set_arg(1, img_output.get_oclPtr()) kernel_run.set_arg(2, thresh) kernel_run.set_arg(3, top) cl.enqueue_nd_range_kernel(self.ocl.commandQueue, kernel_run, img_output.get_ipl().shape, None) vl.vglSetContext(img_output, vl.VGL_CL_CONTEXT())
def max_reduce_real4(ipt): x = CLReal(len(ipt)) y = CLReal(len(ipt)) z = CLReal(len(ipt)) kern = _splitkern_real4.kern kern.set_arg(0, ipt._buffer) kern.set_arg(1, x._buffer) kern.set_arg(2, y._buffer) kern.set_arg(3, z._buffer) cl.enqueue_nd_range_kernel(ipt._ctrl.clqueue, kern, (len(ipt),), None) return max_reduce(x), max_reduce(y), max_reduce(z)
def send(self): # Set the Kernel Arguments npSize = np.int32(self.data_size / 4) self.ocl_krnl_input_stage.set_args(self.buffer_input, npSize) # Copy input data to device global memory cl.enqueue_migrate_mem_objects(self.ocl_q, [self.buffer_input], flags=0) # Launch the Kernel cl.enqueue_nd_range_kernel(self.ocl_q, self.ocl_krnl_input_stage, [1], [1])
def prefixSum(self, e, data, keys, ndata, low, hi, events): import numpy as np import pyopencl as cl mf = cl.mem_flags if not isinstance(data, cl.Buffer): data_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf= data) else: data_buf = data if not isinstance(keys, cl.Buffer): keys_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf= keys) else: keys_buf = keys grid_dims = self.get_grid_dims(ndata) psumbytes = ndata * np.uint64(0).nbytes bsumbytes = int(np.prod(grid_dims) * np.uint64(0).nbytes) nbsumbytes = np.uint64(0).nbytes psum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, psumbytes) bsum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, bsumbytes) nbsum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, nbsumbytes) low = PrefixSum.HOST_TYPE_KEYS(low) hi = PrefixSum.HOST_TYPE_KEYS(hi) kernel = self.prg.prefixSumDown kernel.set_args(data_buf, keys_buf, np.uint64(ndata), low, hi, psum_buf, bsum_buf, nbsum_buf) global_dims = self.get_global(grid_dims) print "prefixSumDown %s %s" % (str(global_dims), str(self.localDims)) if e is None: e = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims, wait_for=e), ) else: e = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims), ) events += e nbsum = np.zeros(1, dtype = np.uint64) events += (cl.enqueue_copy(self.queue, nbsum, nbsum_buf, wait_for=e),) if nbsum>1: (e, bsum_buf, bsum1_buf, nbsum1_buf, ndata2) = self.prefixSumDownInplace(e, bsum_buf, nbsum.item(), events) else: ndata2 = np.zeros(1, dtype = np.uint64) events += (cl.enqueue_copy(self.queue, ndata2, bsum_buf, wait_for=e),) ndata2 = ndata2.item() print ndata2 self.prefixSumUp(e, psum_buf, ndata, bsum_buf, nbsum, events) return (e, data_buf, keys_buf, psum_buf, bsum_buf, nbsum_buf, ndata2)
def vglClNdBinMin(self, img_input, img_input2, img_output): if (not img_input.clForceAsBuf == vl.IMAGE_ND_ARRAY()): print( "vglClNdBinMin: Error: this function supports only OpenCL data as buffer and img_input isn't." ) exit() if (not img_input2.clForceAsBuf == vl.IMAGE_ND_ARRAY()): print( "vglClNdBinMin: Error: this function supports only OpenCL data as buffer and img_input isn't." ) exit() if (not img_output.clForceAsBuf == vl.IMAGE_ND_ARRAY()): print( "vglClNdBinMin: Error: this function supports only OpenCL data as buffer and img_output isn't." ) exit() vl.vglCheckContext(img_input, vl.VGL_CL_CONTEXT()) vl.vglCheckContext(img_input2, vl.VGL_CL_CONTEXT()) vl.vglCheckContext(img_output, vl.VGL_CL_CONTEXT()) if (not isinstance(window, vl.VglStrEl)): print( "vglClNdBinMin: Error: window is not a VglStrEl object. aborting execution." ) exit() _program = self.cl_ctx.get_compiled_kernel( "../CL_BIN/vglClNdBinMin.cl", "vglClNdBinMin") kernel_run = _program.vglClNdBinMin kernel_run.set_arg(0, img_input.get_oclPtr()) kernel_run.set_arg(1, img_input2.get_oclPtr()) kernel_run.set_arg(2, img_output.get_oclPtr()) _worksize_0 = img_input.getWidthIn() if (img_input.depth == vl.IPL_DEPTH_1U()): _worksize_0 = img_input.getWidthStep() if (img_input2.depth == vl.IPL_DEPTH_1U()): _worksize_0 = img_input2.getWidthStep() if (img_output.depth == vl.IPL_DEPTH_1U()): _worksize_0 = img_output.getWidthStep() worksize = (int(_worksize_0), img_input.getHeigthIn(), img_input.getNFrames()) # ENQUEUEING KERNEL EXECUTION #cl.enqueue_nd_range_kernel(self.ocl.commandQueue, kernel_run, worksize, None) cl.enqueue_nd_range_kernel(self.ocl.commandQueue, kernel_run, img_output.get_ipl().shape, None) vl.vglSetContext(img_output, vl.VGL_CL_CONTEXT())
def filter(self, data, keys, low, hi, events): import numpy as np import pyopencl as cl mf = cl.mem_flags ndata = data.size (e, data_buf, keys_buf, indices_buf, bsum_buf, nbsum_buf, ndata2) = self.prefixSum(None, data, keys, ndata, low, hi, events) filt = np.zeros(ndata, dtype = np.bool8) indices = np.zeros(ndata, dtype = np.uint64) data2 = np.zeros(ndata2, dtype = PrefixSum.HOST_TYPE_DATA) keys2 = np.zeros(ndata2, dtype = PrefixSum.HOST_TYPE_KEYS) ndata2bytes = np.uint64(0).nbytes if PrefixSum.RETURN_FILTER == 1: filt_buf = cl.Buffer(self.ctx, mf.READ_WRITE, filt.nbytes) print data2.nbytes data2_buf = cl.Buffer(self.ctx, mf.READ_WRITE, data2.nbytes) keys2_buf = cl.Buffer(self.ctx, mf.READ_WRITE, keys2.nbytes) ndata2_buf = cl.Buffer(self.ctx, mf.READ_WRITE, ndata2bytes) low = PrefixSum.HOST_TYPE_KEYS(low) hi = PrefixSum.HOST_TYPE_KEYS(hi) kernel = self.prg.filter if PrefixSum.RETURN_FILTER == 1: kernel.set_args(data_buf, keys_buf, indices_buf, np.uint64(ndata), low, hi, filt_buf, data2_buf, keys2_buf, ndata2_buf) else: kernel.set_args(data_buf, keys_buf, indices_buf, np.uint64(ndata), low, hi, data2_buf, keys2_buf, ndata2_buf) global_dims = self.get_global(self.get_grid_dims(ndata)) print "filter" if e is None: e = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims, wait_for=e), ) else: e = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims), ) events += e if PrefixSum.RETURN_FILTER == 1: events += ( cl.enqueue_copy(self.queue, filt, filt_buf, wait_for=e), cl.enqueue_copy(self.queue, indices, indices_buf, wait_for=e), cl.enqueue_copy(self.queue, data2, data2_buf, wait_for=e), cl.enqueue_copy(self.queue, keys2, keys2_buf, wait_for=e) ) else: events += ( cl.enqueue_copy(self.queue, indices, indices_buf, wait_for=e), cl.enqueue_copy(self.queue, data2, data2_buf, wait_for=e), cl.enqueue_copy(self.queue, keys2, keys2_buf, wait_for=e) ) return (filt, indices, data2, keys2)
def futhark_render(self, width_743, height_744, time_745, degree_746): y_749 = sitofp_i32_f32(height_744) y_750 = sitofp_i32_f32(width_743) x_751 = fpow32(time_745, np.float32((1.5))) y_752 = (x_751 * np.float32((5.0e-3))) res_753 = (np.float32((1.0)) + y_752) res_754 = (np.float32((3.1415927)) / res_753) group_size_893 = self.group_size y_894 = (group_size_893 - np.int32(1)) x_895 = (width_743 + y_894) num_groups_896 = squot32(x_895, group_size_893) num_threads_897 = (num_groups_896 * group_size_893) bytes_911 = (np.int32(4) * width_743) mem_912 = cl.Buffer( self.ctx, cl.mem_flags.READ_WRITE, np.long( np.long(bytes_911) if ( bytes_911 > np.int32(0)) else np.int32(1))) if ((np.int32(1) * (num_groups_896 * group_size_893)) != np.int32(0)): self.map_kernel_898_var.set_args(mem_912, np.float32(y_749), np.int32(width_743)) cl.enqueue_nd_range_kernel( self.queue, self.map_kernel_898_var, (np.long( (num_groups_896 * group_size_893)), ), (np.long(group_size_893), )) if synchronous: self.queue.finish() nesting_size_833 = (height_744 * width_743) x_836 = (nesting_size_833 + y_894) num_groups_837 = squot32(x_836, group_size_893) num_threads_838 = (num_groups_837 * group_size_893) bytes_913 = (bytes_911 * height_744) mem_915 = cl.Buffer( self.ctx, cl.mem_flags.READ_WRITE, np.long( np.long(bytes_913) if ( bytes_913 > np.int32(0)) else np.int32(1))) if ((np.int32(1) * (num_groups_837 * group_size_893)) != np.int32(0)): self.map_kernel_839_var.set_args(mem_912, np.int32(height_744), np.float32(res_754), np.float32(y_750), np.int32(degree_746), mem_915, np.int32(width_743)) cl.enqueue_nd_range_kernel( self.queue, self.map_kernel_839_var, (np.long( (num_groups_837 * group_size_893)), ), (np.long(group_size_893), )) if synchronous: self.queue.finish() out_mem_917 = mem_915 out_memsize_918 = bytes_913 return (out_memsize_918, out_mem_917)
def solve(self,puzzle,simulations = 16384, iterations = 35, workGroupSize = 128): self.simulations = simulations self.iterations = iterations self.workGroupSize = workGroupSize self.workGroups = int(self.simulations / self.workGroupSize) self.width = np.int8(puzzle['width']) self.height = np.int8(puzzle['height']) #initialise buffers self.initBuffers(puzzle) #create kernel self.kernel = cl.Kernel(self.program,"montecarlo") self.kernel.set_args(self.lengthsBuffer,self.groupLengthsBuffer,self.puzzlesBuffer,self.solutionsBuffer,self.height,self.width,np.int32(self.iterations)) #execute program for a number of iterations cl.enqueue_nd_range_kernel(self.queue,self.kernel,(self.simulations,),(self.workGroupSize,)) #unmap group lengths buffer from device cl.enqueue_map_buffer(self.queue,self.groupLengthsBuffer,cl.map_flags.WRITE,0,self.groupLengths.shape,self.groupLengths.dtype) self.groupLengths = self.groupLengthsBuffer.get_host_array(self.groupLengths.shape,dtype=self.groupLengths.dtype) #unmap solutions buffer from device cl.enqueue_map_buffer(self.queue,self.solutionsBuffer,cl.map_flags.WRITE,0,self.solutionsFlattened.shape,self.solutions.dtype) self.solutions = self.solutionsBuffer.get_host_array(self.solutions.shape,dtype=self.solutions.dtype) #release buffers self.lengthsBuffer.release() self.groupLengthsBuffer.release() self.puzzlesBuffer.release() self.solutionsBuffer.release() #get the best solution i = self.groupLengths.argmin() bestSolution = np.array(self.solutions[i]) #convert solution to list format used by challenge solution = [] for row in range(0,puzzle['height']): for col in range(0,puzzle['width']): if bestSolution[row][col]!=-1: s = bestSolution[row][col] #add to solution list solution.append({'X': int(col),'Y': int(row),'Size':int(s)}) #clear cells in solution for i in range(0,s): for j in range(0,s): bestSolution[row+i][col+j]=-1 return solution
def change_display(image) : image_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=image) mem = cl.GLBuffer(ctx, mf.WRITE_ONLY, numpy.float32(buf)) cl.enqueue_acquire_gl_objects(queue, [mem]) add_knl = prog.add add_knl.set_args(image_buf, mem) cl.enqueue_nd_range_kernel(queue, add_knl, image.shape, None) cl.enqueue_release_gl_objects(queue, [mem]) queue.finish() glFlush()
def _exec_chunked_unsafe(self, chunksize=0): """Unsafe for kernels with local variables.""" if chunksize > 0: self._prep_chunked_exec(chunksize) lenarr = self.leadingvar.length ncnk = int(ceil(float(lenarr)/float(self._cnksz))) cnksz = self._cnksz for i in range(ncnk): if (i == (ncnk - 1)) and not(lenarr % cnksz == 0): cnksz = lenarr % cnksz self._solverobj.__setattr__(self._cnk_name, i) cl.enqueue_nd_range_kernel(self._solverobj.clqueue, self._clkernel, (cnksz,), None) self._solverobj.clqueue.finish()
def updateEt_vanilla(self, algo="SHG"): root.debug("Updating Et using vanilla algorithm") t0 = time.clock() # transform = FFT(self.ctx, self.q, (self.Esig_w_tau_cla,) , (self.Esig_t_tau_p_cla,) , axes = [1]) # events = transform.enqueue(forward = False) # self.Esig_t_tau_p_cla.set(np.fft.ifft(self.Esig_w_tau_cla.get(), axis=1).astype(self.dtype_c).copy()) if self.useCL == True: events = self.Esig_t_tau_p_fft.enqueue(forward=False) for e in events: e.wait() if algo == "SD": krn = self.progs.progs["updateEtVanillaSumSD"].updateEtVanillaSumSD krn.set_scalar_arg_dtypes((None, None, np.int32)) krn.set_args(self.Esig_t_tau_p_cla.data, self.Et_cla.data, self.N) ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Et.shape, None) ev.wait() Et = self.Et_cla.get() self.Et_cla.set(-np.conj(Et).astype(self.dtype_c).copy()) # Esig_w_tau = self.Esig_w_tau_cla.get() # Gm = np.conj(Esig_w_tau.sum(axis=1))[::-1] # self.Et_cla.set(Gm.copy()) else: krn = self.progs.progs["updateEtVanillaSumSHG"].updateEtVanillaSumSHG krn.set_scalar_arg_dtypes((None, None, np.int32)) krn.set_args(self.Esig_t_tau_p_cla.data, self.Et_cla.data, self.N) ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Et.shape, None) ev.wait() krn = self.progs.progs["updateEtVanillaNorm"].updateEtVanillaNorm krn.set_scalar_arg_dtypes((None, np.int32)) krn.set_args(self.Et_cla.data, self.N) ev = cl.enqueue_nd_range_kernel(self.q, krn, [1], None) ev.wait() else: self.Esig_t_tau_p_cla.set(np.fft.ifft(self.Esig_w_tau_cla.get(), axis=1).astype(self.dtype_c).copy()) Esig_t_tau_p = self.Esig_t_tau_p_cla.get() if algo == "SD": Et = np.sqrt(Esig_t_tau_p.sum(axis=0)) # Et = (Esig_t_tau_p.sum(axis=0)) else: Et = Esig_t_tau_p.sum(axis=0) Et = Et / np.abs(Et).max() self.Et_cla.set(Et) root.debug("".join(("Time spent: ", str(time.clock() - t0))))
def test_algorithm(self): print "\n**************************" print "test_reedsolomon:" passed = 0 linecnt = 1 # opencl buffer uint self.inputbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=48*4) # opencl buffer uint self.outputbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=51*4) for k in self.kernelname: kernel = self.load_kernel(self.filename, k) self.fd_input = open('test_bench_rs_input.csv', 'r') self.fd_output = open('test_bench_rs_output.csv', 'r') for line in self.fd_input: data_to_encode = numpy.fromstring(line, dtype=numpy.uint8, sep=",").tostring() data_to_encode = numpy.fromstring(data_to_encode, dtype=numpy.uint32) encoded_data = numpy.array(numpy.zeros(51), dtype=numpy.uint32) reference_data = numpy.fromstring(self.fd_output.readline(), dtype=numpy.uint8, sep=",") cl.enqueue_copy(self.queue, self.inputbuffer, data_to_encode).wait() kernel.set_args(self.inputbuffer, self.outputbuffer) cl.enqueue_nd_range_kernel(self.queue,kernel,(1,),None ).wait() cl.enqueue_copy(self.queue, encoded_data, self.outputbuffer).wait() if encoded_data.tostring() == reference_data.tostring(): passed += 1 print "Test %d PASSED" % linecnt else: print "Test %d FAILED" % linecnt print "input data:" print numpy.fromstring(data_to_encode.tostring(), dtype=numpy.uint8) print "encoded data:" print numpy.fromstring(encoded_data.tostring(), dtype=numpy.uint8) print "reference data:" print reference_data print "error data:" print (reference_data - numpy.fromstring(encoded_data.tostring(), dtype=numpy.uint8)) linecnt += 1 print "%d pass out of %d" % (passed,(linecnt-1)) self.fd_input.close() self.fd_output.close() if passed == (linecnt-1): print "All reedsolomon tests PASS\n" return True else: print "at least one reedsolomon test FAILED\n" return False
def calc_weights_gradient( self ): """ Calculate gradient of weights. This method should be called only for processed layers as it's used inputs array which is valid only at processing time. """ for l in self._next_layers: if not l[0].processed: l[0].calc_weights_gradient() queue = self.opencl.queue kernel = self.opencl.kernel_calc_layer_gradient kernel.set_arg( 2, self._inputs_offset ) kernel.set_arg( 3, self._neurons_offset ) kernel.set_arg( 4, self._inputs_per_neuron ) kernel.set_arg( 5, self._weights_offset ) kernel.set_arg( 7, self._weights_count ) kernel.set_arg( 8, pyopencl.LocalMemory( int( 4 * ( self._inputs_per_neuron + 1 + self.opencl.max_local_size[ 0 ] // self._inputs_per_neuron ) ) ) ) self._calc_gradient_event = pyopencl.enqueue_nd_range_kernel( queue, kernel, ( int( self._weights_buf_size ), ), ( self.opencl.max_local_size[ 0 ], ), wait_for = self._calc_gradient_wait_for ) del self._calc_gradient_wait_for[:] kernel = self.opencl.kernel_propagate_errors kernel.set_arg( 2, self._neurons_offset ) kernel.set_arg( 5, self._neuron_count ) kernel.set_arg( 7, self._inputs_per_neuron ) i_s = numpy.int32( 1 ) for l in self._prev_layers: kernel.set_arg( 3, l[0]._neurons_offset + l[1] ) kernel.set_arg( 4, l[2] ) kernel.set_arg( 6, self._weights_offset + i_s ) l[0]._calc_gradient_wait_for.append( pyopencl.enqueue_nd_range_kernel( queue, kernel, ( int( l[2] * 64 ), ), ( 64, ), wait_for = ( self._calc_gradient_event, ) ) ) i_s += l[2] self._processed = True
def gpu_amend_values(queue, kernels, gpu_params, buffers, amendments): """ Transfers requested amendments (after collision detection check) to the GPU, where a kernel applies them to the data """ intermediary_events = [] packet = amendments.get_packet() if packet[amendments.amount_i] > 0: events = [ cl.enqueue_copy(queue, buffers["global_amendments_n"], packet[amendments.amount_i]), cl.enqueue_copy(queue, buffers["global_amendment_indices"], packet[amendments.indices_i]), cl.enqueue_copy(queue, buffers["global_amendment_values"], packet[amendments.values_i])] # X groups of 64 items (amendments.amount work items) intermediary_events.append( cl.enqueue_nd_range_kernel( queue, kernels["k_update_values"], (int(np.ceil(amendments.amount / gpu_params["preferred_multiple"]) * gpu_params["preferred_multiple"]),), (gpu_params["preferred_multiple"],), global_work_offset=None, wait_for=events)) return intermediary_events
def enqueue(self, wait_for=None, profiling=False): ev = cl.enqueue_nd_range_kernel( self.queue, self.kern, self.gsize, self.lsize, wait_for=wait_for) if profiling: self._events_to_profile.append(ev) return ev
def update_map(queue, kernels, intermediary_events): """ Updates map. Updating includes: - "Dissoluting" pheromones: pheromone level is reduced regularly to simulate ageing. """ intermediary_events.append(cl.enqueue_nd_range_kernel( queue, kernels["k_update_map"], [1], [1]))
def applyIntensityData(self, I_w_tau=None): root.debug("Applying intensity data from experiment") t0 = time.clock() krn = self.progs.progs["applyIntensityData"].applyIntensityData krn.set_scalar_arg_dtypes((None, None, np.int32)) krn.set_args(self.Esig_w_tau_cla.data, self.I_w_tau_cla.data, self.N) ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Esig_w_tau.shape, None) ev.wait() # if self.useCL == True: # krn = self.progs.progs['applyIntensityData'].applyIntensityData # krn.set_scalar_arg_dtypes((None, None, np.int32)) # krn.set_args(self.Esig_w_tau_cla.data, self.I_w_tau_cla.data, self.N) # ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Esig_w_tau.shape, None) # ev.wait() # else: # eps = 0.00 # Esig_w_tau = self.Esig_w_tau_cla.get() # Esig_mag = np.abs(Esig_w_tau) # # Esig_w_tau_p = np.zeros_like(Esig_w_tau) # good_ind = np.where(Esig_mag > eps) # Esig_w_tau_p[good_ind[0], good_ind[1]] = np.sqrt(self.I_w_tau_cla.get()[good_ind[0], good_ind[1]])*Esig_w_tau[good_ind[0], good_ind[1]]/Esig_mag[good_ind[0], good_ind[1]] root.debug("".join(("Time spent: ", str(time.clock() - t0))))
def calc_chi2(self, queue, interspace, q, Iq, rind, rxyz, lind, lxyz, origin, voxelspacing, fifj, targetIq, sq, chi2): kernel = self.kernels.calc_chi2 workgroupsize = 16 gws = (queue.device.max_compute_units * workgroupsize * 512,) lws = (workgroupsize,) floatsize = 4 tmpIq = cl.LocalMemory(floatsize * q.shape[0] * workgroupsize) shape = np.zeros(4, dtype=np.int32) shape[:-1] = interspace.shape shape[-1] = interspace.size nq = np.int32(q.shape[0]) nind1 = np.int32(rind.shape[0]) nind2 = np.int32(lind.shape[0]) fifj_shape = np.zeros(4, dtype=np.int32) fifj_shape[:-1] = fifj.shape fifj_shape[-1] = fifj.size kernel.set_args(interspace.data, q.data, Iq.data, tmpIq, rind.data, rxyz.data, lind.data, lxyz.data, origin, voxelspacing, fifj.data, targetIq.data, sq.data, chi2.data, shape, nq, nind1, nind2, fifj_shape) status = cl.enqueue_nd_range_kernel(queue, kernel, gws, lws) return status
def gradZSD_gpu(self): root.debug("Calculating dZ for SD using gpu") krn = self.progs.progs["gradZSD"].gradZSD krn.set_scalar_arg_dtypes((None, None, None, np.int32)) krn.set_args(self.Esig_t_tau_p_cla.data, self.Et_cla.data, self.dZ_cla.data, self.N) ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Et.shape, None) ev.wait()
def __call__(self, *args, **kwargs): vectors = [] invocation_args = [] for arg, arg_descr in zip(args, self.arguments): if isinstance(arg_descr, VectorArg): if not arg.flags.forc: raise RuntimeError("ElementwiseKernel cannot " "deal with non-contiguous arrays") vectors.append(arg) invocation_args.append(arg.data) else: invocation_args.append(arg) queue = kwargs.pop("queue", None) wait_for = kwargs.pop("wait_for", None) if kwargs: raise TypeError("too many/unknown keyword arguments") repr_vec = vectors[0] if queue is None: queue = repr_vec.queue invocation_args.append(repr_vec.mem_size) gs, ls = repr_vec.get_sizes(queue, self.kernel.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, queue.device)) self.kernel.set_args(*invocation_args) return cl.enqueue_nd_range_kernel(queue, self.kernel, gs, ls, wait_for=wait_for)
def futhark_main(self, screenX_700, screenY_701, depth_702, xmin_703, ymin_704, xmax_705, ymax_706): res_707 = (xmax_705 - xmin_703) res_708 = (ymax_706 - ymin_704) y_711 = sitofp_i32_f32(screenX_700) y_712 = sitofp_i32_f32(screenY_701) x_713 = slt32(np.int32(0), depth_702) bytes_902 = (np.int32(4) * screenY_701) mem_903 = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, long(long(bytes_902) if (bytes_902 > np.int32(0)) else np.int32(1))) mem_905 = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, long(long(bytes_902) if (bytes_902 > np.int32(0)) else np.int32(1))) group_size_911 = np.int32(512) num_groups_912 = squot32(((screenY_701 + group_size_911) - np.int32(1)), group_size_911) if ((np.int32(1) * (num_groups_912 * group_size_911)) != np.int32(0)): self.map_kernel_894_var.set_args(np.float32(ymin_704), np.float32(y_712), np.float32(res_708), np.int32(screenY_701), mem_903, mem_905) cl.enqueue_nd_range_kernel(self.queue, self.map_kernel_894_var, (long((num_groups_912 * group_size_911)),), (long(group_size_911),)) if synchronous: self.queue.finish() nesting_size_844 = (screenX_700 * screenY_701) bytes_906 = (bytes_902 * screenX_700) mem_908 = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, long(long(bytes_906) if (bytes_906 > np.int32(0)) else np.int32(1))) group_size_917 = np.int32(512) num_groups_918 = squot32((((screenY_701 * screenX_700) + group_size_917) - np.int32(1)), group_size_917) if ((np.int32(1) * (num_groups_918 * group_size_917)) != np.int32(0)): self.map_kernel_846_var.set_args(np.int32(screenX_700), np.int32(screenY_701), mem_905, np.byte(x_713), np.int32(depth_702), np.float32(xmin_703), mem_903, np.float32(y_711), np.float32(res_707), mem_908) cl.enqueue_nd_range_kernel(self.queue, self.map_kernel_846_var, (long((num_groups_918 * group_size_917)),), (long(group_size_917),)) if synchronous: self.queue.finish() out_mem_909 = mem_908 out_memsize_910 = bytes_906 return (out_memsize_910, out_mem_909)
def execute(self): global_work_size = [self.outputSignalWidth*self.outputSignalHeight] local_work_size = [1] if (debug==1): print global_work_size print local_work_size kernel = self.program.convolve if (debug==1): print kernel.context print kernel.function_name print kernel.num_args print kernel.program print kernel.reference_count # Vecchia modalita' di creare un evento kernel.set_arg(0,self.inputSignalBuffer) kernel.set_arg(1,self.maskBuffer) kernel.set_arg(2,self.outputSignalBuffer) kernel.set_arg(3,self.inputSignalWidth) kernel.set_arg(4,self.maskWidth) self.event =cl.enqueue_nd_range_kernel(self.queue, kernel, global_work_size, local_work_size, global_work_offset=None, wait_for=None, g_times_l=True) if (debug==1): wgi = cl.kernel_work_group_info for dev in self.ctx.devices: print "-------",dev,"-------" print kernel.get_work_group_info(wgi.WORK_GROUP_SIZE,dev ) print kernel.get_work_group_info(wgi.COMPILE_WORK_GROUP_SIZE,dev ) print kernel.get_work_group_info(wgi.LOCAL_MEM_SIZE,dev ) print kernel.get_work_group_info(wgi.PRIVATE_MEM_SIZE,dev ) print kernel.get_work_group_info(wgi.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,dev ) # Nuova modalita' di creare un evento self.event = kernel(self.queue,global_work_size,None, self.inputSignalBuffer,self.maskBuffer,self.outputSignalBuffer ,self.inputSignalWidth ,self.maskWidth) if (debug==1): print "context",self.event.context print "command_execution_status",self.event.command_execution_status print "command_queue",self.event.command_queue print "command_type",self.event.command_type print "reference_count",self.event.reference_count #print self.event.profile.end #print self.event.profile.queued #print self.event.profile.start #print self.event.profile.submit cl.enqueue_copy(self.queue, self.outputSignal, self.outputSignalBuffer) print self.inputSignal print self.mask print self.outputSignal
def minZerrKernSHG_gpu(self): krn = self.progs.progs["minZerrSHG"].minZerrSHG krn.set_scalar_arg_dtypes((None, None, None, None, None, None, None, None, np.int32)) krn.set_args( self.Esig_t_tau_p_cla.data, self.Et_cla.data, self.dZ_cla.data, self.X0_cla.data, self.X1_cla.data, self.X2_cla.data, self.X3_cla.data, self.X4_cla.data, self.N, ) ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Et.shape, None) ev.wait() krn = self.progs.progs["normEsig"].normEsig krn.set_scalar_arg_dtypes((None, None, np.int32)) krn.set_args(self.Esig_t_tau_p_cla.data, self.Esig_t_tau_norm_cla.data, self.N) ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Esig_t_tau_p.shape, None) ev.wait() mx = cla.max(self.Esig_t_tau_norm_cla).get() * self.N * self.N # Esig_t_tau = self.Esig_t_tau_p_cla.get() # mx = ((Esig_t_tau*Esig_t_tau.conj()).real).max() * self.N*self.N X0 = cla.sum(self.X0_cla, queue=self.q).get() / mx X1 = cla.sum(self.X1_cla, queue=self.q).get() / mx X2 = cla.sum(self.X2_cla, queue=self.q).get() / mx X3 = cla.sum(self.X3_cla, queue=self.q).get() / mx X4 = cla.sum(self.X4_cla, queue=self.q).get() / mx root.debug("".join(("X0=", str(X0), ", type ", str(type(X0))))) root.debug( "".join(("Poly: ", str(X4), " x^4 + ", str(X3), " x^3 + ", str(X2), " x^2 + ", str(X1), " x + ", str(X0))) ) # Polynomial in dZ (expansion of differential) X = np.array([X0, X1, X2, X3, X4]).astype(np.double) root.debug("".join(("Esig_t_tau_p norm max: ", str(mx / (self.N * self.N))))) return X
def prepare_device_memory(queue, kernels, buffers, flock): """ Initializes device memory and transfers first flocks from host to the device. """ print("Initializing the memory and transferring the first flock.") intermediary_events = [cl.enqueue_nd_range_kernel( queue, kernels["k_init_memory"], [1], [1]), cl.enqueue_copy(queue, buffers["global_generated_flocks"], flock.np_arrays)] return intermediary_events
def prefixSumDownInplace(self, e, data, ndata, events): import numpy as np import pyopencl as cl mf = cl.mem_flags if not isinstance(data, cl.Buffer): data_buf = cl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=data) else: data_buf = data grid_dims = self.get_grid_dims(ndata) psumbytes = int(np.prod(grid_dims) * np.uint64(0).nbytes) npsumbytes = np.uint64(0).nbytes psum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, psumbytes) npsum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, npsumbytes) kernel = self.prg.prefixSumDownInplace kernel.set_args(data_buf, np.uint64(ndata), psum_buf, npsum_buf) global_dims = self.get_global(grid_dims) print "prefixSumDownInplace %s %s %d %d" % (str(global_dims), str(self.localDims), ndata, psumbytes) if e is None: e = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims, wait_for=e), ) else: e = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims), ) events += e npsum = np.zeros(1, dtype = np.uint64) events += (cl.enqueue_copy(self.queue, npsum, npsum_buf, wait_for=e),) if npsum>1: (e, psum_buf, psum1_buf, npsum1_buf, ndata2) = self.prefixSumDownInplace(e, psum_buf, npsum.item(), events) else: ndata2 = np.zeros(1, dtype = np.uint64) events += (cl.enqueue_copy(self.queue, ndata2, psum_buf, wait_for=e),) ndata2 = ndata2.item() print ndata2 self.prefixSumUp(e, data_buf, ndata, psum_buf, npsum, events) return (e, data_buf, psum_buf, npsum_buf, ndata2)