def filter_primes(bit_array, offset): if (not len(self.primes)): return empty_bitarray() b = numpy.array(self.primes, dtype=numpy.uint32) a = empty_bitarray() c = numpy.array(offset, dtype=numpy.uint32) b_buf = cl.Buffer(self.ctx, self.mf.READ_ONLY | self.mf.COPY_HOST_PTR, hostbuf=b) a_buf = cl.Buffer(self.ctx, self.mf.READ_WRITE | self.mf.COPY_HOST_PTR, hostbuf=a) c_buf = cl.Buffer(self.ctx, self.mf.READ_ONLY | self.mf.COPY_HOST_PTR, hostbuf=c) # send integers and new bit mask to pfilter event2 = self.program.pfilter(self.queue, (self.block_size, ), None, b_buf, a_buf, c_buf) cl.enqueue_read_buffer(self.queue, a_buf, a) print 'Filter Duration:', 1e-9 * (event2.profile.end - event2.profile.start) return a
def test_that_python_args_fail(ctx_factory): context = ctx_factory() prg = cl.Program(context, """ __kernel void mult(__global float *a, float b, int c) { a[get_global_id(0)] *= (b+c); } """).build() a = np.random.rand(50000) queue = cl.CommandQueue(context) mf = cl.mem_flags a_buf = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a) knl = cl.Kernel(prg, "mult") try: knl(queue, a.shape, None, a_buf, 2, 3) assert False, "PyOpenCL should not accept bare Python types as arguments" except cl.LogicError: pass try: prg.mult(queue, a.shape, None, a_buf, float(2), 3) assert False, "PyOpenCL should not accept bare Python types as arguments" except cl.LogicError: pass prg.mult(queue, a.shape, None, a_buf, np.float32(2), np.int32(3)) a_result = np.empty_like(a) cl.enqueue_read_buffer(queue, a_buf, a_result).wait()
def test_cl(): ctx = cl.create_some_context() # (interactive=False) # print 'ctx', ctx queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) f = open("part1.cl", "r") fstr = "".join(f.readlines()) program = cl.Program(ctx, fstr).build() mf = cl.mem_flags cameraPos = np.array([0, 6, -1, 0]) invView = la.inv(look_at((0, 6, -1), (0, 1, 1), (0, 1, 0))) invProj = la.inv(perspective(60, 1, 1, 1000)) print "view", invView print "proj", invProj viewParamsData = ( cameraPos.flatten().tolist() + np.transpose(invView).flatten().tolist() + np.transpose(invProj).flatten().tolist() ) # print 'vpd', viewParamsData viewParams = struct.pack("4f16f16f", *viewParamsData) viewParams_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=viewParams) num_pixels = 1000 * 1000 # setup opencl dest = np.ndarray((1000, 1000, 4), dtype=np.float32) dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, dest.nbytes) local_shape = (8, 8) # run kernel evt = program.part1(queue, (dest.shape[0], dest.shape[1]), None, viewParams_buf, dest_buf) # evt = program.part1(queue, dest.shape, None, dest_buf) cl.enqueue_read_buffer(queue, dest_buf, dest).wait() print "time", (evt.profile.end - evt.profile.start) * 0.000001, "ms" return dest
def search(self, midstate): msg = flipendian32(midstate) for i in xrange(8): self.sha512_fill.set_arg(i, msg[i * 4:i * 4 + 4]) self.sha512_fill.set_arg(8, self.hashes_buf) self.sha512_fill.set_arg(9, self.keyhash_buf) # t1 = time.time() cl.enqueue_nd_range_kernel(self.queue, self.sha512_fill, (HASHES_NUM, ), (self.sha512_fill_ws, )) self.queue.finish() # print "fill %f" % (time.time() - t1) output = bytearray(OUTPUT_SIZE) cl.enqueue_write_buffer(self.queue, self.output_buf, output) self.queue.finish() self.ksearch.set_arg(0, self.hashes_buf) self.ksearch.set_arg(1, self.keyhash_buf) self.ksearch.set_arg(2, self.output_buf) cl.enqueue_nd_range_kernel(self.queue, self.ksearch, (KEYS_NUM, ), (self.ksearch_ws, )) self.queue.finish() cl.enqueue_read_buffer(self.queue, self.output_buf, output) self.queue.finish() return str(output)
def test_gpu_aes(): import pyopencl as cl import numpy # Prepare context and command queue ctx = cl.create_some_context(interactive=False) queue = cl.CommandQueue(ctx) print "Compiling kernel ..." with open_cl("ralink.cl", "r") as fp: code = fp.read() % { 'STARTTIME': 0, 'MACADDR1': 0, 'MACADDR2': 0, 'NONCE1': 0, 'NONCE2': 0, 'NONCE3': 0, 'NONCE4': 0, 'KEYSTREAM1': 0, 'KEYSTREAM2': 0,} program = cl.Program(ctx, code).build(options="-I %s" % get_opencl_path()) # Prepare memory result = numpy.zeros(shape=(8), dtype=numpy.uint32) result[0] = 0xffffffff; result[1] = 0xffffffff; dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY| cl.mem_flags.COPY_HOST_PTR, hostbuf=result) # Run the program print "Running kernel ..." program.test_aes(queue, (1,), None, dest_buf) # Read the result cl.enqueue_read_buffer(queue, dest_buf, result).wait() print list2hex(result) assert result[0] == 0xD4E415CB assert result[1] == 0xD038A82B assert result[2] == 0x10A673DE assert result[3] == 0xEA25B206
def get_iterations(context, complex_values, iterations): command_queue = cl.CommandQueue(context) output_array = np.zeros(complex_values.shape, dtype=clar.vec.ushort4) flags = cl.mem_flags complex_values_buffer = cl.Buffer(context, flags.READ_ONLY | flags.COPY_HOST_PTR, hostbuf=complex_values) gradient_array_buffer = cl.Buffer(context, flags.READ_ONLY | flags.COPY_HOST_PTR, hostbuf=gradient) output_array_buffer = cl.Buffer(context, flags.WRITE_ONLY, output_array.nbytes) test = open('kernel.cl', 'r') program = cl.Program(context, test.read()).build() test.close() program.mandelbrot( command_queue, complex_values.shape, None, # Local memory size not specified complex_values_buffer, output_array_buffer, gradient_array_buffer, #np.uint(iterations) ) cl.enqueue_read_buffer(command_queue, output_array_buffer, output_array).wait() return output_array
def getData(self): if self.tickState == False: self.kUtil.GetWorld(self.queue, self.a.shape, None, self.ar_ySize, self.a_buf, self.dest_buf) cl.enqueue_read_buffer(self.queue, self.dest_buf, self.a).wait() else: self.kUtil.GetWorld(self.queue, self.a.shape, None, self.ar_ySize, self.b_buf, self.dest_buf) cl.enqueue_read_buffer(self.queue, self.dest_buf, self.a).wait()
def getData(self, n, axis, data_D, data_H, name): """get data from device""" cl.enqueue_read_buffer(lbm.queue, data_D, data_H).wait() # retrieve mid cell points from cell node data if axis == 'x': N = lbm.X.size - 1 X = lbm.X y = data_H[:, n] elif axis == 'y': N = lbm.Y.size - 1 X = lbm.Y y = data_H[n, :] x = np.zeros((N)) for i in range(1, X.size): x[i - 1] = (X[i] - X[i - 1]) / 2.0 + X[i - 1] self.x = x self.y = y self.n = n self.axis = axis self.data_D = data_D self.data_H = data_H self.name = name self.plotLine() return
def getData(self, data_D, data_H, name): """ plot passed in data as a surface """ #plotting fig = mlab.figure(size=(512, 512)) cl.enqueue_read_buffer(lbm.queue, data_D, data_H).wait() # retrieve mid cell points from cell node data Nx = lbm.X.size - 1 Ny = lbm.Y.size - 1 x = np.zeros((Nx)) y = np.zeros((Ny)) for i in range(1, lbm.X.size): x[i - 1] = (lbm.X[i] - lbm.X[i - 1]) / 2.0 + lbm.X[i - 1] for i in range(1, lbm.Y.size): y[i - 1] = (lbm.Y[i] - lbm.Y[i - 1]) / 2.0 + lbm.Y[i - 1] s = mlab.surf(x, y, data_H, warp_scale='auto', colormap="jet") mlab.axes(s) sb = mlab.scalarbar(s, title=name) self.s = s self.data_D = data_D self.data_H = data_H
def __call__(self, ctx, x1, y1, x2, y2, rx, ry, sw, sh, ez, ex, ey): self.build(ctx) x1 = np.array(x1, dtype=np.float32, copy=False) y1 = np.array(y1, dtype=np.float32, copy=False) x2 = np.float32(x2) y2 = np.float32(y2) ez = np.float32(ez) ex = np.float32(ex) ey = np.float32(ey) rx = np.float32(rx) ry = np.float32(ry) sw = np.float32(sw) sh = np.float32(sh) x1_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=x1) y1_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=y1) out_buf = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY, x1.nbytes) queue = cl.CommandQueue(self.ctx) self.prg.subtended_angle2_naive(queue, x1.shape, None, x1_buf, y1_buf, x2, y2, rx, ry, sw, sh, ez, ex, ey, out_buf) out = np.empty_like(x1) cl.enqueue_read_buffer(queue, out_buf, out).wait() x1_buf.release() y1_buf.release() out_buf.release() return out
def clFindRoute(self, key, candidateTableList): timeKernelHash = 0 self.pos = np.array(-1, dtype=np.int32) cl.enqueue_write_buffer(self.queue, self.pos_buf, self.pos) for elem in candidateTableList: key.prefixlen = (elem) ip = int(key.network) event = self.program.match(self.queue, self.tableShape[elem-1], None, self.table_buf[elem-1], self.pos_buf, np.int32(ip) ) event.wait() cl.enqueue_read_buffer(self.queue, self.pos_buf, self.pos) timeKernelHash += event.profile.end - event.profile.start if (self.pos != -1): break # print("Measured Time kernel Hash (eventProfiler OpenCL function): {:5.8f}" # .format(1e-9*timeKernelHash)) return [elem, self.pos]
def compute(self, floatimage, histogram, k): width, height, nbins = np.shape(histogram) numpixels = width * height image_linear = np.reshape(floatimage, (numpixels, )).astype(np.float32) histogram_linear = np.reshape( histogram, (np.size(histogram), )).astype(np.float32) transform = np.zeros_like(image_linear).astype(np.float32) mf = cl.mem_flags self.buf_image = cl.Buffer(self.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=image_linear) self.buf_histogram = cl.Buffer(self.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=histogram_linear) self.output_buf = cl.Buffer(self.context, mf.READ_WRITE, transform.nbytes) kernel = self.program.IIF kernel.set_scalar_arg_dtypes([np.uintc, np.uintc, np.float32] + [None] * 3) kernel.set_arg(0, np.uintc(width)) kernel.set_arg(1, np.uintc(height)) kernel.set_arg(2, np.float32(k)) kernel.set_arg(3, self.buf_image) kernel.set_arg(4, self.buf_histogram) kernel.set_arg(5, self.output_buf) cl.enqueue_nd_range_kernel(self.queue, kernel, image_linear.shape, None).wait() cl.enqueue_read_buffer(self.queue, self.output_buf, transform).wait() return np.reshape(transform, (width, height)).astype(np.float)
def compute(self, image, num_bins): width, height = np.shape(image) numpixels = width * height image = np.reshape(image, (numpixels, )).astype(np.float32) result = np.zeros((numpixels * num_bins, ), dtype=np.float32) mf = cl.mem_flags self.buf_image = cl.Buffer(self.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=image) self.output_buf = cl.Buffer(self.context, mf.READ_WRITE, result.nbytes) kernel = self.program.iif_binid kernel.set_scalar_arg_dtypes([np.uintc, np.uintc, np.ubyte] + [None] * 2) kernel.set_arg(0, np.uintc(width)) kernel.set_arg(1, np.uintc(height)) kernel.set_arg(2, np.ubyte(num_bins)) kernel.set_arg(3, self.buf_image) kernel.set_arg(4, self.output_buf) cl.enqueue_nd_range_kernel(self.queue, kernel, image.shape, None).wait() cl.enqueue_read_buffer(self.queue, self.output_buf, result).wait() return np.reshape(result, (width, height, num_bins)).astype(np.float32)
def FuseRGBD_GPU(self, Image, boneDQ, jointDQ): """ Update the TSDF volume with Image :param Image: RGBD image to update to its surfaces :param boneDQ: the dual quaternion of bone in new frame :param jointDQ: the dual quaternion of joint in new frame :param bp: the indexof body part :return: none """ # initialize buffers #cl.enqueue_write_buffer(self.GPUManager.queue, self.Pose_GPU, Tg) cl.enqueue_write_buffer(self.GPUManager.queue, self.DepthGPU, Image.depth_image) cl.enqueue_write_buffer(self.GPUManager.queue, self.boneDQGPU, boneDQ) cl.enqueue_write_buffer(self.GPUManager.queue, self.jointDQGPU, jointDQ) # fuse data of the RGBD imnage with the TSDF volume 3D model self.GPUManager.programs['FuseTSDF'].FuseTSDF(self.GPUManager.queue, (self.Size[0], self.Size[1]), None, \ self.TSDFGPU, self.DepthGPU, self.Param, self.Size_Volume, self.Pose_GPU, \ self.boneDQGPU, self.jointDQGPU, self.planeF,\ self.Calib_GPU, np.int32(Image.Size[0]), np.int32(Image.Size[1]),self.WeightGPU) # update CPU array. Read the buffer to write in the CPU array. cl.enqueue_read_buffer(self.GPUManager.queue, self.TSDFGPU, self.TSDF).wait() ''' # TEST if TSDF contains NaN TSDFNaN = np.count_nonzero(np.isnan(self.TSDF)) print "TSDFNaN : %d" %(TSDFNaN) ''' cl.enqueue_read_buffer(self.GPUManager.queue, self.WeightGPU, self.Weight).wait()
def do_opencl_pow(hash, target): output = numpy.zeros(1, dtype=[('v', numpy.uint64, 1)]) if (len(enabledGpus) == 0): return output[0][0] data = numpy.zeros(1, dtype=hash_dt, order='C') data[0]['v'] = ("0000000000000000" + hash).decode("hex") data[0]['target'] = target hash_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data) dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, output.nbytes) kernel = program.kernel_sha512 worksize = kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, enabledGpus[0]) kernel.set_arg(0, hash_buf) kernel.set_arg(1, dest_buf) start = time.time() progress = 0 globamt = worksize*2000 while output[0][0] == 0 and shutdown == 0: kernel.set_arg(2, pack("<Q", progress)) cl.enqueue_nd_range_kernel(queue, kernel, (globamt,), (worksize,)) cl.enqueue_read_buffer(queue, dest_buf, output) queue.finish() progress += globamt sofar = time.time() - start # logger.debug("Working for %.3fs, %.2f Mh/s", sofar, (progress / sofar) / 1000000) if shutdown != 0: raise Exception ("Interrupted") taken = time.time() - start # logger.debug("Took %d tries.", progress) return output[0][0]
def execute(self, params): ''' This handles the actual execution for the processing, which would get executed on each request - this is where we care about the performance ''' timing.timings.start('load') self.load_program(params) timing.timings.stop('load') finish = timing.timings.timings['load']['timings'][-1] print '<<< Loaded program in %s ms' % (finish) timing.timings.start('execute') # Start the program self.program.worker(self.queue, self.data['income'].shape, None, self.income_buf, self.capGains_buf, self.dividendsInterest_buf, self.children_buf, self.dest_buf, ) # Get an empty numpy array in the shape of the original data result = numpy.empty_like(self.data['income']) #Wait for result cl.enqueue_read_buffer(self.queue, self.dest_buf, result).wait() #show timing info timing.timings.stop('execute') finish = timing.timings.timings['execute']['timings'][-1] print '<<< Executed in %s ms' % (finish) return result
def randomfill(self): t = getTime() mf = cl.mem_flags self.inputBuf = [ cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.img[i]) for i in [0, 1] ] self.outputBuf = cl.Buffer(self.ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=self.nff) self.program.randomfill( self.queue, self.effectiveSize, None, numpy.int32(self.patchSize[0]), #patchHeight numpy.int32(self.patchSize[1]), #patchWidth numpy.int32(self.size[0]), #height numpy.int32(self.size[1]), #width self.inputBuf[0], self.inputBuf[1], self.outputBuf) c = numpy.empty_like(self.nff) cl.enqueue_read_buffer(self.queue, self.outputBuf, c).wait() self.nff = numpy.copy(c) self.times["randomfill"] += getTime() - t
def do_opencl_pow(hash, target): output = numpy.zeros(1, dtype=[('v', numpy.uint64, 1)]) if (ctx == False): return output[0][0] data = numpy.zeros(1, dtype=hash_dt, order='C') data[0]['v'] = ("0000000000000000" + hash).decode("hex") data[0]['target'] = target hash_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data) dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, output.nbytes) kernel = program.kernel_sha512 worksize = kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, gpus[0]) kernel.set_arg(0, hash_buf) kernel.set_arg(1, dest_buf) start = time.time() progress = 0 globamt = worksize*2000 while output[0][0] == 0: kernel.set_arg(2, pack("<Q", progress)) cl.enqueue_nd_range_kernel(queue, kernel, (globamt,), (worksize,)) cl.enqueue_read_buffer(queue, dest_buf, output) queue.finish() progress += globamt sofar = time.time() - start print sofar, progress / sofar, "hashes/sec" taken = time.time() - start print progress, taken return output[0][0]
def execute(self, *args, **kwargs): self.load_data(*args, **kwargs) self.program.program__(self.queue, self.a.shape, None, self.a_buf, self.b_buf, self.dest_buf) c = np.empty_like(self.a) cl.enqueue_read_buffer(self.queue, self.dest_buf, c).wait() return c
def map_function(data): proc = subprocess.Popen(["../bin/get-host-platform-device.sh"], stdout=subprocess.PIPE, shell=True) (proc_out, err) = proc.communicate() [SPARKCL_PLATFORM , SPARKCL_DEVICE] = proc_out.split() KERNEL_CODE=""" __kernel void ArraySum(__global float *A,__global float *B,__global float *C){ int i = get_global_id(0); C[i] = A[i]+B[i]; } """ cl_device=cl.get_platforms()[int(SPARKCL_PLATFORM)].get_devices()[int(SPARKCL_DEVICE)] ctx = cl.Context([cl_device]) queue = cl.CommandQueue(ctx) prg = cl.Program(ctx, KERNEL_CODE).build() kernel = prg.ArraySum mf = cl.mem_flags np_data = [] np_data.append(np.array(data[0]).astype(np.float32)) np_data.append(np.array(data[1]).astype(np.float32)) data_buf = [] data_buf.append(cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np_data[0])) data_buf.append(cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np_data[1])) result = np.zeros((5, )).astype(np.float32) result_buf = cl.Buffer(ctx, mf.WRITE_ONLY, result.nbytes) kernel(queue,(5,),None,data_buf[0],data_buf[1],result_buf) cl.enqueue_read_buffer(queue, result_buf, result).wait() return result
def transform_uint32(self, data_np, flip_x=False, flip_y=False, swap_xy=False, out=None): height, width = data_np.shape[:2] new_ht, new_wd = height, width if swap_xy: new_ht, new_wd = width, height new_size = [new_ht, new_wd] + list(data_np.shape[2:]) mf = cl.mem_flags #create OpenCL buffers on devices data_np = np.ascontiguousarray(data_np) src_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=data_np) dst_buf = cl.Buffer(self.ctx, mf.WRITE_ONLY, data_np.nbytes) evt = self.program.image_transform_uint32(self.queue, [height, width], None, src_buf, dst_buf, np.int32(width), np.int32(height), np.int32(flip_x), np.int32(flip_y), np.int32(swap_xy)) if out is None: out = np.empty_like(data_np).reshape(new_size) cl.enqueue_read_buffer(self.queue, dst_buf, out).wait() return out
def do_opencl_pow(hash, target): output = numpy.zeros(1, dtype=[('v', numpy.uint64, 1)]) if (ctx == False): return output[0][0] data = numpy.zeros(1, dtype=hash_dt, order='C') data[0]['v'] = ("0000000000000000" + hash).decode("hex") data[0]['target'] = target hash_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data) dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, output.nbytes) kernel = program.kernel_sha512 worksize = kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, cl.get_platforms()[0].get_devices()[1]) kernel.set_arg(0, hash_buf) kernel.set_arg(1, dest_buf) start = time.time() progress = 0 globamt = worksize*2000 while output[0][0] == 0: kernel.set_arg(2, pack("<Q", progress)) cl.enqueue_nd_range_kernel(queue, kernel, (globamt,), (worksize,)) cl.enqueue_read_buffer(queue, dest_buf, output) queue.finish() progress += globamt sofar = time.time() - start print sofar, progress / sofar, "hashes/sec" taken = time.time() - start print progress, taken return output[0][0]
def exchange_boundary_h(s): for queue, eh_fields, tmpf, offset in zip(s.queues, s.eh_fields_gpus, s.tmpfs, s.offsets)[:-1]: cl.enqueue_read_buffer(queue, eh_fields[4], tmpf[0], offset) # hy_gpu cl.enqueue_read_buffer(queue, eh_fields[5], tmpf[1], offset) # hz_gpu for queue, eh_fields, tmpf in zip(s.queues[1:], s.eh_fields_gpus[1:], s.tmpfs[:-1]): cl.enqueue_write_buffer(queue, eh_fields[4], tmpf[0]) cl.enqueue_write_buffer(queue, eh_fields[5], tmpf[1])
def execute(self): """ This handles the actual execution for the processing, which would get executed on each request - this is where we care about the performance """ timing.timings.start("execute") # Start the program self.program.worker(self.queue, self.data1.shape, None, self.data1_buf, self.data2_buf, self.dest_buf) # Get an empty numpy array in the shape of the original data result = numpy.empty_like(self.data1) # Wait for result cl.enqueue_read_buffer(self.queue, self.dest_buf, result).wait() # show timing info timing.timings.stop("execute") finish = timing.timings.timings["execute"]["timings"][-1] print "<<< DONE in %s" % (finish) # Open data file to append to data_file = open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../data.csv"), "a") data_file.write("PyOpenCl %s,%s,%s,%s\n" % (process_type, finish, num_records, num_calculations)) data_file.close()
def update(self, sub_pos, angle, min_dist, max_dist, width, in_weight, out_weight): ''' Perform one update on the probabilities by using the evidence that the sub is at position sub_pos, the target is seen at an absolute heading of `angle` and is most likely between min_dist and max_dist away. in_weight gives the chance that for every point in the region, if the buoy is there then we would get this result i.e. in_weight = P(this measurement | buoy at point p) for p in our region out_weight is the same but for points outside the region ''' n, e = sub_pos cl_program.evidence(cl_queue, self.norths.shape, None, self.norths_buf, self.easts_buf, self.prob_buf, float32(n), float32(e), float32(radians(angle)), float32(min_dist**2), float32(max_dist**2), float32(width), float32(in_weight), float32(out_weight)) #TODO ? cl.enqueue_read_buffer(cl_queue, self.prob_buf, self.probabilities).wait() #Normalize total_prob = numpy.sum(self.probabilities) self.probabilities /= total_prob cl.enqueue_write_buffer(cl_queue, self.prob_buf, self.probabilities)
def getMultipleRows(self,rowbase,rowlimit): #{{{ """Computes multiple Tanimoto rows *rowbase:rowlimit* corresponding to comparing every SMILES string in the query set with the reference SMILES strings having index *row*, *row+1*, ..., *rowlimit-1* in the reference set, and stores this block as the most recent asynchronous result. This method is synchronous (it will not return until the block has been completely computed). """ if rowbase < 0 or rowlimit > self.nref: raise # Pad rows out to 64 byte pitch rowpitchInFloat = 16*((self.nquery+15)/16) # Using pagelocked memory and async copy seems to actually slow us down # on large tiled calculations self.resultmatrix = numpy.empty((rowlimit-rowbase,rowpitchInFloat),dtype=numpy.float32) self.gpu.gpumatrix = cl.Buffer(self.gpu.context,cl.mem_flags.WRITE_ONLY,size=self.resultmatrix.nbytes) # With precalculated magnitudes lmem_bytes = int(2*4*max(self.rlengths[rowbase:rowlimit])) threads_per_block = 192 self.gpu.multiRowKernel(self.gpu.queue,(threads_per_block*(rowlimit-rowbase),), self.gpu.rsmiles,self.gpu.rcounts,self.gpu.rl_gpu,self.gpu.rmag_gpu, self.refPitchInInt, self.gpu.qsmiles,self.gpu.qcounts,self.gpu.ql_gpu,self.gpu.qmag_gpu, self.qPitchTInInt, self.gpu.gpumatrix, numpy.int32(rowpitchInFloat), numpy.int32(self.qshape[0]),numpy.int32(self.qshape[1]),numpy.int32(rowbase), cl.LocalMemory(lmem_bytes),cl.LocalMemory(lmem_bytes), local_size=(threads_per_block,)) cl.enqueue_read_buffer(self.gpu.queue,self.gpu.gpumatrix,self.resultmatrix).wait() return self.resultmatrix[:,0:self.nquery]
def FuseRGBD_GPU(self, Image, Pose): """ Update the TSDF volume with Image :param Image: RGBD image to update to its surfaces :param Pose: transform from the first camera pose to the last camera pose :return: none """ # initialize buffers cl.enqueue_write_buffer(self.GPUManager.queue, self.Pose_GPU, Pose) cl.enqueue_write_buffer(self.GPUManager.queue, self.DepthGPU, Image.depth_image) # fuse data of the RGBD imnage with the TSDF volume 3D model self.GPUManager.programs['FuseTSDF'].FuseTSDF(self.GPUManager.queue, (self.Size[0], self.Size[1]), None, \ self.TSDFGPU, self.DepthGPU, self.Param, self.Size_Volume, self.Pose_GPU, self.Calib_GPU, \ np.int32(Image.Size[0]), np.int32(Image.Size[1]),self.WeightGPU) # update CPU array. Read the buffer to write in the CPU array. cl.enqueue_read_buffer(self.GPUManager.queue, self.TSDFGPU, self.TSDF).wait() ''' # TEST if TSDF contains NaN TSDFNaN = np.count_nonzero(np.isnan(self.TSDF)) print "TSDFNaN : %d" %(TSDFNaN) ''' cl.enqueue_read_buffer(self.GPUManager.queue, self.WeightGPU, self.Weight).wait()
def test_that_python_args_fail(ctx_factory): context = ctx_factory() prg = cl.Program( context, """ __kernel void mult(__global float *a, float b, int c) { a[get_global_id(0)] *= (b+c); } """).build() a = np.random.rand(50000) queue = cl.CommandQueue(context) mf = cl.mem_flags a_buf = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a) knl = cl.Kernel(prg, "mult") try: knl(queue, a.shape, None, a_buf, 2, 3) assert False, "PyOpenCL should not accept bare Python types as arguments" except cl.LogicError: pass try: prg.mult(queue, a.shape, None, a_buf, float(2), 3) assert False, "PyOpenCL should not accept bare Python types as arguments" except cl.LogicError: pass prg.mult(queue, a.shape, None, a_buf, np.float32(2), np.int32(3)) a_result = np.empty_like(a) cl.enqueue_read_buffer(queue, a_buf, a_result).wait()
def transform(self): """Realizes the calculus""" # Prepare the input and output memory mf = cl.mem_flags msg = np.char.array(self.Word_buffer) len_array = np.array(self.len).astype(np.int32) Hexdigest_array = np.char.array(['']*41*(len(len_array))) print msg print msg.nbytes print len_array print len_array.nbytes print Hexdigest_array print Hexdigest_array.nbytes # Allocate device memory msg_buf = cl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, msg.nbytes, msg) len_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, len_array.nbytes, len_array) Hexdigest_buf = cl.Buffer(self.ctx, mf.WRITE_ONLY, Hexdigest_array.nbytes) # Start OpenCL operation and wait for it to finish time1 = datetime.datetime.now() self.prg.sha1(self.queue, (len(len_array),), msg_buf, len_buf, Hexdigest_buf) cl.enqueue_read_buffer(self.queue, Hexdigest_buf, Hexdigest_array).wait() time2 = datetime.datetime.now() print "Execution time OpenCL sha1: " + repr((time2 - time1).microseconds/1000) + "ms" # Convert the result into strings for j in range(0,len(Hexdigest_array)/41): self.result.append(''.join(Hexdigest_array[j*41 + 0:j*41 + 41]))
def dump_batch(self): keys = np.array(self.batch.keys(), dtype='S32') counts = np.array(self.batch.values(), dtype=np.int32) out = np.zeros([self.d, self.w], dtype=np.int32) # create the buffers to hold the values of the input rand_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=self.rand) keys_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=keys) counts_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=counts) # create output buffer out_buf = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY, out.nbytes) # Kernel is now launched launch = self.bld.increment(self.queue, (len(keys), self.d), None, rand_buf, keys_buf, counts_buf, out_buf) # wait till the process completes launch.wait() cl.enqueue_read_buffer(self.queue, out_buf, out).wait() self.M += out self.batch.clear()
def execute(self): self.program.part1(self.queue, self.a.shape, None, self.a_buf, self.b_buf, self.dest_buf) c = numpy.array(range(10), dtype=numpy.uint32) cl.enqueue_read_buffer(self.queue, self.dest_buf, c).wait() print "a", self.a print "b", self.b print "c", c
def execute(self): ''' execute an iteration of patchMatch ''' t = getTime() mf = cl.mem_flags self.inputBuf = [ cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.img[i]) for i in [0, 1] ] self.outputBuf = cl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=self.nff) self.program.propagate( self.queue, self.effectiveSize, None, numpy.int32(self.patchSize[0]), #patchHeight numpy.int32(self.patchSize[1]), #patchWidth numpy.int32(self.size[0]), #height numpy.int32(self.size[1]), #width numpy.int32(self.iteration), self.inputBuf[0], self.inputBuf[1], self.outputBuf) c = numpy.empty_like(self.nff) cl.enqueue_read_buffer(self.queue, self.outputBuf, c).wait() self.nff = numpy.copy(c) self.times["execute"] += getTime() - t
def gpu_array_sum(a, b): context = cl.create_some_context() # Initialize the Context queue = cl.CommandQueue(context, properties=cl.command_queue_properties.PROFILING_ENABLE) # Instantiate a Queue with profiling (timing) enabled a_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=a) b_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=b) c_buffer = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, b.nbytes) # Create three buffers (plans for areas of memory on the device) program = cl.Program(context, """ __kernel void sum(__global const float *a, __global const float *b, __global float *c) { int i = get_global_id(0); int j; for(j = 0; j < 1000; j++) { c[i] = a[i] + b[i]; } }""").build() # Compile the device program gpu_start_time = time() # Get the GPU start time event = program.sum(queue, a.shape, None, a_buffer, b_buffer, c_buffer) # Enqueue the GPU sum program XXX event.wait() # Wait until the event finishes XXX elapsed = 1e-9*(event.profile.end - event.profile.start) # Calculate the time it took to execute the kernel print("GPU Kernel Time: {0} s".format(elapsed)) # Print the time it took to execute the kernel c_gpu = np.empty_like(a) # Create an empty array the same size as array a cl.enqueue_read_buffer(queue, c_buffer, c_gpu).wait() # Read back the data from GPU memory into array c_gpu gpu_end_time = time() # Get the GPU end time print("GPU Time: {0} s".format(gpu_end_time - gpu_start_time)) # Print the time the GPU program took, including both memory copies return c_gpu # Return the sum of the two arrays
def gpu_array_sum(a, b): platform = cl.get_platforms()[0] device = platform.get_devices()[0] context = cl.Context([device]) queue = cl.CommandQueue(context, properties=cl.command_queue_properties.PROFILING_ENABLE) # Instantiate a Queue with profiling (timing) enabled a_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=a) b_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=b) c_buffer = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, b.nbytes) # Create three buffers (plans for areas of memory on the device) program = cl.Program(context, """ __kernel void sum(__global const float *a, __global const float *b, __global float *c) { int i = get_global_id(0); int j; for(j = 0; j < 10000; j++) { c[i] = a[i] + b[i]; } }""").build() # Compile the device program gpu_start_time = time() # Get the GPU start time event = program.sum(queue, a.shape, None, a_buffer, b_buffer, c_buffer) # Enqueue the GPU sum program XXX event.wait() # Wait until the event finishes XXX elapsed = 1e-9*(event.profile.end - event.profile.start) # Calculate the time it took to execute the kernel print("GPU Kernel Time: {0} s".format(elapsed)) # Print the time it took to execute the kernel c_gpu = np.empty_like(a) # Create an empty array the same size as array a cl.enqueue_read_buffer(queue, c_buffer, c_gpu).wait() # Read back the data from GPU memory into array c_gpu gpu_end_time = time() # Get the GPU end time print("GPU Time: {0} s".format(gpu_end_time - gpu_start_time)) # Print the time the GPU program took, including both memory copies return c_gpu # Return the sum of the two arrays
def update(self, sub_pos, angle, min_dist, max_dist, width, in_weight, out_weight): ''' Perform one update on the probabilities by using the evidence that the sub is at position sub_pos, the target is seen at an absolute heading of `angle` and is most likely between min_dist and max_dist away. in_weight gives the chance that for every point in the region, if the buoy is there then we would get this result i.e. in_weight = P(this measurement | buoy at point p) for p in our region out_weight is the same but for points outside the region ''' n,e = sub_pos cl_program.evidence(cl_queue, self.norths.shape, None, self.norths_buf, self.easts_buf, self.prob_buf, float32(n), float32(e), float32(radians(angle)), float32(min_dist**2), float32(max_dist**2), float32(width), float32(in_weight), float32(out_weight)) #TODO ? cl.enqueue_read_buffer(cl_queue, self.prob_buf, self.probabilities).wait() #Normalize total_prob = numpy.sum( self.probabilities ) self.probabilities /= total_prob cl.enqueue_write_buffer(cl_queue, self.prob_buf, self.probabilities)
def do_opencl_pow(hash_, target): """Perform PoW using OpenCL""" output = numpy.zeros(1, dtype=[('v', numpy.uint64, 1)]) if not enabledGpus: return output[0][0] data = numpy.zeros(1, dtype=hash_dt, order='C') data[0]['v'] = ("0000000000000000" + hash_).decode("hex") data[0]['target'] = target hash_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data) dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, output.nbytes) kernel = program.kernel_sha512 worksize = kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, enabledGpus[0]) kernel.set_arg(0, hash_buf) kernel.set_arg(1, dest_buf) progress = 0 globamt = worksize * 2000 while output[0][0] == 0 and shutdown == 0: kernel.set_arg(2, pack("<Q", progress)) cl.enqueue_nd_range_kernel(queue, kernel, (globamt,), (worksize,)) try: cl.enqueue_read_buffer(queue, dest_buf, output) except AttributeError: cl.enqueue_copy(queue, output, dest_buf) queue.finish() progress += globamt if shutdown != 0: raise Exception("Interrupted") # logger.debug("Took %d tries.", progress) return output[0][0]
def execute(self, settings): self.program.mandel(self.queue, (self.c_real.shape[0], ), None, self.real_buf, self.imag_buf, self.depth_buf, self.dest_buf) counts = np.zeros(settings.dim**2, dtype=np.int32) cl.enqueue_read_buffer(self.queue, self.dest_buf, counts).wait() return counts.reshape([settings.dim, settings.dim])
def reduce_flatrot(): sums = np.empty((8,4),'f') evt = program.float4_sum(queue, (64*8,), (64,), reduce_buf, reduce_scratch, qxdyqz_buf, np.int32(length)) cl.enqueue_read_buffer(queue, reduce_buf, sums).wait() return sums.sum(0)
def map1(data): SPARKCL_PLATFORM = os.environ['CL_PLATFORM'] SPARKCL_DEVICE = os.environ['CL_DEVICE'] print str(SPARKCL_PLATFORM)+":"+str(SPARKCL_DEVICE) KERNEL_CODE=""" __kernel void ArraySum(__global float *A,__global float *B,__global float *C){ int i = get_global_id(0); C[i] = A[i]+B[i]; } """ cl_device=cl.get_platforms()[int(SPARKCL_PLATFORM)].get_devices()[int(SPARKCL_DEVICE)] ctx = cl.Context([cl_device]) queue = cl.CommandQueue(ctx) prg = cl.Program(ctx, KERNEL_CODE).build() kernel = prg.ArraySum mf = cl.mem_flags print "map" + str(data) np_data = [] data_buf = [] np_data.append(np.array(data[0]).astype(np.float32)) data_buf.append(cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np_data[0])) np_data.append(np.array(data[1]).astype(np.float32)) data_buf.append(cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np_data[1])) result = np.zeros((5,)).astype(np.float32) result_buf = cl.Buffer(ctx, mf.WRITE_ONLY, result.nbytes) kernel(queue,(5,),None,data_buf[0],data_buf[1],result_buf) cl.enqueue_read_buffer(queue, result_buf, result).wait() return [result.astype(np.float32)]
def mineThread(self): for data in self.qr: for i in range(data.iterations): self.kernel.search( self.commandQueue, (data.size, ), (self.WORKSIZE, ), data.state[0], data.state[1], data.state[2], data.state[3], data.state[4], data.state[5], data.state[6], data.state[7], data.state2[1], data.state2[2], data.state2[3], data.state2[5], data.state2[6], data.state2[7], data.base[i], data.f[0], data.f[1],data.f[2], data.f[3],data.f[4], self.output_buf) cl.enqueue_read_buffer( self.commandQueue, self.output_buf, self.output) self.commandQueue.finish() # The OpenCL code will flag the last item in the output buffer when # it finds a valid nonce. If that's the case, send it to the main # thread for postprocessing and clean the buffer for the next pass. if self.output[self.OUTPUT_SIZE]: reactor.callFromThread(self.postprocess, self.output.copy(), data.nr) self.output.fill(0) cl.enqueue_write_buffer( self.commandQueue, self.output_buf, self.output)
def plotCurrentMembraneCoordinates(self): cl.enqueue_read_buffer(self.queue, self.dev_membraneCoordinatesX.data, self.host_membraneCoordinatesX).wait() cl.enqueue_read_buffer(self.queue, self.dev_membraneCoordinatesY.data, self.host_membraneCoordinatesY).wait() plt.plot(self.host_membraneCoordinatesX, self.host_membraneCoordinatesY)
def resize_uint32(self, data_np, scale_x, scale_y, out=None): height, width = data_np.shape[:2] new_ht = int(height * scale_y) new_wd = int(width * scale_x) new_shape = [new_ht, new_wd] + list(data_np.shape[2:]) mf = cl.mem_flags #create OpenCL buffers on devices data_np = np.ascontiguousarray(data_np) src_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=data_np) num_bytes = new_ht * new_wd * np.uint32(0).nbytes dst_buf = cl.Buffer(self.ctx, mf.WRITE_ONLY, num_bytes) evt = self.program.image_resize_uint32(self.queue, [new_ht, new_wd], None, src_buf, dst_buf, np.int32(width), np.int32(new_wd), np.float64(scale_x), np.float64(scale_y)) if out is None: out = np.empty(new_shape, dtype=data_np.dtype) cl.enqueue_read_buffer(self.queue, dst_buf, out).wait() return out
def lombscarge_opencl(x, y, f): # start up gpu x = np.float64(x) y = np.float64(y) f = np.float64(f) ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) mf = cl.mem_flags # make max arrays Nx, Nf = np.int32(x.shape[0]), np.int32(f.shape[0]) # send data to card x_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=x) y_g = cl.Buffer(ctx, mf.READ_ONLY| mf.COPY_HOST_PTR, hostbuf=y) f_g = cl.Buffer(ctx, mf.READ_ONLY| mf.COPY_HOST_PTR, hostbuf=f) # make output pgram = np.empty_like(f) pgram_g = cl.Buffer(ctx, mf.WRITE_ONLY, pgram.nbytes) prg = cl.Program(ctx, lomb_txt) try: prg.build() except: print("Error:") print(prg.get_build_info(ctx.devices[0], cl.program_build_info.LOG)) raise prg.lombscargle(queue, pgram.shape, None, x_g, y_g, f_g, pgram_g, Nx) cl.enqueue_read_buffer(queue, pgram_g, pgram) return pgram
def test_opencl_0(zz, a, b, c_result): for platform in cl.get_platforms(): for device in [platform.get_devices()[1]]: print("===============================================================") print("Platform name:", platform.name) print("Platform profile:", platform.profile) print("Platform vendor:", platform.vendor) print("Platform version:", platform.version) print("---------------------------------------------------------------") print("Device name:", device.name) print("Device type:", cl.device_type.to_string(device.type)) print("Device memory: ", device.global_mem_size//1024//1024, 'MB') print("Device max clock speed:", device.max_clock_frequency, 'MHz') print("Device compute units:", device.max_compute_units) # Simnple speed test ctx = cl.Context([device]) queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) mf = cl.mem_flags a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a) b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b) dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes) prg = cl.Program(ctx, """ __kernel void sum(__global const double *a, __global const double *b, __global double *c) { int loop; int gid = get_global_id(0); for(loop=0; loop<%s;loop++) { c[gid] = a[gid] + b[gid]; c[gid] = c[gid] * (a[gid] + b[gid]); c[gid] = c[gid] * (a[gid] / 2); c[gid] = log(exp(c[gid])); } } """ % (zz)).build() exec_evt = prg.sum(queue, a.shape, None, a_buf, b_buf, dest_buf) exec_evt.wait() elapsed = 1e-9*(exec_evt.profile.end - exec_evt.profile.start) print("Execution time of test: %g s" % elapsed) c = numpy.empty_like(a) cl.enqueue_read_buffer(queue, dest_buf, c).wait() error = 0 for i in range(zz): if c[i] != c_result[i]: print("c_i: ", c[i], " c_results_i: ", c_result[i]) print("diff: ", numpy.abs(c[i] - c_result[i])) error = 1 if error: print("Results doesn't match!!") else: print("Results OK")
def lomb_scargle32(x, y, f): '''single percesion version of lomb-scargle''' x = np.float32(x) y = np.float32(y) f = np.float32(f) ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) mf = cl.mem_flags # make max arrays Nx, Nf = np.int32(x.shape[0]), np.int32(f.shape[0]) # send data to card x_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=x) y_g = cl.Buffer(ctx, mf.READ_ONLY| mf.COPY_HOST_PTR, hostbuf=y) f_g = cl.Buffer(ctx, mf.READ_ONLY| mf.COPY_HOST_PTR, hostbuf=f) # make output pgram = np.empty_like(f) pgram_g = cl.Buffer(ctx, mf.WRITE_ONLY, pgram.nbytes) prg = cl.Program(ctx, lomb_txt32) try: prg.build() except: # print("Error:") print(prg.get_build_info(ctx.devices[0], cl.program_build_info.LOG)) raise prg.lombscargle(queue, pgram.shape, None, x_g, y_g, f_g, pgram_g, Nx) cl.enqueue_read_buffer(queue, pgram_g, pgram) return pgram
def exchange_boundary_e(s): for queue, eh_fields, tmpf in zip(s.queues, s.eh_fields_gpus, s.tmpfs)[1:]: cl.enqueue_read_buffer(queue, eh_fields[1], tmpf[0]) # ey_gpu cl.enqueue_read_buffer(queue, eh_fields[2], tmpf[1]) # ez_gpu for queue, eh_fields, tmpf, offset in zip(s.queues[:-1], s.eh_fields_gpus[:-1], s.tmpfs[1:], s.offsets[:-1]): cl.enqueue_write_buffer(queue, eh_fields[1], tmpf[0], offset) cl.enqueue_write_buffer(queue, eh_fields[2], tmpf[1], offset)
def execute(self): """ Runs test openCL kernel and returns elapsed time. """ kernel = self.LoadKernelSrc(self.src) # build opencl kernel prg = cl.Program(self.ctx, kernel).build() exec_evt = prg.matrix_mul( self.queue, ( self.m, self.p, ), self.A_buf, self.B_buf, self.C_buf, np.uint32(self.m), np.uint32(self.n), np.uint32(self.p), local_size=( self.block, self.block, ), ).wait() # read result from opencl buffer cl.enqueue_read_buffer(self.queue, self.C_buf, self.C).wait() # return elapsed time in seconds return 1e-9 * (exec_evt.profile.end - exec_evt.profile.start)
def do_opencl_pow(hash, target): global ctx, queue, program, gpus, hash_dt output = numpy.zeros(1, dtype=[("v", numpy.uint64, 1)]) if ctx == False: return output[0][0] data = numpy.zeros(1, dtype=hash_dt, order="C") data[0]["v"] = ("0000000000000000" + hash).decode("hex") data[0]["target"] = target hash_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data) dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, output.nbytes) kernel = program.kernel_sha512 worksize = kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, gpus[0]) kernel.set_arg(0, hash_buf) kernel.set_arg(1, dest_buf) start = time.time() progress = 0 globamt = worksize * 2000 while output[0][0] == 0: kernel.set_arg(2, pack("<Q", progress)) cl.enqueue_nd_range_kernel(queue, kernel, (globamt,), (worksize,)) cl.enqueue_read_buffer(queue, dest_buf, output) queue.finish() progress += globamt sofar = time.time() - start # logger.debug("Working for %.3fs, %.2f Mh/s", sofar, (progress / sofar) / 1000000) taken = time.time() - start # logger.debug("Took %d tries.", progress) return output[0][0]
def __call__(self, ctx, x, y, rx, ry, sw, sh, ez, ex, ey): self.build(ctx) x = np.array(x, dtype=np.float32, copy=False) y = np.array(y, dtype=np.float32, copy=False) ez = np.array(ez, dtype=np.float32, copy=False) ex = np.array(ex, dtype=np.float32, copy=False) ey = np.array(ey, dtype=np.float32, copy=False) rx = np.float32(rx) ry = np.float32(ry) sw = np.float32(sw) sh = np.float32(sh) x_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=x) y_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=y) ez_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=ez) ex_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=ex) ey_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=ey) out_buf = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY, x.nbytes) queue = cl.CommandQueue(self.ctx) self.prg.distance_2_point(queue, x.shape, None, x_buf, y_buf, rx, ry, sw, sh, ez_buf, ex_buf, ey_buf, out_buf) out = np.empty_like(x) cl.enqueue_read_buffer(queue, out_buf, out).wait() x_buf.release() y_buf.release() ez_buf.release() ex_buf.release() ey_buf.release() out_buf.release() return out
def copy_array(self, arr_like, arr_device): """ This copy an array from device to host and returns it. """ c = np.empty_like(arr_like) cl.enqueue_read_buffer(self.queue, arr_device, c).wait() return c
def __call__(self, ctx, x1, y1, x2, y2, rx, ry, sw, sh, ez, ex, ey): self.build(ctx) x1 = np.array(x1, dtype=np.float32, copy=False) y1 = np.array(y1, dtype=np.float32, copy=False) x2 = np.array(x2, dtype=np.float32, copy=False) y2 = np.array(y2, dtype=np.float32, copy=False) ez = np.array(ez, dtype=np.float32, copy=False) ex = np.array(ex, dtype=np.float32, copy=False) ey = np.array(ey, dtype=np.float32, copy=False) rx = np.float32(rx) ry = np.float32(ry) sw = np.float32(sw) sh = np.float32(sh) x1_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=x1) y1_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=y1) x2_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=x2) y2_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=y2) ez_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=ez) ex_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=ex) ey_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=ey) out_buf = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY, x1.nbytes) queue = cl.CommandQueue(self.ctx) self.prg.subtended_angle_naive(queue, x1.shape, None, x1_buf, y1_buf, x2_buf, y2_buf, rx, ry, sw, sh, ez_buf, ex_buf, ey_buf, out_buf) out = np.empty_like(x1) cl.enqueue_read_buffer(queue, out_buf, out).wait() x1_buf.release() y1_buf.release() x2_buf.release() y2_buf.release() ez_buf.release() ex_buf.release() ey_buf.release() out_buf.release() return out
def mineThread(self): for data in self.qr: for i in range(data.iterations): self.kernel.search( self.commandQueue, (data.size, ), (self.WORKSIZE, ), data.state[0], data.state[1], data.state[2], data.state[3], data.state[4], data.state[5], data.state[6], data.state[7], data.state2[1], data.state2[2], data.state2[3], data.state2[5], data.state2[6], data.state2[7], data.base[i], data.f[1],data.f[2], data.f[3],data.f[4], data.f[5],data.f[6], data.f[7],data.f[8], self.output_buf) cl.enqueue_read_buffer( self.commandQueue, self.output_buf, self.output) self.commandQueue.finish() # The OpenCL code will flag the last item in the output buffer # when it finds a valid nonce. If that's the case, send it to # the main thread for postprocessing and clean the buffer # for the next pass. if self.output[self.OUTPUT_SIZE]: reactor.callFromThread(self.postprocess, self.output.copy(), data.nr) self.output.fill(0) cl.enqueue_write_buffer( self.commandQueue, self.output_buf, self.output)
def fromDevice(self, buf, shape=None): if shape is None: shape = buf.shape cpu_buf = numpy.empty(shape, dtype=buf.dtype) cl.enqueue_read_buffer(self.queue, buf, cpu_buf).wait() return cpu_buf
def mineThread(self): for data in self.qr: for i in range(data.iterations): offset = (unpack('I', data.base[i])[0],) if self.GOFFSET else None self.kernel.search( self.commandQueue, (data.size, ), (self.WORKSIZE, ), data.state[0], data.state[1], data.state[2], data.state[3], data.state[4], data.state[5], data.state[6], data.state[7], data.state2[1], data.state2[2], data.state2[3], data.state2[5], data.state2[6], data.state2[7], data.base[i], data.f[0], data.f[1], data.f[2], data.f[3], data.f[4], data.f[5], data.f[6], data.f[7], self.output_buf, global_offset=offset) cl.enqueue_read_buffer(self.commandQueue, self.output_buf, self.output, is_blocking=False) self.commandQueue.finish() # The OpenCL code will flag the last item in the output buffer # when it finds a valid nonce. If that's the case, send it to # the main thread for postprocessing and clean the buffer # for the next pass. if self.output[self.WORKSIZE]: reactor.callFromThread(self.postprocess, self.output.copy(), data.nr) self.output.fill(0) cl.enqueue_write_buffer(self.commandQueue, self.output_buf, self.output, is_blocking=False)
def transform_uint32(self, data_np, flip_x=False, flip_y=False, swap_xy=False, out=None): height, width = data_np.shape[:2] new_ht, new_wd = height, width if swap_xy: new_ht, new_wd = width, height new_size = [new_ht, new_wd] + list(data_np.shape[2:]) mf = cl.mem_flags #create OpenCL buffers on devices data_np = np.ascontiguousarray(data_np) src_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=data_np) dst_buf = cl.Buffer(self.ctx, mf.WRITE_ONLY, data_np.nbytes) self.program.image_transform_uint32(self.queue, [height, width], None, src_buf, dst_buf, np.int32(width), np.int32(height), np.int32(flip_x), np.int32(flip_y), np.int32(swap_xy)) if out is None: out = np.empty_like(data_np).reshape(new_size) cl.enqueue_read_buffer(self.queue, dst_buf, out).wait() return out
def execute(self): self.program.part1(self.queue, self.a.shape, None, self.a_buf, self.b_buf, self.dest_buf) c = numpy.empty_like(self.a) cl.enqueue_read_buffer(self.queue, self.dest_buf, c).wait() print "a", self.a print "b", self.b print "c", c
def resize_uint32(self, data_np, scale_x, scale_y, out=None): height, width = data_np.shape[:2] new_ht = int(height * scale_y) new_wd = int(width * scale_x) new_shape = [new_ht, new_wd] + list(data_np.shape[2:]) mf = cl.mem_flags #create OpenCL buffers on devices data_np = np.ascontiguousarray(data_np) src_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=data_np) num_bytes = new_ht * new_wd * np.uint32(0).nbytes dst_buf = cl.Buffer(self.ctx, mf.WRITE_ONLY, num_bytes) self.program.image_resize_uint32(self.queue, [new_ht, new_wd], None, src_buf, dst_buf, np.int32(width), np.int32(new_wd), np.float64(scale_x), np.float64(scale_y)) if out is None: out = np.empty(new_shape, dtype=data_np.dtype) cl.enqueue_read_buffer(self.queue, dst_buf, out).wait() return out
def prepare_environment(self, filename,camera_index): #build kernel for videocard kernel_file = open(filename, 'r') kernel_string = "".join(kernel_file.readlines()) self.program = cl.Program(self.gpu_context, kernel_string).build() #get frames from the webcam self.stream = cv.CaptureFromCAM(camera_index) self.line_cols = cv.GetMat(cv.QueryFrame(self.stream)).cols while True: self.frame = cv.QueryFrame(self.stream) self.frame = cv.GetMat(self.frame) self.image_data = numpy.asarray(self.frame) self.image_data = numpy.array(self.image_data, dtype=numpy.int32) final = numpy.zeros(shape=(self.image_data.shape)) for position,line in enumerate(self.image_data): if position == 0: continue if position == self.image_data.shape[0]-1: continue line = line.ravel() self.line_buffer = cl.Buffer(self.gpu_context, self.memory_flags.READ_ONLY | self.memory_flags.COPY_HOST_PTR, hostbuf=line) self.top_line_buffer = cl.Buffer(self.gpu_context, self.memory_flags.READ_ONLY | self.memory_flags.COPY_HOST_PTR, hostbuf=self.image_data[position-1]) self.bottom_line_buffer = cl.Buffer(self.gpu_context, self.memory_flags.READ_ONLY | self.memory_flags.COPY_HOST_PTR, hostbuf=self.image_data[position+1]) self.contour_buffer = cl.Buffer(self.gpu_context, self.memory_flags.WRITE_ONLY, line.nbytes) self.program.calculate_differences(self.command_queue, line.shape, None,self.top_line_buffer,self.line_buffer,self.bottom_line_buffer, self.contour_buffer) contour = numpy.empty_like(line) cl.enqueue_read_buffer(self.command_queue, self.contour_buffer, contour).wait() line = contour.reshape(self.line_cols,3) final[position] = line img = numpy.uint8(final) img = cv.fromarray(img) cv.ShowImage("camera_window", img) if cv.WaitKey(10) == 27: breakcv.DestroyWindow("camera_window")
def subtended_angle(self, x1, y1, x2, y2, rx, ry, sw, sh, ez, ex, ey): x1 = np.array(x1, dtype=np.float32, copy=False) y1 = np.array(y1, dtype=np.float32, copy=False) x2 = np.array(x2, dtype=np.float32, copy=False) y2 = np.array(y2, dtype=np.float32, copy=False) ez = np.array(ez, dtype=np.float32, copy=False) ex = np.array(ex, dtype=np.float32, copy=False) ey = np.array(ey, dtype=np.float32, copy=False) rx = np.float32(rx) ry = np.float32(ry) sw = np.float32(sw) sh = np.float32(sh) x1_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=x1) y1_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=y1) x2_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=x2) y2_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=y2) ez_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=ez) ex_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=ex) ey_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=ey) out_buf = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY, x1.nbytes) self.cl_subtended_angle(self.queue, x1.shape, None, x1_buf, y1_buf, x2_buf, y2_buf, rx, ry, sw, sh, ez_buf, ex_buf, ey_buf, out_buf) self.queue.finish() out = np.empty_like(x1) cl.enqueue_read_buffer(self.queue, out_buf, out) return out