def compute(self): glMatrixMode(GL_MODELVIEW) glPushMatrix() glLoadIdentity() self.applySceneTransforms() mat = np.array(glGetFloat(GL_MODELVIEW_MATRIX).transpose(), order='C') glPopMatrix() inv = np.array(np.linalg.inv(mat), order='C') e1 = cl.enqueue_write_buffer(queue, self.matrix, mat) e2 = cl.enqueue_write_buffer(queue, self.inv_matrix, inv) e3 = self.program.pdbTracer(queue, self.dst.shape[:2], self.dst_buf, self.matrix, self.inv_matrix, np.array(len(self.mol.spheres)), self.spheredata, self.envmap, self.phimap, self.sampler) e4 = cl.enqueue_read_buffer(queue, self.dst_buf, self.dst) queue.finish() e4.wait() for e in [e3]: print (e.profile.END - e.profile.START)*1e-9 glBindTexture(GL_TEXTURE_2D, self.dstTex) glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, N, N, GL_RGBA, GL_UNSIGNED_BYTE, self.dst)
def update(self, sub_pos, angle, min_dist, max_dist, width, in_weight, out_weight): ''' Perform one update on the probabilities by using the evidence that the sub is at position sub_pos, the target is seen at an absolute heading of `angle` and is most likely between min_dist and max_dist away. in_weight gives the chance that for every point in the region, if the buoy is there then we would get this result i.e. in_weight = P(this measurement | buoy at point p) for p in our region out_weight is the same but for points outside the region ''' n,e = sub_pos cl_program.evidence(cl_queue, self.norths.shape, None, self.norths_buf, self.easts_buf, self.prob_buf, float32(n), float32(e), float32(radians(angle)), float32(min_dist**2), float32(max_dist**2), float32(width), float32(in_weight), float32(out_weight)) #TODO ? cl.enqueue_read_buffer(cl_queue, self.prob_buf, self.probabilities).wait() #Normalize total_prob = numpy.sum( self.probabilities ) self.probabilities /= total_prob cl.enqueue_write_buffer(cl_queue, self.prob_buf, self.probabilities)
def mineThread(self): for data in self.qr: for i in range(data.iterations): self.kernel.search( self.commandQueue, (data.size, ), (self.WORKSIZE, ), data.state[0], data.state[1], data.state[2], data.state[3], data.state[4], data.state[5], data.state[6], data.state[7], data.state2[1], data.state2[2], data.state2[3], data.state2[5], data.state2[6], data.state2[7], data.base[i], data.f[0], data.f[1],data.f[2], data.f[3],data.f[4], self.output_buf) cl.enqueue_read_buffer( self.commandQueue, self.output_buf, self.output) self.commandQueue.finish() # The OpenCL code will flag the last item in the output buffer when # it finds a valid nonce. If that's the case, send it to the main # thread for postprocessing and clean the buffer for the next pass. if self.output[self.OUTPUT_SIZE]: reactor.callFromThread(self.postprocess, self.output.copy(), data.nr) self.output.fill(0) cl.enqueue_write_buffer( self.commandQueue, self.output_buf, self.output)
def allocations(s): s.eh_fieldss = [] s.ce_fieldss = [] mf = cl.mem_flags for i, nx in enumerate(s.nxs): f = np.zeros((nx, s.ny, s.nz), 'f') cf = np.ones_like(f) * 0.5 if i < s.ngpu: s.eh_fieldss.append( [cl.Buffer(s.context, mf.READ_WRITE, f.nbytes) for m in range(6)] ) s.ce_fieldss.append( [cl.Buffer(s.context, mf.READ_ONLY, cf.nbytes) for m in range(3)] ) for eh_field in s.eh_fieldss[-1]: cl.enqueue_write_buffer(s.queues[i], eh_field, f) for ce_field in s.ce_fieldss[-1]: cl.enqueue_write_buffer(s.queues[i], ce_field, cf) else: s.eh_fieldss.append( [f.copy() for i in xrange(6)] ) s.ce_fieldss.append( [cf.copy() for i in xrange(3)] ) del f, cf s.offsets = [] s.tmpfs = [] for nx in s.nxs: s.offsets.append( (nx-1) * s.ny * s.nz * np.nbytes['float32'] ) s.tmpfs.append( [np.zeros((s.ny, s.nz), dtype=np.float32) for m in range(2)] )
def __init__(s, fdtd, nx, ny, nz): super(TestSetFields, s).__init__(nx, ny, nz) s.fdtd = fdtd for strf in s.strf_list: randarr = np.random.rand(nx, ny, nz).astype(s.fdtd.dtype) cl.enqueue_write_buffer(s.fdtd.queue, s.fdtd.get_buffer(strf), randarr)
def exchange_boundary_h(s): for queue, eh_fields, tmpf, offset in zip(s.queues, s.eh_fields_gpus, s.tmpfs, s.offsets)[:-1]: cl.enqueue_read_buffer(queue, eh_fields[4], tmpf[0], offset) # hy_gpu cl.enqueue_read_buffer(queue, eh_fields[5], tmpf[1], offset) # hz_gpu for queue, eh_fields, tmpf in zip(s.queues[1:], s.eh_fields_gpus[1:], s.tmpfs[:-1]): cl.enqueue_write_buffer(queue, eh_fields[4], tmpf[0]) cl.enqueue_write_buffer(queue, eh_fields[5], tmpf[1])
def exchange_boundary_e(s): for queue, eh_fields, tmpf in zip(s.queues, s.eh_fields_gpus, s.tmpfs)[1:]: cl.enqueue_read_buffer(queue, eh_fields[1], tmpf[0]) # ey_gpu cl.enqueue_read_buffer(queue, eh_fields[2], tmpf[1]) # ez_gpu for queue, eh_fields, tmpf, offset in zip(s.queues[:-1], s.eh_fields_gpus[:-1], s.tmpfs[1:], s.offsets[:-1]): cl.enqueue_write_buffer(queue, eh_fields[1], tmpf[0], offset) cl.enqueue_write_buffer(queue, eh_fields[2], tmpf[1], offset)
def mineThread(self): for data in self.qr: for i in range(data.iterations): offset = (unpack('I', data.base[i])[0],) if self.GOFFSET else None self.kernel.search( self.commandQueue, (data.size, ), (self.WORKSIZE, ), data.state[0], data.state[1], data.state[2], data.state[3], data.state[4], data.state[5], data.state[6], data.state[7], data.state2[1], data.state2[2], data.state2[3], data.state2[5], data.state2[6], data.state2[7], data.base[i], data.f[0], data.f[1], data.f[2], data.f[3], data.f[4], data.f[5], data.f[6], data.f[7], self.output_buf, global_offset=offset) cl.enqueue_read_buffer(self.commandQueue, self.output_buf, self.output, is_blocking=False) self.commandQueue.finish() # The OpenCL code will flag the last item in the output buffer # when it finds a valid nonce. If that's the case, send it to # the main thread for postprocessing and clean the buffer # for the next pass. if self.output[self.WORKSIZE]: reactor.callFromThread(self.postprocess, self.output.copy(), data.nr) self.output.fill(0) cl.enqueue_write_buffer(self.commandQueue, self.output_buf, self.output, is_blocking=False)
def __init__(s, fdtd, nx, ny, nz): super(TestGetFields, s).__init__(nx, ny, nz) s.fdtd = fdtd s.fhosts = {} for strf in s.strf_list: s.fhosts[strf] = np.random.rand(nx, ny, nz).astype(s.fdtd.dtype) cl.enqueue_write_buffer(s.fdtd.queue, s.fdtd.get_buffer(strf), s.fhosts[strf])
def set_target(self, target): flags = mf.READ_ONLY | mf.COPY_HOST_PTR self.target = np.array(target, np.float32) if self.target_buffer is None: self.target_buffer = self.buffer(self.target) else: cl.enqueue_write_buffer(self.queue, self.target_buffer, self.target)
def randomize_weights( self, context ): """ Initialize weights of layer by random values """ weights = numpy.random.rand( context._weights_buf_size ).astype( numpy.float32 ) weights -= 0.5 weights *= 4.0 / numpy.sqrt( numpy.float32( context._weights_buf_size / context._neurons_buf_size ) ) pyopencl.enqueue_write_buffer( context.opencl.queue, context._weights_buf, weights, is_blocking = True )
def set_weights( self, weights ): """ Set weights for entire layer. @param weights NumPy.NDArray of float32 values, size equals to inputs_per_neuron * neuron_count """ pyopencl.enqueue_write_buffer( self.opencl.queue, self.context._weights_buf, weights, device_offset = int( self._weights_offset * 4 ), is_blocking = True )
def __init__(self, baseTab): self.baseTab = baseTab self.ctx = cl.create_some_context() self.queue = cl.CommandQueue(self.ctx) f = open("gutarp.cl", 'r') fstr = "".join(f.readlines()) self.guTarpCL = cl.Program(self.ctx, fstr).build() self.baseTabBuffer = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=self.baseTab) cl.enqueue_write_buffer(self.queue, self.baseTabBuffer, self.baseTab)
def push_particles(self, pos, vel, color): nn = pos.shape[0] if self.num + nn > self.sph.max_num: return self.acquire_gl() cl.enqueue_write_buffer(self.queue, self.position_u, pos, self.num) self.release_gl() self.num += nn self.update_sphp() self.queue.finish()
def test_process( self ): weights = numpy.random.rand( self.nnc._weights_buf_size ).astype( numpy.float32 ) weights -= 0.5 weights *= 4.0 / numpy.sqrt( numpy.float32( self.nnc._weights_buf_size / self.nnc._neurons_buf_size ) ) pyopencl.enqueue_write_buffer( self.ocl.queue, self.nnc._weights_buf, weights, is_blocking = True ) self.nnc.input_layer.set_inputs( numpy.array( [x * x for x in range( 0, 10 )], numpy.float32 ), is_blocking = True ) self.nnc.input_layer.process() self.assertArrayEqual( self.i.get_outputs()[:3], self.h1.get_inputs() ) self.assertArrayEqual( self.i.get_outputs()[:5], self.h2.get_inputs() ) self.assertArrayEqual( self.i.get_outputs()[4:10], self.h3.get_inputs()[:6] )
def exchange_boundary(snx, ny, queues, f_gpus, tmp_hs, tmp_ts): ngpu = len(queues) for i, queue in enumerate(queues): if i>0: cl.enqueue_read_buffer(queue, f_gpus[i], tmp_hs[i], device_offset=ny*4) if i<ngpu-1: cl.enqueue_read_buffer(queue, f_gpus[i], tmp_ts[i], device_offset=(snx-2)*ny*4) for i, queue in enumerate(queues): if i>0: cl.enqueue_write_buffer(queue, f_gpus[i], tmp_ts[i-1]) if i<ngpu-1: cl.enqueue_write_buffer(queue, f_gpus[i], tmp_hs[i+1], device_offset=(snx-1)*ny*4)
def sobel(im, cl=None): if cl is None: cl = setup(im) im = im.astype(numpy.float32) pyopencl.enqueue_write_buffer(cl['queue'], cl['im_dev'], im) cl['prg'].sobel(cl['queue'], im.shape, (3,), \ cl['im_dev'], cl['m_dev'], cl['x_dev'], cl['y_dev']) m = numpy.empty_like(im) x = numpy.empty_like(im) y = numpy.empty_like(im) pyopencl.enqueue_read_buffer(cl['queue'], cl['m_dev'], m).wait() pyopencl.enqueue_read_buffer(cl['queue'], cl['x_dev'], x).wait() pyopencl.enqueue_read_buffer(cl['queue'], cl['y_dev'], y).wait() return m, x, y
def compute(self, idTab) : idTabBuffer = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=idTab) cl.enqueue_write_buffer(self.queue, idTabBuffer, idTab) result = numpy.empty([idTab.shape[0], 1], dtype=numpy.int32) resultBuffer = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=result) cl.enqueue_write_buffer(self.queue, resultBuffer, result) self.guTarpCL.gutarp(self.queue, (idTab.shape[0],), None, self.baseTabBuffer, idTabBuffer, numpy.int32(self.baseTab.shape[0]), numpy.int32(self.baseTab.shape[1]), resultBuffer) cl.enqueue_read_buffer(self.queue, resultBuffer, result).wait() return result
def compute_raytrace(mat): assert mat.dtype == np.float32 assert mat.shape == (4,4) mat = np.ascontiguousarray(mat) cl.enqueue_write_buffer(queue, mat_buf, mat) cl.enqueue_write_buffer(queue, inv_mat_buf, np.ascontiguousarray(np.linalg.inv(mat))).wait() evt = program.raytrace(queue, (H,W), None, img_buf, mat_buf, inv_mat_buf, np.int32(num_faces), vert_buf, face_buf) evt.wait() return evt
def calc_radii_gpu(self): if self.openCLInitialized == 0: self.initialize_opencl() cl.enqueue_write_buffer(self.queue, self.ptcllist_d, self.ptcllist).wait() blockSize = 1024 numBlocks = int( math.ceil(numpy.float64(self.nptcls) / numpy.float64(blockSize))) print(blockSize) print(numBlocks) self.prg.calc_radii_kernel( self.queue, (numBlocks * blockSize,), (blockSize,), self.ptcllist_d, numpy.int32(self.nptcls), numpy.float64(self.k)) cl.enqueue_read_buffer(self.queue, self.ptcllist_d, self.ptcllist).wait()
def load_mesh(vertices, faces): assert vertices.dtype == np.float32 assert vertices.shape[1] == 4 assert vertices.flags['C_CONTIGUOUS'] assert faces.dtype == np.int32 assert faces.shape[1] == 4 assert faces.flags['C_CONTIGUOUS'] global vert_buf, face_buf, num_faces num_faces = faces.shape[0] vert_buf = cl.Buffer(context, mf.READ_WRITE, vertices.shape[0]*4*4) face_buf = cl.Buffer(context, mf.READ_WRITE, faces.shape[0]*4*4) evt = cl.enqueue_write_buffer(queue, vert_buf, vertices, is_blocking=True) evt = cl.enqueue_write_buffer(queue, face_buf, faces, is_blocking=True) return evt
def __init__(s, context, queue, nx, ny, nz, dtype=np.float32): s.context = context s.queue = queue s.nx = nx s.ny = ny s.nz = nz s.dtype = dtype mf = cl.mem_flags f = np.zeros((s.nx, s.ny, s.nz), dtype=s.dtype) cf = np.ones_like(f) * 0.5 s.ehs = s.ex, s.ey, s.ez, s.hx, s.hy, s.hz = [cl.Buffer(s.context, mf.READ_WRITE, f.nbytes) for i in range(6)] s.ces = [cl.Buffer(s.context, mf.READ_ONLY, cf.nbytes) for i in range(3)] for eh in s.ehs: cl.enqueue_write_buffer(queue, eh, f) for ce in s.ces: cl.enqueue_write_buffer(queue, ce, cf) del f, cf
def add(a, b): a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a) b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b) dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes) test1 = struct.pack('ffffi', .5, 10., .1, .2, 3) print test1, len(test1), struct.calcsize('ffff') test_buf = cl.Buffer(ctx, mf.READ_ONLY, len(test1)) cl.enqueue_write_buffer(queue, test_buf, test1).wait() global_size = a.shape local_size = None prg.sum(queue, global_size, local_size, a_buf, b_buf, dest_buf, test_buf) queue.finish() c = np.empty_like(a) cl.enqueue_read_buffer(queue, dest_buf, c).wait() return c
def get_weights_direction_buf( self, context ): """ Returns direction by which adjust weights. """ context.opencl.kernel_calc_conjugate_gradient_beta( context.opencl.queue, ( 64, ), context._gradient_buf, self.prev_gradient_buf, numpy.int32( context._weights_buf_size ), pyopencl.LocalMemory( 256 ), pyopencl.LocalMemory( 256 ), self.beta_buf, local_size = ( 64, ) ) # test1 = numpy.ndarray( [ context.weights_buf_size ], numpy.float32 ) # pyopencl.enqueue_read_buffer( context.opencl.queue, context.gradient_buf, test1, is_blocking = True ) # test2 = numpy.ndarray( [ context.weights_buf_size ], numpy.float32 ) # pyopencl.enqueue_read_buffer( context.opencl.queue, self.prev_gradient_buf, test2, is_blocking = True ) # # beta = numpy.float32( ( test1 * ( test1 - test2 ) ).sum() / ( test2 * test2 ).sum() ) # pyopencl.enqueue_write_buffer( context.opencl.queue, self.beta_buf, numpy.array( [beta], numpy.float32 ), is_blocking = True ) # # test = numpy.ndarray( [ context.weights_buf_size ], numpy.float32 ) # pyopencl.enqueue_read_buffer( context.opencl.queue, self.beta_buf, test, is_blocking = True ) self.iteration_count += 1 if self.iteration_count > context.total_neurons: pyopencl.enqueue_write_buffer( context.opencl.queue, self.beta_buf, numpy.zeros( [1], numpy.float32 ), is_blocking = True ) self.iteration_count = 0 context.opencl.kernel_calc_conjugate_gradient_direction( context.opencl.queue, ( int( context._weights_buf_size ), ), context._gradient_buf, self.beta_buf, self.direction_buf, self.prev_gradient_buf ) # pyopencl.enqueue_read_buffer( context.opencl.queue, self.direction_buf, test, is_blocking = True ) return self.direction_buf
def add(a, b): a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a) b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b) dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes) params = struct.pack('ffffi', .5, 10., 0., 0., 3) print len(params), struct.calcsize('ffffi') params_buf = cl.Buffer(ctx, mf.READ_ONLY, len(params)) cl.enqueue_write_buffer(queue, params_buf, params).wait() global_size = a.shape local_size = None prg.part3(queue, global_size, local_size, a_buf, b_buf, dest_buf, params_buf) queue.finish() c = np.empty_like(a) cl.enqueue_read_buffer(queue, dest_buf, c).wait() return c
def start(self): mat_shape = self.a_dev.shape rand_start_t = time() self.x_host = np.random.uniform(size=mat_shape[1]).astype(np.float32) self.rand_t = time()-rand_start_t self.x_wr_evt = cl.enqueue_write_buffer( self.queue, self.x_dev.data, self.x_host, is_blocking=False) self.mv_evt = self.mat_vec_knl(self.queue, (mat_shape[0],), (128,), self.a_dev.data, self.x_dev.data, self.b_dev.data, self.y_host_buf, mat_shape[1])
def set_inputs( self, inputs, is_blocking = True, wait_for = None ): """ Setup inputs to input layer. @param inputs NumPy.NDArray of float32 values, size equals to neuron count """ return pyopencl.enqueue_write_buffer( self.opencl.queue, self.context._inputs_buf, inputs, device_offset = int( self._inputs_offset * 4 ), is_blocking = is_blocking, wait_for = wait_for )
def compute(self, recordNum, pattern, bucketIdx, actValue, learn, infer): """ Computes 1 step :param recordNum: :param pattern: indices of active columns in the TM layer :param classification: dict of bucketIdx and actualValue :param learn: :param infer: :return: """ pattern = np.array(pattern, dtype=cltypes.uint) if not self._init_buffers: self._setup_buffers(pattern) ev_copy_pattern = cl.enqueue_write_buffer(self._queue, self.cl_activeBitIdx, pattern) # update bit activations on device side ev_update_bit = self._prg.update_bit_activations(self._queue, (pattern.size,), None, self.cl_bit_activations, self.cl_activeBitIdx, wait_for=[ev_copy_pattern]) multiStepPredictions = {} ev_learn = None if learn: ev_learn = [self._prg.learn(self._queue, (self.step_count * pattern.size,), None, self.cl_activeBitIdx, self.cl_table_average, self.cl_table_counts, self.alpha, self.actValueAlpha, cltypes.uint(bucketIdx), self._numBuckets, wait_for=[ev_update_bit])] if infer: """ const __global float* averages, const __global uint* counts, const __global uint* activeBitIdx, __global float2* predictions, // the array of predictions __global const uint* bitActivations, // the number of times each bit has been active uint const activeBits """ # kernel for every active bit in each step ev_infer = self._prg.infer(self._queue, (self._numBuckets,), None, self.cl_table_average, self.cl_table_counts, self.cl_activeBitIdx, self.cl_predictions, self.cl_bit_activations, cltypes.uint(pattern.size), wait_for=ev_learn) cl.enqueue_copy(self._queue, self._predictions, self.cl_predictions, wait_for=[ev_infer]).wait() # print("Activations", self.bucket_activations) # multiStepPredictions['actualValues'] = predictions['x'] / len(pattern) # multiStepPredictions[step] = predictions['y'] / len(pattern) # the probability for each bucket # print("Actual Values", multiStepPredictions['actualValues']) multiStepPredictions[1] = self._predictions.copy() # print("Probability", multiStepPredictions[1]) self.bucket_activations[bucketIdx] += 1 return multiStepPredictions
def FuseRGBD_GPU(self, Image, boneDQ, jointDQ): """ Update the TSDF volume with Image :param Image: RGBD image to update to its surfaces :param boneDQ: the dual quaternion of bone in new frame :param jointDQ: the dual quaternion of joint in new frame :param bp: the indexof body part :return: none """ # initialize buffers #cl.enqueue_write_buffer(self.GPUManager.queue, self.Pose_GPU, Tg) cl.enqueue_write_buffer(self.GPUManager.queue, self.DepthGPU, Image.depth_image) cl.enqueue_write_buffer(self.GPUManager.queue, self.boneDQGPU, boneDQ) cl.enqueue_write_buffer(self.GPUManager.queue, self.jointDQGPU, jointDQ) # fuse data of the RGBD imnage with the TSDF volume 3D model self.GPUManager.programs['FuseTSDF'].FuseTSDF(self.GPUManager.queue, (self.Size[0], self.Size[1]), None, \ self.TSDFGPU, self.DepthGPU, self.Param, self.Size_Volume, self.Pose_GPU, \ self.boneDQGPU, self.jointDQGPU, self.planeF,\ self.Calib_GPU, np.int32(Image.Size[0]), np.int32(Image.Size[1]),self.WeightGPU) # update CPU array. Read the buffer to write in the CPU array. cl.enqueue_read_buffer(self.GPUManager.queue, self.TSDFGPU, self.TSDF).wait() ''' # TEST if TSDF contains NaN TSDFNaN = np.count_nonzero(np.isnan(self.TSDF)) print "TSDFNaN : %d" %(TSDFNaN) ''' cl.enqueue_read_buffer(self.GPUManager.queue, self.WeightGPU, self.Weight).wait()
def setBuffers(self, A,B,x0): """ Create/set OpenCL required buffers. @param A Linear system matrix. @param B Independent linear term. @param x0 Initial solution estimator. """ # Get dimensions shape = np.shape(A) if len(shape) != 2: raise ValueError, 'Matrix A must be 2 dimensional array' if shape[0] != shape[1]: raise ValueError, 'Square linear system matrix expected' if len(B) != shape[0]: raise ValueError, 'Matrix and independet term dimensions does not match' n = len(B) # Set x0 if not provided if x0 != None: if len(x0) != n: raise ValueError, 'Initial solution estimator length does not match with linear system dimensions' if x0 == None: x0 = B # Create OpenCL objects if not already generated if not self.A: mf = cl.mem_flags self.A = cl.Buffer( self.context, mf.READ_ONLY, size = n*n * np.dtype('float32').itemsize ) self.B = cl_array.zeros(self.context,self.queue, (n), np.float32) self.X0 = cl_array.zeros(self.context,self.queue, (n), np.float32) self.X = cl_array.zeros(self.context,self.queue, (n), np.float32) self.R = cl_array.zeros(self.context,self.queue, (n), np.float32) self.x = np.zeros((n), dtype=np.float32) self.n = n # Transfer data to buffers events = [] events.append(cl.enqueue_write_buffer(self.queue, self.A, A.reshape((n*n)) )) events.append(cl.enqueue_write_buffer(self.queue, self.B.data, B)) events.append(cl.enqueue_write_buffer(self.queue, self.X0.data, x0)) for e in events: e.wait()
def compute(self, batch): "compute best path for each batch element. Returns blank-terminated label strings for batch elements." # measure time in GPU debug mode if self.enableGPUDebug: t0 = time.time() # copy batch to device cl.enqueue_write_buffer(self.queue, self.batchBuf, batch.astype(np.float32), is_blocking=False) # one pass if self.kernelVariant == 1: cl.enqueue_nd_range_kernel(self.queue, self.kernel1, (self.batchSize, self.maxT), (1, self.maxT)) # two passes else: cl.enqueue_nd_range_kernel(self.queue, self.kernel1, (self.batchSize, self.maxT, self.maxC), (1, 1, self.maxC)) cl.enqueue_nd_range_kernel(self.queue, self.kernel2, (self.batchSize, ), None) # copy result back from GPU and return it cl.enqueue_read_buffer(self.queue, self.resBuf, self.res, is_blocking=True) # measure time in GPU debug mode if self.enableGPUDebug: t1 = time.time() print('BestPathCL.compute(...) time: ', t1 - t0) return self.res
def set_inputs(self, inputs, is_blocking=True, wait_for=None): """ Setup inputs to input layer. @param inputs NumPy.NDArray of float32 values, size equals to neuron count """ return pyopencl.enqueue_write_buffer(self.opencl.queue, self.context._inputs_buf, inputs, device_offset=int( self._inputs_offset * 4), is_blocking=is_blocking, wait_for=wait_for)
def start(self): mat_shape = self.a_dev.shape rand_start_t = time() self.x_host = np.random.uniform(size=mat_shape[1]).astype(np.float32) self.rand_t = time() - rand_start_t self.x_wr_evt = cl.enqueue_write_buffer(self.queue, self.x_dev.data, self.x_host, is_blocking=False) self.mv_evt = self.mat_vec_knl(self.queue, (mat_shape[0], ), (128, ), self.a_dev.data, self.x_dev.data, self.b_dev.data, self.y_host_buf, mat_shape[1])
def work(): while flag[0]: work = work_getter() if work is None: print "Worker starved!" yield sleep(1) continue data, i = work self.kernel.search( self.commandQueue, (data.size, ), (self.WORKSIZE, ), data.state[0], data.state[1], data.state[2], data.state[3], data.state[4], data.state[5], data.state[6], data.state[7], data.state2[1], data.state2[2], data.state2[3], data.state2[5], data.state2[6], data.state2[7], data.base[i], data.f[0], data.f[1], data.f[2], data.f[3], data.f[4], data.f[5], data.f[6], data.f[7], self.output_buf) cl.enqueue_read_buffer( self.commandQueue, self.output_buf, self.output) #self.commandQueue.finish() yield threads.deferToThread(self.commandQueue.finish) # The OpenCL code will flag the last item in the output buffer # when it finds a valid nonce. If that's the case, send it to # the main thread for postprocessing and clean the buffer # for the next pass. if self.output[self.OUTPUT_SIZE]: self.postprocess(self.output.copy(), data.nr, solution_putter) self.output.fill(0) cl.enqueue_write_buffer( self.commandQueue, self.output_buf, self.output)
def test_process(self): weights = numpy.random.rand(self.nnc._weights_buf_size).astype( numpy.float32) weights -= 0.5 weights *= 4.0 / numpy.sqrt( numpy.float32( self.nnc._weights_buf_size / self.nnc._neurons_buf_size)) pyopencl.enqueue_write_buffer(self.ocl.queue, self.nnc._weights_buf, weights, is_blocking=True) self.nnc.input_layer.set_inputs(numpy.array( [x * x for x in range(0, 10)], numpy.float32), is_blocking=True) self.nnc.input_layer.process() self.assertArrayEqual(self.i.get_outputs()[:3], self.h1.get_inputs()) self.assertArrayEqual(self.i.get_outputs()[:5], self.h2.get_inputs()) self.assertArrayEqual(self.i.get_outputs()[4:10], self.h3.get_inputs()[:6])
def search(self, midstate): msg = flipendian32(midstate) for i in xrange(8): self.sha512_fill.set_arg(i, msg[i * 4:i * 4 + 4]) self.sha512_fill.set_arg(8, self.hashes_buf) self.sha512_fill.set_arg(9, self.keyhash_buf) # t1 = time.time() cl.enqueue_nd_range_kernel(self.queue, self.sha512_fill, (HASHES_NUM,), (self.sha512_fill_ws,)) self.queue.finish() # print "fill %f" % (time.time() - t1) output = bytearray(OUTPUT_SIZE) cl.enqueue_write_buffer(self.queue, self.output_buf, output) self.queue.finish() self.ksearch.set_arg(0, self.hashes_buf) self.ksearch.set_arg(1, self.keyhash_buf) self.ksearch.set_arg(2, self.output_buf) cl.enqueue_nd_range_kernel(self.queue, self.ksearch, (KEYS_NUM,), (self.ksearch_ws,)) self.queue.finish() cl.enqueue_read_buffer(self.queue, self.output_buf, output) self.queue.finish() return str(output)
def work(): while flag[0]: work = work_getter() if work is None: print "Worker starved!" yield sleep(1) continue data, i = work self.kernel.search( self.commandQueue, (data.size, ), (self.WORKSIZE, ), data.state[0], data.state[1], data.state[2], data.state[3], data.state[4], data.state[5], data.state[6], data.state[7], data.state2[1], data.state2[2], data.state2[3], data.state2[5], data.state2[6], data.state2[7], data.base[i], data.f[0], data.f[1], data.f[2], data.f[3], data.f[4], data.f[5], data.f[6], data.f[7], self.output_buf) cl.enqueue_read_buffer(self.commandQueue, self.output_buf, self.output) #self.commandQueue.finish() yield threads.deferToThread(self.commandQueue.finish) # The OpenCL code will flag the last item in the output buffer # when it finds a valid nonce. If that's the case, send it to # the main thread for postprocessing and clean the buffer # for the next pass. if self.output[self.OUTPUT_SIZE]: self.postprocess(self.output.copy(), data.nr, solution_putter) self.output.fill(0) cl.enqueue_write_buffer(self.commandQueue, self.output_buf, self.output)
def to_buf(self, cl_buf, source=None): if source is None: if cl_buf in self.buffers: cl.enqueue_write_buffer(self.default_queue, cl_buf, self.buffers[cl_buf]).wait() else: raise ValueError("Unknown compute buffer and source not specified.") else: if source.base is not None: cl.enqueue_write_buffer(self.default_queue, cl_buf, source.base).wait() else: cl.enqueue_write_buffer(self.default_queue, cl_buf, source).wait()
def setBuffers(self, fs, waves, sea, bem, body): """ Create/set OpenCL required buffers. @param fs Free surface instance. @param waves Waves instance. @param sea Sea boundary instance. @param bem Boundary Element Method instance. @param body Body instance. """ # Get dimensions nFS = fs['N'] nB = body['N'] n = nFS + nB nW = waves['N'] # Generate arrays for positions, areas and normals pos = np.zeros((n, 4), dtype=np.float32) area = np.zeros((n ), dtype=np.float32) normal = np.zeros((n, 4), dtype=np.float32) p = np.zeros((n ), dtype=np.float32) dp = np.zeros((n ), dtype=np.float32) w = np.zeros((nW,4), dtype=np.float32) pos[0:nFS] = fs['pos'].reshape((nFS,4)) area[0:nFS] = fs['area'].reshape((nFS)) normal[0:nFS] = fs['normal'].reshape((nFS,4)) nx = fs['Nx'] ny = fs['Ny'] p[0:n] = bem['p'] dp[0:n] = bem['gradp'] w[0:nW] = waves['data'] # Create OpenCL objects if not already generated if not self.A: mf = cl.mem_flags self.A = cl.Buffer( self.context, mf.WRITE_ONLY, size = n*n * np.dtype('float32').itemsize ) self.B = cl.Buffer( self.context, mf.WRITE_ONLY, size = n * np.dtype('float32').itemsize ) self.dB = cl.Buffer( self.context, mf.WRITE_ONLY, size = n * np.dtype('float32').itemsize ) self.pos = cl.Buffer( self.context, mf.READ_ONLY, size = n*4 * np.dtype('float32').itemsize ) self.area = cl.Buffer( self.context, mf.READ_ONLY, size = n * np.dtype('float32').itemsize ) self.normal = cl.Buffer( self.context, mf.READ_ONLY, size = n*4 * np.dtype('float32').itemsize ) self.bem_p = cl.Buffer( self.context, mf.READ_ONLY, size = n * np.dtype('float32').itemsize ) self.bem_dp = cl.Buffer( self.context, mf.READ_ONLY, size = n * np.dtype('float32').itemsize ) self.waves = cl.Buffer( self.context, mf.READ_ONLY, size = nW*4 * np.dtype('float32').itemsize ) # Transfer data to buffers events = [] events.append(cl.enqueue_write_buffer(self.queue, self.pos, pos)) events.append(cl.enqueue_write_buffer(self.queue, self.area, area)) events.append(cl.enqueue_write_buffer(self.queue, self.normal, normal)) events.append(cl.enqueue_write_buffer(self.queue, self.bem_p, p)) events.append(cl.enqueue_write_buffer(self.queue, self.bem_dp, dp)) events.append(cl.enqueue_write_buffer(self.queue, self.waves, w)) for e in events: e.wait()
def to_buf(self, cl_buf, source=None): if source is None: if cl_buf in self.buffers: cl.enqueue_write_buffer(self.default_queue, cl_buf, self.buffers[cl_buf]).wait() else: raise ValueError( 'Unknown compute buffer and source not specified.') else: if source.base is not None: cl.enqueue_write_buffer(self.default_queue, cl_buf, source.base).wait() else: cl.enqueue_write_buffer(self.default_queue, cl_buf, source).wait()
def FuseRGBD_GPU(self, depth_image, depth_intrinsic, cam_pose, nu): """ Update the TSDF volume with Image :param Image: RGBD image to update to its surfaces """ # initialize buffers #oneweights = np.ones((self.Size[2],self.Size[1]), dtype=np.int16) self.nu = nu #self.TSDF = TSDF #self.Weight = Weight cl.enqueue_write_buffer(self.GPUManager.queue, self.DepthGPU, depth_image) #now oneweights ise 1leri attim ilerde buu skinning weightleri ile dene #cl.enqueue_write_buffer(self.GPUManager.queue, self.WeightGPU, oneweights).wait() #cl.enqueue_write_buffer(self.GPUManager.queue, self.VertexGPU, self.VertexArray).wait() #cl.enqueue_write_buffer(self.GPUManager.queue, self.boneDQGPU, boneDQ) #cl.enqueue_write_buffer(self.GPUManager.queue, self.jointDQGPU, jointDQ) #print(np.int32(depth_image.shape[1])) simdi ravel gonderiyorum bu yuzden boyle # fuse data of the RGBD imnage with the TSDF volume 3D model print("==========DEPTH=fuseRGBD========") print(depth_image.shape[0]) print(depth_image.shape[1]) print(self.Size[1]) print(self.Size[2]) print("=======================") #self.GPUManager.programs['FuseTSDF'].FuseTSDF(self.GPUManager.queue, (self.Size[1], self.Size[2]) , None, \ # self.TSDFGPU, self.WeightGPU,self.DistGPU,self.DepthGPU, self.Pose_GPU, \ # self.VoxelGPU,self.VertexGPU, \ # self.Calib_GPU, np.int32(depth_shape[0]), np.int32(depth_shape[1]),\ # ) ''' self.GPUManager.programs['FuseSortedTSDF'].FuseSortedTSDF(self.GPUManager.queue, (self.Size[2], self.Size[1]) , None, \ self.TSDFGPU, self.WeightGPU,self.DistGPU,self.DepthGPU, \ self.voxelvertexArrayGPU, np.int32(self.nu), \ self.Calib_GPU, np.int32(depth_shape[0]), np.int32(depth_shape[1])\ ) #GPU for Sphere Equation #self.GPUManager.programs['FuseEqTSDF'].FuseEqTSDF(self.GPUManager.queue, (self.Size[1], self.Size[2]), None, \ # self.TSDFGPU, self.TSDFtableGPU,self.Size_Volume, self.Pose_GPU, \ # self.VoxelGPU,self.VertexGPU, \ # np.int32(depth_shape[0]), np.int32(depth_shape[1])) # update CPU array. Read the buffer to write in the CPU array. cl.enqueue_read_buffer(self.GPUManager.queue, self.TSDFGPU, self.TSDF).wait() cl.enqueue_read_buffer(self.GPUManager.queue, self.DistGPU, self.Dist).wait() cl.enqueue_read_buffer(self.GPUManager.queue, self.WeightGPU, self.Weight).wait() #cl.enqueue_read_buffer(self.GPUManager.queue, self.DepthGPU, self.depth).wait() #cl.enqueue_read_buffer(self.GPUManager.queue, self.VoxelGPU, self.voxeldogrumu).wait() ''' #FOR CPU calculation # self.calculateTSDF(depth_image, depth_intrinsic, cam_pose) if (self.maximum_mode): self.calculateTSDF_absminmode(depth_image, depth_intrinsic, cam_pose) else: self.calculateTSDF_ver2(depth_image, depth_intrinsic, cam_pose) # np.savetxt(savepath + '/FuseRGBD_CGPU.txt', self.TSDF_vertices, delimiter=',') # np.savetxt(savepath + '/FuseRGBD_CDist.txt', self.Dist, delimiter=',') #from Sphere Equation #self.TSDFeq() # np.savetxt(savepath + '/FuseRGBD_CGPU.txt', self.TSDF, delimiter=',') # np.savetxt(savepath + '/FuseRGBD_CDist.txt', self.Dist, delimiter=',') #np.savetxt('FuseRGBD_WeightGG.txt', self.Weight, delimiter=',') #np.savetxt('FuseRGBD_cokyanlis.txt', self.depth, delimiter=',') # TEST if TSDF contains NaN TSDFNaN = np.count_nonzero(np.isnan(self.TSDF_vertices)) print("TSDFNaN : {0:d}".format(TSDFNaN)) return self.TSDF_vertices, self.Weights_vertices
def gpu_bench(self, N): ctx = cl.Context(devices=[self.dev]) queue = cl.CommandQueue(ctx) mf = cl.mem_flags U = numpy.zeros((N, N), numpy.float32) V = numpy.zeros((N, N), numpy.float32) U[0, ::2] = 1.0 V[0, 1::2] = 1.0 U_buf = cl.Buffer(ctx, mf.READ_WRITE, U.nbytes) V_buf = cl.Buffer(ctx, mf.READ_WRITE, V.nbytes) cl.enqueue_write_buffer(queue, U_buf, U).wait() cl.enqueue_write_buffer(queue, V_buf, V).wait() knl1 = """ __kernel void update(__global float *u, __global float *v) { int i = get_global_id(0) + 1; int j = get_global_id(1) + 1; int ny = %(NY)d; v[ny*j + i] = ((u[ny*(i-1) + j] + u[ny*(i+1) + j]) + (u[ny*i + j-1] + u[ny*i + j+1]))*0.25; } """ % { 'NY': N, 'NX': N } knl2 = """ __kernel void update(__global float *u, __global float *v) { int i = get_global_id(0); int j = get_global_id(1); int x = get_local_id(0)+1; int y = get_local_id(1)+1; int lsize = %(lsize)d; int ny = %(NY)d; __local float tile[324]; float sum = 0.0f; tile[lsize*y + x] = u[ny*j + i]; barrier(CLK_LOCAL_MEM_FENCE); sum += tile[lsize*(y-1) + x]; sum += tile[lsize*(y+1) + x]; sum += tile[lsize*y + x + 1]; sum += tile[lsize*y + x - 1]; sum *= 0.25f; v[ny*j + i] = sum; } """ % { 'NY': N, 'NX': N, 'lsize': 18 } prg = cl.Program(ctx, knl2) prg.build() yield 0 reps = 100 if N <= 1000 else 10 ct = 0 while True: for i in xrange(reps): evt = prg.update(queue, ((N), (N)), (16, 16), U_buf, V_buf, g_times_l=False) evt = prg.update(queue, ((N), (N)), (16, 16), V_buf, U_buf, wait_for=[evt], g_times_l=False) ct += 1 queue.finish() yield ct
def mining_thread(self): say_line('started OpenCL miner on platform %d, device %d (%s)', (self.options.platform, self.device_index, self.device_name)) (self.defines, rate_divisor, hashspace) = if_else(self.vectors, ('-DVECTORS', 500, 0x7FFFFFFF), ('', 1000, 0xFFFFFFFF)) self.defines += (' -DOUTPUT_SIZE=' + str(self.output_size)) self.defines += (' -DOUTPUT_MASK=' + str(self.output_size - 1)) self.load_kernel() frame = 1.0 / max(self.frames, 3) unit = self.worksize * 256 global_threads = unit * 10 queue = cl.CommandQueue(self.context) last_rated_pace = last_rated = last_n_time = last_temperature = time() base = last_hash_rate = threads_run_pace = threads_run = 0 output = np.zeros(self.output_size + 1, np.uint32) output_buffer = cl.Buffer(self.context, cl.mem_flags.WRITE_ONLY | cl.mem_flags.USE_HOST_PTR, hostbuf=output) self.kernel.set_arg(20, output_buffer) work = None temperature = 0 while True: if self.should_stop: return sleep(self.frameSleep) if (not work) or (not self.work_queue.empty()): try: work = self.work_queue.get(True, 1) except Empty: continue else: if not work: continue nonces_left = hashspace state = work.state state2 = work.state2 f = work.f self.kernel.set_arg(0, state[0]) self.kernel.set_arg(1, state[1]) self.kernel.set_arg(2, state[2]) self.kernel.set_arg(3, state[3]) self.kernel.set_arg(4, state[4]) self.kernel.set_arg(5, state[5]) self.kernel.set_arg(6, state[6]) self.kernel.set_arg(7, state[7]) self.kernel.set_arg(8, state2[1]) self.kernel.set_arg(9, state2[2]) self.kernel.set_arg(10, state2[3]) self.kernel.set_arg(11, state2[5]) self.kernel.set_arg(12, state2[6]) self.kernel.set_arg(13, state2[7]) self.kernel.set_arg(15, f[0]) self.kernel.set_arg(16, f[1]) self.kernel.set_arg(17, f[2]) self.kernel.set_arg(18, f[3]) self.kernel.set_arg(19, f[4]) if temperature < self.cutoff_temp: self.kernel.set_arg(14, pack('I', base)) cl.enqueue_nd_range_kernel(queue, self.kernel, (global_threads,), (self.worksize,)) nonces_left -= global_threads threads_run_pace += global_threads threads_run += global_threads base = uint32(base + global_threads) else: threads_run_pace = 0 last_rated_pace = time() sleep(self.cutoff_interval) now = time() if self.adapterIndex != None: t = now - last_temperature if temperature >= self.cutoff_temp or t > 1: last_temperature = now with adl_lock: temperature = self.get_temperature() t = now - last_rated_pace if t > 1: rate = (threads_run_pace / t) / rate_divisor last_rated_pace = now; threads_run_pace = 0 r = last_hash_rate / rate if r < 0.9 or r > 1.1: global_threads = max(unit * int((rate * frame * rate_divisor) / unit), unit) last_hash_rate = rate t = now - last_rated if t > self.options.rate: self.update_rate(now, threads_run, t, work.targetQ, rate_divisor) last_rated = now; threads_run = 0 queue.finish() cl.enqueue_read_buffer(queue, output_buffer, output) queue.finish() if output[self.output_size]: result = Object() result.header = work.header result.merkle_end = work.merkle_end result.time = work.time result.difficulty = work.difficulty result.target = work.target result.state = np.array(state) result.nonces = np.array(output) result.job_id = work.job_id result.extranonce2 = work.extranonce2 result.server = work.server result.miner = self self.switch.put(result) output.fill(0) cl.enqueue_write_buffer(queue, output_buffer, output) if not self.switch.update_time: if nonces_left < 3 * global_threads * self.frames: self.update = True nonces_left += 0xFFFFFFFFFFFF elif 0xFFFFFFFFFFF < nonces_left < 0xFFFFFFFFFFFF: say_line('warning: job finished, %s is idle', self.id()) work = None elif now - last_n_time > 1: work.time = bytereverse(bytereverse(work.time) + 1) state2 = partial(state, work.merkle_end, work.time, work.difficulty, f) calculateF(state, work.merkle_end, work.time, work.difficulty, f, state2) self.kernel.set_arg(8, state2[1]) self.kernel.set_arg(9, state2[2]) self.kernel.set_arg(10, state2[3]) self.kernel.set_arg(11, state2[5]) self.kernel.set_arg(12, state2[6]) self.kernel.set_arg(13, state2[7]) self.kernel.set_arg(15, f[0]) self.kernel.set_arg(16, f[1]) self.kernel.set_arg(17, f[2]) self.kernel.set_arg(18, f[3]) self.kernel.set_arg(19, f[4]) last_n_time = now self.update_time_counter += 1 if self.update_time_counter >= self.switch.max_update_time: self.update = True self.update_time_counter = 1
def load_images(self, rgb, depth): cl.enqueue_write_buffer(self.queue, self.rgb_cl, rgb) cl.enqueue_write_buffer(self.queue, self.depth_cl, depth) self.queue.finish()
class Array(object): """A :mod:`pyopencl` Array is used to do array-based calculation on a compute device. This is mostly supposed to be a :mod:`numpy`-workalike. Operators work on an element-by-element basis, just like :class:`numpy.ndarray`. """ def __init__(self, context, shape, dtype, order="C", allocator=None, base=None, data=None, queue=None): if allocator is None: allocator = DefaultAllocator(context) try: s = 1 for dim in shape: s *= dim except TypeError: if not isinstance(shape, int): raise TypeError("shape must either be iterable or " "castable to an integer") s = shape shape = (shape,) self.context = context self.queue = queue self.shape = shape self.dtype = numpy.dtype(dtype) if order not in ["C", "F"]: raise ValueError("order must be either 'C' or 'F'") self.order = order self.mem_size = self.size = s self.nbytes = self.dtype.itemsize * self.size self.allocator = allocator if data is None: if self.size: self.data = self.allocator(self.size * self.dtype.itemsize) else: self.data = None if base is not None: raise ValueError("If data is specified, base must be None.") else: self.data = data self.base = base #@memoize_method FIXME: reenable def get_sizes(self, queue): return splay(queue, self.mem_size) def set(self, ary, queue=None, async=False): assert ary.size == self.size assert ary.dtype == self.dtype if self.size: evt = cl.enqueue_write_buffer(queue or self.queue, self.data, ary) if not async: evt.wait()
def load_raw(depth): (L, T), (R, B) = rect assert depth.dtype == np.float32 assert depth.shape[0] == B - T assert depth.shape[1] == R - L return cl.enqueue_write_buffer(queue, raw_buf, depth, is_blocking=False)
def load_mask(mask): (L, T), (R, B) = rect assert mask.dtype == np.uint8 assert mask.shape[0] == B - T assert mask.shape[1] == R - L return cl.enqueue_write_buffer(queue, mask_buf, mask, is_blocking=False)
#print f2 #print ftime[f1] #print ftime[f2] # read new frames and push to kernel # note since the data in f2 becomes f1 we do not need to push two frames # every time we go to a new interval. By using an additional flag we can # set the linear interpolation in the kernel to be correct whether or not # f1 is behind or in front of f2 if (f1 != flast): flast = f1 uf1 = fin.var('u')[f1, :] vf1 = fin.var('v')[f1, :] uf2 = fin.var('u')[f2, :] vf2 = fin.var('v')[f2, :] cl.enqueue_write_buffer(a_queue, uf1_buf, uf1).wait() cl.enqueue_write_buffer(a_queue, vf1_buf, vf1).wait() cl.enqueue_write_buffer(a_queue, uf2_buf, uf2).wait() cl.enqueue_write_buffer(a_queue, vf2_buf, vf2).wait() #--------------------------------------------------------------------------- # find cell containing particle and update state #--------------------------------------------------------------------------- event = findcell_knl(a_queue, x.shape, None, cell_buf, neney_buf, eney_buf, x_buf, y_buf, xt_buf, yt_buf) #cl.enqueue_read_buffer(a_queue, cell_buf, cell).wait() event = findrobust_knl(a_queue, x.shape, None, cell_buf, x_buf, y_buf, xt_buf, yt_buf, num_elems)
def run_with_ocl(np_in_list, np_out_list, shape, oclfun, preset_outbuf=False, rw_outbuf=False): assert (DEFAULT_ENABLE_OPENCL == True and OCL.ENABLE_OPENCL == True) t_start = time.time() buf_in_list = [ cl.Buffer(OCL.prog['ctx'], OCL.prog['mf'].READ_ONLY | OCL.prog['mf'].COPY_HOST_PTR, size=np_in.nbytes, hostbuf=np_in) for np_in in np_in_list ] if preset_outbuf: buf_out_list = [ cl.Buffer(OCL.prog['ctx'], (OCL.prog['mf'].READ_WRITE if rw_outbuf else OCL.prog['mf'].WRITE_ONLY) | OCL.prog['mf'].USE_HOST_PTR, size=np_out.nbytes, hostbuf=np_out) for np_out in np_out_list ] for i, outbuf in enumerate(buf_out_list): cl.enqueue_write_buffer(OCL.prog['queue'], outbuf, np_out_list[i]) else: buf_out_list = [ cl.Buffer(OCL.prog['ctx'], (OCL.prog['mf'].READ_WRITE if rw_outbuf else OCL.prog['mf'].WRITE_ONLY) | OCL.prog['mf'].ALLOC_HOST_PTR, size=np_out.nbytes) for np_out in np_out_list ] runevent = oclfun( OCL.prog['queue'], shape, None, *(buf_in_list + buf_out_list)) # queue, globalSize, localSize, *buffers) runevent.wait() for buf_from, np_to in zip(buf_out_list, np_out_list): # cl.enqueue_copy(OCL.prog['queue'], np_to, buf_from) cl.enqueue_read_buffer(OCL.prog['queue'], buf_from, np_to) # np_to1, e = cl.enqueue_map_buffer( # OCL.prog['queue'], # buf_from, # cl.map_flags.READ, # 0, # np_to.shape, # np_to.dtype, # "C" # ) # np_to += np_to1 OCL.prog['queue'].finish() t_end = time.time() dt_real = t_end - t_start
def miningThread(self): self.loadKernel() frame = 1.0 / self.frames unit = self.worksize * 256 globalThreads = unit * 10 queue = cl.CommandQueue(self.context) lastRatedPace = lastRated = lastNTime = time() base = lastHashRate = threadsRunPace = threadsRun = 0 f = np.zeros(8, np.uint32) output = np.zeros(OUTPUT_SIZE + 1, np.uint32) output_buf = cl.Buffer(self.context, cl.mem_flags.WRITE_ONLY | cl.mem_flags.USE_HOST_PTR, hostbuf=output) work = None while True: sleep(self.frameSleep) if self.stop: return if (not work) or (not self.workQueue.empty()): try: work = self.workQueue.get(True, 1) except Empty: continue else: if not work: continue noncesLeft = self.hashspace data = np.array(unpack('IIIIIIIIIIIIIIII', work['data'][128:].decode('hex')), dtype=np.uint32) state = np.array(unpack('IIIIIIII', work['midstate'].decode('hex')), dtype=np.uint32) target = np.array(unpack('IIIIIIII', work['target'].decode('hex')), dtype=np.uint32) state2 = partial(state, data, f) self.miner.search(queue, (globalThreads, ), (self.worksize, ), state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7], state2[1], state2[2], state2[3], state2[5], state2[6], state2[7], pack('I', base), f[0], f[1], f[2], f[3], f[4], f[5], f[6], f[7], output_buf) cl.enqueue_read_buffer(queue, output_buf, output) noncesLeft -= globalThreads threadsRunPace += globalThreads threadsRun += globalThreads base = uint32(base + globalThreads) now = time() t = now - lastRatedPace if (t > 1): rate = (threadsRunPace / t) / self.rateDivisor lastRatedPace = now threadsRunPace = 0 r = lastHashRate / rate if r < 0.9 or r > 1.1: globalThreads = max( unit * int((rate * frame * self.rateDivisor) / unit), unit) lastHashRate = rate t = now - lastRated if (t > self.rate): self.hashrate(int((threadsRun / t) / self.rateDivisor)) lastRated = now threadsRun = 0 queue.finish() if output[OUTPUT_SIZE]: result = {} result['work'] = work result['data'] = np.array(data) result['state'] = np.array(state) result['target'] = target result['output'] = np.array(output) self.resultQueue.put(result) output.fill(0) cl.enqueue_write_buffer(queue, output_buf, output) if self.updateTime == '': if noncesLeft < (TIMEOUT + 1) * globalThreads * self.frames: self.update = True noncesLeft += 0xFFFFFFFFFFFF elif 0xFFFFFFFFFFF < noncesLeft < 0xFFFFFFFFFFFF: self.sayLine('warning: job finished, miner is idle') work = None elif now - lastNTime > 1: data[1] = bytereverse(bytereverse(data[1]) + 1) state2 = partial(state, data, f) lastNTime = now
def run(self): frame = float(1) / float(self.frames) window = frame / 30 upper = frame + window lower = frame - window unit = self.worksize * 256 globalThreads = unit queue = cl.CommandQueue(self.context) base = lastRate = threadsRun = lastNTime = 0 output = np.zeros(2, np.uint32) output_buf = cl.Buffer(self.context, cl.mem_flags.WRITE_ONLY | cl.mem_flags.USE_HOST_PTR, hostbuf=output) work = None while True: if (not work) or (not self.workQueue.empty()): try: work = self.workQueue.get(True, 1) except Empty: continue else: if not work: continue elif work == 'stop': return try: data = np.array(unpack( 'IIIIIIIIIIIIIIII', work['data'][128:].decode('hex')), dtype=np.uint32) state = np.array(unpack( 'IIIIIIII', work['midstate'].decode('hex')), dtype=np.uint32) target = np.array(unpack('IIIIIIII', work['target'].decode('hex')), dtype=np.uint32) state2 = partial(state, data) except Exception as e: self.sayLine('Wrong data format from RPC!') sys.exit() kernelStart = time() self.miner.search(queue, (globalThreads, ), (self.worksize, ), data[0], data[1], data[2], state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7], state2[1], state2[2], state2[3], state2[5], state2[6], state2[7], target[6], target[7], pack('I', base), output_buf) cl.enqueue_read_buffer(queue, output_buf, output) if (time() - lastRate > self.rate): self.say( '%s khash/s', int((threadsRun / (time() - lastRate)) / self.rateDivisor)) threadsRun = 0 lastRate = time() queue.finish() kernelTime = time() - kernelStart threadsRun += globalThreads base = uint32(base + globalThreads) if (kernelTime < lower): globalThreads += unit elif (kernelTime > upper and globalThreads > unit): globalThreads -= unit if output[0]: result = {} d = work['data'] d = d[:136] + pack('I', long( data[1])).encode('hex') + d[144:152] + pack( 'I', long(output[1])).encode('hex') + d[160:] result['data'] = d result['hash'] = output[0] self.resultQueue.put(result) output[0] = 0 cl.enqueue_write_buffer(queue, output_buf, output) work = None continue if (time() - lastNTime > 1): data[1] = bytereverse(bytereverse(data[1]) + 1) state2 = partial(state, data) lastNTime = time()
import pyopencl as cl, numpy import numpy.linalg as la a = numpy.arange(16).astype(numpy.float32) ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) a_dev = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, size=a.nbytes) cl.enqueue_write_buffer(queue, a_dev, a) prg = cl.Program( ctx, """ #pragma OPENCL EXTENSION cl_intel_printf : enable #pragma OPENCL EXTENSION cl_amd_printf : enable __kernel void twice(__global float *a) { int i = get_local_id(0); int local_size = get_local_size(0); int group_id = get_group_id(0); a[i + local_size * group_id] *= 2; //a[i] = float(local_size); printf(" -> \n"); } """).build() prg.twice(queue, a.shape, (4, ), a_dev) result = numpy.empty_like(a) cl.enqueue_read_buffer(queue, a_dev, result).wait()
def to_buf_async(self, cl_buf, stream=None): queue = stream.queue if stream is not None else self.default_queue cl.enqueue_write_buffer(queue, cl_buf, self.buffers[cl_buf], is_blocking=False)
def process_sub_matrix(self, *args, **kwargs): device = kwargs['device'] sub_matrix_queue = kwargs['sub_matrix_queue'] context = self.opencl.contexts[device] command_queue = self.opencl.command_queues[device] program = self.opencl.programs[device] vertical_kernel = cl.Kernel(program, 'vertical') diagonal_kernel = cl.Kernel(program, self.settings.diagonal_kernel_name) while True: try: sub_matrix = sub_matrix_queue.get(False) transfer_from_device_events = [] transfer_to_device_events = [] create_matrix_events = [] vertical_events = [] diagonal_events = [] # Vectors X vectors_x = self.get_vectors_x(sub_matrix) vectors_x_buffer = cl.Buffer( context, cl.mem_flags.READ_ONLY, vectors_x.size * vectors_x.itemsize) transfer_to_device_events.append( cl.enqueue_write_buffer(command_queue, vectors_x_buffer, vectors_x, device_offset=0, wait_for=None, is_blocking=False)) # Vectors Y vectors_y = self.get_vectors_y(sub_matrix) vectors_y_buffer = cl.Buffer( context, cl.mem_flags.READ_ONLY, vectors_y.size * vectors_y.itemsize) transfer_to_device_events.append( cl.enqueue_write_buffer(command_queue, vectors_y_buffer, vectors_y, device_offset=0, wait_for=None, is_blocking=False)) # Recurrence points recurrence_points, \ recurrence_points_start, \ recurrence_points_end = self.get_recurrence_points(sub_matrix) recurrence_points_buffer = cl.Buffer( context, cl.mem_flags.READ_WRITE, recurrence_points.size * recurrence_points.itemsize) transfer_to_device_events.append( cl.enqueue_write_buffer(command_queue, recurrence_points_buffer, recurrence_points, device_offset=0, wait_for=None, is_blocking=False)) # Vertical frequency distribution vertical_frequency_distribution = self.get_empty_local_frequency_distribution( ) vertical_frequency_distribution_buffer = cl.Buffer( context, cl.mem_flags.READ_WRITE, vertical_frequency_distribution.size * vertical_frequency_distribution.itemsize) transfer_to_device_events.append( cl.enqueue_write_buffer( command_queue, vertical_frequency_distribution_buffer, vertical_frequency_distribution, device_offset=0, wait_for=None, is_blocking=False)) # White vertical frequency distribution white_vertical_frequency_distribution = self.get_empty_local_frequency_distribution( ) white_vertical_frequency_distribution_buffer = cl.Buffer( context, cl.mem_flags.READ_WRITE, white_vertical_frequency_distribution.size * white_vertical_frequency_distribution.itemsize) transfer_to_device_events.append( cl.enqueue_write_buffer( command_queue, white_vertical_frequency_distribution_buffer, white_vertical_frequency_distribution, device_offset=0, wait_for=None, is_blocking=False)) # Diagonal frequency distribution diagonal_frequency_distribution = self.get_empty_local_frequency_distribution( ) diagonal_frequency_distribution_buffer = cl.Buffer( context, cl.mem_flags.READ_WRITE, diagonal_frequency_distribution.size * diagonal_frequency_distribution.itemsize) transfer_to_device_events.append( cl.enqueue_write_buffer( command_queue, diagonal_frequency_distribution_buffer, diagonal_frequency_distribution, device_offset=0, wait_for=None, is_blocking=False)) # Vertical carryover vertical_carryover, \ vertical_carryover_start,\ vertical_carryover_end = self.get_vertical_length_carryover(sub_matrix) vertical_carryover_buffer = cl.Buffer( context, cl.mem_flags.READ_WRITE, vertical_carryover.size * vertical_carryover.itemsize) transfer_to_device_events.append( cl.enqueue_write_buffer(command_queue, vertical_carryover_buffer, vertical_carryover, device_offset=0, wait_for=None, is_blocking=False)) # White vertical carryover white_vertical_carryover, \ white_vertical_carryover_start,\ white_vertical_carryover_end = self.get_white_vertical_length_carryover(sub_matrix) white_vertical_carryover_buffer = cl.Buffer( context, cl.mem_flags.READ_WRITE, white_vertical_carryover.size * white_vertical_carryover.itemsize) transfer_to_device_events.append( cl.enqueue_write_buffer(command_queue, white_vertical_carryover_buffer, white_vertical_carryover, device_offset=0, wait_for=None, is_blocking=False)) # Diagonal carryover diagonal_carryover, \ diagonal_carryover_start, \ diagonal_carryover_end = self.get_diagonal_length_carryover(sub_matrix) diagonal_carryover_buffer = cl.Buffer( context, cl.mem_flags.READ_WRITE, diagonal_carryover.size * diagonal_carryover.itemsize) transfer_to_device_events.append( cl.enqueue_write_buffer(command_queue, diagonal_carryover_buffer, diagonal_carryover, device_offset=0, wait_for=None, is_blocking=False)) command_queue.finish() # Vertical kernel vertical_args = [ vectors_x_buffer, vectors_y_buffer, np.uint32(sub_matrix.dim_x), np.uint32(sub_matrix.dim_y), np.uint32(self.settings.embedding_dimension), np.float32(self.settings.neighbourhood.radius), recurrence_points_buffer, vertical_frequency_distribution_buffer, vertical_carryover_buffer, white_vertical_frequency_distribution_buffer, white_vertical_carryover_buffer ] OpenCL.set_kernel_args(vertical_kernel, vertical_args) global_work_size = [ int(sub_matrix.dim_x + (device.max_work_group_size - (sub_matrix.dim_x % device.max_work_group_size))) ] local_work_size = None vertical_events.append( cl.enqueue_nd_range_kernel(command_queue, vertical_kernel, global_work_size, local_work_size)) command_queue.finish() # Diagonal kernel if self.settings.is_matrix_symmetric: diagonal_args = [ vectors_x_buffer, vectors_y_buffer, np.uint32(sub_matrix.dim_x), np.uint32(sub_matrix.dim_y), np.uint32(sub_matrix.start_x), np.uint32(sub_matrix.start_y), np.uint32(self.settings.embedding_dimension), np.float32(self.settings.neighbourhood.radius), np.uint32(self.settings.theiler_corrector), np.uint32(self.get_diagonal_offset(sub_matrix)), diagonal_frequency_distribution_buffer, diagonal_carryover_buffer ] global_work_size = [ int(sub_matrix.dim_x + (device.max_work_group_size - (sub_matrix.dim_x % device.max_work_group_size))) ] else: diagonal_args = [ vectors_x_buffer, vectors_y_buffer, np.uint32(sub_matrix.dim_x + sub_matrix.dim_y - 1), np.uint32(sub_matrix.dim_y), np.uint32(sub_matrix.start_x), np.uint32(sub_matrix.start_y), np.uint32(self.settings.embedding_dimension), np.float32(self.settings.neighbourhood.radius), np.uint32(self.settings.theiler_corrector), diagonal_frequency_distribution_buffer, diagonal_carryover_buffer ] global_work_size_x = sub_matrix.dim_x + sub_matrix.dim_y - 1 global_work_size = [ int(global_work_size_x + ( device.max_work_group_size - (global_work_size_x % device.max_work_group_size))) ] OpenCL.set_kernel_args(diagonal_kernel, diagonal_args) local_work_size = None diagonal_events.append( cl.enqueue_nd_range_kernel(command_queue, diagonal_kernel, global_work_size, local_work_size)) command_queue.finish() # Read buffer transfer_from_device_events.append( cl.enqueue_read_buffer( command_queue, recurrence_points_buffer, self.recurrence_points[ recurrence_points_start:recurrence_points_end], device_offset=0, wait_for=None, is_blocking=False)) transfer_from_device_events.append( cl.enqueue_read_buffer( command_queue, vertical_frequency_distribution_buffer, vertical_frequency_distribution, device_offset=0, wait_for=None, is_blocking=False)) transfer_from_device_events.append( cl.enqueue_read_buffer( command_queue, vertical_carryover_buffer, self.vertical_length_carryover[ vertical_carryover_start:vertical_carryover_end], device_offset=0, wait_for=None, is_blocking=False)) transfer_from_device_events.append( cl.enqueue_read_buffer( command_queue, white_vertical_frequency_distribution_buffer, white_vertical_frequency_distribution, device_offset=0, wait_for=None, is_blocking=False)) transfer_from_device_events.append( cl.enqueue_read_buffer( command_queue, white_vertical_carryover_buffer, self.white_vertical_length_carryover[ white_vertical_carryover_start: white_vertical_carryover_end], device_offset=0, wait_for=None, is_blocking=False)) transfer_from_device_events.append( cl.enqueue_read_buffer( command_queue, diagonal_frequency_distribution_buffer, diagonal_frequency_distribution, device_offset=0, wait_for=None, is_blocking=False)) transfer_from_device_events.append( cl.enqueue_read_buffer( command_queue, diagonal_carryover_buffer, self.diagonal_length_carryover[ diagonal_carryover_start:diagonal_carryover_end], device_offset=0, wait_for=None, is_blocking=False)) command_queue.finish() # Update frequency distributions self.threads_vertical_frequency_distribution[ device] += vertical_frequency_distribution self.threads_white_vertical_frequency_distribution[ device] += white_vertical_frequency_distribution self.threads_diagonal_frequency_distribution[ device] += diagonal_frequency_distribution # Get events runtimes runtimes = Runtimes() runtimes.transfer_to_device = self.opencl.convert_events_runtime( transfer_to_device_events) runtimes.transfer_from_device = self.opencl.convert_events_runtime( transfer_from_device_events) runtimes.create_matrix = self.opencl.convert_events_runtime( create_matrix_events) runtimes.detect_vertical_lines = self.opencl.convert_events_runtime( vertical_events) runtimes.detect_diagonal_lines = self.opencl.convert_events_runtime( diagonal_events) self.threads_runtimes[device] += runtimes except Queue.Empty: break
''' ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) m = 2**13 n = 2**13 #workgroup_size = None # Slow # workgroup_size = (1,1) workgroup_size = (2**4, 2**4) a = np.arange(m * n, dtype=np.float32).reshape((m, n)) a_buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, size=a.nbytes) cl.enqueue_write_buffer(queue, a_buf, a) prg = cl.Program( ctx, """ __kernel void transpose( __global float *a, __global float *a_t ){ int m = get_global_size(0); int n = get_global_size(1); int i = get_global_id (0); int j = get_global_id (1); int read_idx = j + i * n;
def write_array(self, data, **kwargs): queue = get_device().queue return cl.enqueue_write_buffer(queue, self.data, data, **kwargs)
# note since the data in f2 becomes f1 we do not need to push two frames # every time we go to a new interval. By using an additional flag we can # set the linear interpolation in the kernel to be correct whether or not # f1 is behind or in front of f2 if (f1 != flast): flast = f1 #uf1_buf = uf2_buf #vf1_buf = vf2_buf uf2 = fin.var('ua')[f2, :] vf2 = fin.var('va')[f2, :] if (behind[0] == 1): event = cl.enqueue_write_buffer(a_queue, uf2_buf, uf2, is_blocking=False) #event.wait() #elap_wf2 = elap_wf2 + 1e-9*(event.profile.end - event.profile.start) event = cl.enqueue_write_buffer(b_queue, vf2_buf, vf2, is_blocking=False) #event.wait() #elap_wf2 = elap_wf2 + 1e-9*(event.profile.end - event.profile.start) behind = numpy.zeros(1, dtype=dtype_int) else: event = cl.enqueue_write_buffer(a_queue, uf1_buf,
def load_filt(filt): (L, T), (R, B) = rect assert filt.dtype == np.float32 assert filt.shape[0] == B - T assert filt.shape[1] == R - L return cl.enqueue_write_buffer(queue, filt_buf, filt, is_blocking=False)
sys.exit() # Allocation f = np.zeros((nx, ny, nz), 'f') cf = np.ones_like(f) * 0.5 mf = cl.mem_flags eh_gpus = [] ce_gpus = [] for i, queue in enumerate(queues): eh_gpus.append( [cl.Buffer(context, mf.READ_WRITE, f.nbytes) for m in range(6)]) ce_gpus.append( [cl.Buffer(context, mf.READ_ONLY, cf.nbytes) for m in range(3)]) for j in xrange(6): cl.enqueue_write_buffer(queue, eh_gpus[i][j], f) for j in xrange(3): cl.enqueue_write_buffer(queue, ce_gpus[i][j], cf) b_offset = (nx - 1) * ny * nz * np.nbytes['float32'] tmpfs = [] for i, queue in enumerate(queues): #tmpfs.append( [cl.Buffer(context, mf.READ_WRITE | mf.ALLOC_HOST_PTR, f.nbytes/nx) for m in range(2)] ) tmpfs.append([np.zeros((ny, nz), dtype=np.float32) for m in range(2)]) # Program and Kernel Ls = 256 Gs = get_optimal_global_work_size(device) print('Ls = %d, Gs = %d' % (Ls, Gs)) kern = open('./fdtd3d.cl').read()