def cart2d_to_pol2d(self, M): Q1 = np.zeros((len(self.R), ), dtype=np.float64) Q2 = np.zeros((len(self.R), len(self.A)), dtype=np.float64) norm = np.zeros((len(self.R), ), dtype=np.float64) cl.enqueue_copy(self.queue, self.buf_M, M) cl.enqueue_fill_buffer(self.queue, self.buf_Q1e, np.float64(0.0), 0, Q1.nbytes) cl.enqueue_fill_buffer(self.queue, self.buf_Q2e, np.float64(0.0), 0, Q2.nbytes) cl.enqueue_fill_buffer(self.queue, self.buf_norm, np.float64(0.0), 0, norm.nbytes) self.prg.cart2d_to_pol2d_project(self.queue, (self.r_len, 1), None, self.buf_fmt, self.buf_M, self.buf_Q1e, self.buf_Q2e, self.buf_norm) cl.enqueue_copy(self.queue, Q1, self.buf_Q1e) cl.enqueue_copy(self.queue, Q2, self.buf_Q2e) cl.enqueue_copy(self.queue, norm, self.buf_norm) cl.enqueue_barrier(self.queue).wait() Q1 /= norm.sum() return Q1, Q2
def postproc_depose_scalar(self, fld): # Correct near axis deposition args_grid = [self.DataDev[fld+'_m'+str(m)].data \ for m in range(self.Args['M']+1)] WGS, WGS_tot = self.get_wgs(self.Args['Nx']) enqueue_barrier(self.queue) self._treat_axis_d_knl(self.queue, (WGS_tot, ), (WGS, ), args_grid[0], np.uint32(self.Args['Nx'])).wait() for m in range(1, self.Args['M'] + 1): self._treat_axis_c_knl(self.queue, (WGS_tot, ), (WGS, ), args_grid[m], np.uint32(self.Args['Nx'])).wait() # Divide by radius WGS, WGS_tot = self.get_wgs(self.Args['NxNr']) grid_str = ['NxNr', 'Nx', 'dV_inv'] grid_args = [self.DataDev[arg].data for arg in grid_str] enqueue_barrier(self.queue) self._divide_by_dv_d_knl(self.queue, (WGS_tot, ), (WGS, ), args_grid[0], *grid_args).wait() for m in range(1, self.Args['M'] + 1): self._divide_by_dv_c_knl(self.queue, (WGS_tot, ), (WGS, ), args_grid[m], *grid_args).wait()
def debayer_frame(ctx, debayer_prg, data, rgb=False): q = cl.CommandQueue(ctx) yuv_buff = np.empty(FRAME_WIDTH * FRAME_HEIGHT + UV_SIZE * 2, dtype=np.uint8) cam_g = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data) yuv_g = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, FRAME_WIDTH * FRAME_HEIGHT + UV_SIZE * 2) local_worksize = (20, 20) if TICI else (4, 4) ev1 = debayer_prg.debayer10(q, (UV_WIDTH, UV_HEIGHT), local_worksize, cam_g, yuv_g) cl.enqueue_copy(q, yuv_buff, yuv_g, wait_for=[ev1]).wait() cl.enqueue_barrier(q) y = yuv_buff[:FRAME_WIDTH * FRAME_HEIGHT].reshape( (FRAME_HEIGHT, FRAME_WIDTH)) u = yuv_buff[FRAME_WIDTH * FRAME_HEIGHT:FRAME_WIDTH * FRAME_HEIGHT + UV_SIZE].reshape((UV_HEIGHT, UV_WIDTH)) v = yuv_buff[FRAME_WIDTH * FRAME_HEIGHT + UV_SIZE:].reshape( (UV_HEIGHT, UV_WIDTH)) if rgb: return yuv_to_rgb(y, u, v) else: return y, u, v
def final(config, ctx, queue, program, buffers, debug=False): matrixSize = config['matrixSize'] bandwidth = config['bandwidth'] partitionNumber = config['partitionNumber'] partitionSize = config['partitionSize'] offdiagonalSize = config['offdiagonalSize'] rhsSize = config['rhsSize'] xo = np.ones((partitionNumber * (partitionSize - 2 * offdiagonalSize), rhsSize), dtype=np.float32) tmp = np.ones((partitionNumber * (partitionSize - 2 * offdiagonalSize), rhsSize), dtype=np.float32) mf = cl.mem_flags xo_buf = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=xo) tmp_buf = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=tmp) kernel = program.reconstruct kernel.set_scalar_arg_dtypes([None, None, None, None, np.int32, np.int32, np.int32]) cl.enqueue_barrier(queue) kernel( queue, (partitionNumber,), None, buffers[1], # Avwg buffer from factor, see if it is also readable and still valide buffers[3], # x buffer from solve, see if it is still valide xo_buf, tmp_buf, np.int32(partitionSize), np.int32(offdiagonalSize), np.int32(rhsSize) ) xtb = np.ones((partitionNumber * 2 * offdiagonalSize, rhsSize), dtype=np.float32) cl.enqueue_copy(queue, xtb, buffers[3]) if (debug) : print "X(t,b):" print xtb cl.enqueue_copy(queue, xo, xo_buf) if (debug) : print "X':" print xo xtb = sparse.csr_matrix(xtb) xo = sparse.csr_matrix(xo) x = [] for i in range(0, partitionNumber) : t = i * (2 * offdiagonalSize) b = (i + 1) * (2 * offdiagonalSize) u = i * (partitionSize - 2 * offdiagonalSize) v = (i + 1) * (partitionSize - 2 * offdiagonalSize) x.append(xtb[t : t + offdiagonalSize, 0 : rhsSize]) x.append(xo[u : v, 0 : rhsSize]) x.append(xtb[b - offdiagonalSize : b, 0 : rhsSize]) return sp.sparse.vstack(x)
def begin_acquire(self, nthreads_per_block=64, cl_context=None): if api.is_gpu_api_cuda(): self.gpu_funcs.reset_earliest_time_int( np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, block=(nthreads_per_block, 1, 1), grid=(len(self.earliest_time_int_gpu) // nthreads_per_block + 1, 1)) self.channel_q_int_gpu.fill(0) self.channel_q_gpu.fill(0) self.channel_history_gpu.fill(0) elif api.is_gpu_api_opencl(): comqueue = cl.CommandQueue(cl_context) self.gpu_funcs.reset_earliest_time_int( comqueue, (nthreads_per_block, 1, 1), (len(self.earliest_time_int_gpu) // nthreads_per_block + 1, 1), np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu.data, g_times_l=True).wait() self.channel_q_int_gpu.fill(0, queue=comqueue) self.channel_q_gpu.fill(0, queue=comqueue) self.channel_history_gpu.fill(0, queue=comqueue) cl.enqueue_barrier(comqueue)
def run(self): self.program.Jacobian(self.queue, self.shape, block_shape, self.B, self.DB) self.run_key = True if (self.maxiter != None): niter = 0 while (self.run_key): self.program.Integrate(self.queue, self.shape, block_shape, self.X, self.X1, self.B, self.DB, np.float32(self.step), self.Error, self.Current) cl.enqueue_barrier(self.queue) cl.enqueue_read_buffer(self.queue, self.Error, self._Error) error = np.max(self._Error) if (not np.isnan(error)): cl.enqueue_read_buffer(self.queue, self.Current, self._Current) self.step /= error**0.1+0.5 if (error < 1): cl.enqueue_copy(self.queue, self.X, self.X1) cl.enqueue_read_buffer(self.queue, self.X1, self._X) else: self.step /= 2. self.queue.finish() if (self.maxiter != None): niter += 1 if (niter == self.maxiter): self.stop()
def kmeans(it_n,class_n,data_n, centroids_x,centroids_y,data_x,data_y,partitioned): context = cl.create_some_context() queue = cl.CommandQueue(context); for i in range(0,class_n): print centroids_x[i],centroids_y[i] with open("kernel.cl", 'r') as fin: program = cl.Program(context, fin.read()).build() assign = program.assign buf_centroids_x=cl.Buffer(context,mf.READ_ONLY|mf.COPY_HOST_PTR,hostbuf=centroids_x) buf_centroids_y =cl.Buffer(context,mf.READ_ONLY|mf.COPY_HOST_PTR,hostbuf=centroids_y) buf_data_x=cl.Buffer(context,mf.READ_ONLY|mf.COPY_HOST_PTR,hostbuf=data_x) buf_data_y=cl.Buffer(context,mf.READ_ONLY|mf.COPY_HOST_PTR,hostbuf=data_y) buf_parts = cl.Buffer(context,mf.READ_WRITE | mf.COPY_HOST_PTR,hostbuf=partitioned) dbl_max =100000.0 for i in range(0,it_n): assign(queue,(data_n,),None,buf_centroids_x,buf_centroids_y,buf_data_x,buf_data_y, buf_parts,np.int32(class_n),np.int32(data_n),np.float32(dbl_max)) cl.enqueue_barrier(queue) e =cl.enqueue_copy(queue,partitioned,buf_parts) e.wait() count = np.zeros(class_n).astype(np.int32) for i in range(0,class_n): centroids_x[i]=0.0 centroids_y[i]=0.0 for i in range(0,data_n): centroids_x[partitioned[i]] +=data_x[i] centroids_y[partitioned[i]] +=data_y[i] count[partitioned[i]]+=1 for i in range(0,class_n): if count[i] != 0: centroids_x[i] /= count[i] centroids_y[i] /= count[i] buf_centroids_x=cl.Buffer(context,mf.READ_ONLY|mf.COPY_HOST_PTR,hostbuf=centroids_x) buf_centroids_y =cl.Buffer(context,mf.READ_ONLY|mf.COPY_HOST_PTR,hostbuf=centroids_y) print partitioned return partitioned
def sync(self, queue): ''' Inject a pyopencl marker into the specified queue that waits for the attached event. ''' if self._event is None: return cl.enqueue_barrier(queue, wait_for=[self._event])
def postproc_depose_vector(self, vec_fld): args_raddiv_str = ['NxNr', 'Nx', 'dV_inv'] args_raddiv = [self.DataDev[arg].data for arg in args_raddiv_str] for fld in [vec_fld + comp for comp in self.Args['vec_comps']]: # Correct near axis deposition args_fld = [self.DataDev[fld+'_m'+str(m)].data \ for m in range(self.Args['M']+1)] WGS, WGS_tot = self.get_wgs(self.Args['Nx']) self._treat_axis_d_knl(self.queue, (WGS_tot, ), (WGS, ), args_fld[0], np.uint32(self.Args['Nx'])).wait() for m in range(1, self.Args['M'] + 1): self._treat_axis_c_knl(self.queue, (WGS_tot, ), (WGS, ), args_fld[m], np.uint32(self.Args['Nx'])).wait() # Divide by radius WGS, WGS_tot = self.get_wgs(self.Args['NxNr']) enqueue_barrier(self.queue) self._divide_by_dv_d_knl(self.queue, (WGS_tot, ), (WGS, ), args_fld[0], *args_raddiv).wait() for m in range(1, self.Args['M'] + 1): self._divide_by_dv_c_knl(self.queue, (WGS_tot, ), (WGS, ), args_fld[m], *args_raddiv).wait()
def test_enqueue_barrier_marker(ctx_factory): ctx = ctx_factory() _skip_if_pocl(ctx.devices[0].platform, 'pocl crashes on enqueue_barrier') queue = cl.CommandQueue(ctx) cl.enqueue_barrier(queue) evt1 = cl.enqueue_marker(queue) evt2 = cl.enqueue_marker(queue, wait_for=[evt1]) cl.enqueue_barrier(queue, wait_for=[evt1, evt2])
def test_enqueue_barrier_marker(ctx_factory): ctx = ctx_factory() _skip_if_pocl(ctx.devices[0].platform, 'pocl crashes on enqueue_barrier') queue = cl.CommandQueue(ctx) cl.enqueue_barrier(queue) evt1 = cl.enqueue_marker(queue) evt2 = cl.enqueue_marker(queue, wait_for=[evt1]) cl.enqueue_barrier(queue, wait_for=[evt1, evt2])
def step(self, c1, c2): if not self.opencl_fast_step: return super().step(c1, c2) cl.enqueue_copy(self.queue, self.buf_Q1e, self.Q1_exp) cl.enqueue_copy(self.queue, self.buf_Q2e, self.Q2_exp) try: self.Q1_cal except AttributeError: # i = 1 self.prg.pol2d_to_pol3d_init(self.queue, (self.r_len, 1), None, self.buf_fmt, self.buf_Q1e, self.buf_Q2e, self.buf_P1, self.buf_P2) self.Q1_cal = np.empty_like(self.Q1_exp) self.Q2_cal = np.empty_like(self.Q2_exp) self.P1 = np.empty_like(self.Q1_exp) self.P2 = np.empty_like(self.Q2_exp) else: # i > 1 cl.enqueue_copy(self.queue, self.buf_Q1c, self.Q1_cal) cl.enqueue_copy(self.queue, self.buf_Q2c, self.Q2_cal) cl.enqueue_copy(self.queue, self.buf_P1, self.P1) cl.enqueue_copy(self.queue, self.buf_P2, self.P2) self.prg.pol2d_to_pol3d_step(self.queue, (self.r_len, 1), None, self.buf_fmt, self.buf_Q1e, self.buf_Q2e, self.buf_Q1c, self.buf_Q2c, self.buf_P1, self.buf_P2, np.float64(c1), np.float64(c2)) self.prg.norm_pol3d_angular(self.queue, (self.r_len, 1), None, self.buf_fmt, self.buf_P1, self.buf_P2, self.buf_norm) radial_norm = np.zeros((len(self.R), ), dtype=np.float64) cl.enqueue_copy(self.queue, self.Q1_cal, self.buf_Q1c) cl.enqueue_copy(self.queue, self.Q2_cal, self.buf_Q2c) cl.enqueue_copy(self.queue, self.P1, self.buf_P1) cl.enqueue_copy(self.queue, self.P2, self.buf_P2) cl.enqueue_copy(self.queue, radial_norm, self.buf_norm) cl.enqueue_barrier(self.queue).wait() self.P1 /= radial_norm.sum() self.M_cal = self.pol3d_to_cart2d(self.P1, self.P2) self.norm_cart2d(self.M_cal) assert np.isfinite(self.M_cal).all(), 'M_cal is not finite' self.Q1_cal, self.Q2_cal = self.cart2d_to_pol2d(self.M_cal)
def run(self, X): cl.enqueue_copy(self.queue, self.X, X) #cl.enqueue_acquire_gl_objects(self.queue, [self.X]) self.out = np.zeros((self.nx,), dtype = np.float32) cl.enqueue_copy(self.queue, self.I, self.out) self.program.Solve(self.queue, (self.nx, self.na), None, self.A, self.r, self.X, self.I) #cl.enqueue_release_gl_objects(self.queue, [self.X]) #self.queue.finish() cl.enqueue_barrier(self.queue) return self
def run(self, input_data): print("Running model") assert self.loaded, "Must have a model loaded first" assert all(input_data.shape == self.input_shape), \ "Input data must be of shape "+str(self.input_shape)+\ " but is of shape "+str(input_data.shape) # Set input data mf = cl.mem_flags self.bufs[self.input_buffer] = np.ascontiguousarray(input_data) self.opencl_bufs[self.input_buffer] = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.bufs[self.input_buffer]) with cl.CommandQueue(self.ctx) as queue: # Enqueue operations for i, op in enumerate(self.operations): print("Enqueing op", i) t = time.time() self.enqueue_op(queue, op) t = time.time() - t print("Took", t, "s") # TODO do I need a cl.enqueue_barrier(queue)? # or maybe cl.wait_for_events(event) and handle which outputs # are used for certain inputs? cl.enqueue_barrier(queue) # Get different output not requiring the custom op # # Note: only when we request the result does it actually run the # network, so this takes a long time prediction_boxes = None prediction_classes = None print("Fetching results") t = time.time() for tensor in self.tensors: buf = self.replace_buffers(tensor["buffer"]) if tensor["name"] == "Squeeze": prediction_boxes = self.load_buf(queue, buf) elif tensor["name"] == "convert_scores": prediction_classes = self.load_buf(queue, buf) t = time.time() - t print("Took", t, "s") # Note: only the prediction boxes/classes buffers will have valid data # in them though unless we load *all* the buffers in the above for loop np.save("tflite_opencl.npy", { t["name"]: self.bufs[t["buffer"]] for t in self.tensors }) print("Total number of tensors:", len(self.tensors)) return prediction_boxes, prediction_classes
def kmeans_chunk_center(self, data, centers): data = data.astype(np.float32) centers = centers.astype(np.float32) k = len(centers) dim = len(centers[0]) if not self.prg: self.__initialize_program(dim, k) out = np.zeros((10, 1), dtype=np.float32) new_centers = np.asarray(self.n_work_groups * [np.zeros(self._dimension, dtype=np.float32) for _ in xrange(k)], dtype=np.float32) data_assigns = np.empty((len(data), 1), dtype=np.int32) # create buffers data_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data) centers_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=centers) assigns_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, data_assigns.nbytes) new_centers_buf = cl.Buffer( self.ctx, cl.mem_flags.WRITE_ONLY, self.n_work_groups * dim * k * np.dtype('float32').itemsize ) centers_counter_buf = cl.Buffer( self.ctx, cl.mem_flags.WRITE_ONLY, k*self.n_work_groups * np.dtype('int32').itemsize ) out_buf = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=out) e = cl.enqueue_barrier(self.queue) e.wait() # run opencl extension self.prg.kmeans_chunk_center_cl( self.queue, (len(data),), None, assigns_buf, data_buf, centers_buf, centers_counter_buf, new_centers_buf, out_buf ) # barrier e = cl.enqueue_barrier(self.queue) e.wait() # wait for it to finish and read out buffers cl.enqueue_copy(self.queue, data_assigns, assigns_buf) cl.enqueue_copy(self.queue, new_centers, new_centers_buf) cl.enqueue_copy(self.queue, out, out_buf) new_centers = new_centers[:k] self._data_assigns.extend(data_assigns.flatten().tolist()) return new_centers.astype(dtype=np.float32)
def runDijkstra(graph, source, costArray): context = cl.create_some_context() with open("Kernel.cl", 'r') as fin: program = cl.Program(context, fin.read()).build() maskArray = np.zeros(graph.vertexcount).astype(np.int32) maskArray[source] = 1 updateCostArray = np.zeros(graph.vertexcount).astype(np.float32) for i in range(0, graph.vertexcount): updateCostArray[i] = np.Inf updateCostArray[source] = 0 costArray[source] = 0 queue = cl.CommandQueue(context) dijkstra_first = program.Dijkstra_first dijkstra_second = program.Dijkstra_second vertex = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=graph.vertexArray) edge = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=graph.edgeArray) weight = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=graph.weightArray) mask = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=maskArray) cost = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=costArray) updateCost = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=costArray) while (maskArrayEmpty(maskArray, graph.vertexcount) is False): dijkstra_first(queue, (graph.vertexcount, ), (1, ), vertex, edge, weight, mask, cost, updateCost, np.int32(graph.vertexcount), np.int32(graph.edgecount)) dijkstra_second(queue, (graph.vertexcount, ), (1, ), vertex, edge, weight, mask, cost, updateCost, np.int32(graph.vertexcount)) cl.enqueue_barrier(queue) e = cl.enqueue_copy(queue, maskArray, mask) e.wait() cl.enqueue_barrier(queue) e = cl.enqueue_copy(queue, costArray, cost) e.wait() return costArray #return the shortest path weights from the source to each vertex
def test_enqueue_barrier_marker(ctx_factory): ctx = ctx_factory() # Still relevant on pocl 1.0RC1. _xfail_if_pocl( ctx.devices[0].platform, (1, 0), "pocl crashes on enqueue_barrier") queue = cl.CommandQueue(ctx) if queue._get_cl_version() >= (1, 2) and cl.get_cl_header_version() <= (1, 1): pytest.skip("CL impl version >= 1.2, header version <= 1.1--cannot be sure " "that clEnqueueWaitForEvents is implemented") cl.enqueue_barrier(queue) evt1 = cl.enqueue_marker(queue) evt2 = cl.enqueue_marker(queue, wait_for=[evt1]) cl.enqueue_barrier(queue, wait_for=[evt1, evt2])
def test_setIterationFinished_000(self): basePath = 'C:/Private/PhD_Publications/Publication_of_Algorithm/Code/TrackingAlgorithm/TrackingAlgorithm/TestData/ReferenceDataForTests/UnitTests/OpenClKernels/setIterationFinished_000' inputPath = basePath + '/input' referencePath = basePath + '/output' referenceVariableName1 = 'dev_iterationFinished' self.linFitSearchRangeXvalues = np.float64( np.transpose(np.linspace(1, 200, 200))) self.setupTest() self.nrOfLocalAngleSteps = 64 self.detectionKernelStrideSize = 2048 self.nrOfStrides = 1 self.nrOfDetectionAngleSteps = np.float64( self.nrOfStrides * self.detectionKernelStrideSize) self.loadDeviceVariable('dev_iterationFinished', inputPath) self.setWorkGroupSizes() self.prg.setIterationFinished(self.queue, (1, 1), None, self.dev_iterationFinished.data) barrierEvent = cl.enqueue_barrier(self.queue) self.assertVectorEqualsExpectedResult( self.dev_iterationFinished, referencePath + '/' + referenceVariableName1 + '.npy') pass
def test_calculateMembraneNormalVectors_000(self): basePath = 'C:/Private/PhD_Publications/Publication_of_Algorithm/Code/TrackingAlgorithm/TrackingAlgorithm/TestData/ReferenceDataForTests/UnitTests/OpenClKernels/calculateMembraneNormalVectors_000' inputPath = basePath + '/input' referencePath = basePath + '/output' referenceVariableName1 = 'dev_membraneNormalVectors' self.linFitSearchRangeXvalues = np.float64( np.transpose(np.linspace(1, 200, 200))) self.setupTest() self.nrOfLocalAngleSteps = 64 self.detectionKernelStrideSize = 2048 self.nrOfStrides = 1 self.nrOfDetectionAngleSteps = np.float64( self.nrOfStrides * self.detectionKernelStrideSize) self.loadDeviceVariable('dev_membraneCoordinates', inputPath) self.loadDeviceVariable('dev_membraneNormalVectors', inputPath) self.loadDeviceVariable('gradientGlobalSize', inputPath) self.setWorkGroupSizes() self.prg.calculateMembraneNormalVectors(self.queue, self.gradientGlobalSize, None, \ self.dev_membraneCoordinates.data, \ self.dev_membraneNormalVectors.data \ ) barrierEvent = cl.enqueue_barrier(self.queue) self.assertVector2EqualsExpectedResult( self.dev_membraneNormalVectors, referencePath + '/' + referenceVariableName1 + '.npy') pass
def pol3d_to_cart2d(self, P1, P2): M = np.zeros((self.row_len, self.col_len), dtype=np.float64) cl.enqueue_copy(self.queue, self.buf_P1, P1) cl.enqueue_copy(self.queue, self.buf_P2, P2) cl.enqueue_fill_buffer(self.queue, self.buf_M, np.float64(0.0), 0, M.nbytes) self.prg.pol3d_to_cart2d(self.queue, (self.row_len, self.col_len), None, self.buf_fmt, self.buf_P1, self.buf_P2, self.buf_M) cl.enqueue_copy(self.queue, M, self.buf_M) cl.enqueue_barrier(self.queue).wait() return M
def pol3d_to_slice2d(self, P1, P2): S = np.zeros((self.row_len, self.col_len), dtype=np.float64) cl.enqueue_copy(self.queue, self.buf_P1, P1) cl.enqueue_copy(self.queue, self.buf_P2, P2) cl.enqueue_fill_buffer(self.queue, self.buf_S, np.float64(0.0), 0, S.nbytes) self.prg.pol3d_to_slice2d(self.queue, (self.row_len, self.col_len), None, self.buf_fmt, self.buf_P1, self.buf_P2, self.buf_S) cl.enqueue_copy(self.queue, S, self.buf_S) cl.enqueue_barrier(self.queue).wait() return S
def _enqueue_barrier(queue, wait_for): if queue.device.platform.name == "Portable Computing Language": # pocl 0.13 and below crash on clEnqueueBarrierWithWaitList evt = cl.enqueue_marker(queue, wait_for=wait_for) queue.finish() return evt else: return cl.enqueue_barrier(queue, wait_for=wait_for)
def _enqueue_barrier(queue, wait_for): if queue.device.platform.name == "Portable Computing Language": # pocl 0.13 and below crash on clEnqueueBarrierWithWaitList evt = cl.enqueue_marker(queue, wait_for=wait_for) queue.finish() return evt else: return cl.enqueue_barrier(queue, wait_for=wait_for)
def getUZCM_Ring(n,beta,hJ): ''' explicitly calculates the inner energy U, the partititon function Z, the heat capacity C and the magnetisation M for a given beta and hJ on a Ising Ring with n spins. ''' beta=np.array(beta,dtype=np.double) beta_g=cl.Buffer(ctx,mf.READ_ONLY|mf.COPY_HOST_PTR,hostbuf=beta) h_g=cl.Buffer(ctx,mf.READ_WRITE,8*2**(n)) u_g,z_g,c_g,m_g=(cl.Buffer(ctx,mf.WRITE_ONLY,beta.nbytes) for x in range(4)) u_h,z_h,c_h,m_h=(np.zeros_like(beta) for x in range(4)) cprg.tabulateH_Ring(cqu,(1,),None,struct.pack('i', n),struct.pack('d', hJ),h_g) cl.enqueue_barrier(cqu) cprg.getUZCM(cqu,beta.shape,None,struct.pack('i', n),beta_g,h_g,u_g,z_g,c_g,m_g) cl.enqueue_copy(cqu, u_h, u_g) cl.enqueue_copy(cqu, z_h, z_g) cl.enqueue_copy(cqu, c_h, c_g) cl.enqueue_copy(cqu, m_h, m_g) return u_h/n,z_h,c_h/n,m_h/n
def calculateContourCenter(self): self.prg.calculateDs(self.queue, self.gradientGlobalSize, None, \ self.dev_membraneCoordinates.data, \ self.dev_ds.data \ ) barrierEvent = cl.enqueue_barrier(self.queue) self.prg.calculateSumDs(self.queue, self.gradientGlobalSize, None, \ self.dev_ds.data, self.dev_sumds.data \ ) barrierEvent = cl.enqueue_barrier(self.queue) self.prg.calculateContourCenter(self.queue, (1,1), None, \ self.dev_membraneCoordinates.data, \ self.dev_ds.data, self.dev_sumds.data, \ self.dev_contourCenter.data, \ np.int32(self.nrOfDetectionAngleSteps) \ ) barrierEvent = cl.enqueue_barrier(self.queue)
def execute(self): if self.executionsPerReadback == 1: self.program.Conway(self.queue, self.a.shape, None, self.ar_ySize, self.a_buf, self.dest_buf) else: for i in range(0,self.executionsPerReadback/2): self.program.Conway(self.queue, self.a.shape, None, self.ar_ySize, self.a_buf, self.dest_buf) self.program.Conway(self.queue, self.a.shape, None, self.ar_ySize, self.dest_buf, self.a_buf) cl.enqueue_barrier(self.queue) if self.executionsPerReadback%2 == 1: self.program.Conway(self.queue, self.a.shape, None, self.ar_ySize, self.a_buf, self.dest_buf) if self.executionsPerReadback%2 == 1: cl.enqueue_read_buffer(self.queue, self.dest_buf, self.c).wait() else: cl.enqueue_read_buffer(self.queue, self.a_buf, self.c).wait() self.a = self.c; #Refresh buffers mf = cl.mem_flags self.a_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.a) self.dest_buf = cl.Buffer(self.ctx, mf.WRITE_ONLY, self.a.nbytes)
def setStartingCoordinates(self,dev_initialMembraneCoordinatesX,dev_initialMembraneCoordinatesY, \ dev_initialMembranNormalVectorsX,dev_initialMembranNormalVectorsY): cl.enqueue_copy_buffer(self.queue, dev_initialMembraneCoordinatesX.data, self.dev_membraneCoordinatesX.data).wait() #<- cl.enqueue_copy_buffer(self.queue, dev_initialMembraneCoordinatesY.data, self.dev_membraneCoordinatesY.data).wait() cl.enqueue_copy_buffer(self.queue, dev_initialMembranNormalVectorsX.data, self.dev_membraneNormalVectorsX.data).wait() cl.enqueue_copy_buffer(self.queue, dev_initialMembranNormalVectorsY.data, self.dev_membraneNormalVectorsY.data).wait() barrierEvent = cl.enqueue_barrier(self.queue) self.queue.finish()
def test_filterJumpedCoordinates_000(self): basePath = 'C:/Private/PhD_Publications/Publication_of_Algorithm/Code/TrackingAlgorithm/TrackingAlgorithm/TestData/ReferenceDataForTests/UnitTests/OpenClKernels/filterJumpedCoordinates_000' inputPath = basePath + '/input' referencePath = basePath + '/output' referenceVariableName1 = 'dev_membraneCoordinates' referenceVariableName2 = 'dev_membraneNormalVectors' self.linFitSearchRangeXvalues = np.float64( np.transpose(np.linspace(1, 200, 200))) self.setupTest() self.nrOfLocalAngleSteps = 64 self.detectionKernelStrideSize = 2048 self.nrOfStrides = 1 self.nrOfDetectionAngleSteps = np.float64( self.nrOfStrides * self.detectionKernelStrideSize) self.loadDeviceVariable('dev_previousContourCenter', inputPath) self.loadDeviceVariable('dev_membraneCoordinates', inputPath) self.loadDeviceVariable('dev_membraneNormalVectors', inputPath) self.loadDeviceVariable('dev_previousInterpolatedMembraneCoordinates', inputPath) self.loadDeviceVariable('dev_closestLowerNoneNanIndex', inputPath) self.loadDeviceVariable('dev_closestUpperNoneNanIndex', inputPath) self.maxCoordinateShift = np.float64(10.0) self.listOfGoodCoordinates_memSize = np.int(8192) self.setWorkGroupSizes() self.prg.filterJumpedCoordinates(self.queue, self.gradientGlobalSize, None, \ self.dev_previousContourCenter.data, \ self.dev_membraneCoordinates.data, \ self.dev_membraneNormalVectors.data, \ self.dev_previousInterpolatedMembraneCoordinates.data, \ cl.LocalMemory(self.dev_closestLowerNoneNanIndex.nbytes), \ cl.LocalMemory(self.dev_closestUpperNoneNanIndex.nbytes), \ cl.LocalMemory(self.listOfGoodCoordinates_memSize), \ self.maxCoordinateShift \ ) barrierEvent = cl.enqueue_barrier(self.queue) self.assertVector2EqualsExpectedResult( self.dev_membraneCoordinates, referencePath + '/' + referenceVariableName1 + '.npy') self.assertVector2EqualsExpectedResult( self.dev_membraneNormalVectors, referencePath + '/' + referenceVariableName2 + '.npy') pass
def setStartingMembraneNormals(self, dev_initialMembranNormalVectorsX, dev_initialMembranNormalVectorsY): if self.resetNormalsAfterEachImage and not self.getContourId( ) == 0: # reset contour normal vector to radial vectors; we do this only starting for the second, since doing this for image 0, would destroy the correspondence of the indexes of the contour coordinates to their corresponding contour normals cl.enqueue_copy_buffer( self.queue, self.dev_radialVectorsX.data, self.dev_membraneNormalVectorsX.data).wait() cl.enqueue_copy_buffer( self.queue, self.dev_radialVectorsY.data, self.dev_membraneNormalVectorsY.data).wait() else: # copy contour normal vectors from last image to use as initial normal vectors for next image cl.enqueue_copy_buffer( self.queue, dev_initialMembranNormalVectorsX.data, self.dev_membraneNormalVectorsX.data).wait() cl.enqueue_copy_buffer( self.queue, dev_initialMembranNormalVectorsY.data, self.dev_membraneNormalVectorsY.data).wait() barrierEvent = cl.enqueue_barrier(self.queue)
def setStartingCoordinatesNew(self, dev_initialMembraneCoordinatesX, dev_initialMembraneCoordinatesY): cl.enqueue_copy_buffer(self.queue, dev_initialMembraneCoordinatesX.data, self.dev_membraneCoordinatesX.data).wait() #<- cl.enqueue_copy_buffer(self.queue, dev_initialMembraneCoordinatesY.data, self.dev_membraneCoordinatesY.data).wait() #cl.enqueue_copy_buffer(self.queue,dev_initialMembraneCoordinatesX.data,self.dev_interpolatedMembraneCoordinatesX.data).wait() #cl.enqueue_copy_buffer(self.queue,dev_initialMembraneCoordinatesY.data,self.dev_interpolatedMembraneCoordinatesY.data).wait() cl.enqueue_copy_buffer( self.queue, dev_initialMembraneCoordinatesX.data, self.dev_previousInterpolatedMembraneCoordinatesX.data).wait() cl.enqueue_copy_buffer( self.queue, dev_initialMembraneCoordinatesY.data, self.dev_previousInterpolatedMembraneCoordinatesY.data).wait() barrierEvent = cl.enqueue_barrier(self.queue)
def gpu_ifft(vec): """ Uses the pyopencl and pyfft libraries to perform an fft on the GPU """ from pyfft.cl import Plan as cl_plan import pyopencl as cl import pyopencl.array as cl_array from numpy import complex64, shape, complex128, float32, real array_size = vec.shape #Find the GPU's available platform = cl.get_platforms() my_gpu_devices = platform[0].get_devices(device_type=cl.device_type.GPU) #Create a context using the GPU's found in the above step ctx = cl.Context(devices=my_gpu_devices) #Create queue using that context queue = cl.CommandQueue(ctx) # plan = cl_plan(array_size,queue=queue) #Make a temporary copy of vec so that things don't get all messed up ##temp = vec.copy().astype(complex64) plan = cl_plan(array_size, dtype=complex64, queue=queue) # plan = cl_plan(array_size,queue=queue) alloc = cl.tools.ImmediateAllocator(queue) cl.tools.MemoryPool(alloc).stop_holding() ##gpu_data = cl_array.to_device(queue, temp) vec = vec.astype(complex64) cl.enqueue_barrier(queue) gpu_data = cl_array.to_device(queue, vec, allocator = alloc, async = True) gpu_data.queue.finish() cl.enqueue_barrier(queue) plan.execute(gpu_data.data, inverse=True) cl.enqueue_barrier(queue) ans = gpu_data.get() gpu_data.data.release() gpu_data.queue.finish() queue.flush() for i in range(20): pass return ans
def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64, max_blocks=1024, max_steps=10, use_weights=False, scatter_first=0, cl_context=None): """Propagate photons on GPU to termination or max_steps, whichever comes first. May be called repeatedly without reloading photon information if single-stepping through photon history. ..warning:: `rng_states` must have at least `nthreads_per_block`*`max_blocks` number of curandStates. """ nphotons = self.pos.size # bind node texture reference if api.is_gpu_api_cuda() and not self.node_texture_ref_bound: # we have to unroll, as pycuda doesn't seem to support vector times right now for binding self.unrolled_nodes = ga.to_gpu( gpu_geometry.nodes.get().ravel().view(np.uint32)) self.unrolled_extra_nodes = ga.to_gpu( gpu_geometry.extra_nodes.ravel().view(np.uint32)) self.unrolled_triangles = ga.to_gpu( gpu_geometry.triangles.get().ravel().view(np.uint32)) self.unrolled_triangles4 = ga.to_gpu( gpu_geometry.triangles4.ravel().view(np.uint32)) self.unrolled_vertices = ga.to_gpu( gpu_geometry.vertices.get().ravel().view(np.float32)) self.unrolled_vertices4 = ga.to_gpu( gpu_geometry.vertices4.ravel().view(np.float32)) self.node_texture_ref.set_address(self.unrolled_nodes.gpudata, self.unrolled_nodes.nbytes) self.extra_node_texture_ref.set_address( self.unrolled_extra_nodes.gpudata, self.unrolled_extra_nodes.nbytes) #self.unrolled_nodes.bind_to_texref_ext( self.node_texture_ref ) #self.unrolled_extra_nodes.bind_to_texref_ext( self.extra_node_texture_ref ) #self.unrolled_triangles.bind_to_texref_ext( self.triangles_texture_ref ) self.triangles_texture_ref.set_address( self.unrolled_triangles4.gpudata, self.unrolled_triangles4.nbytes) #self.unrolled_vertices.bind_to_texref_ext( self.vertices_texture_ref ) self.vertices_texture_ref.set_address( self.unrolled_vertices4.gpudata, self.unrolled_vertices4.nbytes) print "[BOUND TO TEXTURE MEMORY]" print "Nodes: ", self.unrolled_nodes.nbytes / 1.0e3, " kbytes" print "Extra nodes: ", self.unrolled_extra_nodes.nbytes / 1.0e3, " kbytes" print "Triangles: ", self.unrolled_triangles4.nbytes / 1.0e3, " kbytes" print "Vertices: ", self.unrolled_vertices4.nbytes / 1.0e3, " kbytes" print "Total: ", (self.unrolled_nodes.nbytes + self.unrolled_extra_nodes.nbytes + self.unrolled_triangles4.nbytes + self.unrolled_vertices4.nbytes) / 1.0e3, "kbytes" self.node_texture_ref_bound = True # setup queue maxqueue = nphotons step = 0 input_queue = np.empty(shape=maxqueue + 1, dtype=np.uint32) input_queue[0] = 0 # Order photons initially in the queue to put the clones next to each other for copy in xrange(self.ncopies): input_queue[1 + copy::self.ncopies] = np.arange( self.true_nphotons, dtype=np.uint32) + copy * self.true_nphotons if api.is_gpu_api_cuda(): input_queue_gpu = ga.to_gpu(input_queue) elif api.is_gpu_api_opencl(): comqueue = cl.CommandQueue(cl_context) input_queue_gpu = ga.to_device(comqueue, input_queue[1:]) # why the offset? output_queue = np.zeros(shape=maxqueue + 1, dtype=np.uint32) output_queue[0] = 1 if api.is_gpu_api_cuda(): output_queue_gpu = ga.to_gpu(output_queue) elif api.is_gpu_api_opencl(): output_queue_gpu = ga.to_device(comqueue, output_queue) if use_weights: iuse_weights = 1 else: iuse_weights = 0 adapt_factor = 1.0 start_prop = time.time() while step < max_steps: # Just finish the rest of the steps if the # of photons is low #if nphotons < nthreads_per_block * 16 * 8 or use_weights: # nsteps = max_steps - step #else: # nsteps = 1 nsteps = 1 start_step = time.time() for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max( int(adapt_factor*max_blocks), 1 )): #print nphotons, nthreads_per_block, max_blocks," : ",first_photon, photons_this_round, blocks, adapt_factor start_chunk = time.time() if api.is_gpu_api_cuda(): self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(nsteps), np.int32(iuse_weights), np.int32(scatter_first), gpu_geometry.gpudata, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) #cuda.Context.get_current().synchronize() elif api.is_gpu_api_opencl(): self.gpu_funcs.propagate( comqueue, (photons_this_round, 1, 1), None, np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu.data, output_queue_gpu.data, rng_states.data, self.pos.data, self.dir.data, self.wavelengths.data, self.pol.data, self.t.data, self.flags.data, self.last_hit_triangles.data, self.weights.data, np.int32(nsteps), np.int32(iuse_weights), np.int32(scatter_first), gpu_geometry.world_scale, gpu_geometry.world_origin.data, np.int32(len(gpu_geometry.nodes)), gpu_geometry.material_data['n'], gpu_geometry.material_data['step'], gpu_geometry.material_data["wavelength0"], gpu_geometry.vertices.data, gpu_geometry.triangles.data, gpu_geometry.material_codes.data, gpu_geometry.colors.data, gpu_geometry.nodes.data, gpu_geometry.extra_nodes.data, gpu_geometry.material_data["nmaterials"], gpu_geometry.material_data['refractive_index'].data, gpu_geometry.material_data['absorption_length'].data, gpu_geometry.material_data['scattering_length'].data, gpu_geometry.material_data['reemission_prob'].data, gpu_geometry.material_data['reemission_cdf'].data, gpu_geometry.surface_data['nsurfaces'], gpu_geometry.surface_data['detect'].data, gpu_geometry.surface_data['absorb'].data, gpu_geometry.surface_data['reemit'].data, gpu_geometry.surface_data['reflect_diffuse'].data, gpu_geometry.surface_data['reflect_specular'].data, gpu_geometry.surface_data['eta'].data, gpu_geometry.surface_data['k'].data, gpu_geometry.surface_data['reemission_cdf'].data, gpu_geometry.surface_data['model'].data, gpu_geometry.surface_data['transmissive'].data, gpu_geometry.surface_data['thickness'].data, gpu_geometry.surface_data['nplanes'].data, gpu_geometry.surface_data['wire_diameter'].data, gpu_geometry.surface_data['wire_pitch'].data, g_times_l=True).wait() end_chunk = time.time() chunk_time = end_chunk - start_chunk #print "chunk time: ",chunk_time #if chunk_time>2.5: # adapt_factor *= 0.5 step += nsteps scatter_first = 0 # Only allow non-zero in first pass end_step = time.time() #print "step time: ",end_step-start_step if step < max_steps: start_requeue = time.time() #print "reset photon queues" if api.is_gpu_api_cuda(): cuda.Context.get_current().synchronize( ) # ensure all threads done #temp = input_queue_gpu #input_queue_gpu = output_queue_gpu #output_queue_gpu = temp # Assign with a numpy array of length 1 to silence # warning from PyCUDA about setting array with different strides/storage orders. #output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32)) #nphotons = input_queue_gpu[:1].get()[0] - 1 # new style output_queue_gpu.get(output_queue) nphotons = output_queue[0] - 1 input_queue_gpu.set(output_queue) output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32)) elif api.is_gpu_api_opencl(): temp_out = output_queue_gpu.get() nphotons = temp_out[0] input_queue_gpu.set( temp_out[1:], queue=comqueue ) # set the input queue to have index of photons still need to be run output_queue_gpu[:1].set( np.ones(shape=1, dtype=np.uint32), queue=comqueue) # reset first instance to be one end_requeue = time.time() #print "re-queue time (nphotons=",nphotons"): ",end_requeue-start_requeue if nphotons == 0: break end_prop = time.time() print "propagation time: ", end_prop - start_prop, " secs" end_flags = self.flags.get() end_flag = np.max(end_flags) if end_flag & (1 << 31): print >> sys.stderr, "WARNING: ABORTED PHOTONS" if api.is_gpu_api_cuda(): cuda.Context.get_current().synchronize() elif api.is_gpu_api_opencl(): cl.enqueue_barrier(comqueue)
int ai = k*n+gid; for (i = k+1; i < n; i++) { a[i*n+gid] = a[i*n+gid] - (a[i*n+k] / a[ak]) * a[ai]; //a[i*n+gid] = gid; } } """).build() for k in range(n-1): kernel = prg.eliminate kernel.set_scalar_arg_dtypes([None, numpy.int32, numpy.int32]) # I hope it also takes the last column # how can I put only k jobs into the queue event = kernel(queue, (n-k,), None, a_buf, numpy.int32(k), numpy.int32(n)) #for (i = k+1; i < n; i++) { # a[i*n+k] = 0 #} # We need to wait for all jobs on each loop t1 = time() cl.enqueue_barrier(queue) t2 = time() cl.enqueue_copy(queue, a, a_buf) t3 = time() #print("t1: ", t1-t2, " t2: ", t2-t3) print("k: ", k) print(a) cl.enqueue_copy(queue, a, a_buf) #print a
def enqueue_wait_for_events(self, events: Sequence[Event]) -> None: # OpenCL has some odd semantics for an empty wait list, hence the check if events: pyopencl.enqueue_barrier(self._pyopencl_command_queue, [x._pyopencl_event for x in events])
def __forward_ocl_vec4_interleaved(self, x, h0, c0, sm=False): def interleave(matrix): if len(matrix.shape) == 1: return np.squeeze(interleave(np.expand_dims(matrix, axis=0)), axis=0).copy() new = np.zeros_like(matrix) a, b, c, d = np.hsplit(matrix, 4) simd = 4 rng = np.arange(0, matrix.shape[1], simd) new[:, rng] = a new[:, rng + 1] = b new[:, rng + 2] = c new[:, rng + 3] = d return new.copy() seq_len = x.shape[0] batch_size = x.shape[1] weights = interleave( np.concatenate((np.transpose(self.Wi), np.transpose(self.Wh)), 0)).T.astype(np.float32) ifcos = np.zeros((batch_size, 4 * self.hidden_size)).astype(np.float32) hy = np.zeros( (seq_len + 1, batch_size, self.hidden_size)).astype(np.float32) cy = np.zeros( (seq_len + 1, batch_size, self.hidden_size)).astype(np.float32) hy[0] = h0 cy[0] = c0 platform = cl.get_platforms()[0] # Select the first platform [0] device = platform.get_devices()[ 0] # Select the first device on this platform [0] context = cl.Context([device]) # Create a context with your device queue = cl.CommandQueue( context) # Create a command queue with your context # Allocate on device x_gpu = cl.Buffer(context, cl.mem_flags.COPY_HOST_PTR, hostbuf=x.copy('C')) # cuda_alloc(x) weights_gpu = cl.Buffer( context, cl.mem_flags.COPY_HOST_PTR, hostbuf=weights.copy('C')) # cuda_alloc(weights) bias_gpu = cl.Buffer(context, cl.mem_flags.COPY_HOST_PTR, hostbuf=interleave( self.B).copy('C')) # cuda_alloc(self.B) ifcos_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=ifcos.copy('C')) # cuda_alloc(ifcos) hy_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=hy.copy('C')) # cuda_alloc(hy) cy_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=cy.copy('C')) # cuda_alloc(cy) kernelSource = '' kernelFilename = 'lstm_vec4_interleaved.cl' with open(kernelFilename, 'r') as file: kernelSource = file.read() program = cl.Program(context, kernelSource).build() M = np.int32(batch_size) K = np.int32(self.input_size + self.hidden_size) N = np.int32(4 * self.hidden_size) gemm_lws = (8, 8, 1) gemm_gws = int(M), int(N), gemm_lws[2] eltwise_lws = (8, 8, 1) eltwise_gws = int(M), int(self.hidden_size), eltwise_lws[2] events = [] for i in range(0, seq_len): gemm_kernel = program.lstm_gemm gemm_kernel.set_args(x_gpu, hy_gpu, weights_gpu, bias_gpu, ifcos_gpu, M, K, N, np.int32(self.input_size), np.int32(self.hidden_size), np.int32(i)) ev1 = cl.enqueue_nd_range_kernel(queue, gemm_kernel, gemm_gws, gemm_lws) events.append(ev1) cl.enqueue_barrier(queue) eltwise_kernel = program.lstm_eltwise eltwise_kernel.set_args(cy_gpu, ifcos_gpu, hy_gpu, np.int32(self.hidden_size), np.int32(batch_size), np.int32(i)) ev2 = cl.enqueue_nd_range_kernel(queue, eltwise_kernel, eltwise_gws, eltwise_lws) events.append(ev2) cl.enqueue_barrier(queue) timer_start = datetime.datetime.now() cl.wait_for_events(events) execution_time = (datetime.datetime.now() - timer_start).total_seconds() * 1000 cl.enqueue_copy(queue, ifcos, ifcos_gpu) cl.enqueue_copy(queue, hy, hy_gpu) cl.enqueue_copy(queue, cy, cy_gpu) queue.finish() # Copy the data for array c back to the host results = hy[1:], hy[-1:], cy[-1:] return results, execution_time
def __forward_ocl_naive(self, x, h0, c0, acc): seq_len = x.shape[0] batch_size = x.shape[1] weights = np.concatenate( (np.transpose(self.Wi), np.transpose(self.Wh)), 0).astype(np.float32) ifcos = np.zeros((batch_size, 4 * self.hidden_size)).astype(np.float32) hy = np.zeros( (seq_len + 1, batch_size, self.hidden_size)).astype(np.float32) cy = np.zeros( (seq_len + 1, batch_size, self.hidden_size)).astype(np.float32) hy[0] = h0 cy[0] = c0 platform = cl.get_platforms()[0] # Select the first platform [0] device = platform.get_devices()[ 0] # Select the first device on this platform [0] context = cl.Context([device]) # Create a context with your device queue = cl.CommandQueue( context) # Create a command queue with your context # Allocate on device x_gpu = cl.Buffer(context, cl.mem_flags.COPY_HOST_PTR, hostbuf=x.copy('C')) # cuda_alloc(x) weights_gpu = cl.Buffer( context, cl.mem_flags.COPY_HOST_PTR, hostbuf=weights.copy('C')) # cuda_alloc(weights) bias_gpu = cl.Buffer(context, cl.mem_flags.COPY_HOST_PTR, hostbuf=self.B.copy('C')) # cuda_alloc(self.B) ifcos_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=ifcos.copy('C')) # cuda_alloc(ifcos) hy_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=hy.copy('C')) # cuda_alloc(hy) cy_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=cy.copy('C')) # cuda_alloc(cy) kernelSource = '' kernelFilename = 'lstm_naive.cl' if acc is False else 'lstm_naive_acc.cl' with open(kernelFilename, 'r') as file: kernelSource = file.read() program = cl.Program(context, kernelSource).build() M = np.int32(batch_size) K = np.int32(self.input_size + self.hidden_size) N = np.int32(4 * self.hidden_size) gemm_lws = (1, 1, 1) gemm_gws = int(M), int(N), gemm_lws[2] eltwise_lws = (1, 1, 1) eltwise_gws = int(M), int(self.hidden_size), eltwise_lws[2] events = [] for i in range(0, seq_len): gemm_kernel = program.lstm_gemm gemm_kernel.set_args(x_gpu, hy_gpu, weights_gpu, bias_gpu, ifcos_gpu, M, K, N, np.int32(self.input_size), np.int32(self.hidden_size), np.int32(i)) ev1 = cl.enqueue_nd_range_kernel(queue, gemm_kernel, gemm_gws, gemm_lws) events.append(ev1) cl.enqueue_barrier(queue) eltwise_kernel = program.lstm_eltwise eltwise_kernel.set_args(cy_gpu, ifcos_gpu, hy_gpu, np.int32(self.hidden_size), np.int32(batch_size), np.int32(i)) ev2 = cl.enqueue_nd_range_kernel(queue, eltwise_kernel, eltwise_gws, eltwise_lws) events.append(ev2) cl.enqueue_barrier(queue) timer_start = datetime.datetime.now() cl.wait_for_events(events) execution_time = (datetime.datetime.now() - timer_start).total_seconds() * 1000 cl.enqueue_copy(queue, ifcos, ifcos_gpu) cl.enqueue_copy(queue, hy, hy_gpu) cl.enqueue_copy(queue, cy, cy_gpu) queue.finish() results = hy[1:], hy[-1:], cy[-1:] return results, execution_time
def run(self): self.program.Solve(self.queue, self.shape[2:5], None, self.A, self.b, self.x, self.threshold) cl.enqueue_barrier(self.queue) return self
def advance(self, obj, shift=False, corr=None, shcorr=None): ''' Propagate a field through the current slab and transmit it through an interface with the next slab characterized by object contrast obj. The transmission overwrites the refractive index of the current slab with the interface reflection coefficients. If shift is True, the forward is shifted by half a slab to agree with full-wave solutions and includes a backward-traveling contribution caused by reflection from the interface with the next slab. The relevant result (either the forward field or the half-shifted combined field) is copied into a device-side buffer for later retrieval and handling. If corr is not None, it should be a tuple as specified in the reset() docstring to override the default use of corrective terms in the spectral propagator. The argument shcorr is interpreted exactly as corr, but is used instead of corr for the propagation used to shift the field to the center of the slab. ''' prog, grid = self.prog, self.grid fwdque, recvque, sendque = self.fwdque, self.recvque, self.sendque # Point to the field components fwd, bck, buf = [f for f in self.fld] if shift: # Ensure that a prior copy isn't using the buffer buf.sync(fwdque) # Copy the forward field for shifting if necessary cl.enqueue_copy(fwdque, buf, fwd) # Copy the sound speed extrema for the current slab speedlim = list(self.speedlim) # Push the next slab to its buffer (overwrites speed extrema) ocur, onxt, obevt = self.objupdate(obj) if self.phasetol is not None: # Figure maximum propagation distance to not # exceed maximum permissible phase deviation dzl = [] for spd in speedlim: # Sign governs the sign of the phase deviation, # which is irrelevant, so ignore it here spdiff = max(abs(spd - 1.), 1e-8) # Preventing spdiff from reaching zero limits # maximum permissible propagation distance dzl.append(abs(0.5 * self.phasetol * spd / spdiff)) # Subdivide the slab into maximum propagation distance nsteps = max(1, int(np.round(self.dz / min(dzl)))) else: nsteps = 1 dz = self.dz / nsteps # Ensure that no prior copy is using the field buffer fwd.sync(fwdque) # Propagate the forward field through the slab on the fwdque for i in range(nsteps): self.propagate(fwd, dz, corr=corr) # Ensure next slab has been received before handling interface cl.enqueue_barrier(fwdque, wait_for=[obevt]) # Compute transmission through the interface # The reflected field is only of interest if a shift is desired transevt = prog.txreflect(fwdque, grid, None, fwd, bck if shift else None, ocur, onxt) # Hold the current contrast slab until the transmission is done ocur.attachevent(transevt) if shift: # Add the forward and backward fields prog.caxpy(fwdque, grid, None, buf, np.float32(1.), buf, bck) # Propagate the combined field a half step # Save the propagation event for delaying result copies pevt = self.propagate(buf, 0.5 * self.dz, corr=shcorr) # Handle Goertzel iterations to compute the Fourier # transform of the contrast source on the unit sphere if self._goertzel: # Compute the FFT of the source in the XY plane crt = self.scratch[0] prog.ctmul(fwdque, grid, None, crt, ocur, buf) self.fftplan.execute(crt) # Compute the next Goertzel iteration pn1, pn2 = self.goertzbuf dz = np.float32(self.dz) # The final argument (slab count) is not yet used nz = np.int32(0) prog.goertzelfft(fwdque, grid, None, pn1, pn2, crt, dz, nz) # Cycle the Goertzel buffers self.goertzbuf = [pn2, pn1] else: # Copy the shifted field into the result buffer # No result sync necessary, all mods occur on sendque evt = cl.enqueue_copy(sendque, self.result, buf, wait_for=[pevt]) # Attach the copy event to the source buffer buf.attachevent(evt) else: # Copy the forward field into the result buffer # Wait for transmissions to finish for consistency evt = cl.enqueue_copy(sendque, self.result, fwd, wait_for=[transevt]) # Attach the copy event to the field buffer fwd.attachevent(evt)
def baum_welch( sequence, transition_probs, symbol_probs, initial_dist, accuracy = 1e-3, maxit = 1): A = pyopencl.Buffer( context, pyopencl.mem_flags.READ_WRITE | pyopencl.mem_flags.COPY_HOST_PTR, hostbuf=transition_probs) B = pyopencl.Buffer( context, pyopencl.mem_flags.READ_WRITE | pyopencl.mem_flags.COPY_HOST_PTR, hostbuf=symbol_probs) pi = pyopencl.Buffer( context, pyopencl.mem_flags.READ_WRITE | pyopencl.mem_flags.COPY_HOST_PTR, hostbuf=initial_dist) ob = pyopencl.Buffer( context, pyopencl.mem_flags.READ_ONLY | pyopencl.mem_flags.COPY_HOST_PTR, hostbuf=sequence) T = len(sequence) N = len(transition_probs) M = len(symbol_probs[0]) alpha = pyopencl.Buffer( context, pyopencl.mem_flags.READ_WRITE, T*N * numpy.dtype('float32').itemsize) beta = pyopencl.Buffer( context, pyopencl.mem_flags.READ_WRITE, T*N * numpy.dtype('float32').itemsize) matrix_buffer = pyopencl.Buffer( context, pyopencl.mem_flags.READ_WRITE, T*N*N * numpy.dtype('float32').itemsize) scratch = pyopencl.LocalMemory( 2*kernel.WORK_GROUP_SIZE*N*N* numpy.dtype('float32').itemsize ) reduced = pyopencl.Buffer( context, pyopencl.mem_flags.READ_WRITE, blocks*N*N * numpy.dtype('float32').itemsize) probability = pyopencl.Buffer( context, pyopencl.mem_flags.WRITE_ONLY, numpy.dtype('float32').itemsize ) old_prob = 0.0 new_prob = old_prob + accuracy + 1 it = 0 while it < maxit: # abs(new_prob - old_prob) > accuracy and it < maxit: forward(ob, A, B, pi, T, N, alpha, matrix_buffer, scratch, reduced) # forward_naive(ob, A, B, pi, T, N, alpha, matrix_buffer, scratch) e = pyopencl.enqueue_barrier(queue) e.wait() backward_naive(ob, A, B, T, N, beta, matrix_buffer, scratch) e = pyopencl.enqueue_barrier(queue) e.wait() transition_probabilities(alpha, beta, A, B, ob, T, matrix_buffer) state_probabilities(alpha, beta, T) transitions = transition_counts(matrix_buffer, T-1, N, scratch) states = state_counts(alpha, T-1, N, scratch) symbols = symbol_counts(alpha, ob, T, N, M, scratch) update(A, B, pi, alpha, transitions, states, symbols, probability, N, T) if it > 0: old_prob = new_prob new_prob = numpy.array((1), numpy.float32) pyopencl.enqueue_copy(queue, new_prob, probability) it = it + 1 transition_probs = numpy.zeros_like(transition_probs) symbol_probs = numpy.zeros_like(symbol_probs) initial_dist = numpy.zeros_like(initial_dist) pyopencl.enqueue_copy(queue, transition_probs, A) pyopencl.enqueue_copy(queue, symbol_probs, B) pyopencl.enqueue_copy(queue, initial_dist, pi) return transition_probs, symbol_probs, initial_dist, new_prob, it
def trackContour(self): if self.resetNormalsAfterEachImage and not self.getContourId( ) == 0 and self.nrOfTrackingIterations == 0: # reset contour normal vector to radial vectors; we do this only starting for the second, since doing this for image 0, would destroy the correspondence of the indexes of the contour coordinates to their corresponding contour normals cl.enqueue_copy_buffer( self.queue, self.dev_radialVectorsX.data, self.dev_membraneNormalVectorsX.data).wait() cl.enqueue_copy_buffer( self.queue, self.dev_radialVectorsY.data, self.dev_membraneNormalVectorsY.data).wait() # tracking status variables self.nrOfTrackingIterations = self.nrOfTrackingIterations + 1 stopInd = 1 self.trackingFinished = np.array(1, dtype=np.int32) # True self.dev_trackingFinished = cl_array.to_device(self.queue, self.trackingFinished) self.iterationFinished = np.array(0, dtype=np.int32) # True self.dev_iterationFinished = cl_array.to_device( self.queue, self.iterationFinished) self.dev_membraneCoordinates = helpers.ToDoubleVectorOnDevice( self.queue, self.dev_membraneCoordinatesX, self.dev_membraneCoordinatesY) self.dev_membraneNormalVectors = helpers.ToDoubleVectorOnDevice( self.queue, self.dev_membraneNormalVectorsX, self.dev_membraneNormalVectorsY) self.dev_previousInterpolatedMembraneCoordinates = helpers.ToDoubleVectorOnDevice( self.queue, self.dev_previousInterpolatedMembraneCoordinatesX, self.dev_previousInterpolatedMembraneCoordinatesY) self.dev_membranePolarCoordinates = helpers.ToDoubleVectorOnDevice( self.queue, self.dev_membranePolarTheta, self.dev_membranePolarRadius) self.dev_interpolatedMembraneCoordinates = helpers.ToDoubleVectorOnDevice( self.queue, self.dev_interpolatedMembraneCoordinatesX, self.dev_interpolatedMembraneCoordinatesY) for strideNr in range(self.nrOfStrides): # set the starting index of the coordinate array for each kernel instance kernelCoordinateStartingIndex = np.int32( strideNr * self.detectionKernelStrideSize) self.prg.findMembranePosition(self.queue, self.trackingGlobalSize, self.trackingWorkGroupSize, self.sampler, \ self.dev_Img, self.imgSizeX, self.imgSizeY, \ self.buf_localRotationMatrices, \ self.buf_linFitSearchRangeXvalues, \ self.linFitParameter, \ cl.LocalMemory(self.fitIntercept_memSize), cl.LocalMemory(self.fitIncline_memSize), \ cl.LocalMemory(self.rotatedUnitVector_memSize), \ self.meanParameter, \ self.buf_meanRangeXvalues, self.meanRangePositionOffset, \ cl.LocalMemory(self.localMembranePositions_memSize), \ self.dev_membraneCoordinates.data, \ self.dev_membraneNormalVectors.data, \ self.dev_fitInclines.data, \ kernelCoordinateStartingIndex, \ self.inclineTolerance, \ self.inclineRefinementRange) barrierEvent = cl.enqueue_barrier(self.queue) self.prg.filterNanValues(self.queue, self.gradientGlobalSize, None, \ self.dev_membraneCoordinates.data, \ self.dev_membraneNormalVectors.data, \ cl.LocalMemory(self.dev_closestLowerNoneNanIndex.nbytes), cl.LocalMemory(self.dev_closestUpperNoneNanIndex.nbytes) \ ) barrierEvent = cl.enqueue_barrier(self.queue) self.prg.filterJumpedCoordinates(self.queue, self.gradientGlobalSize, None, \ self.dev_previousContourCenter.data, \ self.dev_membraneCoordinates.data, \ self.dev_membraneNormalVectors.data, \ self.dev_previousInterpolatedMembraneCoordinates.data, \ cl.LocalMemory(self.dev_closestLowerNoneNanIndex.nbytes), \ cl.LocalMemory(self.dev_closestUpperNoneNanIndex.nbytes), \ cl.LocalMemory(self.listOfGoodCoordinates_memSize), \ self.maxCoordinateShift \ ) barrierEvent = cl.enqueue_barrier(self.queue) self.prg.calculateInterCoordinateAngles(self.queue, self.gradientGlobalSize, None, \ self.dev_interCoordinateAngles.data, \ self.dev_membraneCoordinates.data \ ) barrierEvent = cl.enqueue_barrier(self.queue) self.prg.filterIncorrectCoordinates(self.queue, self.gradientGlobalSize, None, \ self.dev_previousContourCenter.data, \ self.dev_interCoordinateAngles.data, \ self.dev_membraneCoordinates.data, \ self.dev_membraneNormalVectors.data, \ cl.LocalMemory(self.dev_closestLowerNoneNanIndex.nbytes), cl.LocalMemory(self.dev_closestUpperNoneNanIndex.nbytes), \ self.maxInterCoordinateAngle \ ) barrierEvent = cl.enqueue_barrier(self.queue) # information regarding barriers: http://stackoverflow.com/questions/13200276/what-is-the-difference-between-clenqueuebarrier-and-clfinish ######################################################################## ### Calculate contour center ######################################################################## self.calculateContourCenter() ######################################################################## ### Convert cartesian coordinates to polar coordinates ######################################################################## self.prg.cart2pol(self.queue, self.gradientGlobalSize, None, \ self.dev_membraneCoordinates.data, \ self.dev_membranePolarCoordinates.data, \ self.dev_contourCenter.data) barrierEvent = cl.enqueue_barrier(self.queue) ######################################################################## ### Interpolate polar coordinates ######################################################################## self.prg.sortCoordinates(self.queue, (1,1), None, \ self.dev_membranePolarCoordinates.data, \ self.dev_membraneCoordinates.data, \ self.dev_membraneNormalVectors.data, \ np.int32(self.nrOfDetectionAngleSteps) \ ) barrierEvent = cl.enqueue_barrier(self.queue) self.prg.interpolatePolarCoordinatesLinear(self.queue, self.gradientGlobalSize, None, \ self.dev_membranePolarCoordinates.data, \ self.dev_radialVectors.data, \ self.dev_contourCenter.data, \ self.dev_membraneCoordinates.data, \ self.dev_interpolatedMembraneCoordinates.data, \ self.dev_interpolationAngles.data, \ self.nrOfAnglesToCompare \ ) barrierEvent = cl.enqueue_barrier(self.queue) ######################################################################## ### Convert polar coordinates to cartesian coordinates ######################################################################## self.prg.checkIfTrackingFinished(self.queue, self.gradientGlobalSize, None, \ self.dev_interpolatedMembraneCoordinates.data, \ self.dev_previousInterpolatedMembraneCoordinates.data, \ self.dev_trackingFinished.data, \ self.coordinateTolerance) barrierEvent = cl.enqueue_barrier(self.queue) self.prg.checkIfCenterConverged(self.queue, (1,1), None, \ self.dev_contourCenter.data, \ self.dev_previousContourCenter.data, \ self.dev_trackingFinished.data, \ self.centerTolerance) barrierEvent = cl.enqueue_barrier(self.queue) self.dev_membraneNormalVectorsX, self.dev_membraneNormalVectorsY = helpers.ToSingleVectorsOnDevice( self.queue, self.dev_membraneNormalVectors) self.dev_previousInterpolatedMembraneCoordinatesX, self.dev_previousInterpolatedMembraneCoordinatesY = helpers.ToSingleVectorsOnDevice( self.queue, self.dev_previousInterpolatedMembraneCoordinates) self.dev_membraneCoordinatesX, self.dev_membraneCoordinatesY = helpers.ToSingleVectorsOnDevice( self.queue, self.dev_membraneCoordinates) self.dev_membranePolarTheta, self.dev_membranePolarRadius = helpers.ToSingleVectorsOnDevice( self.queue, self.dev_membranePolarCoordinates) self.dev_interpolatedMembraneCoordinatesX, self.dev_interpolatedMembraneCoordinatesY = helpers.ToSingleVectorsOnDevice( self.queue, self.dev_interpolatedMembraneCoordinates) cl.enqueue_read_buffer(self.queue, self.dev_trackingFinished.data, self.trackingFinished).wait() barrierEvent = cl.enqueue_barrier(self.queue) cl.enqueue_copy_buffer( self.queue, self.dev_interpolatedMembraneCoordinatesX.data, self.dev_previousInterpolatedMembraneCoordinatesX.data).wait() cl.enqueue_copy_buffer( self.queue, self.dev_interpolatedMembraneCoordinatesY.data, self.dev_previousInterpolatedMembraneCoordinatesY.data).wait() cl.enqueue_copy_buffer(self.queue, self.dev_contourCenter.data, self.dev_previousContourCenter.data).wait() self.prg.setIterationFinished(self.queue, (1, 1), None, self.dev_iterationFinished.data) barrierEvent = cl.enqueue_barrier(self.queue) cl.enqueue_read_buffer(self.queue, self.dev_iterationFinished.data, self.iterationFinished).wait() self.setStartingCoordinatesNew(self.dev_interpolatedMembraneCoordinatesX, \ self.dev_interpolatedMembraneCoordinatesY) pass
def run(self): self.program.Solve(self.queue, self.shape[2:5], None, self.A, self.b, self.x, self.threshold) cl.enqueue_barrier(self.queue) return self