def toGpu(self, data): import pyopencl as cl gpu_buf = self.allocate(data.shape, data.dtype) queue = self._createQueue() cl.enqueue_copy(queue, gpu_buf, data, is_blocking=True) return gpu_buf
def __call__( self, forward, backward, costs, features, origin, h ): mf = cl.mem_flags w_features = features.astype( np.float64 ) cummulated = costs * 1 forward_buffer = cl.Buffer( self.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = forward ) backward_buffer = cl.Buffer( self.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = backward ) cost_buffer = cl.Buffer( self.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = costs ) cumm_buffer = cl.Buffer( self.context, mf.READ_WRITE, costs.nbytes ) feature_buffer = cl.Buffer( self.context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf = w_features ) origin_buffer = cl.Buffer( self.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = origin ) self.henryCummulated.cummulated1( self.queue, costs.shape, None, np.int32( w_features.shape[2] ), np.int32( w_features.shape[1] ), origin_buffer, np.int32( h ), self.idirection_buffer, forward_buffer, backward_buffer, cost_buffer, cumm_buffer ) self.henryCummulated.cummulated2( self.queue, costs.shape, None, np.int32( w_features.shape[2] ), np.int32( w_features.shape[1] ), np.int32( features.shape[3] ), cumm_buffer, feature_buffer ) cl.enqueue_copy( self.queue, cummulated, cumm_buffer ) cl.enqueue_copy( self.queue, w_features, feature_buffer ) return cummulated, w_features
def test_enqueue_task(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) mf = cl.mem_flags prg = cl.Program(ctx, """ __kernel void reverse(__global const float *in, __global float *out, int n) { for (int i = 0;i < n;i++) { out[i] = in[n - 1 - i]; } } """).build() knl = prg.reverse n = 100 a = np.random.rand(n).astype(np.float32) b = np.empty_like(a) buf1 = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a) buf2 = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes) knl.set_args(buf1, buf2, np.int32(n)) cl.enqueue_task(queue, knl) cl.enqueue_copy(queue, b, buf2).wait() assert la.norm(a[::-1] - b) == 0
def fromGpu(self, gpu_buf, target_shape, target_dtype): import pyopencl as cl data = numpy.empty(target_shape, target_dtype) queue = self._createQueue() cl.enqueue_copy(queue, data, gpu_buf, is_blocking=True) return data
def run(self, kernel, shape, *args): kargs = [] for arg in args: if isinstance(arg, np.ndarray): if id(arg) in self.buffers: buf = self.buffers[id(arg)] cl.enqueue_copy(self.runtime.queues[0], buf, arg) else: flags = cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR buf = cl.Buffer(self.runtime.context, flags, arg.nbytes, hostbuf=arg) self.buffers[id(arg)] = buf kargs.append(buf) else: kargs.append(np.float32(arg)) # TODO: use user-supplied information if necessary first_np_array = [a for a in args if isinstance(a, np.ndarray)][0] workspace = shape if shape else first_np_array.shape if self.output is None: self.output = np.empty(workspace).astype(np.float32) out_buffer = cl.Buffer(self.runtime.context, cl.mem_flags.WRITE_ONLY, self.output.nbytes) self.buffers[id(self.output)] = out_buffer else: out_buffer = self.buffers[id(self.output)] kargs.append(out_buffer) start = time.time() kernel(self.runtime.queues[0], workspace, None, *kargs) cl.enqueue_copy(self.runtime.queues[0], self.output, out_buffer) self.time = time.time() - start return self.output
def runTest(self): nx, ny, nz, str_f, pt0, pt1 = self.args slidx = common.slice_index_two_points(pt0, pt1) str_fs = common.convert_to_tuple(str_f) # instance gpu_devices = common_gpu.gpu_device_list(print_info=False) context = cl.Context(gpu_devices) device = gpu_devices[0] fields = Fields(context, device, nx, ny, nz, '') getf = GetFields(fields, str_f, pt0, pt1) # host allocations eh_dict = {} for sf in str_fs: eh_dict[sf] = np.random.rand(*fields.ns).astype(fields.dtype) cl.enqueue_copy(fields.queue, fields.get_buf(sf), eh_dict[sf]) # verify getf.get_event().wait() for str_f in str_fs: original = eh_dict[str_f][slidx] copy = getf.get_fields(str_f) self.assertEqual(np.abs(eh_dict[str_f][slidx] - getf.get_fields(str_f)).max(), 0, self.args)
def _set(self, ary): # Allocate a new buffer with suitable padding and assign buf = np.zeros(self.datashape, dtype=self.dtype) buf[...,:self.ioshape[-1]] = ary # Copy cl.enqueue_copy(self.backend.qdflt, self.data, buf)
def _set(self, ary): # Allocate a new buffer with suitable padding and pack it buf = np.zeros((self.nrow, self.leaddim), dtype=self.dtype) buf[:, :self.ncol] = self._pack(ary) # Copy cl.enqueue_copy(self.backend.qdflt, self.data, buf)
def generate(self, chunk_array, ctx, queue, heightmap_kernel): assert isinstance(chunk_array, ChunkArray) hmap = self._generate_hmap() x_bounds = (0, 8) y_bounds = (0, 8) for z in range(1): chunk_array.allocate_layer(z, x_bounds, y_bounds) for x in range(8): for y in range(8): chunk_array.allocate_chunk(x, y, z, level=0) print("allocated!") ihmap = numpy.empty((256,256), dtype=numpy.int32) for x in range(256): for y in range(256): height = hmap[x, y] ihmap[x, y] = int(max(min(height*7.4 + 8, 32), 0)) """for x in range(256): print(x) for y in range(256): height = hmap[x, y] z_max = int(max(min(height + 8, 32), 0)) for z in range(32): voxel = chunk_array.get_voxel(x, y, z) voxel['flags'] = 0 if z_max < z else 1""" chunk_array.upload_buffers() buffer = pyopencl.Buffer(ctx, pyopencl.mem_flags.READ_ONLY|pyopencl.mem_flags.COPY_HOST_PTR, hostbuf = ihmap) #pyopencl.enqueue_copy(queue, buffer, hmap) heightmap_kernel(queue, (255, 255, 32), None, chunk_array.array_buffer._d_buffer, buffer) pyopencl.enqueue_copy(queue, chunk_array.voxel_data.level_buffers[0]._h_buffer, chunk_array.voxel_data.level_buffers[0]._d_buffer) chunk_array.upload_buffers()
def to_host(queue, data, dtype, start, shape, elemstrides, is_blocking=True): """Copy memory off the device, into a Numpy array. If the requested array is discontiguous, the whole block is copied off the device, and a view is created to show the appropriate part. """ if min(elemstrides) < 0: raise NotImplementedError() m, n = shape sm, sn = elemstrides if m * n == 0: return np.zeros(shape, dtype=dtype) itemsize = dtype.itemsize bytestart = itemsize * start bytelen = itemsize * ((m-1)*sm + (n-1)*sn + 1) temp_buf = np.zeros(bytelen, dtype=np.int8) cl.enqueue_copy(queue, temp_buf, data, device_offset=bytestart, is_blocking=is_blocking) bytestrides = (itemsize * sm, itemsize * sn) return np.ndarray(shape=(m, n), dtype=dtype, buffer=temp_buf.data, offset=0, strides=bytestrides)
def calc_range(start, num, perexec): """Calculate the otp-md5 of the 64-bit numbers range(start, num), with otp sequence of rounds.""" assert(num % perexec == 0) # Boilerplate OpenCL stuff ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) mf = cl.mem_flags # Read the program source and compile sourcecode = open("otpmd5.cl").read() prg = cl.Program(ctx, sourcecode).build() for i in xrange(num / perexec): offset = start + (perexec * i) host_input = numpy.arange(offset, offset+perexec, dtype=numpy.uint64) result = numpy.empty_like(host_input) dev_input = cl.Buffer(ctx, mf.READ_ONLY | mf.USE_HOST_PTR, hostbuf=host_input) dev_output = cl.Buffer(ctx, mf.READ_WRITE, size=result.size * result.itemsize) prg.get_otpmd5_64k_rounds(queue, host_input.shape, None, dev_input, dev_output).wait() cl.enqueue_copy(queue, result, dev_output).wait() send_output(host_input, result)
def __call__(self, call_details, values, cutoff): # type: (CallDetails, np.ndarray, np.ndarray, float) -> np.ndarray context = self.queue.context # Arrange data transfer to card details_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=call_details.buffer) values_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=values) # Call kernel and retrieve results step = 100 #print("calling OpenCL") for start in range(0, call_details.pd_prod, step): stop = min(start+step, call_details.pd_prod) args = [ np.uint32(self.q_input.nq), np.int32(start), np.int32(stop), details_b, values_b, self.q_input.q_b, self.result_b, self.real(cutoff), ] self.kernel(self.queue, self.q_input.global_size, None, *args) cl.enqueue_copy(self.queue, self.result, self.result_b) # Free buffers for v in (details_b, values_b): if v is not None: v.release() return self.result[:self.q_input.nq]
def __setitem__(self, item, new_value): if isinstance(item, slice) or is_iterable(item): raise NotImplementedError('TODO') else: m, n = self.shape0s[item], self.shape1s[item] sm, sn = self.stride0s[item], self.stride1s[item] if (sm, sn) in [(1, m), (n, 1)]: # contiguous clarray = self.getitem_device(item) if isinstance(new_value, np.ndarray): array = np.asarray(new_value, order='C', dtype=self.dtype) else: array = np.zeros(clarray.shape, dtype=clarray.dtype) array[...] = new_value array.shape = clarray.shape # reshape to avoid warning assert equal_strides( array.strides, clarray.strides, clarray.shape) clarray.set(array) else: # discontiguous # Copy a contiguous region off the device that surrounds the # discontiguous, set the appropriate values, and copy back s = self.starts[item] array = to_host(self.queue, self.cl_buf.data, self.dtype, s, (m, n), (sm, sn), is_blocking=True) array[...] = new_value buf = array.base if array.base is not None else array bytestart = self.dtype.itemsize * s cl.enqueue_copy(self.queue, self.cl_buf.data, buf, device_offset=bytestart, is_blocking=True)
def test_sub_buffers(ctx_factory): ctx = ctx_factory() if (ctx._get_cl_version() < (1, 1) or cl.get_cl_header_version() < (1, 1)): from pytest import skip skip("sub-buffers are only available in OpenCL 1.1") alignment = ctx.devices[0].mem_base_addr_align queue = cl.CommandQueue(ctx) n = 30000 a = (np.random.rand(n) * 100).astype(np.uint8) mf = cl.mem_flags a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a) start = (5000 // alignment) * alignment stop = start + 20 * alignment a_sub_ref = a[start:stop] a_sub = np.empty_like(a_sub_ref) cl.enqueue_copy(queue, a_sub, a_buf[start:stop]) assert np.array_equal(a_sub, a_sub_ref)
def final(config, ctx, queue, program, buffers, debug=False): matrixSize = config['matrixSize'] bandwidth = config['bandwidth'] partitionNumber = config['partitionNumber'] partitionSize = config['partitionSize'] offdiagonalSize = config['offdiagonalSize'] rhsSize = config['rhsSize'] xo = np.ones((partitionNumber * (partitionSize - 2 * offdiagonalSize), rhsSize), dtype=np.float32) tmp = np.ones((partitionNumber * (partitionSize - 2 * offdiagonalSize), rhsSize), dtype=np.float32) mf = cl.mem_flags xo_buf = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=xo) tmp_buf = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=tmp) kernel = program.reconstruct kernel.set_scalar_arg_dtypes([None, None, None, None, np.int32, np.int32, np.int32]) cl.enqueue_barrier(queue) kernel( queue, (partitionNumber,), None, buffers[1], # Avwg buffer from factor, see if it is also readable and still valide buffers[3], # x buffer from solve, see if it is still valide xo_buf, tmp_buf, np.int32(partitionSize), np.int32(offdiagonalSize), np.int32(rhsSize) ) xtb = np.ones((partitionNumber * 2 * offdiagonalSize, rhsSize), dtype=np.float32) cl.enqueue_copy(queue, xtb, buffers[3]) if (debug) : print "X(t,b):" print xtb cl.enqueue_copy(queue, xo, xo_buf) if (debug) : print "X':" print xo xtb = sparse.csr_matrix(xtb) xo = sparse.csr_matrix(xo) x = [] for i in range(0, partitionNumber) : t = i * (2 * offdiagonalSize) b = (i + 1) * (2 * offdiagonalSize) u = i * (partitionSize - 2 * offdiagonalSize) v = (i + 1) * (partitionSize - 2 * offdiagonalSize) x.append(xtb[t : t + offdiagonalSize, 0 : rhsSize]) x.append(xo[u : v, 0 : rhsSize]) x.append(xtb[b - offdiagonalSize : b, 0 : rhsSize]) return sp.sparse.vstack(x)
def compute(self, simulations, window): simulationsOpenCL=simulations.reshape(-1, order = 'C').astype(np.float32) globalSize=(int(self.sizes['reshaped0']), int(self.sizes['reshaped1'])) localSize=(int(self.openCL.workGroup[0]), int(self.openCL.workGroup[1])) mf = cl.mem_flags base=np.float32(2) chromosomeLength=np.int32(self.sizes['simulationsLength']) lim0=np.int32(self.sizes['originalSimulations']) lim1=np.int32(self.sizes['originalSimulations']) window=np.int32(window) simulationsBuffer = cl.Buffer(self.openCL.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=simulationsOpenCL) outBuffer = cl.Buffer(self.openCL.ctx, mf.WRITE_ONLY, int((self.sizes['originalSimulations']**2)*np.int32(1).nbytes)) kernel=self.openCL.prg.phenCrowding kernel(self.openCL.queue, globalSize, localSize, base, chromosomeLength, window, lim0, lim1, simulationsBuffer, outBuffer) crowding = np.zeros((self.sizes['originalSimulations']**2,)).astype(np.float32) cl.enqueue_copy(self.openCL.queue, crowding, outBuffer) crowding=np.reshape(crowding, (self.sizes['originalSimulations'], -1), order='F') return crowding
def likelihood(self, outcomes, modelparams, expparams): # By calling the superclass implementation, we can consolidate # call counting there. super(AcceleratedPrecessionModel, self).likelihood(outcomes, modelparams, expparams) # Possibly add a second axis to modelparams. if len(modelparams.shape) == 1: modelparams = modelparams[..., np.newaxis] # Convert to float32 if needed. mps = modelparams.astype(np.float32) eps = expparams.astype(np.float32) # Allocating a buffer for the pr0 returns. pr0 = np.empty((mps.shape[0], eps.shape[0]), dtype=mps.dtype) # Move buffers to the GPU. mf = cl.mem_flags mps_buf = cl.Buffer(self._ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=mps) eps_buf = cl.Buffer(self._ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=eps) dest_buf = cl.Buffer(self._ctx, mf.WRITE_ONLY, pr0.nbytes) # Run the kernel with global worksize (n_models, n_experiments). self._prg.cos_model(self._queue, pr0.shape, None, np.int32(eps.shape[0]), mps_buf, eps_buf, dest_buf) # Copy the buffer back from the GPU and free memory there. cl.enqueue_copy(self._queue, pr0, dest_buf) mps_buf.release() eps_buf.release() dest_buf.release() # Now we concatenate over outcomes. return FiniteOutcomeModel.pr0_to_likelihood_array(outcomes, pr0)
def S_reduction100(self): self.program.CalcSREDUCTLEN(self.queue, (self.ThreadsNumb,), (self.Slocalsize,), self.Const_int_buf, self.Const_float_buf, self.Link_buf, self.S_out_of_groups_buf, cl.LocalMemory(self.Slocalsize*self.Type.itemsize), self.Seed_buf) cl.enqueue_copy(self.queue, self.S_out_of_groups_cpu, self.S_out_of_groups_buf).wait() r=numpy.zeros(7) for i in xrange(len(self.S_out_of_groups_cpu)): r[i%7]+=self.S_out_of_groups_cpu[i] return r/(self.prop['Ns']**3*self.prop['Nt']*6)
def process_data(self, pos, newbuf): assert newbuf.shape[0] ==self.chunksize if not newbuf.flags['C_CONTIGUOUS']: newbuf = newbuf.copy() pyopencl.enqueue_copy(self.queue, self.sigs_cl, newbuf) event = self.kern_detect_peaks(self.queue, (self.chunksize,), (self.max_wg_size,), self.sigs_cl, self.ring_sum_cl, self.peaks_cl) event.wait() if pos-(newbuf.shape[0]+2*self.n_span)<0: # the very first buffer is sacrified because of peak span return None, None pyopencl.enqueue_copy(self.queue, self.peaks, self.peaks_cl) ind_peaks, = np.nonzero(self.peaks) if ind_peaks.size>0: ind_peaks += pos - newbuf.shape[0] - self.n_span self.n_peak += ind_peaks.size #~ peaks = np.zeros(ind_peaks.size, dtype = [('index', 'int64'), ('code', 'int64')]) #~ peaks['index'] = ind_peaks #~ return self.n_peak, peaks return self.n_peak, ind_peaks return None, None
def get_fields(self): """ :return: Returns a dictionary of all fields. Transfers data from the GPU to the CPU. """ f = np.zeros((self.nx, self.ny, NUM_JUMPERS), dtype=np.float32, order='F') cl.enqueue_copy(self.queue, f, self.f, is_blocking=True) feq = np.zeros((self.nx, self.ny, NUM_JUMPERS), dtype=np.float32, order='F') cl.enqueue_copy(self.queue, feq, self.feq, is_blocking=True) u = np.zeros((self.nx, self.ny), dtype=np.float32, order='F') cl.enqueue_copy(self.queue, u, self.u, is_blocking=True) v = np.zeros((self.nx, self.ny), dtype=np.float32, order='F') cl.enqueue_copy(self.queue, v, self.v, is_blocking=True) rho = np.zeros((self.nx, self.ny), dtype=np.float32, order='F') cl.enqueue_copy(self.queue, rho, self.rho, is_blocking=True) results={} results['f'] = f results['u'] = u results['v'] = v results['rho'] = rho results['feq'] = feq return results
def get_fields(self): """ :return: Returns a dictionary of all fields. Transfers data from the GPU to the CPU. """ f = np.zeros((self.nx, self.ny, self.num_populations + 1, NUM_JUMPERS), dtype=np.float32, order="F") cl.enqueue_copy(self.queue, f, self.f, is_blocking=True) feq = np.zeros((self.nx, self.ny, self.num_populations + 1, NUM_JUMPERS), dtype=np.float32, order="F") cl.enqueue_copy(self.queue, feq, self.feq, is_blocking=True) u = np.zeros((self.nx, self.ny), dtype=np.float32, order="F") cl.enqueue_copy(self.queue, u, self.u, is_blocking=True) v = np.zeros((self.nx, self.ny), dtype=np.float32, order="F") cl.enqueue_copy(self.queue, v, self.v, is_blocking=True) rho = np.zeros((self.nx, self.ny, self.num_populations + 1), dtype=np.float32, order="F") cl.enqueue_copy(self.queue, rho, self.rho, is_blocking=True) results = {} results["f"] = f results["u"] = u results["v"] = v results["rho"] = rho results["feq"] = feq return results
def eval(self, pars): _ctx,queue = card() radius, length = \ [GaussianDispersion(int(pars[base+'_pd_n']), pars[base+'_pd'], pars[base+'_pd_nsigma']) for base in OneDGpuCylinder.PD_PARS] #Get the weights for each radius.value, radius.weight = radius.get_weights(pars['radius'], 0, 10000, True) length.value, length.weight = length.get_weights(pars['length'], 0, 10000, True) #Perform the computation, with all weight points sum, norm, vol = 0.0, 0.0, 0.0, sub = pars['sldCyl'] - pars['sldSolv'] real = np.float32 if self.q.dtype == np.dtype('float32') else np.float64 #Loop over radius, length, theta, phi weight points for r in xrange(len(radius.weight)): for l in xrange(len(length.weight)): self.prg.OneDCylKernel(queue, self.q.shape, None, self.q_b, self.res_b, real(sub), real(length.value[l]), real(radius.value[r]), real(pars['scale']), np.uint32(self.q.size), real(pars['uplim']), real(pars['bolim'])) cl.enqueue_copy(queue, self.res, self.res_b) sum += radius.weight[r]*length.weight[l]*self.res*pow(radius.value[r],2)*length.value[l] vol += radius.weight[r]*length.weight[l] *pow(radius.value[r],2)*length.value[l] norm += radius.weight[r]*length.weight[l] if vol != 0.0 and norm != 0.0: sum *= norm/vol return sum/norm + pars['background']
def lnlikelihood_ocl(self, pv): self._lnl2d(pv) self.prg_lnl.lnl1d_chunked(self.cl_queue, [self.lnl2d.shape[0], self.cl_lnl_chunks], None, uint32(self.lnl2d.shape[1]), self._b_lnl2d, self._b_lnl1d) cl.enqueue_copy(self.cl_queue, self.lnl1d, self._b_lnl1d) lnl = self.lnl1d.astype('d').sum(1) return lnl
def execute(self): kernel = self.program.fact self.event = kernel(self.queue,[self.a_dim],None,self.d_a_buf,self.d_c_buf) self.event.wait() cl.enqueue_copy(self.queue, self.h_c, self.d_c_buf) print "a", self.h_a print "ris", self.h_c
def get_edges(clctx, features, reductions, blurs, buf_in, summarise=True): """ Using the *features* and *reductions* programs, and *blurs* program with sigma=2.0, find all edge pixels in *buf_in* and return the count. """ gs, wgs = clctx.gs, clctx.wgs bufa = cl.Image(clctx.ctx, cl.mem_flags.READ_WRITE, clctx.ifmt, (gs, gs)) bufb = cl.Image(clctx.ctx, cl.mem_flags.READ_WRITE, clctx.ifmt, (gs, gs)) bufc = cl.Image(clctx.ctx, cl.mem_flags.READ_WRITE, clctx.ifmt, (gs, gs)) blurs.convolve_x(clctx.queue, (gs, gs), (wgs, wgs), buf_in, bufb) blurs.convolve_y(clctx.queue, (gs, gs), (wgs, wgs), bufb, bufa) blurs.convolve_x(clctx.queue, (gs, gs), (wgs, wgs), bufa, bufc) blurs.convolve_y(clctx.queue, (gs, gs), (wgs, wgs), bufc, bufb) features.subtract(clctx.queue, (gs, gs), (wgs, wgs), bufb, bufa, bufc) features.edges(clctx.queue, (gs, gs), (wgs, wgs), bufc, bufa) counts = reduction.run_reduction(clctx, reductions.reduction_sum, bufa) if not summarise: edges = np.empty((gs, gs, 4), np.float32) cl.enqueue_copy(clctx.queue, edges, bufa, origin=(0, 0), region=(gs, gs)) bufa.release() bufb.release() bufc.release() if summarise: return counts else: return edges
def to_host(queue, data, dtype, start, shape, elemstrides): """Copy memory off the device, into a Numpy array""" m, n = shape Sm, Sn = elemstrides if m * n == 0: return np.zeros(shape, dtype=dtype) if min(elemstrides) < 0: raise NotImplementedError() itemsize = dtype.itemsize bytestart = itemsize * start # -- TODO: is there an extra element transferred here? byteend = bytestart + itemsize * ((m-1) * Sm + (n-1) * Sn + 1) temp_buf = np.zeros((byteend - bytestart), dtype=np.int8) cl.enqueue_copy(queue, temp_buf, data, device_offset=bytestart, is_blocking=True) bytestrides = (itemsize * Sm, itemsize * Sn) try: view = np.ndarray( shape=(m, n), dtype=dtype, buffer=temp_buf.data, offset=0, strides=bytestrides) except: raise return view
def __init__(self): t_np = np.arange(0, 100000000, dtype=np.float32) self.ctx = cl.create_some_context() self.queue = cl.CommandQueue(self.ctx) self.mf = cl.mem_flags self.t_g = cl.Buffer( self.ctx, self.mf.READ_ONLY | self.mf.COPY_HOST_PTR, hostbuf=t_np) f = open("ex.cl", "r") fstr = "".join(f.readlines()) f.close() self.prg = cl.Program(self.ctx, fstr).build() self.res_g = cl.Buffer(self.ctx, self.mf.WRITE_ONLY, t_np.nbytes) self.prg.proc(self.queue, t_np.shape, None, self.t_g, self.res_g) res_np = np.empty_like(t_np) cl.enqueue_copy(self.queue, res_np, self.res_g) # Check on CPU with Numpy: print(res_np) print(np.amax(res_np))
def run(self): for ii in range(0,10): for jj in range(0,10): r = np.random.random([self.nsample,3]) r[:,0]=(r[:,0]+ii)*0.1 r[:,1]=(r[:,1]+jj)*0.1 self.X = np.zeros((self.nsample,4), dtype = np.float32) self.X[:,0:3] = r self.X[:,3] = 1. self.I = np.zeros((self.nsample,4), dtype = np.float32) self.I[:,0:3] = 1. #self.I[:,3] = 0. cl.enqueue_acquire_gl_objects(self.queue, [self.X_cl,self.I_cl]) cl.enqueue_copy(self.queue, self.X_cl, self.X) cl.enqueue_copy(self.queue, self.I_cl, self.I) self.program.Solve(self.queue, (self.nsample, self.na), None, self.A_cl, self.X_cl, self.I_cl, self.alpha) cl.enqueue_release_gl_objects(self.queue, [self.X_cl,self.I_cl]) self.queue.finish() self.draw() self.scrnData = np.zeros((self.width,self.height), dtype = np.float32) glReadPixels(0, 0, self.width, self.height, GL_ALPHA, GL_FLOAT, self.scrnData) print np.max(self.scrnData) scipy.misc.imsave('render.png', np.flipud(self.scrnData))
def get_color(self, img): # OpenCL only supports RGBA images, not RGB, so add an alpha channel src = np.array(img.convert('RGBA')) src.shape = w, h, _ = img.width, img.height, 4 w = int(w * self.SCALE_FACTOR) h = int(h * self.SCALE_FACTOR) local_size = self.max_work_item_sizes global_size = (math.ceil(h / local_size[0]), math.ceil(w / local_size[1])) total_work_groups = global_size[0] * global_size[1] mf = cl.mem_flags src_buf = cl.image_from_array(self.ctx, src, 4, norm_int=True) out = np.zeros(4 * total_work_groups, dtype=np.int32) out_buf = cl.Buffer(self.ctx, mf.WRITE_ONLY, size=out.itemsize * 4 * total_work_groups) kernel = self.prg.get_color kernel.set_scalar_arg_dtypes([None, None, np.uint32, np.uint32]) kernel(self.queue, global_size, local_size, src_buf, out_buf, w, h, g_times_l=True) cl.enqueue_copy(self.queue, dest=out, src=out_buf, is_blocking=True) # this sum takes .1 ms at 3440x1440, don't even bother OpenCL-ifying it resized_out = np.reshape(out, (out.shape[0] / 4, 4)) summed_out = np.sum(resized_out, axis=0) avg_out = (summed_out / summed_out[3])[:3].astype(int) return avg_out
def update_state(self, cl_context, cl_queue, time): del self.next_state self.next_state = np.empty(self.num_unit_states, dtype="float32") mf = cl.mem_flags cl.enqueue_copy(cl_queue, self.next_state, self.next_state_buf) # print 'self.next_state:' # print list(self.next_state) self.state_buf.release() del self.state self.state = np.zeros([self.total_state_size], dtype="float32") self.state[: self.num_unit_states] = self.next_state for s in self.streams: sval = s.pull(time) for sindex in range(s.ndim): skey = (s.id, sindex) suid = self.stream_uids[skey] gpu_index = self.unit2gpu[suid] state_index = self.unit_state_index[gpu_index] self.state[state_index] = sval[sindex] # print 'self.state:' # print list(self.state) self.state_buf = cl.Buffer(cl_context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.state)
gpu_queue = cl.CommandQueue(gpu_context) size = 1000 a = np.ones(size, dtype=np.float32) b = np.ones(size, dtype=np.float32) * 2 c = np.zeros_like(a) #lets first run this on the CPU, so create memory objects for the cpu_context if has_cpu: a_dev = cl.Buffer(cpu_context, cl.mem_flags.READ_WRITE, a.nbytes) b_dev = cl.Buffer(cpu_context, cl.mem_flags.READ_WRITE, b.nbytes) c_dev = cl.Buffer(cpu_context, cl.mem_flags.READ_WRITE, c.nbytes) cpu_program = cl.Program(cpu_context, kernel_source).build() #copy memory objects to the device cl.enqueue_copy(cpu_queue, a_dev, a, is_blocking=True) cl.enqueue_copy(cpu_queue, b_dev, b, is_blocking=True) #__call__ (queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_|=False cpu_program.sum(cpu_queue, a.shape, None, a_dev, b_dev, c_dev) cl.enqueue_copy(cpu_queue, c, c_dev) #check results python_kernel(a, b, c) #now run on the GPU if has_gpu: a_dev = cl.Buffer(gpu_context, cl.mem_flags.READ_WRITE, a.nbytes) b_dev = cl.Buffer(gpu_context, cl.mem_flags.READ_WRITE, b.nbytes) c_dev = cl.Buffer(gpu_context, cl.mem_flags.READ_WRITE, c.nbytes) gpu_program = cl.Program(gpu_context, kernel_source).build()
def process(self, pipe, image, data): import pyopencl as cl if not hasattr(self, "is_init"): # loop unrolling on mask mask = np.array( self.get("kernel", [[0, 0, 0], [0, 1, 0], [0, 0, 0] ])) * self.get("factor", 1.0) if mask.shape[0] != mask.shape[1]: print("WARNING: mask should be square for convolution!") if mask.shape[0] % 2 != 1: print("WARNING: mask should be an odd length") single_val_codeblock = """ tx = x + {i}; ty = y + {j}; if (tx >= 0 && tx < w && ty >= 0 && ty < h) {{ tidx = tx + ty * w; c = (float)({MASK_VAL}); nR += c * (float)img_in[3*tidx+0]; nG += c * (float)img_in[3*tidx+1]; nB += c * (float)img_in[3*tidx+2]; }} """ kernel_unfolded = "" for i in range(0, mask.shape[0]): for j in range(0, mask.shape[1]): if mask[j, i] != 0: cur_cb = single_val_codeblock.format( i=i - mask.shape[0] // 2, j=mask.shape[0] // 2 - j, MASK_VAL=mask[j, i]) kernel_unfolded += cur_cb #print (kernel_unfolded) OPENCL_SRC = """ __kernel void convert(__global __read_only uchar * img_in, __global __write_only uchar * img_out, int w, int h) {{ int x = get_global_id(0), y = get_global_id(1); if (x >= w || y >= h) return; int idx = x + y * w; // img_in[3*idx+0] = R component at pixel x, y // img_in[3*idx+1] = G component at pixel x, y // img_in[3*idx+2] = B component at pixel x, y // new components float nR = 0, nG = 0, nB = 0; int tx, ty, tidx; float c; // AUTOGEN START {gen_code} // AUTOGEN END img_out[3 * idx + 0] = (uchar)clamp(nR, 0.0f, 255.0f); img_out[3 * idx + 1] = (uchar)clamp(nG, 0.0f, 255.0f); img_out[3 * idx + 2] = (uchar)clamp(nB, 0.0f, 255.0f); }} """.format(gen_code=kernel_unfolded) #print(OPENCL_SRC) mf = cl.mem_flags platform = cl.get_platforms()[self.get("opencl_platform", 0)] devs = platform.get_devices()[self.get("opencl_device", -1)] self.ctx = cl.Context(devs if isinstance(devs, list) else [devs]) self.queue = cl.CommandQueue(self.ctx) # now build the programs self.prg = cl.Program(self.ctx, OPENCL_SRC).build() self.src_buf = cl.Buffer(self.ctx, mf.READ_ONLY, image.nbytes) self.dest_buf = cl.Buffer(self.ctx, mf.WRITE_ONLY, image.nbytes) self.dest = np.empty_like(image) # we have initialized self.is_init = True h, w, _ = image.shape # write current image cl.enqueue_copy(self.queue, self.src_buf, image) self.prg.convert(self.queue, (w, h), None, self.src_buf, self.dest_buf, np.int32(w), np.int32(h)) # read back image cl.enqueue_copy(self.queue, self.dest, self.dest_buf) return self.dest, data
} else { for (int i = thread_id*k; i < N; i++) y[thread_id] = y[thread_id] + x[i]; } } """ program = cl.Program(context, src).build(options='') #Transfers the data #hx = np.random.uniform(0, 1, N).astype(np.float32) hx = np.ones(N).astype(np.float32) print hx.shape print W * Ng hy = np.zeros(N).astype(np.float32) cl.enqueue_copy(queue, x, hx, is_blocking=False) event_execute = program.add_vectors(queue, (Ng * W, ), (W, ), x, y) # print "RESULT B4", hy[W*Ng] event_copy = cl.enqueue_copy(queue, hy, y, is_blocking=True) print "RESULT = ", hy[:W * Ng], hx[9999999] print 'here' print "SUM RESULT = ", sum(hy[:W * Ng]) print "NUMPY RESULT = ", sum(hx) #At this point, the queue is not flush. Nothing has been sent for execution. queue.flush() #At this point, the queue is not finished. The completion of the operations is not guaranteed! queue.finish()
def main(): if len(sys.argv) != 2: _help() sys.exit(1) device = int(sys.argv[1]) devices = get_devices() context = cl.Context(devices=[devices[device]]) queue = cl.CommandQueue(context) # TODO use these to decide what "shape" to make the work groups # and add something which allows those shapes to be replaced in the # openCL source code max_group = devices[device].max_work_group_size max_item = devices[device].max_work_item_sizes cl_text = open("memcpy_3d.cl", "r").read().replace("LOCAL_SIZE", "256") program = cl.Program(context, cl_text).build() memcpy_3d = program.memcpy_3d memcpy_3d.set_scalar_arg_dtypes( [ None, np.int32, np.int32, np.int32, None, ] ) shape = (32, 512, 1028) size = shape[0] * shape[1] * shape[2] mem_in = np.random.randint(0, 256, size=size, dtype=np.uint16).reshape(shape) _mem_in = cl.Buffer( context, cl.mem_flags.READ_ONLY, mem_in.size * np.dtype(mem_in.dtype).itemsize ) _mem_out = cl.Buffer( context, cl.mem_flags.WRITE_ONLY, mem_in.size * np.dtype(mem_in.dtype).itemsize ) mem_out = np.zeros(shape=mem_in.shape, dtype=mem_in.dtype) cl.enqueue_copy(queue, _mem_in, mem_in) # work must be a multiple of group size group = (1, 12, 16) work = tuple(int(group[d] * np.ceil(shape[d] / group[d])) for d in (0, 1, 2)) print(f"{shape} -> {work}") evt = memcpy_3d( queue, work, group, _mem_in, shape[0], shape[1], shape[2], _mem_out, ) evt.wait() cl.enqueue_copy(queue, mem_out, _mem_out) assert np.array_equal(mem_in, mem_out)
//sort the columns sort( &(pixel00), &(pixel10), &(pixel20) ); sort( &(pixel01), &(pixel11), &(pixel21) ); sort( &(pixel02), &(pixel12), &(pixel22) ); //sort the diagonal sort( &(pixel00), &(pixel11), &(pixel22) ); // median is the the middle value of the diagonal result[i] = pixel11; } } ''' #Kernel function instantiation prg = cl.Program(ctx, src).build() #Allocate memory for variables on the device img_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=img) result_g = cl.Buffer(ctx, mf.WRITE_ONLY, img.nbytes) width_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np.int32(img.shape[1])) height_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np.int32(img.shape[0])) # Call Kernel. Automatically takes care of block/grid distribution prg.medianFilter(queue, img.shape, None, img_g, result_g, width_g, height_g) result = np.empty_like(img) cl.enqueue_copy(queue, result, result_g) # Show the blurred image imsave('medianFilter-OpenCL.jpg', result)
def test_enqueue_copy_rect_2d(ctx_factory, honor_skip=True): """ Test 2D sub-array (slice) copy. """ ctx = ctx_factory() queue = cl.CommandQueue(ctx) if (honor_skip and ctx.devices[0].platform.name == "Portable Computing Language" and get_pocl_version(ctx.devices[0].platform) <= (0, 13)): # https://github.com/pocl/pocl/issues/353 pytest.skip("POCL's rectangular copies crash") ary_in_shp = 256, 128 # Entire array shape from which sub-array copied to device sub_ary_shp = 128, 96 # Sub-array shape to be copied to device ary_in_origin = 20, 13 # Sub-array origin ary_in_slice = generate_slice(ary_in_origin, sub_ary_shp) ary_out_origin = 11, 19 # Origin of sub-array copy from device to host-array ary_out_shp = 512, 256 # Entire host-array shape copy sub-array device->host ary_out_slice = generate_slice(ary_out_origin, sub_ary_shp) buf_in_origin = 7, 3 # Origin of sub-array in device buffer buf_in_shp = 300, 200 # shape of device buffer buf_out_origin = 31, 17 # Origin of 2nd device buffer buf_out_shp = 300, 400 # shape of 2nd device buffer # Create host array of random values. h_ary_in = \ np.array( np.random.randint( 0, 256, np.product(ary_in_shp) ), dtype=np.uint8 ).reshape(ary_in_shp) # Create device buffers d_in_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, size=np.product(buf_in_shp)) d_out_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, size=np.product(buf_out_shp)) # Copy sub-array (rectangular buffer) from host to device cl.enqueue_copy(queue, d_in_buf, h_ary_in, buffer_origin=buf_in_origin[::-1], host_origin=ary_in_origin[::-1], region=sub_ary_shp[::-1], buffer_pitches=(buf_in_shp[-1], ), host_pitches=(ary_in_shp[-1], )) # Copy sub-array (rectangular buffer) from device-buffer to device-buffer cl.enqueue_copy(queue, d_out_buf, d_in_buf, src_origin=buf_in_origin[::-1], dst_origin=buf_out_origin[::-1], region=sub_ary_shp[::-1], src_pitches=(buf_in_shp[-1], ), dst_pitches=(buf_out_shp[-1], )) # Create zero-initialised array to receive sub-array from device h_ary_out = np.zeros(ary_out_shp, dtype=h_ary_in.dtype) # Copy sub-array (rectangular buffer) from device to host-array. cl.enqueue_copy(queue, h_ary_out, d_out_buf, buffer_origin=buf_out_origin[::-1], host_origin=ary_out_origin[::-1], region=sub_ary_shp[::-1], buffer_pitches=(buf_out_shp[-1], ), host_pitches=(ary_out_shp[-1], )) queue.finish() # Check that the sub-array copied to device is # the same as the sub-array received from device. assert np.all(h_ary_in[ary_in_slice] == h_ary_out[ary_out_slice])
def copyResult(queue, result, outResult): cl.enqueue_copy(queue, result, outResult)
print("Executing computation") #prg.sum(queue, a_np.shape, None, a_g, b_g, res_g) knl = prg.sum knl.set_args(a_g, b_g, res_g) local_work_size = None #local_work_size = (10,) t0 = time.perf_counter_ns() ev = cl.enqueue_nd_range_kernel(queue=queue, kernel=knl, global_work_size=(vector_size//float_vector_size,), local_work_size=local_work_size) t0_enqueue = time.perf_counter_ns() ev.wait() t1 = time.perf_counter_ns() print("Transferring result to host") t2 = time.perf_counter_ns() cl.enqueue_copy(queue, res_np, res_g) t3 = time.perf_counter_ns() # Check on CPU with Numpy: print("Computing on the host using numpy") t4 = time.perf_counter_ns() res_local = a_np + b_np t5 = time.perf_counter_ns() print("Local type:", res_local.dtype) print("---------------------------------------------------------------------------") print("Comparing results") print("Difference : {}".format(res_np - res_local)) print("A : {}".format(a_np)) print("B : {}".format(b_np)) print("Result OpenCL: {}".format(res_np))
def evaluate(self, k: Union[float, ndarray], ldc: ndarray, t0: Union[float, ndarray], p: Union[float, ndarray], a: Union[float, ndarray], i: Union[float, ndarray], e: Union[float, ndarray] = None, w: Union[float, ndarray] = None, copy: bool = True) -> ndarray: """Evaluate the transit model for a set of scalar or vector parameters. Parameters ---------- k Radius ratio(s) either as a single float, 1D vector, or 2D array. ldc Limb darkening coefficients as a 1D or 2D array. t0 Transit center(s) as a float or a 1D vector. p Orbital period(s) as a float or a 1D vector. a Orbital semi-major axis (axes) divided by the stellar radius as a float or a 1D vector. i Orbital inclination(s) as a float or a 1D vector. e : optional Orbital eccentricity as a float or a 1D vector. w : optional Argument of periastron as a float or a 1D vector. Notes ----- The model can be evaluated either for one set of parameters or for many sets of parameters simultaneously. In the first case, the orbital parameters should all be given as floats. In the second case, the orbital parameters should be given as a 1D array-like. Returns ------- ndarray Modelled flux either as a 1D or 2D ndarray. """ npv = 1 if isinstance(t0, float) else len(t0) k = asarray(k) if k.size == 1: nk = 1 elif npv == 1: nk = k.size else: nk = k.shape[1] if e is None: e, w = 0.0, 0.0 pvp = empty((npv, nk + 6), dtype=float32) pvp[:, :nk] = k pvp[:, nk] = t0 pvp[:, nk + 1] = p pvp[:, nk + 2] = a pvp[:, nk + 3] = i pvp[:, nk + 4] = e pvp[:, nk + 5] = w ldc = atleast_2d(ldc).astype(float32) self.npv = uint32(pvp.shape[0]) self.spv = uint32(pvp.shape[1]) # Release and reinitialise the GPU buffers if the sizes of the time or # limb darkening coefficient arrays change. if (ldc.size != self.u.size) or (pvp.size != self.pv.size): assert self.npb == ldc.shape[1] // 2 if self._b_f is not None: self._b_f.release() self._b_u.release() self._b_p.release() self.pv = zeros(pvp.shape, float32) self.u = zeros((self.npv, 2 * self.npb), float32) self.f = zeros((self.npv, self.nptb), float32) mf = cl.mem_flags self._b_f = cl.Buffer(self.ctx, mf.WRITE_ONLY, self.time.nbytes * self.npv) self._b_u = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.u) self._b_p = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.pv) # Copy the limb darkening coefficient array to the GPU cl.enqueue_copy(self.queue, self._b_u, ldc) # Copy the parameter vector to the GPU self.pv[:] = pvp cl.enqueue_copy(self.queue, self._b_p, self.pv) self.prg.ma_eccentric_pop(self.queue, (self.npv, self.nptb), None, self._b_time, self._b_lcids, self._b_pbids, self._b_p, self._b_u, self._b_ed, self._b_le, self._b_ld, self._b_nsamples, self._b_etimes, self.k0, self.k1, self.nk, self.nz, self.dk, self.dz, self.spv, self.nlc, self.npb, self._b_f) if copy: cl.enqueue_copy(self.queue, self.f, self._b_f) return squeeze(self.f) else: return None
FILE_NAME="krak.cl" f=open(FILE_NAME,"r") SRC = ''.join(f.readlines()) f.close() prg = cl.Program(ctx, SRC).build() print("\n\nCracking the following data for a challenge:") print(a) # launch the kernel event = prg.krak(queue, a.shape, None, a_dev, s) event.wait() # copy the output from the context to the Python process cl.enqueue_copy(queue, a, a_dev) # if everything went fine, b should contain squares of integers print("CL returned:") print(a) if(a[0] == 0xdfd05a8b899b6000): print("[OK] Distinguished point search over colors 2-7 passed, result=0xdfd05a8b899b6000") if(a[12] == 0x3e248e031efda051): print("[OK] Key in table in color 2 found, result=0x3e248e031efda051") print("Dame brmlab? Dame deku?") #prg = cl.Program(ctx, SRC).build() #krak = prg.krak
# actual benchmark ------------------------------------------------------------ t1 = time() count = 20 for i in range(count): event = kernel(queue, h_c.shape[::-1], (block_size, block_size), d_c_buf, d_a_buf, d_b_buf) event.wait() gpu_time = (time() - t1) / count # transfer device -> host ----------------------------------------------------- t1 = time() cl.enqueue_copy(queue, h_c, d_c_buf) pull_time = time() - t1 # timing output --------------------------------------------------------------- gpu_total_time = gpu_time + push_time + pull_time print "GPU push+compute+pull total [s]:", gpu_total_time print "GPU push [s]:", push_time print "GPU pull [s]:", pull_time print "GPU compute (host-timed) [s]:", gpu_time print "GPU compute (event-timed) [s]: ", (event.profile.end - event.profile.start) * 1e-9 gflop = h_c.size * (a_width * 2.) / (1000**3.) gflops = gflop / gpu_time
def evaluate_pv(self, pvp: ndarray, ldc: ndarray, copy: bool = True): """Evaluate the transit model for 2D parameter array. Parameters ---------- pvp Parameter array with a shape `(npv, npar)` where `npv` is the number of parameter vectors, and each row contains a set of parameters `[k, t0, p, a, i, e, w]`. The radius ratios can also be given per passband, in which case the row should be structured as `[k_0, k_1, k_2, ..., k_npb, t0, p, a, i, e, w]`. ldc Limb darkening coefficient array with shape `(npv, 2*npb)`, where `npv` is the number of parameter vectors and `npb` is the number of passbands. Notes ----- This version of the `evaluate` method is optimized for calculating several models in parallel, such as when using *emcee* for MCMC sampling. Returns ------- ndarray Modelled flux either as a 1D or 2D ndarray. """ pvp = atleast_2d(pvp) ldc = atleast_2d(ldc).astype(float32) self.npv = uint32(pvp.shape[0]) self.spv = uint32(pvp.shape[1]) if pvp.shape[0] != ldc.shape[0]: raise ValueError("The parameter array and the ldc array have incompatible dimensions.") # Release and reinitialise the GPU buffers if the sizes of the time or # limb darkening coefficient arrays change. if (ldc.size != self.u.size) or (pvp.size != self.pv.size): assert self.npb == ldc.shape[1] // 2 if self._b_f is not None: self._b_f.release() self._b_u.release() self._b_p.release() self.pv = zeros(pvp.shape, float32) self.u = zeros((self.npv, 2 * self.npb), float32) self.f = zeros((self.npv, self.nptb), float32) mf = cl.mem_flags self._b_f = cl.Buffer(self.ctx, mf.WRITE_ONLY, self.time.nbytes * self.npv) self._b_u = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.u) self._b_p = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.pv) # Copy the limb darkening coefficient array to the GPU cl.enqueue_copy(self.queue, self._b_u, ldc) # Copy the parameter vector to the GPU self.pv[:] = pvp cl.enqueue_copy(self.queue, self._b_p, self.pv) self.prg.ma_eccentric_pop(self.queue, (self.npv, self.nptb), None, self._b_time, self._b_lcids, self._b_pbids, self._b_p, self._b_u, self._b_ed, self._b_le, self._b_ld, self._b_nsamples, self._b_etimes, self.k0, self.k1, self.nk, self.nz, self.dk, self.dz, self.spv, self.nlc, self.npb, self._b_f) if copy: cl.enqueue_copy(self.queue, self.f, self._b_f) return squeeze(self.f) else: return None
def update_opencl(): global u u0 = u[0,:] u1 = u[1,:] e0 = np.zeros(imax-1).astype(np.float32) e1 = np.zeros(imax-1).astype(np.float32) res0 = np.zeros(imax ).astype(np.float32) res1 = np.zeros(imax ).astype(np.float32) a = .5 * (cp + cm) b = .5 * (cp - cm) u0_g = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=u0) u1_g = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=u1) e0_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=e0) e1_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=e1) res0_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=res0) res1_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=res1) prg = cl.Program(ctx, """ #define A %(a)f #define B %(b)f #define DX %(dx)f #define DT %(dt)f #define SIZE %(size)d __kernel void update( __global float *u0_g, __global float *u1_g, __global float *e0_g, __global float *e1_g, __global float *res0_g, __global float *res1_g ) { int i = get_global_id(0); // compute flux vector float U0 = .5 * (u0_g[i] + u0_g[i+1]); float U1 = .5 * (u1_g[i] + u1_g[i+1]); barrier(CLK_GLOBAL_MEM_FENCE); if (i != SIZE-1) { e0_g[i] = A * U0 + B * U1 - .5*DX/DT * (u0_g[i+1] - u0_g[i]); e1_g[i] = B * U0 + A * U1 - .5*DX/DT * (u1_g[i+1] - u1_g[i]); } barrier(CLK_GLOBAL_MEM_FENCE); // compute residual vector if (i != 0 && i != SIZE-1) { res0_g[i] = -(e0_g[i] - e0_g[i-1]) / DX; res1_g[i] = -(e1_g[i] - e1_g[i-1]) / DX; } // update u0_g[i] = u0_g[i] + DT * res0_g[i]; u1_g[i] = u1_g[i] + DT * res1_g[i]; } """ % {"a":a, "b":b , "dx":dx, "dt":dt, "size":imax}).build() prg.update(queue, u0.shape, None, u0_g, u1_g, e0_g, e1_g, res0_g, res1_g) cl.enqueue_copy(queue, u0, u0_g) cl.enqueue_copy(queue, u1, u1_g) u = np.vstack((u0, u1)) return
morph_window_3d[1, 1, 1] = 255 morph_window_3d[1, 1, 2] = 255 morph_window_3d[1, 2, 1] = 255 morph_window_3d[2, 1, 1] = 255 vl.vglCheckContext(img_input, vl.VGL_CL_CONTEXT()) vl.vglCheckContext(img_output, vl.VGL_CL_CONTEXT()) vl.vglCheckContext(img_input, vl.VGL_RAM_CONTEXT()) vl.vglCheckContext(img_output, vl.VGL_RAM_CONTEXT()) wrp.vglClBinThreshold(img_input, img_output, np.float32(.5)) salvando2d(img_output, "bin-vglClBinThreshold.pgm") cl.enqueue_copy(wrp.cl_ctx.queue, img_input.get_oclPtr(), img_output.get_oclPtr(), dest_origin=(0, 0, 0), src_origin=(0, 0, 0), region=img_input.get_oclPtr().shape) wrp.vglClBinConway(img_input, img_output) salvando2d(img_output, "bin-vglClBinConway.pgm") wrp = None img_input = None img_input_3d = None img_input2 = None img_input2_3d = None img_output = None
def prefixSum(self, e, data, keys, ndata, low, hi, events): import numpy as np import pyopencl as cl mf = cl.mem_flags if not isinstance(data, cl.Buffer): data_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=data) else: data_buf = data if not isinstance(keys, cl.Buffer): keys_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=keys) else: keys_buf = keys grid_dims = self.get_grid_dims(ndata) psumbytes = ndata * np.uint64(0).nbytes bsumbytes = int(np.prod(grid_dims) * np.uint64(0).nbytes) nbsumbytes = np.uint64(0).nbytes psum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, psumbytes) bsum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, bsumbytes) nbsum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, nbsumbytes) low = PrefixSum.HOST_TYPE_KEYS(low) hi = PrefixSum.HOST_TYPE_KEYS(hi) kernel = self.prg.prefixSumDown kernel.set_args(data_buf, keys_buf, np.uint64(ndata), low, hi, psum_buf, bsum_buf, nbsum_buf) global_dims = self.get_global(grid_dims) print "prefixSumDown %s %s" % (str(global_dims), str(self.localDims)) if e is None: e = (cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims, wait_for=e), ) else: e = (cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims), ) events += e nbsum = np.zeros(1, dtype=np.uint64) events += (cl.enqueue_copy(self.queue, nbsum, nbsum_buf, wait_for=e), ) if nbsum > 1: (e, bsum_buf, bsum1_buf, nbsum1_buf, ndata2) = self.prefixSumDownInplace(e, bsum_buf, nbsum.item(), events) else: ndata2 = np.zeros(1, dtype=np.uint64) events += (cl.enqueue_copy(self.queue, ndata2, bsum_buf, wait_for=e), ) ndata2 = ndata2.item() print ndata2 self.prefixSumUp(e, psum_buf, ndata, bsum_buf, nbsum, events) return (e, data_buf, keys_buf, psum_buf, bsum_buf, nbsum_buf, ndata2)
def main2(): cosm = cosmology.Cosmology(0.7, 0.3, 0, 0.7) zd1, zd2, zd3 = 0.3, 0.7, 1.1 plane1 = { "scaledlenses": [(lenses.PlummerLens(cosm.getAngularDiameterDistance(zd1), { "mass": 1e14 * MASS_SUN, "width": 2.0 * ANGLE_ARCSEC }), V(1, 0) * ANGLE_ARCSEC)], "unscaledlens": (lenses.MassSheetLens(cosm.getAngularDiameterDistance(zd1), {"density": 1.5}), V(0, 0)), } plane2 = { "scaledlenses": [(lenses.PlummerLens(cosm.getAngularDiameterDistance(zd2), { "mass": 2e14 * MASS_SUN, "width": 1.5 * ANGLE_ARCSEC }), V(0, 1) * ANGLE_ARCSEC)], "unscaledlens": (lenses.MassSheetLens(cosm.getAngularDiameterDistance(zd2), {"density": 1.5}), V(0, 0)), } plane3 = { "scaledlenses": [ (lenses.CompositeLens(cosm.getAngularDiameterDistance(zd3), [ { "lens": lenses.PlummerLens(cosm.getAngularDiameterDistance(zd3), { "mass": 1.5e14 * MASS_SUN, "width": 3.5 * ANGLE_ARCSEC }), "factor": 0.5, "angle": 0, "x": ANGLE_ARCSEC, "y": 0 }, ]), V(-1, -1) * ANGLE_ARCSEC), (lenses.CompositeLens(cosm.getAngularDiameterDistance(zd3), [ { "lens": lenses.PlummerLens(cosm.getAngularDiameterDistance(zd3), { "mass": 1.5e14 * MASS_SUN, "width": 3.5 * ANGLE_ARCSEC }), "factor": 0.5, "angle": 0, "x": -ANGLE_ARCSEC, "y": 0 }, ]), V(-1, -1) * ANGLE_ARCSEC), ], "unscaledlens": (lenses.MassSheetLens(cosm.getAngularDiameterDistance(zd3), {"density": 1.5}), V(0, 0)), } zds, lensplanes = [zd1, zd2, zd3], [plane1, plane2, plane3] code = getMultiPlaneOCLProgram(lensplanes) angularScale = ANGLE_ARCSEC thetas = (np.array([ V(5, 5) * ANGLE_ARCSEC, V(-4, 4) * ANGLE_ARCSEC, V(10, 0) * ANGLE_ARCSEC ]) / angularScale).astype(np.float32) zss = [0.5, 0.95, 2.0] #thetas = (np.array([ V(-4,4)*ANGLE_ARCSEC ])/angularScale).astype(np.float32) #zss = [ 0.95 ] betas = np.zeros(thetas.shape, dtype=np.float32) assert (len(thetas) == len(zss)) allNumPlanes, DsrcAll, Dmatrix, intParams, floatParams, weights, centers, intParamOffsets, floatParamOffsets, weightOffsets, planeWeights = getOpenCLData( cosm, zss, zds, lensplanes, angularScale) pprint.pprint(thetas) pprint.pprint(betas) print("allNumPlanes") pprint.pprint(allNumPlanes) pprint.pprint(DsrcAll) pprint.pprint(Dmatrix) print("Int params:") pprint.pprint(intParams) print("Float params:") pprint.pprint(floatParams) print("Weights:") weights = np.random.random(weights.shape).astype(np.float32) pprint.pprint(weights) pprint.pprint(centers) pprint.pprint(intParamOffsets) pprint.pprint(floatParamOffsets) pprint.pprint(weightOffsets) planeWeights = np.random.random(planeWeights.shape).astype(np.float32) pprint.pprint(planeWeights) ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) mf = cl.mem_flags d_thetas = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=thetas) d_betas = cl.Buffer(ctx, mf.WRITE_ONLY, betas.nbytes) d_numPlanes = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=allNumPlanes) d_DsrcAll = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=DsrcAll) d_Dmatrix = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=Dmatrix) d_intParams = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=intParams) d_floatParams = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=floatParams) d_weights = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=weights) d_centers = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=centers) d_intParamOffsets = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=intParamOffsets) d_floatParamOffsets = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=floatParamOffsets) d_weightOffsets = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=weightOffsets) d_planeWeights = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=planeWeights) #print(code) #print("Using code from tst.cl") #code = open("tst.cl", "rt").read() prg = cl.Program(ctx, code).build() calcBetas = prg.calculateBetas calcBetas(queue, (len(zss), ), None, np.int32(len(zss)), d_thetas, d_betas, d_numPlanes, d_DsrcAll, d_Dmatrix, d_intParams, d_floatParams, d_weights, d_centers, d_intParamOffsets, d_floatParamOffsets, d_weightOffsets, d_planeWeights) cl.enqueue_copy(queue, betas, d_betas) print("GPU betas:") pprint.pprint(betas) tracer = CPUMultiPlaneTracer(zss, zds, lensplanes, cosm) tracer.setAllWeights(weights.tolist()) tracer.setPlaneWeights(planeWeights.tolist()) betas = tracer.trace(thetas) print("CPU:") print(betas)
# Shared memory partialDotProduct = cl.LocalMemory( np.dtype(np.float64).itemsize * localWorkSize * 3) start = timer() for i in range(LOOP_COUNT): for iSmp in range(nSmp): # Fill the source with values. srcM[:, :, :] = M[:, iSmp, :, :] srcV[:] = v[:, iSmp] eventV = cl.enqueue_copy(queues[0], vector_buf, srcV) eventM0 = cl.enqueue_copy(queues[0], matrix_buf, srcM[:indptr[halfSize]], is_blocking=False) # spMVOverlapping matrix_dot_vector_kernel_event0 = \ program.matrix_dot_vector(queues[0], (globalWorkSize,), (localWorkSize,), np.int64(halfSize), np.int64(0), indptr_buf, indices_buf, matrix_buf, vector_buf, destination_buf, partialDotProduct, wait_for=[eventM0]) eventM1 = cl.enqueue_copy(queues[1], matrix_buf, srcM[indptr[halfSize]:], is_blocking=False,
def filter(self, data, keys, low, hi, events): import numpy as np import pyopencl as cl mf = cl.mem_flags ndata = data.size (e, data_buf, keys_buf, indices_buf, bsum_buf, nbsum_buf, ndata2) = self.prefixSum(None, data, keys, ndata, low, hi, events) filt = np.zeros(ndata, dtype=np.bool8) indices = np.zeros(ndata, dtype=np.uint64) data2 = np.zeros(ndata2, dtype=PrefixSum.HOST_TYPE_DATA) keys2 = np.zeros(ndata2, dtype=PrefixSum.HOST_TYPE_KEYS) ndata2bytes = np.uint64(0).nbytes if PrefixSum.RETURN_FILTER == 1: filt_buf = cl.Buffer(self.ctx, mf.READ_WRITE, filt.nbytes) print data2.nbytes data2_buf = cl.Buffer(self.ctx, mf.READ_WRITE, data2.nbytes) keys2_buf = cl.Buffer(self.ctx, mf.READ_WRITE, keys2.nbytes) ndata2_buf = cl.Buffer(self.ctx, mf.READ_WRITE, ndata2bytes) low = PrefixSum.HOST_TYPE_KEYS(low) hi = PrefixSum.HOST_TYPE_KEYS(hi) kernel = self.prg.filter if PrefixSum.RETURN_FILTER == 1: kernel.set_args(data_buf, keys_buf, indices_buf, np.uint64(ndata), low, hi, filt_buf, data2_buf, keys2_buf, ndata2_buf) else: kernel.set_args(data_buf, keys_buf, indices_buf, np.uint64(ndata), low, hi, data2_buf, keys2_buf, ndata2_buf) global_dims = self.get_global(self.get_grid_dims(ndata)) print "filter" if e is None: e = (cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims, wait_for=e), ) else: e = (cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims), ) events += e if PrefixSum.RETURN_FILTER == 1: events += (cl.enqueue_copy(self.queue, filt, filt_buf, wait_for=e), cl.enqueue_copy(self.queue, indices, indices_buf, wait_for=e), cl.enqueue_copy(self.queue, data2, data2_buf, wait_for=e), cl.enqueue_copy(self.queue, keys2, keys2_buf, wait_for=e)) else: events += (cl.enqueue_copy(self.queue, indices, indices_buf, wait_for=e), cl.enqueue_copy(self.queue, data2, data2_buf, wait_for=e), cl.enqueue_copy(self.queue, keys2, keys2_buf, wait_for=e)) return (filt, indices, data2, keys2)
kernel_src = kernelFile.read() compile_options = "-D BINS=%i -D NIMAGE=%i -D WORKGROUP_SIZE=%i -D EPS=%f" % \ (bins, size, workgroup_size, numpy.finfo(numpy.float32).eps) program = cl.Program(ctx, kernel_src).build(options=compile_options) program.reduce1(queue, (workgroup_size * workgroup_size, ), (workgroup_size, ), d_pos.data, numpy.uint32(pos_size), d_preresult) program.reduce2(queue, (workgroup_size, ), (workgroup_size, ), d_preresult, d_minmax) result = numpy.ndarray(4, dtype=numpy.float32) cl.enqueue_copy(queue, result, d_minmax) min0 = pos[:, :, 0].min() max0 = pos[:, :, 0].max() min1 = pos[:, :, 1].min() max1 = pos[:, :, 1].max() minmax = (min0, max0, min1, max1) print(minmax) print(result) d_outData = cl.Buffer(ctx, mf.READ_WRITE, 4 * bins) d_outCount = cl.Buffer(ctx, mf.READ_WRITE, 4 * bins) d_outMerge = cl.Buffer(ctx, mf.READ_WRITE, 4 * bins) program.memset_out(queue, (1024, ), (workgroup_size, ), d_outData, d_outCount,
prog = cl.Program(context, kernel_src) try: prog.build(options=['-Werror'], devices=[dev]) except: print('Build log:') print(prog.get_build_info(dev, cl.program_build_info.LOG)) raise # Input mutex = np.zeros(shape=(1, ), dtype=np.int32) mutex_buff = cl.Buffer(context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=mutex) sum_out = np.zeros(shape=(1, ), dtype=np.int32) sum_buff = cl.Buffer(context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=sum_out) # Enqueue kernel global_size = (4, ) local_size = None # __call__(queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False) prog.mutex(queue, global_size, local_size, mutex_buff, sum_buff) # Print averaged results cl.enqueue_copy(queue, dest=sum_out, src=sum_buff, is_blocking=True) print('Sum: ' + str(sum_out))
dados2_h = np.array(dados2_h).astype(np.complex64) RES_h = np.empty_like(dados_h) print "dados1" print dados_h print "\n Expected" print dados_h * 2 dados_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=dados_h) dados2_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=dados2_h) RES_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=RES_h) Source = """ __kernel void soma(__global float2 *dados, __global float2 *dados2, __global float2 *res, int W){ const int gid_x = get_global_id(0); for(int i = 0; i<W; i++) { res[gid_x*W+i] = dados[gid_x*W+i] *2; } } """ prg = cl.Program(ctx, Source).build() completeEvent = prg.soma(queue, (M, ), None, dados_d, dados2_d, RES_d, np.int32(2)) completeEvent.wait() cl.enqueue_copy(queue, RES_h, RES_d) print "\n RES" print RES_h - (dados_h + dados2_h)
def test_image_3d(ctx_factory): #test for image_from_array for 3d image of float2 context = ctx_factory() device, = context.devices if not device.image_support: from pytest import skip skip("images not supported on %s" % device) if device.platform.vendor == "Intel(R) Corporation": from pytest import skip skip("images crashy on %s" % device) _skip_if_pocl(device.platform, 'pocl does not support CL_ADDRESS_CLAMP') prg = cl.Program( context, """ __kernel void copy_image_plane( __global float2 *dest, __read_only image3d_t src, sampler_t samp, int stride0, int stride1) { int d0 = get_global_id(0); int d1 = get_global_id(1); int d2 = get_global_id(2); /* const sampler_t samp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; */ dest[d0*stride0 + d1*stride1 + d2] = read_imagef( src, samp, (float4)(d2, d1, d0, 0)).xy; } """).build() num_channels = 2 shape = (3, 4, 2) a = np.random.random(shape + (num_channels, )).astype(np.float32) queue = cl.CommandQueue(context) try: a_img = cl.image_from_array(context, a, num_channels) except cl.RuntimeError: import sys exc = sys.exc_info()[1] if exc.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED: from pytest import skip skip("required image format not supported on %s" % device.name) else: raise a_dest = cl.Buffer(context, cl.mem_flags.READ_WRITE, a.nbytes) samp = cl.Sampler(context, False, cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) prg.copy_image_plane( queue, shape, None, a_dest, a_img, samp, np.int32(a.strides[0] / a.itemsize / num_channels), np.int32(a.strides[1] / a.itemsize / num_channels), ) a_result = np.empty_like(a) cl.enqueue_copy(queue, a_result, a_dest) good = la.norm(a_result - a) == 0 if not good: if queue.device.type & cl.device_type.CPU: assert good, ( "The image implementation on your CPU CL platform '%s' " "returned bad values. This is bad, but common." % queue.device.platform) else: assert good
if ary is None: ary = np.empty(self.shape, self.dtype) ary = _as_strided(ary, strides=self.strides) else: if ary.size != self.size: raise TypeError("'ary' has non-matching type") if ary.dtype != self.dtype: raise TypeError("'ary' has non-matching size") assert self.flags.forc, "Array in get() must be contiguous" if self.size: cl.enqueue_copy(queue or self.queue, ary, self.data, is_blocking=not async) return ary def get_item(self, index, queue=None, wait_for=None): if not isinstance(index, tuple): index = (index, ) if len(index) != len(self.shape): raise ValueError("incorrect number of indices") tgt = np.empty((), self.dtype) cl.enqueue_copy(queue or self.queue, tgt, self.data, is_blocking=True,
def Metropolis(sigma, J, B, T, iterations, Device, Divider): kernel_params = {'block_size': sigma.shape[0] / Divider} # Je detecte un peripherique GPU dans la liste des peripheriques Id = 1 HasXPU = False for platform in cl.get_platforms(): for device in platform.get_devices(): if Id == Device: XPU = device print "CPU/GPU selected: ", device.name.lstrip() HasXPU = True Id += 1 if HasXPU == False: print "No XPU #%i found in all of %i devices, sorry..." % (Device, Id - 1) sys.exit() ctx = cl.Context([XPU]) queue = cl.CommandQueue( ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) # Je recupere les flag possibles pour les buffers mf = cl.mem_flags sigmaCL = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=sigma) #sigmaCL = cl.Buffer(ctx, mf.READ_WRITE, sigma.nbytes) # Program based on Kernel2 MetropolisCL = cl.Program(ctx, KERNEL_CODE.substitute(kernel_params)).build() divide = Divider * Divider step = STEP / divide i = 0 duration = 0. while (step * i < iterations / divide): # Call OpenCL kernel # sigmaCL is lattice translated in CL format # step is number of iterations start_time = time.time() CLLaunch = MetropolisCL.MainLoop( queue, (numpy.int32(sigma.shape[0] * sigma.shape[1] / 2), 1), None, sigmaCL, numpy.float32(J), numpy.float32(B), numpy.float32(T), numpy.uint32(sigma.shape[0]), numpy.uint32(step), numpy.uint32(2008), numpy.uint32(1010)) CLLaunch.wait() # elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start) elapsed = time.time() - start_time print "Iteration %i with T=%f and %i iterations in %f: " % (i, T, step, elapsed) if LAPIMAGE: cl.enqueue_copy(queue, sigma, sigmaCL).wait() checkLattice(sigma) ImageOutput(sigma, "Ising2D_GPU_OddEven_%i_%1.1f_%.3i_Lap" % (SIZE, T, i)) i = i + 1 duration = duration + elapsed cl.enqueue_copy(queue, sigma, sigmaCL).wait() CheckLattice(sigma) sigmaCL.release() return (duration)
print 'The queue is using the device:', queue.device.name program = cl.Program(context, open('label_regions.cl').read()).build(options='') host_image = np.load('maze1.npy') host_labels = np.empty_like(host_image) host_done_flag = np.zeros(1).astype(np.int32) gpu_image = cl.Buffer(context, cl.mem_flags.READ_ONLY, host_image.size * 4) gpu_labels = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4) gpu_done_flag = cl.Buffer(context, cl.mem_flags.READ_WRITE, 4) # Send to the device, non-blocking cl.enqueue_copy(queue, gpu_image, host_image, is_blocking=False) local_size = (8, 8) # 64 pixels per work group global_size = tuple( [round_up(g, l) for g, l in zip(host_image.shape[::-1], local_size)]) print global_size width = np.int32(host_image.shape[1]) height = np.int32(host_image.shape[0]) halo = np.int32(1) # Create a local memory per working group that is # the size of an int (4 bytes) * (N+2) * (N+2), where N is the local_size buf_size = (np.int32(local_size[0] + 2 * halo), np.int32(local_size[1] + 2 * halo)) gpu_local_memory = cl.LocalMemory(4 * buf_size[0] * buf_size[1])
class Array(object): """A :class:`numpy.ndarray` work-alike that stores its data and performs its computations on the compute device. *shape* and *dtype* work exactly as in :mod:`numpy`. Arithmetic methods in :class:`Array` support the broadcasting of scalars. (e.g. `array+5`) *cqa* must be a :class:`pyopencl.CommandQueue`. *cqa* specifies the queue in which the array carries out its computations by default. *cqa* will at some point be renamed *queue*, so it should be considered 'positional-only'. *allocator* may be `None` or a callable that, upon being called with an argument of the number of bytes to be allocated, returns an :class:`pyopencl.Buffer` object. (A :class:`pyopencl.tools.MemoryPool` instance is one useful example of an object to pass here.) .. versionchanged:: 2011.1 Renamed *context* to *cqa*, made it general-purpose. All arguments beyond *order* should be considered keyword-only. .. attribute :: data The :class:`pyopencl.MemoryObject` instance created for the memory that backs this :class:`Array`. .. attribute :: shape The tuple of lengths of each dimension in the array. .. attribute :: dtype The :class:`numpy.dtype` of the items in the GPU array. .. attribute :: size The number of meaningful entries in the array. Can also be computed by multiplying up the numbers in :attr:`shape`. .. attribute :: nbytes The size of the entire array in bytes. Computed as :attr:`size` times ``dtype.itemsize``. .. attribute :: strides Tuple of bytes to step in each dimension when traversing an array. .. attribute :: flags Return an object with attributes `c_contiguous`, `f_contiguous` and `forc`, which may be used to query contiguity properties in analogy to :attr:`numpy.ndarray.flags`. """ def __init__(self, cqa, shape, dtype, order="C", allocator=None, data=None, queue=None, strides=None): # {{{ backward compatibility from warnings import warn if queue is not None: warn( "Passing the queue to the array through anything but the " "first argument of the Array constructor is deprecated. " "This will be continue to be accepted throughout the 2013.[0-6] " "versions of PyOpenCL.", DeprecationWarning, 2) if isinstance(cqa, cl.CommandQueue): if queue is not None: raise TypeError("can't specify queue in 'cqa' and " "'queue' arguments") queue = cqa elif isinstance(cqa, cl.Context): warn( "Passing a context for the 'cqa' parameter is deprecated. " "This usage will be continue to be accepted throughout the 2013.[0-6] " "versions of PyOpenCL.", DeprecationWarning, 2) if queue is not None: raise TypeError("may not pass a context and a queue " "(just pass the queue)") if allocator is not None: raise TypeError("may not pass a context and an allocator " "(just pass the queue)") else: # cqa is assumed to be an allocator warn( "Passing an allocator for the 'cqa' parameter is deprecated. " "This usage will be continue to be accepted throughout the 2013.[0-6] " "versions of PyOpenCL.", DeprecationWarning, 2) if allocator is not None: raise TypeError("can't specify allocator in 'cqa' and " "'allocator' arguments") allocator = cqa if queue is None: warn( "Queue-less arrays are deprecated. " "They will continue to work throughout the 2013.[0-6] " "versions of PyOpenCL.", DeprecationWarning, 2) # }}} # invariant here: allocator, queue set # {{{ determine shape and strides dtype = np.dtype(dtype) try: s = 1 for dim in shape: s *= dim except TypeError: import sys if sys.version_info >= (3, ): admissible_types = (int, np.integer) else: admissible_types = (int, long, np.integer) if not isinstance(shape, admissible_types): raise TypeError("shape must either be iterable or " "castable to an integer") s = shape shape = (shape, ) if isinstance(s, np.integer): # bombs if s is a Python integer s = np.asscalar(s) if strides is None: strides = _make_strides(dtype.itemsize, shape, order) else: # FIXME: We should possibly perform some plausibility # checking on 'strides' here. strides = tuple(strides) # }}} self.queue = queue self.shape = shape self.dtype = dtype self.strides = strides self.events = [] self.size = s alloc_nbytes = self.nbytes = self.dtype.itemsize * self.size self.allocator = allocator if data is None: if not alloc_nbytes: # Work around CL not allowing zero-sized buffers. alloc_nbytes = 1 if allocator is None: # FIXME remove me when queues become required if queue is not None: context = queue.context self.data = cl.Buffer(context, cl.mem_flags.READ_WRITE, alloc_nbytes) else: self.data = self.allocator(alloc_nbytes) else: self.data = data @property def context(self): return self.data.context @property @memoize_method def flags(self): return _ArrayFlags(self) @property def mem_size(self): from warnings import warn warn("Array.mem_size is deprecated. Use Array.size", DeprecationWarning, stacklevel=2) def _new_with_changes(self, data, shape=None, dtype=None, strides=None, queue=None): if shape is None: shape = self.shape if dtype is None: dtype = self.dtype if strides is None: strides = self.strides if queue is None: queue = self.queue if queue is not None: return Array(queue, shape, dtype, allocator=self.allocator, strides=strides, data=data) elif self.allocator is not None: return Array(self.allocator, shape, dtype, queue=queue, strides=strides, data=data) else: return Array(self.context, shape, dtype, strides=strides, data=data) #@memoize_method FIXME: reenable def get_sizes(self, queue, kernel_specific_max_wg_size=None): if not self.flags.forc: raise NotImplementedError("cannot operate on non-contiguous array") return splay(queue, self.size, kernel_specific_max_wg_size=kernel_specific_max_wg_size) def set(self, ary, queue=None, async=False): """Transfer the contents the :class:`numpy.ndarray` object *ary* onto the device. *ary* must have the same dtype and size (not necessarily shape) as *self*. """ assert ary.size == self.size assert ary.dtype == self.dtype if not ary.flags.forc: raise RuntimeError("cannot set from non-contiguous array") ary = ary.copy() if ary.strides != self.strides: from warnings import warn warn( "Setting array from one with different strides/storage order. " "This will cease to work in 2013.x.", stacklevel=2) if self.size: cl.enqueue_copy(queue or self.queue, self.data, ary, is_blocking=not async)
def run_parallel(self, kernelName='', scalarArgs=None, slicedROArgs=None, nonSlicedROArgs=None, slicedRWArgs=None, nonSlicedRWArgs=None, dimension=0): # t0 = time.time() ka_offset = len(scalarArgs) if scalarArgs is not None else 0 ro_offset = len(slicedROArgs) if slicedROArgs is not None else 0 ns_offset = len(nonSlicedROArgs) if nonSlicedROArgs is not None else 0 rw_offset = len(slicedRWArgs) if slicedRWArgs is not None else 0 rw_pos = ka_offset + ro_offset + ns_offset nsrw_pos = ka_offset + ro_offset + ns_offset + rw_offset kernel_bufs = [] global_size = [] ev_h2d = [] ev_run = [] nCU = [] ndstart = [] ndslice = [] ndsize = [] minWGS = 1e20 for ictx, ctx in enumerate(self.cl_ctx): nCUw = 1 nCU.extend([nCUw]) tmpWGS = ctx.devices[0].max_work_group_size if tmpWGS < minWGS: minWGS = tmpWGS # nCU.extend([ctx.devices[0].max_compute_units*nCUw]) totalCUs = np.sum(nCU) minWGS = 256 divider = minWGS * totalCUs n2f = np.remainder(dimension, divider) needResize = False # odd dimension performance fix if n2f != 0 and dimension > divider: oldSize = dimension dimension = (np.trunc(dimension / divider) + 1) * divider nDiff = int(dimension - oldSize) needResize = True work_cl_ctx = self.cl_ctx if dimension > totalCUs else [self.cl_ctx[0]] nctx = len(work_cl_ctx) for ictx, ctx in enumerate(work_cl_ctx): ev_h2d.extend([[]]) kernel_bufs.extend([[]]) if scalarArgs is not None: kernel_bufs[ictx].extend(scalarArgs) ndstart.extend([sum(ndsize)]) if dimension > 1: if ictx < nctx - 1: ndsize.extend([np.floor(dimension * nCU[ictx] / totalCUs)]) else: ndsize.extend([dimension - ndstart[ictx]]) ndslice.extend( [slice(ndstart[ictx], ndstart[ictx] + ndsize[ictx])]) else: ndslice.extend([0]) # In case each photon has an array of input/output data we define a second # dimension if slicedROArgs is not None and dimension > 1: for iarg, arg in enumerate(slicedROArgs): newArg = np.concatenate([arg, arg[:nDiff]]) if needResize\ else arg secondDim = np.int(len(newArg) / dimension) iSlice = slice( int(ndstart[ictx] * secondDim), int((ndstart[ictx] + ndsize[ictx]) * secondDim)) kernel_bufs[ictx].extend([ cl.Buffer(self.cl_ctx[ictx], self.cl_mf.READ_ONLY | self.cl_mf.COPY_HOST_PTR, hostbuf=newArg[iSlice]) ]) if nonSlicedROArgs is not None: for iarg, arg in enumerate(nonSlicedROArgs): kernel_bufs[ictx].extend([ cl.Buffer(self.cl_ctx[ictx], self.cl_mf.READ_ONLY | self.cl_mf.COPY_HOST_PTR, hostbuf=arg) ]) if slicedRWArgs is not None: for iarg, arg in enumerate(slicedRWArgs): newArg = np.concatenate([arg, arg[:nDiff]]) if needResize\ else arg secondDim = np.int(len(newArg) / dimension) iSlice = slice( int(ndstart[ictx] * secondDim), int((ndstart[ictx] + ndsize[ictx]) * secondDim)) kernel_bufs[ictx].extend([ cl.Buffer(self.cl_ctx[ictx], self.cl_mf.READ_WRITE | self.cl_mf.COPY_HOST_PTR, hostbuf=newArg[iSlice]) ]) global_size.extend([(np.int(ndsize[ictx]), )]) if nonSlicedRWArgs is not None: for iarg, arg in enumerate(nonSlicedRWArgs): kernel_bufs[ictx].extend([ cl.Buffer(self.cl_ctx[ictx], self.cl_mf.READ_WRITE | self.cl_mf.COPY_HOST_PTR, hostbuf=arg) ]) global_size.extend([np.array([1]).shape]) local_size = None for ictx, ctx in enumerate(work_cl_ctx): kernel = getattr(self.cl_program[ictx], kernelName) ev_run.extend([ kernel(self.cl_queue[ictx], global_size[ictx], local_size, *kernel_bufs[ictx]) ]) for iev, ev in enumerate(ev_run): status = cl.command_execution_status.to_string( ev.command_execution_status) if _DEBUG > 20: print("ctx status {0} {1}".format(iev, status)) ret = () if slicedRWArgs is not None: for ictx, ctx in enumerate(work_cl_ctx): for iarg, arg in enumerate(slicedRWArgs): newArg = np.concatenate([arg, arg[:nDiff]]) if needResize\ else arg secondDim = np.int(len(newArg) / dimension) iSlice = slice( int(ndstart[ictx] * secondDim), int((ndstart[ictx] + ndsize[ictx]) * secondDim)) cl.enqueue_copy(self.cl_queue[ictx], slicedRWArgs[iarg][iSlice], kernel_bufs[ictx][iarg + rw_pos], is_blocking=self.cl_is_blocking) if needResize: for arg in slicedRWArgs: arg = arg[:oldSize] ret += tuple(slicedRWArgs) if nonSlicedRWArgs is not None: for ictx, ctx in enumerate(work_cl_ctx): for iarg, arg in enumerate(nonSlicedRWArgs): cl.enqueue_copy(self.cl_queue[ictx], nonSlicedRWArgs[iarg], kernel_bufs[ictx][iarg + nsrw_pos], is_blocking=self.cl_is_blocking) if needResize: for arg in nonSlicedRWArgs: arg = arg[:oldSize] ret += tuple(nonSlicedRWArgs) # print("Total CL execution time:", time.time() - t0, "s") return ret
def buildGraph(ip, dev=0): """Builds the knn grap with intial params. params: ------ ip: initial params return: ------ graph: graph object of Graph """ start = time() nbrs = NearestNeighbors(n_neighbors = ip.k + 1, algorithm="buffer_kd_tree", tree_depth=9, plat_dev_ids={0:[0]}) nbrs.fit(ip.position) dists, inds = nbrs.kneighbors(ip.position) print("success") if bool_1 else print() # now build the graph using those nns using gpu platform = cl.get_platforms()[0] print(platform) device = platform.get_devices()[dev] print(device) context = cl.Context([device]) print(context) program = cl.Program(context, open(mywf).read()).build() print(program) queue = cl.CommandQueue(context) print(queue) # define the input here which is the ndbrs gpu ngbrs_gpu = inds ngbrs_gpu = ngbrs_gpu[0:,1:] ngbrs_gpu = unroll(ngbrs_gpu) ngbrs_gpu = ngbrs_gpu.astype('int32') # define the second input here which is the signal levels signal = ip.signal n, chnl = signal.shape signal = np.reshape(signal,(n*chnl,),order='F') signal = signal.astype('float32') print("signal",signal.shape) if bool_1 else print() k = ip.k print("n is :", n) if bool_1 else print() scale = ip.sigma # create the buffers on the device, intensity, nbgrs, weights mem_flags = cl.mem_flags ngbrs_buf = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR,hostbuf=ngbrs_gpu) signal_buf = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=signal) weight_vec = np.ndarray(shape=(n*k,), dtype=np.float32) weight_buf = cl.Buffer(context, mem_flags.WRITE_ONLY, weight_vec.nbytes) # run the kernel to compute the weights program.compute_weights(queue, (n,), None, signal_buf, ngbrs_buf, weight_buf, np.int32(k), np.float32(scale), np.int32(chnl)) queue.finish() #OT # copy the weihts to the host memory cl.enqueue_copy(queue, weight_vec, weight_buf) end = time() - start print('total time taken by the gpu python:', end) if bool_1 else print() # save the graph graph = Graph(weight_vec,ngbrs_gpu,k) return graph
def vglCl3dErode(self, img_input, img_output, convolution_window, window_size_x, window_size_y, window_size_z): print("# Running vglCl3dErode") vl.vglCheckContext(img_input, vl.VGL_CL_CONTEXT()) vl.vglCheckContext(img_output, vl.VGL_CL_CONTEXT()) # TRANSFORMAR EM BUFFER try: cl_convolution_window = cl.Buffer(self.ocl.context, cl.mem_flags.READ_ONLY, convolution_window.nbytes) cl.enqueue_copy(self.ocl.commandQueue, cl_convolution_window, convolution_window.tobytes(), is_blocking=True) convolution_window = cl_convolution_window except Exception as e: print( "vglCl3dErode: Error!! Impossible to convert convolution_window to cl.Buffer object." ) print(str(e)) exit() if (not isinstance(window_size_x, np.uint32)): print( "vglCl3dErode: Warning: window_size_x not np.uint32! Trying to convert..." ) try: window_size_x = np.uint32(window_size_x) except Exception as e: print( "vglCl3dErode: Error!! Impossible to convert window_size_x as a np.uint32 object." ) print(str(e)) exit() if (not isinstance(window_size_y, np.uint32)): print( "vglCl3dErode: Warning: window_size_y not np.uint32! Trying to convert..." ) try: window_size_y = np.uint32(window_size_y) except Exception as e: print( "vglCl3dErode: Error!! Impossible to convert window_size_y as a np.uint32 object." ) print(str(e)) exit() if (not isinstance(window_size_z, np.uint32)): print( "vglCl3dErode: Warning: window_size_z not np.uint32! Trying to convert..." ) try: window_size_z = np.uint32(window_size_z) except Exception as e: print( "vglCl3dErode: Error!! Impossible to convert window_size_z as a np.uint32 object." ) print(str(e)) exit() _program = self.cl_ctx.get_compiled_kernel("../CL/vglCl3dErode.cl", "vglCl3dErode") kernel_run = _program.vglCl3dErode kernel_run.set_arg(0, img_input.get_oclPtr()) kernel_run.set_arg(1, img_output.get_oclPtr()) kernel_run.set_arg(2, convolution_window) kernel_run.set_arg(3, window_size_x) kernel_run.set_arg(4, window_size_y) kernel_run.set_arg(5, window_size_z) cl.enqueue_nd_range_kernel(self.ocl.commandQueue, kernel_run, img_output.get_oclPtr().shape, None) vl.vglSetContext(img_output, vl.VGL_CL_CONTEXT())
vl.IMAGE_ND_ARRAY()) vl.vglLoadImage(img_input) vl.vglClUpload(img_input) img_input2 = vl.VglImage("bin2.pgm", vl.VGL_IMAGE_2D_IMAGE(), None, vl.IMAGE_ND_ARRAY()) vl.vglLoadImage(img_input2) vl.vglClUpload(img_input2) # OUTPUT IMAGE img_output = vl.create_blank_image_as(img_input) img_output.set_oclPtr(vl.get_similar_oclPtr_object(img_input)) vl.vglAddContext(img_output, vl.VGL_CL_CONTEXT()) img_out_aux = vl.get_similar_oclPtr_object(img_input) cl.enqueue_copy(wrp.cl_ctx.queue, img_out_aux, img_output.get_oclPtr()) # TRANSFORMANDO IMAGENS EM IMAGENS BINARIAS wrp.vglClNdBinThreshold(img_input, img_output, 100) cl.enqueue_copy(wrp.cl_ctx.queue, img_input.get_oclPtr(), img_output.get_oclPtr()) # STRUCTURANT ELEMENT window = vl.VglStrEl() window.constructorFromTypeNdim(vl.VGL_STREL_CROSS(), 2) # INPUT IMAGE #img_input = vl.VglImage("bin.pgm", vl.VGL_IMAGE_2D_IMAGE(), None, vl.IMAGE_ND_ARRAY()) #vl.vglLoadImage(img_input) #vl.vglClUpload(img_input) #wrp.vglClNdBinThreshold(img_input, img_output, 100)