def start_websocket(websocket, path): try: # initialize OpenCL ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) # build kernal from file prg = cl.Program(ctx, open('kernal.cl').read()).build() # load data into numpy arrays on the host pos_host, vel_host = load_data() # create and initialize arrays on the device # copy initial postiton to device pos_dev = cl_array.to_device(queue, pos_host) # copy initial velocity to device vel_dev = cl_array.to_device(queue, vel_host) # allocate memory for new data on the devie pos_new_dev = cl_array.empty_like(pos_dev) vel_new_dev = cl_array.empty_like(vel_dev) # main loop while True: # run kernal with the work goup size of a number of bodies prg.nbody_simple(queue, (pos_host.shape[0],), None, pos_dev.data, vel_dev.data, pos_new_dev.data, vel_new_dev.data) # copy new position into host np.array (device -> host) pos_host = pos_new_dev.get() # update position and velocity on the device (device -> device) cl.enqueue_copy(queue, pos_dev.data, pos_new_dev.data) cl.enqueue_copy(queue, vel_dev.data, vel_new_dev.data) #send data to client yield from websocket.send(','.join(map(str,pos_host[:, :3].flatten()))) finally: yield from websocket.close()
def gs_mod_gpu(idata, itera=10, osize=256): cut = osize // 2 pl = cl.get_platforms()[0] devices = pl.get_devices(device_type=cl.device_type.GPU) ctx = cl.Context(devices=[devices[0]]) queue = cl.CommandQueue(ctx) plan = Plan(idata.shape, queue=queue, dtype=complex128) #no funciona con "complex128" src = str( Template(KERNEL).render( double_support=all(has_double_support(dev) for dev in devices), amd_double_support=all( has_amd_double_support(dev) for dev in devices))) prg = cl.Program(ctx, src).build() idata_gpu = cl_array.to_device(queue, ifftshift(idata).astype("complex128")) fdata_gpu = cl_array.empty_like(idata_gpu) rdata_gpu = cl_array.empty_like(idata_gpu) plan.execute(idata_gpu.data, fdata_gpu.data) mask = exp(2.j * pi * random(idata.shape)) mask[512 - cut:512 + cut, 512 - cut:512 + cut] = 0 idata_gpu = cl_array.to_device( queue, ifftshift(idata + mask).astype("complex128")) fdata_gpu = cl_array.empty_like(idata_gpu) rdata_gpu = cl_array.empty_like(idata_gpu) error_gpu = cl_array.to_device(ctx, queue, zeros(idata_gpu.shape).astype("double")) plan.execute(idata_gpu.data, fdata_gpu.data) e = 1000 ea = 1000 for i in range(itera): prg.norm(queue, fdata_gpu.shape, None, fdata_gpu.data) plan.execute(fdata_gpu.data, rdata_gpu.data, inverse=True) #~ prg.norm1(queue, rdata_gpu.shape,None,rdata_gpu.data,idata_gpu.data,error_gpu.data, int32(cut)) norm1 = prg.norm1 norm1.set_scalar_arg_dtypes([None, None, None, int32]) norm1(queue, rdata_gpu.shape, None, rdata_gpu.data, idata_gpu.data, error_gpu.data, int32(cut)) e = sqrt(cl_array.sum(error_gpu).get()) / (2 * cut) #~ if e>ea: #~ #~ break #~ ea=e plan.execute(rdata_gpu.data, fdata_gpu.data) fdata = fdata_gpu.get() fdata = ifftshift(fdata) fdata = exp(1.j * angle(fdata)) return fdata
def gs_mod_gpu(idata,itera=10,osize=256): cut=osize//2 pl=cl.get_platforms()[0] devices=pl.get_devices(device_type=cl.device_type.GPU) ctx = cl.Context(devices=[devices[0]]) queue = cl.CommandQueue(ctx) plan = Plan(idata.shape, queue=queue,dtype=complex128) #no funciona con "complex128" src = str(Template(KERNEL).render( double_support=all( has_double_support(dev) for dev in devices), amd_double_support=all( has_amd_double_support(dev) for dev in devices) )) prg = cl.Program(ctx,src).build() idata_gpu=cl_array.to_device(queue, ifftshift(idata).astype("complex128")) fdata_gpu=cl_array.empty_like(idata_gpu) rdata_gpu=cl_array.empty_like(idata_gpu) plan.execute(idata_gpu.data,fdata_gpu.data) mask=exp(2.j*pi*random(idata.shape)) mask[512-cut:512+cut,512-cut:512+cut]=0 idata_gpu=cl_array.to_device(queue, ifftshift(idata+mask).astype("complex128")) fdata_gpu=cl_array.empty_like(idata_gpu) rdata_gpu=cl_array.empty_like(idata_gpu) error_gpu=cl_array.to_device(ctx, queue, zeros(idata_gpu.shape).astype("double")) plan.execute(idata_gpu.data,fdata_gpu.data) e=1000 ea=1000 for i in range (itera): prg.norm(queue, fdata_gpu.shape, None,fdata_gpu.data) plan.execute(fdata_gpu.data,rdata_gpu.data,inverse=True) #~ prg.norm1(queue, rdata_gpu.shape,None,rdata_gpu.data,idata_gpu.data,error_gpu.data, int32(cut)) norm1=prg.norm1 norm1.set_scalar_arg_dtypes([None, None, None, int32]) norm1(queue, rdata_gpu.shape,None,rdata_gpu.data,idata_gpu.data,error_gpu.data, int32(cut)) e= sqrt(cl_array.sum(error_gpu).get())/(2*cut) #~ if e>ea: #~ #~ break #~ ea=e plan.execute(rdata_gpu.data,fdata_gpu.data) fdata=fdata_gpu.get() fdata=ifftshift(fdata) fdata=exp(1.j*angle(fdata)) return fdata
def __init__(self, sino_shape, slice_shape=None, axis_position=None, angles=None, ctx=None, devicetype="all", platformid=None, deviceid=None, profile=False): OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype, platformid=platformid, deviceid=deviceid, profile=profile) # Create a backprojector self.backprojector = Backprojection(sino_shape, slice_shape=slice_shape, axis_position=axis_position, angles=angles, ctx=self.ctx, profile=profile) # Create a projector self.projector = Projection(self.backprojector.slice_shape, self.backprojector.angles, axis_position=axis_position, detector_width=self.backprojector.num_bins, normalize=False, ctx=self.ctx, profile=profile) self.sino_shape = sino_shape self.is_cpu = self.backprojector.is_cpu # Arrays self.d_data = parray.empty(self.queue, sino_shape, dtype=np.float32) self.d_data.fill(0.0) self.d_sino = parray.empty_like(self.d_data) self.d_sino.fill(0.0) self.d_x = parray.empty(self.queue, self.backprojector.slice_shape, dtype=np.float32) self.d_x.fill(0.0) self.d_x_old = parray.empty_like(self.d_x) self.d_x_old.fill(0.0) self.add_to_cl_mem({ "d_data": self.d_data, "d_sino": self.d_sino, "d_x": self.d_x, "d_x_old": self.d_x_old, })
def cl_test_sobel(im): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) sobel = Sobel(ctx, queue) im_buf = cl_array.to_device(queue, im) mag_buf = cl_array.empty_like(im_buf) imgx_buf = cl_array.empty_like(im_buf) imgy_buf = cl_array.empty_like(im_buf) sobel(im_buf, imgx_buf, imgy_buf, mag_buf) return (mag_buf.get(), imgx_buf.get(), imgy_buf.get())
def __init__(self, sino_shape, slice_shape=None, axis_position=None, angles=None, ctx=None, devicetype="all", platformid=None, deviceid=None, profile=False): ReconstructionAlgorithm.__init__(self, sino_shape, slice_shape=slice_shape, axis_position=axis_position, angles=angles, ctx=ctx, devicetype=devicetype, platformid=platformid, deviceid=deviceid, profile=profile) self.compute_preconditioners() # Create a LinAlg instance self.linalg = LinAlg(self.backprojector.slice_shape, ctx=self.ctx) # Positivity constraint self.elwise_clamp = ElementwiseKernel(self.ctx, "float *a", "a[i] = max(a[i], 0.0f);") # Projection onto the L-infinity ball of radius Lambda self.elwise_proj_linf = ElementwiseKernel( self.ctx, "float2* a, float Lambda", "a[i].x = copysign(min(fabs(a[i].x), Lambda), a[i].x); a[i].y = copysign(min(fabs(a[i].y), Lambda), a[i].y);", "elwise_proj_linf") # Additional arrays self.linalg.gradient(self.d_x) self.d_p = parray.empty_like(self.linalg.cl_mem["d_gradient"]) self.d_q = parray.empty_like(self.d_data) self.d_g = self.linalg.d_image self.d_tmp = parray.empty_like(self.d_x) self.d_p.fill(0) self.d_q.fill(0) self.d_tmp.fill(0) self.add_to_cl_mem({ "d_p": self.d_p, "d_q": self.d_q, "d_tmp": self.d_tmp, }) self.theta = 1.0
def test_elwise_kernel_with_options(ctx_factory): from pyopencl.clrandom import rand as clrand from pyopencl.elementwise import ElementwiseKernel context = ctx_factory() queue = cl.CommandQueue(context) in_gpu = clrand(queue, (50,), np.float32) options = ['-D', 'ADD_ONE'] add_one = ElementwiseKernel( context, "float* out, const float *in", """ out[i] = in[i] #ifdef ADD_ONE +1 #endif ; """, options=options, ) out_gpu = cl_array.empty_like(in_gpu) add_one(out_gpu, in_gpu) gt = in_gpu.get() + 1 gv = out_gpu.get() assert la.norm(gv - gt) < 1e-5
def get_fluid_source(params, G, P, D, out=None): """Calculate a small fluid source term, added to conserved variables for stability""" s = G.slices sh = G.shapes # T the old fashioned way: TODO Tmhd_full... T = cl_array.empty(params['queue'], sh.grid_tensor, dtype=np.float64) for mu in range(4): Tmhd_vec(params, G, P, D, mu, out=T[mu]) if out is None: out = cl_array.empty_like(P) global gcon1_d, gcon2_d, gcon3_d if gcon1_d is None: gcon1_d = cl_array.to_device(params['queue'], (G.conn[:, :, 1, :, :] * G.gdet[Loci.CENT.value]).copy()) gcon2_d = cl_array.to_device(params['queue'], (G.conn[:, :, 2, :, :] * G.gdet[Loci.CENT.value]).copy()) gcon3_d = cl_array.to_device(params['queue'], (G.conn[:, :, 3, :, :] * G.gdet[Loci.CENT.value]).copy()) # Contract mhd stress tensor with connection evt, _ = G.dot2D2geom(params['queue'], u=T, g=gcon1_d, out=out[s.U1]) evt, _ = G.dot2D2geom(params['queue'], u=T, g=gcon2_d, out=out[s.U2]) evt, _ = G.dot2D2geom(params['queue'], u=T, g=gcon3_d, out=out[s.U2]) if 'profile' in params and params['profile']: evt.wait() return out
def test_spirv(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) if (ctx._get_cl_version() < (2, 1) or cl.get_cl_header_version() < (2, 1)): pytest.skip("SPIR-V program creation only available " "in OpenCL 2.1 and higher") n = 50000 a_dev = cl.clrandom.rand(queue, n, np.float32) b_dev = cl.clrandom.rand(queue, n, np.float32) dest_dev = cl_array.empty_like(a_dev) with open("add-vectors-%d.spv" % queue.device.address_bits, "rb") as spv_file: spv = spv_file.read() prg = cl.Program(ctx, spv).build() if (not prg.all_kernels() and queue.device.platform.name.startswith("AMD Accelerated")): pytest.skip( "SPIR-V program creation on AMD did not result in any kernels") prg.sum(queue, a_dev.shape, None, a_dev.data, b_dev.data, dest_dev.data) assert la.norm((dest_dev - (a_dev + b_dev)).get()) < 1e-7
def _test_desparsification(self, input_on_device, output_on_device, dtype): current_config = "input on device: %s, output on device: %s, dtype: %s" % ( str(input_on_device), str(output_on_device), str(dtype)) logger.debug("CSR: %s" % current_config) # Generate data and reference CSR array = generate_sparse_random_data(shape=(512, 511), dtype=dtype) ref_sparse = self.compute_ref_sparsification(array) # De-sparsify on device csr = CSR(array.shape, dtype=dtype, max_nnz=ref_sparse.nnz) if input_on_device: data = parray.to_device(csr.queue, ref_sparse.data) indices = parray.to_device(csr.queue, ref_sparse.indices) indptr = parray.to_device(csr.queue, ref_sparse.indptr) else: data = ref_sparse.data indices = ref_sparse.indices indptr = ref_sparse.indptr if output_on_device: d_arr = parray.empty_like(csr.array) d_arr.fill(0) output = d_arr else: output = None arr = csr.densify(data, indices, indptr, output=output) if output_on_device: arr = arr.get() # Compare self.assertTrue( np.allclose(arr.reshape(array.shape), array), "something wrong with densified data (%s)" % current_config)
def _cam_callback(self, image, frame_id, pub_type, rgb_type, yuv_type): img = np.frombuffer(image.raw_data, dtype=np.dtype("uint8")) img = np.reshape(img, (H, W, 4)) img = img[:, :, [0, 1, 2]].copy() # convert RGB frame to YUV rgb = np.reshape(img, (H, W * 3)) rgb_cl = cl_array.to_device(self.queue, rgb) yuv_cl = cl_array.empty_like(rgb_cl) self.krnl(self.queue, (np.int32(self.Wdiv4), np.int32(self.Hdiv4)), None, rgb_cl.data, yuv_cl.data).wait() yuv = np.resize(yuv_cl.get(), np.int32(rgb.size / 2)) eof = int(frame_id * 0.05 * 1e9) # TODO: remove RGB send once the last RGB vipc subscriber is removed self.vipc_server.send(rgb_type, img.tobytes(), frame_id, eof, eof) self.vipc_server.send(yuv_type, yuv.data.tobytes(), frame_id, eof, eof) dat = messaging.new_message(pub_type) msg = { "frameId": image.frame, "transform": [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0] } setattr(dat, pub_type, msg) pm.send(pub_type, dat)
def setup_arrays(self, nrays, nsamples, cutoff): prog_params = (nrays, nsamples, cutoff) if prog_params in self.array_cache: return self.array_cache[prog_params] else: arrays = ArraySet() arrays.scratch = cla.empty(self.queue, (nsamples, nrays), dtype=np.float32, allocator=self.memory_pool) arrays.result = cla.empty(self.queue, (nrays,), dtype=np.int32, allocator=self.memory_pool) arrays.pre_cutoff = cla.empty(self.queue, (nrays, cutoff), dtype=np.float32, allocator=self.memory_pool) arrays.pre_cutoff_squared = cla.empty_like(arrays.pre_cutoff) arrays.idx = cla.arange(self.queue, 0, cutoff * nrays, 1, dtype=np.int32, allocator=self.memory_pool) self.array_cache[prog_params] = arrays return arrays
def mul(a_, b_): a = np.array(a_).astype(np.int32) b = np.array(b_).astype(np.int32) platform = cl.get_platforms()[0] device = platform.get_devices()[0] ctx = cl.Context([device]) queue = cl.CommandQueue(ctx) a_dev = cl_array.to_device(queue, a) b_dev = cl_array.to_device(queue, b) sizea_dev = cl_array.to_device(queue, np.array([a.size], dtype=np.int32)) sizeb_dev = cl_array.to_device(queue, np.array([b.size], dtype=np.int32)) dest_dev = cl_array.empty_like(cl_array.to_device(queue, np.zeros(a.size + b.size - 1, dtype=np.int32))) prg = cl.Program(ctx, """ __kernel void mul(__global const int *a, __global const int *b, __global const int *sizea, __global const int *sizeb, __global int *c) { int size_a = sizea[0]; int size_b = sizeb[0]; for(int i=0; i<size_a; i++) { for(int j=0; j<size_b; j++) { c[i+j] += a[i] * b[j]; } } } """).build() prg.mul(queue, a.shape, None, a_dev.data, b_dev.data, sizea_dev.data, sizeb_dev.data, dest_dev.data) #print(dest_dev, sub) return np.trim_zeros(dest_dev.get(), 'b').tolist()
def get_deviders(num): a = np.array([num]).astype(np.int32) platform = cl.get_platforms()[0] device = platform.get_devices()[0] ctx = cl.Context([device]) queue = cl.CommandQueue(ctx) a_dev = cl_array.to_device(queue, a) dest_dev = cl_array.empty_like(cl_array.to_device(queue, np.zeros((2 * a[0]), dtype=np.int32))) prg = cl.Program(ctx, """ __kernel void sum(__global const int *a, __global int *c) { int i = 1; int n = a[0]; int j = 0; while(i <= sqrt((float) n)) { if(n%i==0) { c[j] = i; j++; c[j] = -i; j++; if (i != (n / i)) { c[j] = n/i; j++; c[j] = -n/i; j++; } } i++; } } """).build() prg.sum(queue, a.shape, None, a_dev.data, dest_dev.data) return np.trim_zeros(dest_dev.get(), 'b').tolist()
def test_elwise_kernel_with_options(ctx_factory): from pyopencl.clrandom import rand as clrand from pyopencl.elementwise import ElementwiseKernel context = ctx_factory() queue = cl.CommandQueue(context) in_gpu = clrand(queue, (50,), np.float32) options = ["-D", "ADD_ONE"] add_one = ElementwiseKernel( context, "float* out, const float *in", """ out[i] = in[i] #ifdef ADD_ONE +1 #endif ; """, options=options, ) out_gpu = cl_array.empty_like(in_gpu) add_one(out_gpu, in_gpu) gt = in_gpu.get() + 1 gv = out_gpu.get() assert la.norm(gv - gt) < 1e-5
def __call__(self, input_buf, row_buf, col_buf, output_buf, intermed_buf=None): (h, w) = input_buf.shape r = row_buf.shape[0] c = col_buf.shape[0] if intermed_buf is None: intermed_buf = cl_array.empty_like(input_buf) self.program.separable_correlation_row(self.queue, (h, w), None, intermed_buf.data, input_buf.data, np.int32(w), np.int32(h), row_buf.data, np.int32(r)) self.program.separable_correlation_col(self.queue, (h, w), None, output_buf.data, intermed_buf.data, np.int32(w), np.int32(h), col_buf.data, np.int32(c))
def rev_grad(self, valuation): cache = {} res = self._evaluate(valuation, cache) adjoint = clarray.empty_like(res).fill(1.0) grad = {key: 0 for key in valuation} self._rev_grad(valuation, adjoint, grad, cache) return grad
def test_RungeKutta(initials, t0, t1, derived_function, expected, delta_absolute_error, absolute_error, relative_error, expected_error_runge_kutta): sut_derivedFn = f'''double4 derivedFn(double4* Y, double t) {{ return (double4)({derived_function}); }}''' sut = cl.elementwise.ElementwiseKernel( context, 'double4 *y, double4 *y0, double t, double dt, double* error_runge_kutta', 'double4 temp_y = y[0]; double4 temp_y0 = y0[0]; *error_runge_kutta = RungeKutta(&temp_y, &temp_y0, t, dt); y[0] = temp_y', name='sut', preamble=f'{sut_derivedFn}{rk_pd_4d.rungeKutta}') y0 = cl_array.to_device(queue, initials) y = cl_array.empty_like(y0) error_runge_kutta = cl_array.to_device(queue, numpy.array([numpy.double(0.0)])) sut(y, y0, t0, t1, error_runge_kutta) assert error_runge_kutta.get() == pytest.approx(expected_error_runge_kutta, abs=delta_absolute_error) numpy.testing.assert_allclose(numpy.array(y.get()[0].tolist()), numpy.array(expected[0].tolist()), rtol=relative_error, atol=absolute_error)
def fixup_ceiling(params, fflag, G, P): s = G.slices # First apply ceilings: # 1. Limit gamma with respect to normal observer # TODO is there a softer touch here? gamma = mhd_gamma_calc(params['queue'], G, P, Loci.CENT) f = cl_array.if_positive(gamma - params['gamma_max'], ((params['gamma_max']**2 - 1.) / (gamma**2 - 1.))**(1/2), cl_array.empty_like(gamma).fill(1)) P[s.U1] *= f P[s.U2] *= f P[s.U3] *= f # 2. Limit KTOT if params['electrons']: # Keep to KTOTMAX by controlling u, to avoid anomalous cooling from funnel wall # TODO This operates on last iteration's KTOT, meaning the effective value can escape the ceiling. Rethink u_max_ent = params['entropy_max'] * (P[s.RHO] ** params['gam'])/(params['gam']-1.) P[s.UU] = cl_array.if_positive(P[s.UU] - u_max_ent, u_max_ent, P[s.UU]) P[s.KTOT] = cl_array.if_positive(P[s.KTOT] - params['entropy_max'], params['entropy_max'], P[s.KTOT]) pass # TODO keep track of hits #fflag |= cl_array.if_positive(gamma - params['gamma_max'], temp.fill(HIT_FLOOR_GAMMA), zero) return P, fflag
def run(x, approx): if with_pyopencl: ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) queue.finish() tree = approx.tree_1d # initialize variables x_dev = cl_array.to_device(queue, x) tree_dev = cl_array.to_device(queue, tree) y_dev = cl_array.empty_like(x_dev) # build the code to run from given string dt = approx.dtype declaration = "__kernel void sum(__global " + dt + " *tree, " declaration += "__global " + dt + " *x, __global " + dt + " *y) " code = declaration + '{' + approx.code + '}' # compile code and then execute it prg = cl.Program(ctx, code).build() prg.sum(queue, x_dev.shape, None, tree_dev.data, x_dev.data, y_dev.data) queue.finish() return y_dev.get() else: raise ValueError("Function requires pyopencl installation.")
def get_tmp_arrays_like(self, **kwargs): """ Allocates required temporary arrays matching those passed via keyword. :returns: A :class:`dict` of named arrays, suitable for passing via dictionary expansion. .. versionadded:: 2020.2 """ tmp_arrays = {} for name in self.dof_names: f = kwargs[name] tmp_name = gen_tmp_name(name) import pyopencl.array as cla if isinstance(f, cla.Array): tmp_arrays[tmp_name] = cla.empty_like(f) elif isinstance(f, np.ndarray): tmp_arrays[tmp_name] = np.empty_like(f) else: raise ValueError(f"Could not generate tmp array for {f}" f"of type {type(f)}") tmp_arrays[tmp_name][...] = 0. return tmp_arrays
def genindices(self, arrayin): """Generate indices for splitting array.""" retval = dict() # run the 'trim' program # need to split if it's too long! splitlist = tuple([x for x in xrange(CLIDT.indexmaxsize, arrayin.shape[0], CLIDT.indexmaxsize)]) indexinc = 0 for chunk in np.vsplit(arrayin, splitlist): chunkarr = cla.to_device(self.queue, np.asarray(chunk, dtype=np.int32)) template = cla.empty_like(chunkarr) event = self.program.trim( self.queue, chunkarr.shape, None, chunkarr.data, template.data, np.int32(self.split) ) try: event.wait() except cl.RuntimeError, inst: errstr = inst.__str__() if errstr == "clWaitForEvents failed: out of resources": print "OpenCL timed out, probably due to the display manager." print "Disable your display manager and try again!" print "If that does not work, rerun with OpenCL disabled." else: raise cl.RuntimeError, inst sys.exit(1) for index, elem in enumerate(template.get()): splitkey = tuple([x for x in elem]) try: retval[splitkey] except KeyError: retval[splitkey] = [] retval[splitkey].append(index + indexinc) indexinc += CLIDT.indexmaxsize
def setup_arrays(self, nrays, nsamples, cutoff): prog_params = (nrays, nsamples, cutoff) if prog_params in self.array_cache: return self.array_cache[prog_params] else: arrays = ArraySet() arrays.scratch = cla.empty(self.queue, (nsamples, nrays), dtype=np.float32, allocator=self.memory_pool) arrays.result = cla.empty(self.queue, (nrays, ), dtype=np.int32, allocator=self.memory_pool) arrays.pre_cutoff = cla.empty(self.queue, (nrays, cutoff), dtype=np.float32, allocator=self.memory_pool) arrays.pre_cutoff_squared = cla.empty_like(arrays.pre_cutoff) arrays.idx = cla.arange(self.queue, 0, cutoff * nrays, 1, dtype=np.int32, allocator=self.memory_pool) self.array_cache[prog_params] = arrays return arrays
def opencl_cross(a, b): a = asarray(a, dtype=float32) b = asarray(b, dtype=float32) ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) a_dev = cl_array.to_device(queue, a) b_dev = cl_array.to_device(queue, b) c_dev = cl_array.empty_like(a_dev) code = """ __kernel void crossproduct(__global const float *a, __global const float *b, __global float *c) { int i = get_global_id(0); __global const float * a_ = &a[i * 3]; __global const float * b_ = &b[i * 3]; __global float * c_ = &c[i * 3]; c_[0] = a_[1] * b_[2] - a_[2] * b_[1]; c_[1] = a_[2] * b_[0] - a_[0] * b_[2]; c_[2] = a_[0] * b_[1] - a_[1] * b_[0]; } """ prg = cl.Program(ctx, code).build() prg.crossproduct(queue, a.shape, None, a_dev.data, b_dev.data, c_dev.data) return c_dev.get()
def cam_callback(self, image): img = np.frombuffer(image.raw_data, dtype=np.dtype("uint8")) img = np.reshape(img, (H, W, 4)) img = img[:, :, [0, 1, 2]].copy() # convert RGB frame to YUV rgb = np.reshape(img, (H, W * 3)) rgb_cl = cl_array.to_device(self.queue, rgb) yuv_cl = cl_array.empty_like(rgb_cl) self.krnl(self.queue, (np.int32(self.Wdiv4), np.int32(self.Hdiv4)), None, rgb_cl.data, yuv_cl.data).wait() yuv = np.resize(yuv_cl.get(), np.int32((rgb.size / 2))) eof = self.frame_id * 0.05 # TODO: remove RGB send once the last RGB vipc subscriber is removed self.vipc_server.send(VisionStreamType.VISION_STREAM_RGB_BACK, img.tobytes(), self.frame_id, eof, eof) self.vipc_server.send(VisionStreamType.VISION_STREAM_ROAD, yuv.data.tobytes(), self.frame_id, eof, eof) dat = messaging.new_message('roadCameraState') dat.roadCameraState = { "frameId": image.frame, "transform": [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0] } pm.send('roadCameraState', dat) self.frame_id += 1
def _nsIfft(M, fft): # K = np.fft.ifftn(M, axes=(-3,-2,-1), norm="ortho") tmp = cla.to_device(fft.queue, M) K = cla.empty_like(tmp) fft.FFTH(K, tmp) # K = fft(M) return K.get()
def square(): if not slicer.util.getNode('moving'): load_default_volume() a = slicer.util.array('moving').flatten() ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) mf = cl.mem_flags a_dev = cl_array.to_device(queue, a) dest_dev = cl_array.empty_like(a_dev) prg = cl.Program(ctx, """ __kernel void square(__global const short *a, __global short *c) { int gid = get_global_id(0); c[gid] = a[gid] * a[gid]; } """).build() prg.square(queue, a.shape, None, a_dev.data, dest_dev.data) diff = ( dest_dev - (a_dev*a_dev) ).get() norm = la.norm(diff) print(norm)
def _evaluate(self, valuation, cache): if id(self) not in cache: op = self.ops[0]._evaluate(valuation, cache) val = clarray.empty_like(op) ev = self.eval_kernel(op, val) ev.wait() cache[id(self)] = val return cache[id(self)]
def __call__(self, input_ary, output_ary=None, allocator=None, stream=None): allocator = allocator or input_ary.allocator if output_ary is None: output_ary = input_ary if isinstance(output_ary, (str, unicode)) and output_ary == "new": output_ary = cl_array.empty_like(input_ary, allocator=allocator) if input_ary.shape != output_ary.shape: raise ValueError("input and output must have the same shape") if not input_ary.flags.forc: raise RuntimeError("ScanKernel cannot " "deal with non-contiguous arrays") n, = input_ary.shape if not n: return output_ary unit_size = self.scan_wg_size * self.scan_wg_seq_batches dev = driver.Context.get_device() max_groups = 3*dev.get_attribute( driver.device_attribute.MULTIPROCESSOR_COUNT) from pytools import uniform_interval_splitting interval_size, num_groups = uniform_interval_splitting( n, unit_size, max_groups); block_results = allocator(self.dtype.itemsize*num_groups) dummy_results = allocator(self.dtype.itemsize) # first level scan of interval (one interval per block) self.scan_intervals_knl.prepared_async_call( (num_groups, 1), (self.scan_wg_size, 1, 1), stream, input_ary.gpudata, n, interval_size, output_ary.gpudata, block_results) # second level inclusive scan of per-block results self.scan_intervals_knl.prepared_async_call( (1,1), (self.scan_wg_size, 1, 1), stream, block_results, num_groups, interval_size, block_results, dummy_results) # update intervals with result of second level scan self.final_update_knl.prepared_async_call( (num_groups, 1,), (self.update_wg_size, 1, 1), stream, output_ary.gpudata, n, interval_size, block_results) return output_ary
def __call__(self, input_ary, output_ary=None, allocator=None, stream=None): allocator = allocator or input_ary.allocator if output_ary is None: output_ary = input_ary if isinstance(output_ary, (str, unicode)) and output_ary == "new": output_ary = cl_array.empty_like(input_ary, allocator=allocator) if input_ary.shape != output_ary.shape: raise ValueError("input and output must have the same shape") if not input_ary.flags.forc: raise RuntimeError("ScanKernel cannot " "deal with non-contiguous arrays") n, = input_ary.shape if not n: return output_ary unit_size = self.scan_wg_size * self.scan_wg_seq_batches dev = driver.Context.get_device() max_groups = 3 * dev.get_attribute( driver.device_attribute.MULTIPROCESSOR_COUNT) from pytools import uniform_interval_splitting interval_size, num_groups = uniform_interval_splitting( n, unit_size, max_groups) block_results = allocator(self.dtype.itemsize * num_groups) dummy_results = allocator(self.dtype.itemsize) # first level scan of interval (one interval per block) self.scan_intervals_knl.prepared_async_call( (num_groups, 1), (self.scan_wg_size, 1, 1), stream, input_ary.gpudata, n, interval_size, output_ary.gpudata, block_results) # second level inclusive scan of per-block results self.scan_intervals_knl.prepared_async_call( (1, 1), (self.scan_wg_size, 1, 1), stream, block_results, num_groups, interval_size, block_results, dummy_results) # update intervals with result of second level scan self.final_update_knl.prepared_async_call(( num_groups, 1, ), (self.update_wg_size, 1, 1), stream, output_ary.gpudata, n, interval_size, block_results) return output_ary
def get_state(params, G, P, loc=Loci.CENT, out=None): """Calculate ucon, ucov, bcon, bcov from primitive variables Returns a dict of state variables """ # TODO make this a fusion of kernels? Components are needed in places if out is None: out = {} out['ucon'] = cl_array.empty(params['queue'], G.shapes.grid_vector, dtype=np.float64) out['ucov'] = cl_array.empty_like(out['ucon']) out['bcon'] = cl_array.empty_like(out['ucon']) out['bcov'] = cl_array.empty_like(out['ucon']) ucon_calc(params, G, P, loc, out=out['ucon']) G.lower_grid(out['ucon'], loc, out=out['ucov']) bcon_calc(params, G, P, out['ucon'], out['ucov'], out=out['bcon']) G.lower_grid(out['bcon'], loc, out=out['bcov']) return out
def __call__(self, input_ary, output_ary=None, allocator=None, queue=None): allocator = allocator or input_ary.allocator queue = queue or input_ary.queue or output_ary.queue if output_ary is None: output_ary = input_ary if isinstance(output_ary, (str, unicode)) and output_ary == "new": output_ary = cl_array.empty_like(input_ary, allocator=allocator) if input_ary.shape != output_ary.shape: raise ValueError("input and output must have the same shape") if not input_ary.flags.forc: raise RuntimeError("ScanKernel cannot " "deal with non-contiguous arrays") n, = input_ary.shape if not n: return output_ary unit_size = self.scan_wg_size * self.scan_wg_seq_batches max_groups = 3*max(dev.max_compute_units for dev in self.devices) from pytools import uniform_interval_splitting interval_size, num_groups = uniform_interval_splitting( n, unit_size, max_groups); block_results = allocator(self.dtype.itemsize*num_groups) dummy_results = allocator(self.dtype.itemsize) # first level scan of interval (one interval per block) self.scan_intervals_knl( queue, (num_groups*self.scan_wg_size,), (self.scan_wg_size,), input_ary.data, n, interval_size, output_ary.data, block_results) # second level inclusive scan of per-block results self.scan_intervals_knl( queue, (self.scan_wg_size,), (self.scan_wg_size,), block_results, num_groups, interval_size, block_results, dummy_results) # update intervals with result of second level scan self.final_update_knl( queue, (num_groups*self.update_wg_size,), (self.update_wg_size,), output_ary.data, n, interval_size, block_results) return output_ary
def setup_device(self, imshape): print('Setting up with imshape = %s' % (str(imshape))) self.cached_shape = imshape self.clIm = cla.Array(self.q, imshape, np.float32) self.clm = cla.empty_like(self.clIm) self.clx = cla.empty_like(self.clIm) self.cly = cla.empty_like(self.clIm) self.clO = cla.zeros_like(self.clIm) self.clM = cla.zeros_like(self.clIm) self.clF = cla.empty_like(self.clIm) self.clS = cla.empty_like(self.clIm) self.clThisS = cla.empty_like(self.clIm) self.clScratch = cla.empty_like(self.clIm) self.radial_prg = pyopencl.Program(self.ctx, RADIAL_PROGRAM).build() self.sobel = Sobel(self.ctx, self.q) #self.sepcorr2d = NaiveSeparableCorrelation(self.ctx, self.q) self.sepcorr2d = LocalMemorySeparableCorrelation(self.ctx, self.q) self.accum = ElementwiseKernel(self.ctx, 'float *a, float *b', 'a[i] += b[i]') self.norm_s = ElementwiseKernel(self.ctx, 'float *s, const float nRadii', 's[i] = -1 * s[i] / nRadii', 'norm_s') self.accum_s = ElementwiseKernel(self.ctx, 'float *a, float *b, const float nr', 'a[i] -= b[i] / nr') self.gaussians = {} self.gaussian_prgs = {} self.minmax = MinMaxKernel(self.ctx, self.q) # starburst storage clImageFormat = cl.ImageFormat(cl.channel_order.R, cl.channel_type.FLOAT) self.clIm2D = cl.Image(self.ctx, mf.READ_ONLY, clImageFormat, imshape) # Create sampler for sampling image object self.imSampler = cl.Sampler(self.ctx, False, # Non-normalized coordinates cl.addressing_mode.CLAMP_TO_EDGE, cl.filter_mode.LINEAR) self.cl_find_ray_boundaries = FindRayBoundaries(self.ctx, self.q)
def test(self): a = numpy.random.randn(4, 4).astype(numpy.float32) b = numpy.random.randn(4, 4).astype(numpy.float32) c = numpy.random.randn(4, 4).astype(numpy.float32) a_gpu = cl_array.to_device(self.ctx, queue, a) b_gpu = cl_array.to_device(self.ctx, queue, b) c_gpu = cl_array.to_device(self.ctx, queue, c) dest_gpu = cl_array.empty_like(a_gpu)
def __call__(self, input_buf, imgx_buf, imgy_buf, mag_buf): if self.scratch is None or self.scratch.shape != input_buf.shape: self.scratch = cl_array.empty_like(input_buf) self.sepconv_cr(input_buf, self.sobel_c, self.sobel_r, imgx_buf, self.scratch) self.sepconv_rc(input_buf, self.sobel_r, self.sobel_c, imgy_buf, self.scratch) self.mag(mag_buf, imgx_buf, imgy_buf)
def test_derivedFnCoulomb(initials, expected): sut = cl.elementwise.ElementwiseKernel( context, 'double4 *k, double4 *y, double t', 'double4 temp_y = y[0]; k[0] = derivedFn(&temp_y, t)', name='sut', preamble=rk_pd_4d.derivedFnCoulomb) y = cl_array.to_device(queue, initials) k = cl_array.empty_like(y) sut(k, y, 0.0) assert expected == k.get()
def prim_to_flux(params, G, P, D=None, dir=0, loc=Loci.CENT, out=None): """Calculate fluxes of conserved varibles in direction dir, or if dir=0 the variables themselves""" sh = G.shapes if out is None: out = cl_array.empty_like(P) if D is None: D = get_state(params, G, P, loc) global knl_prim_to_flux if knl_prim_to_flux is None: code = replace_prim_names(""" out[RHO,i,j,k] = P[RHO,i,j,k] * ucon[dir,i,j,k] * gdet[i,j] out[UU,i,j,k] = (T[0,i,j,k] + P[RHO,i,j,k] * ucon[dir,i,j,k]) * gdet[i,j] out[U1,i,j,k] = T[1,i,j,k] * gdet[i,j] out[U2,i,j,k] = T[2,i,j,k] * gdet[i,j] out[U3,i,j,k] = T[3,i,j,k] * gdet[i,j] out[B1,i,j,k] = (bcon[1,i,j,k] * ucon[dir,i,j,k] - bcon[dir,i,j,k] * ucon[1,i,j,k]) * gdet[i,j] out[B2,i,j,k] = (bcon[2,i,j,k] * ucon[dir,i,j,k] - bcon[dir,i,j,k] * ucon[2,i,j,k]) * gdet[i,j] out[B3,i,j,k] = (bcon[3,i,j,k] * ucon[dir,i,j,k] - bcon[dir,i,j,k] * ucon[3,i,j,k]) * gdet[i,j] """) if 'electrons' in params and params['electrons']: code += replace_prim_names(""" out[KEL,i,j,k] = P[RHO,i,j,k] * ucon[dir,i,j,k] * gdet[i,j] * P[KEL,i,j,k] out[KTOT,i,j,k] = P[RHO,i,j,k] * ucon[dir,i,j,k] * gdet[i,j] * P[KTOT,i,j,k] """) # TODO also passives knl_prim_to_flux = lp.make_kernel( "[dir, ndim, n1, n2, n3] -> " + sh.isl_grid_scalar, code, [ *primsArrayArgs("P", "out", ghosts=False), *vecArrayArgs("T", "ucon", "bcon", ghosts=False), *gscalarArrayArgs("gdet", ghosts=False), ... ], assumptions=sh.assume_grid + "and 0 <= dir < ndim", default_offset=lp.auto) knl_prim_to_flux = lp.fix_parameters(knl_prim_to_flux, nprim=params['n_prim'], ndim=4) # TODO keep k because of the geom argument knl_prim_to_flux = tune_grid_kernel(knl_prim_to_flux, sh.grid_scalar) print("Compiled prim_to_flux") evt, _ = knl_prim_to_flux(params['queue'], P=P, T=Tmhd_vec(params, G, P, D, dir), gdet=G.gdet_d[loc.value], ucon=D['ucon'], bcon=D['bcon'], dir=dir, out=out) if 'profile' in params and params['profile']: evt.wait() return out
def __call__(self, input_ary, output_ary=None, allocator=None, queue=None): allocator = allocator or input_ary.allocator queue = queue or input_ary.queue or output_ary.queue if output_ary is None: output_ary = input_ary if isinstance(output_ary, (str, unicode)) and output_ary == "new": output_ary = cl_array.empty_like(input_ary, allocator=allocator) if input_ary.shape != output_ary.shape: raise ValueError("input and output must have the same shape") if not input_ary.flags.forc: raise RuntimeError("ScanKernel cannot " "deal with non-contiguous arrays") n, = input_ary.shape if not n: return output_ary unit_size = self.scan_wg_size * self.scan_wg_seq_batches max_groups = 3 * max(dev.max_compute_units for dev in self.devices) from pytools import uniform_interval_splitting interval_size, num_groups = uniform_interval_splitting( n, unit_size, max_groups) block_results = allocator(self.dtype.itemsize * num_groups) dummy_results = allocator(self.dtype.itemsize) # first level scan of interval (one interval per block) self.scan_intervals_knl(queue, (num_groups * self.scan_wg_size, ), (self.scan_wg_size, ), input_ary.data, n, interval_size, output_ary.data, block_results) # second level inclusive scan of per-block results self.scan_intervals_knl(queue, (self.scan_wg_size, ), (self.scan_wg_size, ), block_results, num_groups, interval_size, block_results, dummy_results) # update intervals with result of second level scan self.final_update_knl(queue, (num_groups * self.update_wg_size, ), (self.update_wg_size, ), output_ary.data, n, interval_size, block_results) return output_ary
def test_elwise_kernel(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand a_gpu = clrand(context, queue, (50,), numpy.float32) b_gpu = clrand(context, queue, (50,), numpy.float32) from pyopencl.elementwise import ElementwiseKernel lin_comb = ElementwiseKernel(context, "float a, float *x, float b, float *y, float *z", "z[i] = a*x[i] + b*y[i]", "linear_combination") c_gpu = cl_array.empty_like(a_gpu) lin_comb(5, a_gpu, 6, b_gpu, c_gpu) assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5
def callback_post(self, context): print("context:", context) queue = cl.CommandQueue(context) nd_data = np.array([[1, 2, 3, 4], [5, 6, 5, 2]], dtype=np.complex64) nd_user_data = np.array([[2, 2, 2, 2], [3, 4, 5, 6]], dtype=np.float32) cl_data = cla.to_device(queue, nd_data) cl_user_data = cla.to_device(queue, nd_user_data) cl_data_transformed = cla.empty_like(cl_data) G = GpyFFT(debug=False) plan = G.create_plan(context, cl_data.shape) plan.strides_in = tuple(x // cl_data.dtype.itemsize for x in cl_data.strides) plan.strides_out = tuple(x // cl_data.dtype.itemsize for x in cl_data_transformed.strides) plan.inplace = False plan.precision = CLFFT_SINGLE plan.set_callback(b'postset', self.callback_kernel_src_postset, 'post', user_data=cl_user_data.data) plan.bake(queue) plan.enqueue_transform((queue,), (cl_data.data,), (cl_data_transformed.data,) ) queue.finish() print('cl_data_transformed:') print(cl_data_transformed) print('fft(nd_data) * nd_user_data') print(np.fft.fftn(nd_data)) assert np.allclose(cl_data_transformed.get(), np.fft.fftn(nd_data) * nd_user_data) del plan
def test_spirv(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) if (ctx._get_cl_version() < (2, 1) or cl.get_cl_header_version() < (2, 1)): from pytest import skip skip("SPIR-V program creation only available in OpenCL 2.1 and higher") n = 50000 a_dev = cl.clrandom.rand(queue, n, np.float32) b_dev = cl.clrandom.rand(queue, n, np.float32) dest_dev = cl_array.empty_like(a_dev) with open("add-vectors-%d.spv" % queue.device.address_bits, "rb") as spv_file: spv = spv_file.read() prg = cl.Program(ctx, spv) prg.sum(queue, a_dev.shape, None, a_dev.data, b_dev.data, dest_dev.data) assert la.norm((dest_dev - (a_dev+b_dev)).get()) < 1e-7
# Use OpenCL To Add Two Random Arrays (Using PyOpenCL Arrays and Elementwise) import pyopencl as cl # Import the OpenCL GPU computing API import pyopencl.array as cl_array # Import PyOpenCL Array (a Numpy array plus an OpenCL buffer object) import numpy # Import Numpy number tools context = cl.create_some_context() # Initialize the Context queue = cl.CommandQueue(context) # Instantiate a Queue a = cl_array.to_device(queue, numpy.random.randn(10).astype(numpy.float32)) # Create a random pyopencl array b = cl_array.to_device(queue, numpy.random.randn(10).astype(numpy.float32)) # Create a random pyopencl array c = cl_array.empty_like(a) # Create an empty pyopencl destination array sum = cl.elementwise.ElementwiseKernel(context, "float *a, float *b, float *c", "c[i] = a[i] + b[i]", "sum") # Create an elementwise kernel object # - Arguments: a string formatted as a C argument list # - Operation: a snippet of C that carries out the desired map operatino # - Name: the fuction name as which the kernel is compiled sum(a, b, c) # Call the elementwise kernel print("a: {}".format(a)) print("b: {}".format(b)) print("c: {}".format(c)) # Print all three arrays, to show sum() worked
def gs_gpu(idata,itera=100): """Gerchberg-Saxton algorithm to calculate DOEs using the GPU Calculates the phase distribution in a object plane to obtain an specific amplitude distribution in the target plane. It uses a FFT to calculate the field propagation. The wavefront at the DOE plane is assumed as a plane wave. **ARGUMENTS:** ========== ====================================================== idata numpy array containing the target amplitude distribution itera Maximum number of iterations ========== ====================================================== """ pl=cl.get_platforms()[0] devices=pl.get_devices(device_type=cl.device_type.GPU) ctx = cl.Context(devices=[devices[0]]) queue = cl.CommandQueue(ctx) plan = Plan(idata.shape, queue=queue,dtype=complex128) #no funciona con "complex128" src = str(Template(KERNEL).render( double_support=all( has_double_support(dev) for dev in devices), amd_double_support=all( has_amd_double_support(dev) for dev in devices) )) prg = cl.Program(ctx,src).build() idata_gpu=cl_array.to_device(queue, ifftshift(idata).astype("complex128")) fdata_gpu=cl_array.empty_like(idata_gpu) rdata_gpu=cl_array.empty_like(idata_gpu) plan.execute(idata_gpu.data,fdata_gpu.data) e=1000 ea=1000 for i in range (itera): prg.norm(queue, fdata_gpu.shape, None,fdata_gpu.data) plan.execute(fdata_gpu.data,rdata_gpu.data,inverse=True) tr=rdata_gpu.get() rdata=ifftshift(tr) #TODO: This calculation should be done in the GPU e= (abs(rdata)-idata).std() if e>ea: break ea=e prg.norm2(queue, rdata_gpu.shape,None,rdata_gpu.data,idata_gpu.data) plan.execute(rdata_gpu.data,fdata_gpu.data) fdata=fdata_gpu.get() #~ prg.norm(queue, fdata_gpu.shape, None,fdata_gpu.data) fdata=ifftshift(fdata) fdata=exp(1.j*angle(fdata)) #~ fdata=fdata_gpu.get() return fdata
# Kernel for reduce-code krnlRed=ReductionKernel(ctx,numpy.float32,neutral="0", reduce_expr="a+b",map_expr="get_val(x[i])*%10.3f" % length, arguments="__global float *x", preamble=""" float get_val(float x) { return x*x; } """) # Generation of an array where each element is an evaluated integral. tonum=1000000 # Number of elements. # Array to send to the GPU. p_gpu=cl_array.to_device(ctx,queue,sp.linspace(0,tonum,tonum+1).astype(numpy.float32)) res=cl_array.empty_like(p_gpu) # The resultating array # Elementwise (mapping) kernel. krnlMap=ElementwiseKernel(ctx,"float *param, float *res", "res[i]=integrate(param[i])",preamble=""" float integrate(float param) { float sum=0; for (float f=0.0;f<10.0;f+=0.001) { sum+=(f*f-10*f-param); } return sum/1000.0; } """) integrand=krnlRed(vals).get() # Calculate the first integral. krnlMap(p_gpu,res) # Generate the large array.
def empty_like(self, a): arr = cl_array.empty_like(a) self._cl_arrays.append(arr) return arr
import pyopencl as cl import pyopencl.array as cl_array import numpy ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) n = 10 a_gpu = cl_array.to_device( ctx, queue, numpy.random.randn(n).astype(numpy.float32)) b_gpu = cl_array.to_device( ctx, queue, numpy.random.randn(n).astype(numpy.float32)) from pyopencl.elementwise import ElementwiseKernel lin_comb = ElementwiseKernel(ctx, "float a, float *x, " "float b, float *y, " "float *z", "z[i] = a*x[i] + b*y[i]", "linear_combination") c_gpu = cl_array.empty_like(a_gpu) lin_comb(5, a_gpu, 6, b_gpu, c_gpu) import numpy.linalg as la assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5
def test_segmented_scan(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.tools import dtype_to_ctype dtype = np.int32 ctype = dtype_to_ctype(dtype) #for is_exclusive in [False, True]: for is_exclusive in [True, False]: if is_exclusive: output_statement = "out[i] = prev_item" else: output_statement = "out[i] = item" from pyopencl.scan import GenericScanKernel knl = GenericScanKernel(context, dtype, arguments="__global %s *ary, __global char *segflags, " "__global %s *out" % (ctype, ctype), input_expr="ary[i]", scan_expr="across_seg_boundary ? b : (a+b)", neutral="0", is_segment_start_expr="segflags[i]", output_statement=output_statement, options=[]) np.set_printoptions(threshold=2000) from random import randrange from pyopencl.clrandom import rand as clrand for n in scan_test_counts: a_dev = clrand(queue, (n,), dtype=dtype, a=0, b=10) a = a_dev.get() if 10 <= n < 20: seg_boundaries_values = [ [0, 9], [0, 3], [4, 6], ] else: seg_boundaries_values = [] for i in range(10): seg_boundary_count = max(2, min(100, randrange(0, int(0.4*n)))) seg_boundaries = [ randrange(n) for i in range(seg_boundary_count)] if n >= 1029: seg_boundaries.insert(0, 1028) seg_boundaries.sort() seg_boundaries_values.append(seg_boundaries) for seg_boundaries in seg_boundaries_values: #print "BOUNDARIES", seg_boundaries #print a seg_boundary_flags = np.zeros(n, dtype=np.uint8) seg_boundary_flags[seg_boundaries] = 1 seg_boundary_flags_dev = cl_array.to_device( queue, seg_boundary_flags) seg_boundaries.insert(0, 0) result_host = a.copy() for i, seg_start in enumerate(seg_boundaries): if i+1 < len(seg_boundaries): seg_end = seg_boundaries[i+1] else: seg_end = None if is_exclusive: result_host[seg_start+1:seg_end] = np.cumsum( a[seg_start:seg_end][:-1]) result_host[seg_start] = 0 else: result_host[seg_start:seg_end] = np.cumsum( a[seg_start:seg_end]) #print "REF", result_host result_dev = cl_array.empty_like(a_dev) knl(a_dev, seg_boundary_flags_dev, result_dev) #print "RES", result_dev is_correct = (result_dev.get() == result_host).all() if not is_correct: diff = result_dev.get() - result_host print("RES-REF", diff) print("ERRWHERE", np.where(diff)) print(n, list(seg_boundaries)) assert is_correct from gc import collect collect() print("%d excl:%s done" % (n, is_exclusive))
from __future__ import absolute_import from __future__ import print_function import pyopencl as cl import pyopencl.array as cl_array import numpy import numpy.linalg as la a = numpy.random.rand(50000).astype(numpy.float32) b = numpy.random.rand(50000).astype(numpy.float32) ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) a_dev = cl_array.to_device(queue, a) b_dev = cl_array.to_device(queue, b) dest_dev = cl_array.empty_like(a_dev) prg = cl.Program(ctx, """ __kernel void sum(__global const float *a, __global const float *b, __global float *c) { int gid = get_global_id(0); c[gid] = a[gid] + b[gid]; } """).build() prg.sum(queue, a.shape, None, a_dev.data, b_dev.data, dest_dev.data) print(la.norm((dest_dev - (a_dev+b_dev)).get()))
def callback_pre(self, context): print("context:", context) queue = cl.CommandQueue(context) nd_data = np.array([[1, 2, 3, 4], [5, 6, 5, 2]], dtype=np.complex64) cl_data = cla.to_device(queue, nd_data) cl_data_transformed = cla.empty_like(cl_data) print("cl_data:") print(cl_data) print('nd_data.shape/strides:', nd_data.shape, nd_data.strides) print('cl_data.shape/strides:', cl_data.shape, cl_data.strides) print('cl_data_transformed.shape/strides:', cl_data_transformed.shape, cl_data_transformed.strides) G = GpyFFT(debug=False) plan = G.create_plan(context, cl_data.shape) plan.strides_in = tuple(x // cl_data.dtype.itemsize for x in cl_data.strides) plan.strides_out = tuple(x // cl_data.dtype.itemsize for x in cl_data_transformed.strides) print('plan.strides_in', plan.strides_in) print('plan.strides_out', plan.strides_out) print('plan.distances', plan.distances) print('plan.batch_size', plan.batch_size) plan.inplace = False plan.precision = CLFFT_SINGLE print('plan.precision:', plan.precision) plan.scale_forward = 1. print('plan.scale_forward:', plan.scale_forward) #print('plan.transpose_result:', plan.transpose_result) nd_user_data = np.array([[2, 2, 2, 2], [3, 4, 5, 6]], dtype=np.float32) cl_user_data = cla.to_device(queue, nd_user_data) print('cl_user_data') print(cl_user_data) plan.set_callback(b'premul', self.callback_kernel_src_premul, 'pre', user_data=cl_user_data.data) plan.bake(queue) print('plan.temp_array_size:', plan.temp_array_size) plan.enqueue_transform((queue,), (cl_data.data,), (cl_data_transformed.data,) ) queue.finish() print('cl_data_transformed:') print(cl_data_transformed) print('fft(nd_data * nd_user_data):') print(np.fft.fftn(nd_data * nd_user_data)) assert np.allclose(cl_data_transformed.get(), np.fft.fftn(nd_data * nd_user_data)) del plan
import numpy as np import gpyfft G = gpyfft.GpyFFT(debug=True) print "clAmdFft Version: %d.%d.%d"%(G.get_version()) context = cl.create_some_context() queue = cl.CommandQueue(context) print "context:", hex(context.obj_ptr) print "queue:", hex(queue.obj_ptr) nd_data = np.array([[1,2,3,4], [5,6,7,8]], dtype = np.complex64) cl_data = cla.to_device(queue, nd_data) cl_data_transformed = cla.empty_like(cl_data) print "cl_data:" print cl_data print 'nd_data.shape/strides', nd_data.shape, nd_data.strides print 'cl_data.shape/strides', cl_data.shape, cl_data.strides print 'cl_data_transformed.shape/strides', cl_data_transformed.shape, cl_data_transformed.strides plan = G.create_plan(context, cl_data.shape) plan.strides_in = tuple(x//cl_data.dtype.itemsize for x in cl_data.strides) plan.strides_out = tuple(x//cl_data.dtype.itemsize for x in cl_data_transformed.strides) print 'plan.strides_in', plan.strides_in print 'plan.strides_out', plan.strides_out
a = np.array(range(10, 1, -1), dtype=np.float32) test_im = np.outer(a, a) row_k = np.array([1, 2, 3]).astype(np.float32) col_k = np.array([5, 6, 7]).astype(np.float32) else: test_im = np.ones([10, 10]).astype(np.float32) row_k = np.array([1, 2, 3]).astype(np.float32) col_k = np.array([2, 4, 5]).astype(np.float32) ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) in_buf = cl_array.to_device(queue, test_im) row_buf = cl_array.to_device(queue, row_k) col_buf = cl_array.to_device(queue, col_k) out_buf = cl_array.empty_like(in_buf) imgx_buf = cl_array.empty_like(in_buf) imgy_buf = cl_array.empty_like(in_buf) mag_buf = cl_array.empty_like(in_buf) # Test the Sobel sobel = Sobel(ctx, queue) sobel(in_buf, imgx_buf, imgy_buf, mag_buf) print(imgx_buf.get()) print(mag_buf.get()) # Test the conv #conv = NaiveSeparableCorrelation(ctx, queue) conv = LocalMemorySeparableCorrelation(ctx, queue)
def dilate(): #headURI = 'http://www.slicer.org/slicerWiki/images/4/43/MR-head.nrrd' #labelURI = 'http://boggs.bwh.harvard.edu/tmp/MRHead-label.nrrd' base = '/tmp/hoot/' headURI = base + 'MR-head.nrrd' labelURI = base + 'MR-head-label.nrrd' print("Starting...") if not slicer.util.getNode('MR-head*'): print("Downloading...") vl = slicer.modules.volumes.logic() name = 'MR-head' volumeNode = vl.AddArchetypeVolume(headURI, name, 0) name = 'MR-head-label' labelNode = vl.AddArchetypeVolume(labelURI, name, 1) if volumeNode: storageNode = volumeNode.GetStorageNode() if storageNode: # Automatically select the volume to display appLogic = slicer.app.applicationLogic() selNode = appLogic.GetSelectionNode() selNode.SetReferenceActiveVolumeID(volumeNode.GetID()) selNode.SetReferenceActiveLabelVolumeID(labelNode.GetID()) appLogic.PropagateVolumeSelection(1) node = slicer.util.getNode('MR-head') volume = slicer.util.array('MR-head') oneOverVolumeMax = 1. / volume.max() labelNode = slicer.util.getNode('MR-head-label') labelVolume = slicer.util.array('MR-head-label') print("Creating Context...") ctx = None for platform in cl.get_platforms(): for device in platform.get_devices(): print(cl.device_type.to_string(device.type)) if cl.device_type.to_string(device.type) == "GPU": ctx = cl.Context([device]) break; if not ctx: print ("no GPU context available") ctx = cl.create_some_context() print("Creating Queue...") queue = cl.CommandQueue(ctx) print("Copying volumes...") mf = cl.mem_flags volume_dev = cl_array.to_device(queue, volume) volume_image_dev = cl.image_from_array(ctx, volume,1) label_dev = cl.array.to_device(queue, labelVolume) theta = numpy.zeros_like(volume) theta_dev = cl.array.to_device(queue,theta) thetaNext = numpy.zeros_like(volume) thetaNext_dev = cl.array.to_device(queue,thetaNext) dest_dev = cl_array.empty_like(volume_dev) sampler = cl.Sampler(ctx,False,cl.addressing_mode.REPEAT,cl.filter_mode.LINEAR) print("Building program...") slices,rows,columns = volume.shape prg = cl.Program(ctx, """ #pragma OPENCL EXTENSION cl_khr_fp64: enable __kernel void copy( __global short source[{slices}][{rows}][{columns}], __global short destination[{slices}][{rows}][{columns}]) {{ size_t slice = get_global_id(0); size_t column = get_global_id(1); size_t row = get_global_id(2); if (slice < {slices} && row < {rows} && column < {columns}) {{ destination[slice][row][column] = source [slice][row][column]; }} }} __kernel void dilate( __read_only image3d_t volume, __global short label[{slices}][{rows}][{columns}], sampler_t volumeSampler, __global short dest[{slices}][{rows}][{columns}]) {{ size_t slice = get_global_id(0); size_t column = get_global_id(1); size_t row = get_global_id(2); if (slice >= {slices} || row >= {rows} || column >= {columns}) {{ return; }} int size = 1; int sliceOff, rowOff, columnOff; unsigned int sampleSlice, sampleRow, sampleColumn; short samples = 0; float4 samplePosition; for (sliceOff = -size; sliceOff <= size; sliceOff++) {{ sampleSlice = slice + sliceOff; if (sampleSlice < 0 || sampleSlice >= {slices}) continue; for (rowOff = -size; rowOff <= size; rowOff++) {{ sampleRow = row + rowOff; if (sampleRow < 0 || sampleRow >= {rows}) continue; for (columnOff = -size; columnOff <= size; columnOff++) {{ sampleColumn = column + columnOff; if (sampleColumn < 0 || sampleColumn >= {columns}) continue; if (label[sampleSlice][sampleRow][sampleColumn] != 0) {{ samples++; }} }} }} }} dest[slice][row][column] = samples; }} """.format(slices=slices,rows=rows,columns=columns)).build() def iterate(iterations=10): print("Running!") for iteration in xrange(iterations): prg.dilate(queue, volume.shape, None, volume_image_dev, label_dev.data, sampler, dest_dev.data) prg.copy(queue, volume.shape, None, dest_dev.data, label_dev.data) print("Getting data...") labelVolume[:] = dest_dev.get() print("Rendering...") labelNode.GetImageData().Modified() node.GetImageData().Modified() print("Done!") def grow(iterations=10): for iteration in xrange(iterations): iterate(1) slicer.app.processEvents()
def memoryBlur(): print("Starting...") if not slicer.util.getNode('MRHead*'): print("Downloading...") vl = slicer.modules.volumes.logic() uri = 'http://www.slicer.org/slicerWiki/images/4/43/MR-head.nrrd' name = 'MRHead' volumeNode = vl.AddArchetypeVolume(uri, name, 0) if volumeNode: storageNode = volumeNode.GetStorageNode() if storageNode: # Automatically select the volume to display appLogic = slicer.app.applicationLogic() selNode = appLogic.GetSelectionNode() selNode.SetReferenceActiveVolumeID(volumeNode.GetID()) appLogic.PropagateVolumeSelection(1) node = slicer.util.getNode('MRHead*') volume = slicer.util.array('MRHead*') print("Creating Context...") ctx = None for platform in cl.get_platforms(): for device in platform.get_devices(): print(cl.device_type.to_string(device.type)) if cl.device_type.to_string(device.type) == "GPU": ctx = cl.Context([device]) if not ctx: print ("no GPU context available") ctx = cl.create_some_context() print("Creating Queue...") queue = cl.CommandQueue(ctx) print("Copying volume...") mf = cl.mem_flags volume_dev = cl_array.to_device(queue, volume) dest_dev = cl_array.empty_like(volume_dev) print("Building program...") slices,rows,columns = volume.shape prg = cl.Program(ctx, """ #pragma OPENCL EXTENSION cl_khr_fp64: enable __kernel void blur( __global const short volume[{slices}][{rows}][{columns}], __global short dest[{slices}][{rows}][{columns}]) {{ size_t slice = get_global_id(0); size_t column = get_global_id(1); size_t row = get_global_id(2); int size = 3; int sliceOff, rowOff, columnOff; unsigned int sampleSlice, sampleRow, sampleColumn; double sum = 0; unsigned int samples = 0; for (sliceOff = -size; sliceOff <= size; sliceOff++) {{ sampleSlice = slice + sliceOff; if (sampleSlice < 0 || sampleSlice >= {slices}) continue; for (rowOff = -size; rowOff <= size; rowOff++) {{ sampleRow = row + rowOff; if (sampleRow < 0 || sampleRow >= {rows}) continue; for (columnOff = -size; columnOff <= size; columnOff++) {{ sampleColumn = column + columnOff; if (sampleColumn < 0 || sampleColumn >= {columns}) continue; sum += volume[sampleSlice][sampleRow][sampleColumn]; samples++; }} }} }} dest[slice][row][column] = (short) (sum / samples); }} """.format(slices=slices,rows=rows,columns=columns)).build() print("Running!") prg.blur(queue, volume.shape, None, volume_dev.data, dest_dev.data) print("Getting data...") volume[:] = dest_dev.get() print("Rendering...") node.GetImageData().Modified() print("Done!")
end_val=6.0 start_val=0.0 side_length=2000 print "Solving mom-space with matrix-dimensions:", side_length,"x",side_length # Timing, to see how fast the code is now. t1=time.time() # Create all the necessary arrays. x_vector=numpy.array([i%side_length for i in range(side_length**2)]) y_vector=numpy.array([(i-i%side_length)/side_length for i in range(side_length**2)]) gpu_matrix_x=cl_array.to_device(ctx,queue,(x_vector).astype(numpy.float32)) gpu_matrix_y=cl_array.to_device(ctx,queue,(y_vector).astype(numpy.float32)) gpu_matrix_res=cl_array.empty_like(gpu_matrix_x) # Kernel to generate an identity matrix. krnl_identity_matrix=ElementwiseKernel(ctx,"int *x, int *y, float *res", "res[i]=get_element(x[i],y[i])",preamble=""" float get_element(int x, int y) { return x==y; } """) # Kernel to generate a matrix through integrals depending on row and column. # The integrand is given in the C-function f(). krnl_gaussian_matrix=ElementwiseKernel(ctx,"float *x, float *y, float start, float end, float step, float *res", "res[i]=get_element(x[i],y[i],start,end,step)", preamble="#define PI 3.14159265f\n"+ "".join(open("bessel.cl",'r').readlines())+
if not ctx: print ("preferred context not available") ctx = cl.create_some_context() print("Creating Queue...") queue = cl.CommandQueue(ctx) print("Copying volumes...") mf = cl.mem_flags volume_dev = cl_array.to_device(queue, volume) label_dev = cl.array.to_device(queue, labelVolume) binaryLabels = numpy.logical_not(numpy.logical_not(labelVolume)) theta = float(2**15) * numpy.array(binaryLabels,dtype=numpy.dtype('float32')) theta_dev = cl.array.to_device(queue,theta) thetaNext = numpy.copy(numpy.array(theta, dtype=numpy.dtype('float32'))) thetaNext_dev = cl.array.to_device(queue,thetaNext) labelNext_dev = cl_array.empty_like(label_dev) candidates = labelVolume.copy() candidates_dev = cl.array.to_device(queue,candidates) candidatesNext = candidates.copy() candidatesNext_dev = cl.array.to_device(queue,candidatesNext) candidatesInitialized = False print("label mean ", labelVolume.mean()) print("label_dev mean ", label_dev.get().mean()) print("labelNext_dev mean ", labelNext_dev.get().mean()) print("candidates_dev mean ", candidates_dev.get().mean()) print("theta mean ", theta.mean()) print("thetaNext_dev mean ", thetaNext_dev.get().mean()) print("candidatesNext_dev mean ", candidatesNext_dev.get().mean()) print("Building program...")
def __call__(self, input_dev, row_dev, col_dev, result_dev, scratch_dev=None): if scratch_dev is None: scratch_dev = cla.empty_like(input_dev) row_tile_width = 128 #row_tile_width = 64 col_tile_width = 8 #col_tile_height = 16 col_tile_height = 8 #col_tile_width = 16 #col_tile_height = 48 col_hstride = 8 assert (np.mod(row_dev.shape[0], 2) == 1, "Kernels must be of odd width") row_kernel_radius = row_dev.shape[0] / 2 coallescing_quantum = 16 row_kernel_radius_aligned = ((row_kernel_radius / coallescing_quantum) * coallescing_quantum) if row_kernel_radius_aligned == 0: row_kernel_radius_aligned = coallescing_quantum assert (np.mod(col_dev.shape[0], 2) == 1, "Kernels must be of odd width") col_kernel_radius = col_dev.shape[0] / 2 prg = self.build_program(input_dev.dtype, input_dev.shape, row_kernel_radius, row_kernel_radius_aligned, row_tile_width, col_kernel_radius, col_tile_width, col_tile_height, col_hstride) # Row kernel launch parameters row_local_size = (row_kernel_radius_aligned + row_tile_width + row_kernel_radius, 1) row_group_size = (int_div_up(input_dev.shape[1], row_tile_width), input_dev.shape[0]) row_global_size = (row_local_size[0] * row_group_size[0], row_local_size[1] * row_group_size[1]) # Column kernel launch parameters col_local_size = (min(input_dev.shape[1], col_tile_width), min(input_dev.shape[0], col_hstride)) col_group_size = (int_div_up(input_dev.shape[1], col_tile_width), int_div_up(input_dev.shape[0], col_tile_height)) col_global_size = (col_local_size[0] * col_group_size[0], col_local_size[1] * col_group_size[1]) row_global_size = tuple(int(e) for e in row_global_size) row_local_size = tuple(int(e) for e in row_local_size) col_global_size = tuple(int(e) for e in col_global_size) col_local_size = tuple(int(e) for e in col_local_size) try: prg.separable_convolution_row(self.queue, row_global_size, row_local_size, scratch_dev.data, input_dev.data, row_dev.data) except Exception as ex: print(input_dev.shape) print(row_dev.shape) print(row_global_size) print(row_local_size) raise ex try: prg.separable_convolution_col(self.queue, col_global_size, col_local_size, result_dev.data, scratch_dev.data, col_dev.data) except Exception as e: print(input_dev.shape) print(result_dev) print(scratch_dev) print(col_dev) print(col_dev.shape) raise e