def _fold_exp_and_coh(t_array, w, tz, tau_arr): if tz != 0.: t_array -= tz shape = t_array.shape t_array = t_array.astype(np.float32) t_arr_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=t_array) tau_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=(1/tau_arr).astype(np.float32)) shape = (shape[0], shape[1], tau_arr.size) shape_coh = (shape[0], shape[1], 3) out = cl_array.empty(queue, shape=shape, dtype=np.float32) out_coh = cl_array.empty(queue, shape=shape_coh, dtype=np.float32) global_work_size = t_array.size + (work_size[0] - t_array.size % work_size[0]) prg.fold_exp(queue, (global_work_size, tau_arr.size), work_size, t_arr_gpu, np.float32(w), tau_buf, out.data, np.uint32(t_array.size)) coh_no_div.coh_gauss(queue, (global_work_size, 3), work_size, t_arr_gpu, np.float32(w/1.4142), out_coh.data, np.uint32(t_array.size)) queue.finish() a = out.get(async_=True) b = out_coh.get(async_=True) b /= np.abs(b).max(0) queue.finish() return a, b
def get_flux(params, G, P): sh = G.shapes # Just need 4 elements -- filled below F = [0] * 4 global Pl, Pr, ctop if Pl is None: Pl = cl_array.empty(params['queue'], sh.grid_primitives, dtype=np.float64) Pr = cl_array.empty(params['queue'], sh.grid_primitives, dtype=np.float64) ctop = cl_array.empty(params['queue'], sh.grid_vector, dtype=np.float64) # reconstruct left- and right-going components reconstruct(params, G, P, 1, lout=Pl, rout=Pr) # turn these into a net flux F[1], ctop[1] = lr_to_flux(params, G, Pl, Pr, 1, Loci.FACE1) reconstruct(params, G, P, 2, lout=Pl, rout=Pr) F[2], ctop[2] = lr_to_flux(params, G, Pl, Pr, 2, Loci.FACE2) reconstruct(params, G, P, 3, lout=Pl, rout=Pr) F[3], ctop[3] = lr_to_flux(params, G, Pl, Pr, 3, Loci.FACE3) if params['dt_static']: ndt = params['dt_start'] else: ndt = ndt_min(params, G, ctop) return F, ndt
def __init__(self, height, width): """ height, width : size of the screen """ # Don't confuse 'Viewer' and 'Engine' # Size of Engine should always be the same while running self._height = height self._width = width self._image = np.zeros((self.size[0], self.size[1], 3), dtype=np.uint8) self._TM = ThingsManager() # OpenCl things self.device = cl.get_platforms()[0].get_devices()[0] self.ctx = cl.Context([self.device]) self.queue = cl.CommandQueue(self.ctx) self.bg_color = np.array(colors.COLOR_BACKGROUND, dtype=np.uint8) self.wall_color = np.array(colors.COLOR_WALL, dtype=np.uint8) self.image_dev = cl_array.empty(self.queue, self.image.shape, np.uint8) self.bg_col_dev = cl_array.to_device(self.queue, self.bg_color) self.wall_col_dev = cl_array.to_device(self.queue, self.wall_color) self.fp_ray_dev = None self.delta_vec_dev = None self.observation_dev = cl_array.empty(self.queue, (2, ec.RayNum, 3), np.uint8) cl_path = path.join(path.dirname(__file__), 'cl_scripts/ray.cl') with open(cl_path, 'r') as f: fstr = "".join(f.readlines()) self.program = cl.Program(self.ctx, fstr).build() # Initiate things first and then call CollisionManager self.initiate_things() self._CM = CollisionManager(self.size, self._TM)
def __init__(self, par): self.C = par["C"] self.traj = par["traj"] self.NSlice = par["NSlice"] self.NScan = par["NScan"] self.dimX = par["dimX"] self.dimY = par["dimY"] self.NC = par["NC"] self.fval_min = 0 self.fval = 0 self.ctx = par["ctx"][0] self.queue = par["queue"][0] self.res = [] self.N = par["N"] self.Nproj = par["Nproj"] self.incor = par["InScale"].astype(DTYPE) self.coil_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=self.C.data) self.tmp_sino = clarray.empty(self.queue, (self.NScan, self.NC, self.NSlice, self.Nproj, self.N), DTYPE, "C") self.tmp_result = clarray.empty(self.queue, (self.NScan, self.NC, self.NSlice, self.dimY, self.dimX), DTYPE, "C") self.NUFFT = NUFFT(self.ctx, self.queue, par, overgridfactor=par["ogf"]) self.prg = Program( self.ctx, open(resource_filename( 'rrsg_cgreco', 'kernels/opencl_operator_kernels.c')).read())
def maxpool2d(q, A, f, stride, out=None, indices=None): dtype = dtype_to_ctype(A.dtype) n, c, h, w = A.shape out_h = (h - f) / stride + 1 out_w = (w - f) / stride + 1 if out is None: out = clarray.empty(q, (n, c, out_h, out_w), dtype=A.dtype) if indices is None: indices = clarray.empty(q, (n, c, out_h, out_w), dtype=np.int32) if 'max_pool' not in _kernel_cache: prg = cl.Program(clplatf.ctx, _maxpool_template % { 'dtype': dtype }).build() _kernel_cache['max_pool'] = prg.max_pool krnl = _kernel_cache['max_pool'] # TODO better global and local dimensions (make divisible by 64 etc.) ev = krnl(q, (n * c * out_h * out_w, ), None, A.data, out.data, indices.data, np.int32(h), np.int32(w), np.int32(out_h), np.int32(out_w), np.int32(f), np.int32(f), np.int32(stride), np.int32(stride)) ev.wait() return out, indices
def init_OpenCL_quanti(self, N_var, msg_at_time, return_buffer_only=False): """Inits the OpenCL context and transfers all static data to the device""" self.context = cl.create_some_context() print(self.context.get_info(cl.context_info.DEVICES)) path = os.path.split(os.path.abspath(__file__)) kernelsource = open(os.path.join(path[0], 'kernels_quanti_template.cl')).read() tpl = Template(kernelsource) rendered_tp = tpl.render(Nvar=N_var) self.program = cl.Program(self.context, str(rendered_tp)).build() self.return_buffer_only = return_buffer_only # Set up OpenCL self.queue = cl.CommandQueue(self.context) self.quantize = self.program.quantize self.quantize.set_scalar_arg_dtypes([np.int32, None, None, None]) self.quantize_LLR = self.program.quantize_LLR self.quantize_LLR.set_scalar_arg_dtypes( [np.int32, None, None, None, None]) self.limit_buff = cl_array.to_device( self.queue, self.cdf_t_given_x_equals_zero.astype(np.float64)) self.cluster_buff = cl_array.empty(self.queue, (N_var, msg_at_time), dtype=np.int32) self.LLR_buff = cl_array.empty(self.queue, (N_var, msg_at_time), dtype=np.float64) self.LLR_values_buff = cl_array.to_device( self.queue, self.output_LLRs.astype(np.float64))
def setup_arrays(self, nrays, nsamples, cutoff): prog_params = (nrays, nsamples, cutoff) if prog_params in self.array_cache: return self.array_cache[prog_params] else: arrays = ArraySet() arrays.scratch = cla.empty(self.queue, (nsamples, nrays), dtype=np.float32, allocator=self.memory_pool) arrays.result = cla.empty(self.queue, (nrays,), dtype=np.int32, allocator=self.memory_pool) arrays.pre_cutoff = cla.empty(self.queue, (nrays, cutoff), dtype=np.float32, allocator=self.memory_pool) arrays.pre_cutoff_squared = cla.empty_like(arrays.pre_cutoff) arrays.idx = cla.arange(self.queue, 0, cutoff * nrays, 1, dtype=np.int32, allocator=self.memory_pool) self.array_cache[prog_params] = arrays return arrays
def _fold_exp_and_coh(t_array, w, tz, tau_arr): if tz != 0.: t_array -= tz shape = t_array.shape t_array = t_array.astype(np.float32) t_arr_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=t_array) tau_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=(1 / tau_arr).astype(np.float32)) shape = (shape[0], shape[1], tau_arr.size) shape_coh = (shape[0], shape[1], 3) out = cl_array.empty(queue, shape=shape, dtype=np.float32) out_coh = cl_array.empty(queue, shape=shape_coh, dtype=np.float32) global_work_size = t_array.size + (work_size[0] - t_array.size % work_size[0]) prg.fold_exp(queue, (global_work_size, tau_arr.size), work_size, t_arr_gpu, np.float32(w), tau_buf, out.data, np.uint32(t_array.size)) coh_no_div.coh_gauss(queue, (global_work_size, 3), work_size, t_arr_gpu, np.float32(w / 1.4142), out_coh.data, np.uint32(t_array.size)) queue.finish() a = out.get(async_=True) b = out_coh.get(async_=True) b /= np.abs(b).max(0) queue.finish() return a, b
def _alloctmparrays(self, inp_shape, outp_shape): block_size = self.slices+self.overlap for j in range(self.num_fun): self.inp.append([]) for i in range(2*self.num_dev): self.inp[j].append([]) for k in range(len(inp_shape[j])): if not len(inp_shape[j][k]) == 0: self.inp[j][i].append( clarray.empty( self.queue[4*int(i/2)], ((block_size, )+inp_shape[j][k][1:]), dtype=self.dtype)) else: self.inp[j][i].append([]) for j in range(self.num_fun): self.outp.append([]) for i in range(2*self.num_dev): self.outp[j].append( clarray.empty( self.queue[4*int(i/2)], ((block_size, )+outp_shape[j][1:]), dtype=self.dtype))
def setup_arrays(self, nrays, nsamples, cutoff): prog_params = (nrays, nsamples, cutoff) if prog_params in self.array_cache: return self.array_cache[prog_params] else: arrays = ArraySet() arrays.scratch = cla.empty(self.queue, (nsamples, nrays), dtype=np.float32, allocator=self.memory_pool) arrays.result = cla.empty(self.queue, (nrays, ), dtype=np.int32, allocator=self.memory_pool) arrays.pre_cutoff = cla.empty(self.queue, (nrays, cutoff), dtype=np.float32, allocator=self.memory_pool) arrays.pre_cutoff_squared = cla.empty_like(arrays.pre_cutoff) arrays.idx = cla.arange(self.queue, 0, cutoff * nrays, 1, dtype=np.int32, allocator=self.memory_pool) self.array_cache[prog_params] = arrays return arrays
def init_openCL(self,set_mem_pool_None = False): self.context = cl.create_some_context() print('### OPENCL Device #####') print(self.context.get_info(cl.context_info.DEVICES)) path = os.path.split(os.path.abspath(__file__)) kernelsource = open(os.path.join(path[0], "IB_kernels.cl")).read() tpl = Template(kernelsource) rendered_tp = tpl.render(cardinality_T=self.cardinality_T) #kernelsource = open("information_bottleneck / information_bottleneck_algorithms / IB_kernels.cl").read() self.program = cl.Program(self.context, str(rendered_tp)).build() self.queue = cl.CommandQueue(self.context) if set_mem_pool_None: self.mem_pool = None else: self.mem_pool = cl.tools.MemoryPool(cl.tools.ImmediateAllocator(self.queue)) self.p_x_given_y_buffer = cl_array.to_device(self.queue, self.p_x_given_y.astype(dtype=np.float64),allocator=self.mem_pool) self.p_x_and_y_buffer = cl_array.to_device(self.queue, self.p_x_y.astype(dtype=np.float64),allocator=self.mem_pool) self.p_y_buffer = cl_array.to_device(self.queue, self.p_y.astype(dtype=np.float64),allocator=self.mem_pool) self.p_x_and_t_buffer = cl_array.empty(self.queue, (self.cardinality_T, self.cardinality_X), dtype=np.float64, allocator=self.mem_pool) self.p_t_buffer = cl_array.empty(self.queue, self.cardinality_T, dtype=np.float64, allocator=self.mem_pool) self.argmin_buffer = cl_array.empty(self.queue,self.cardinality_Y,dtype=np.int32,allocator=self.mem_pool) self.dkl_mat_buffer = cl_array.empty(self.queue,(self.cardinality_Y,self.cardinality_T),dtype=np.float64,allocator=self.mem_pool) self.start_vec_buffer = cl_array.empty(self.queue,self.cardinality_T,dtype=np.int32,allocator=self.mem_pool) self.dkl_compute_prog = self.program.compute_dkl_mat self.dkl_compute_prog.set_scalar_arg_dtypes([np.int32, np.int32, np.int32, None, None, None]) self.find_argmin_prog = self.program.find_argmin self.find_argmin_prog.set_scalar_arg_dtypes([np.int32, np.int32, None, None]) self.allow_move_prog = self.program.allow_move self.allow_move_prog.set_scalar_arg_dtypes([np.int32, None, None, None]) self.compute_p_x_and_t_parallel_prog = self.program.compute_p_x_and_t_parallel self.compute_p_x_and_t_parallel_prog.set_scalar_arg_dtypes([np.int32, np.int32, np.int32, None, None, None, None, None]) self.compute_p_x_given_t_parallel_prog = self.program.compute_p_x_given_t_parallel self.compute_p_x_given_t_parallel_prog.set_scalar_arg_dtypes( [np.int32, None, None]) self.compute_p_t_parallel_prog = self.program.compute_p_t_parallel self.compute_p_t_parallel_prog.set_scalar_arg_dtypes([np.int32, None, None]) self.update_dist_prog = self.program.update_distributions self.update_dist_prog.set_scalar_arg_dtypes([np.int32, np.int32, np.int32, None, None, None,None, None])
def empty(n, dtype, backend='cython'): if backend == 'opencl': import pyopencl.array as gpuarray from .opencl import get_queue out = gpuarray.empty(get_queue(), n, dtype) elif backend == 'cuda': import pycuda.gpuarray as gpuarray out = gpuarray.empty(n, dtype) else: out = np.empty(n, dtype=dtype) return wrap_array(out, backend)
def empty(n, dtype, backend='cython'): if backend == 'opencl': import pyopencl.array as gpuarray dev_array = gpuarray.empty(get_queue(), n, dtype) elif backend == 'cuda': import pycuda.gpuarray as gpuarray dev_array = gpuarray.empty(n, dtype) else: return Array(np.empty(n, dtype=dtype)) wrapped_array = Array() wrapped_array.set_dev_array(dev_array) return wrapped_array
def init_OpenCL_decoding(self,msg_at_time_, context_=False): if not context_: self.context = cl.create_some_context() else: self.context = context_ print(self.context.get_info(cl.context_info.DEVICES)) path = os.path.split(os.path.abspath(__file__)) kernelsource = open(os.path.join(path[0], "kernels_min_and_BP.cl")).read() tpl = Template(kernelsource) rendered_tp = tpl.render(cn_degree=self.d_c_max, vn_degree=self.d_v_max, msg_at_time=msg_at_time_) self.program = cl.Program(self.context, str(rendered_tp)).build() self.queue = cl.CommandQueue(self.context) self.inbox_memory_start_varnodes_buffer = cl_array.to_device(self.queue, self.inbox_memory_start_varnodes.astype(np.int32)) self.inbox_memory_start_checknodes_buffer = cl_array.to_device(self.queue, self.inbox_memory_start_checknodes.astype(np.int32)) self.degree_varnode_nr_buffer = cl_array.to_device(self.queue, self.degree_varnode_nr.astype(np.int32)) self.degree_checknode_nr_buffer = cl_array.to_device(self.queue, self.degree_checknode_nr.astype(np.int32)) self.target_memorycells_varnodes_buffer = cl_array.to_device(self.queue, self.target_memory_cells_varnodes.astype(np.int32)) self.target_memorycells_checknodes_buffer = cl_array.to_device(self.queue, self.target_memory_cells_checknodes.astype(np.int32)) self.checknode_inbox_buffer = cl_array.empty(self.queue, self.inbox_memory_checknodes.shape, dtype=np.float64) self.varnode_inbox_buffer = cl_array.empty(self.queue, self.inbox_memory_varnodes.shape, dtype=np.float64) self.syndrom_buffer = cl_array.empty(self.queue, (self.degree_checknode_nr.shape[0], self.inbox_memory_varnodes.shape[-1]), dtype=np.int32) self.krnl = get_sum_kernel(self.context, None, dtype_in=self.varnode_inbox_buffer.dtype) # varnode_output_buffer.dtype ) # define programs self.send_prog = self.program.send_channel_values_to_checknode_inbox self.varnode_update_prog = self.program.varnode_update self.checknode_update_prog = self.program.checknode_update self.calc_syndrom_prog = self.program.calc_syndrome self.varoutput_prog = self.program.calc_varnode_output
def __init__(self, sino_shape, slice_shape=None, axis_position=None, angles=None, ctx=None, devicetype="all", platformid=None, deviceid=None, profile=False): OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype, platformid=platformid, deviceid=deviceid, profile=profile) # Create a backprojector self.backprojector = Backprojection(sino_shape, slice_shape=slice_shape, axis_position=axis_position, angles=angles, ctx=self.ctx, profile=profile) # Create a projector self.projector = Projection(self.backprojector.slice_shape, self.backprojector.angles, axis_position=axis_position, detector_width=self.backprojector.num_bins, normalize=False, ctx=self.ctx, profile=profile) self.sino_shape = sino_shape self.is_cpu = self.backprojector.is_cpu # Arrays self.d_data = parray.empty(self.queue, sino_shape, dtype=np.float32) self.d_data.fill(0.0) self.d_sino = parray.empty_like(self.d_data) self.d_sino.fill(0.0) self.d_x = parray.empty(self.queue, self.backprojector.slice_shape, dtype=np.float32) self.d_x.fill(0.0) self.d_x_old = parray.empty_like(self.d_x) self.d_x_old.fill(0.0) self.add_to_cl_mem({ "d_data": self.d_data, "d_sino": self.d_sino, "d_x": self.d_x, "d_x_old": self.d_x_old, })
def __init__(self, size, **kwargs): """ Parameters ---------- size : tuple of two int (height, width) of the map kwargs ------ apple_num : int number of total apples in a map eat_apple : float reward given when apple is eaten. hit_wall : float punishment(or reward?) given when hit wall. """ # Don't confuse 'Viewer' and 'Engine' # kwargs self._apple_num = kwargs['apple_num'] self._rewards = dict( eat_apple=kwargs['eat_apple'], hit_wall=kwargs['hit_wall'], ) # Size of Engine should always be the same while running self._height = size[0] self._width = size[1] self._image = np.zeros((self.size[0], self.size[1], 3), dtype=np.uint8) self._TM = ThingsManager() # OpenCl things self.device = cl.get_platforms()[0].get_devices()[0] self.ctx = cl.Context([self.device]) self.queue = cl.CommandQueue(self.ctx) self.bg_color = np.array(colors.COLOR_BACKGROUND, dtype=np.uint8) self.wall_color = np.array(colors.COLOR_WALL, dtype=np.uint8) self.image_dev = cl_array.empty(self.queue, self.image.shape, np.uint8) self.bg_col_dev = cl_array.to_device(self.queue, self.bg_color) self.wall_col_dev = cl_array.to_device(self.queue, self.wall_color) self.fp_ray_dev = None self.delta_vec_dev = None self.observation_dev = cl_array.empty(self.queue, (2, ec.RayNum, 3), np.uint8) cl_path = path.join(path.dirname(__file__), 'cl_scripts/ray.cl') with open(cl_path, 'r') as f: fstr = "".join(f.readlines()) self.program = cl.Program(self.ctx, fstr).build() # Initiate things first and then call CollisionManager self.initiate_things() self._CM = CollisionManager(self.size, self._TM)
def test_numpy_integer_shape(ctx_factory): try: list(np.int32(17)) except: pass else: from pytest import skip skip("numpy implementation does not handle scalar correctly.") context = ctx_factory() queue = cl.CommandQueue(context) cl_array.empty(queue, np.int32(17), np.float32) cl_array.empty(queue, (np.int32(17), np.int32(17)), np.float32)
def test_numpy_integer_shape(ctx_factory): try: list(np.int32(17)) except Exception: pass else: from pytest import skip skip("numpy implementation does not handle scalar correctly.") context = ctx_factory() queue = cl.CommandQueue(context) cl_array.empty(queue, np.int32(17), np.float32) cl_array.empty(queue, (np.int32(17), np.int32(17)), np.float32)
def _init_cl_arrays(self): self.cl_G = cla.to_device(self.queue, self.G.astype(self.complexdtype)) self.cl_G_conj = cla.to_device(self.queue, self.G.astype(self.complexdtype).conj()) self.cl_work = cla.zeros(self.queue, tuple(self.N12_pad), self.complexdtype) self.cl_workF = cla.zeros_like(self.cl_work) self.cl_field1 = cla.empty(self.queue, tuple(self.N1), self.complexdtype) self.cl_field2 = cla.empty(self.queue, tuple(self.N2), self.complexdtype)
def _prep_gpu(): """ Set up GPU calculation dependencies """ # try to import the necessary libraries fallback = False try: import gpu import string import pyopencl as cl import pyopencl.array as cla from pyfft.cl import Plan except ImportError: fallback = True # check gpu_info try: assert gpu.valid(gpu_info),\ "gpu_info in propagate_distances improperly specified" context, device, queue, platform = gpu_info except AssertionError: fallback = True if fallback: propagate_distances(data, distances, energy_or_wavelength, pixel_pitch, subregion=subregion, silent=silent, band_limit=band_limit, gpu_info=None, im_convert=im_convert) # if everything is OK, allocate memory and build kernels kp = string.join(gpu.__file__.split('/')[:-1], '/')+'/kernels/' build = _build_helper(context, device, kp) phase_multiply = build('propagate_phase_multiply.cl') copy_to_buffer = build('propagate_copy_to_save_buffer.cl') fftplan = Plan((N, N), queue=queue) # put the signals onto the gpu along with buffers for the # various operations rarray = cla.to_device(queue, r.astype(np.float32)) fourier = cla.to_device(queue, data.astype(np.complex64)) phase = cla.empty(queue, (N, N), np.complex64) back = cla.empty(queue, (N, N), np.complex64) store = cla.empty(queue, (nf, rows, cols), np.complex64) # precompute the fourier transform of data. fftplan.execute(fourier.data, wait_for_finish=True) return phase_multiply, copy_to_buffer, fftplan, rarray, fourier,\ phase, back, store, build
def build_scratch(self, imshape): self.scratch = [] self.index_scratch = [] l = np.prod(imshape) self.array_indices = cla.arange(self.queue, 0, l, 1, dtype=np.int32) if l % self.runlen != 0: l += l % self.runlen while l > 1: l /= self.runlen self.scratch.append(cla.empty(self.queue, (l, ), np.float32)) self.index_scratch.append(cla.empty(self.queue, (l, ), np.int32)) self.imshape = imshape
def build_scratch(self, imshape): self.scratch = [] self.index_scratch = [] l = np.prod(imshape) self.array_indices = cla.arange(self.queue, 0, l, 1, dtype=np.int32) if l % self.runlen != 0: l += l % self.runlen while l > 1: l /= self.runlen self.scratch.append(cla.empty(self.queue, (l,), np.float32)) self.index_scratch.append(cla.empty(self.queue, (l,), np.int32)) self.imshape = imshape
def _dev_array(self): if not hasattr(self, '__dev_array'): setattr(self, '__dev_array', array.empty(_queue, self.sparsity.nz, self.dtype)) return getattr(self, '__dev_array')
def __init__(self, ctx, queue, par, kwidth=3, overgridfactor=2, fft_dim=(1, 2), klength=200, DTYPE=np.complex64, DTYPE_real=np.float32): print("Setting up PyOpenCL NUFFT.") self.DTYPE = DTYPE self.DTYPE_real = DTYPE_real self.fft_shape = (par["NScan"] * par["NC"] * par["NSlice"], par["N"], par["N"]) self.traj = par["traj"] self.dcf = par["dcf"] self.Nproj = par["Nproj"] self.ctx = ctx self.queue = queue self.overgridfactor = overgridfactor self.kerneltable, self.kerneltable_FT, self.u = calckbkernel( kwidth, overgridfactor, par["N"], klength) self.kernelpoints = self.kerneltable.size self.fft_scale = DTYPE_real( np.sqrt(np.prod(self.fft_shape[fft_dim[0]:]))) self.deapo = 1 / self.kerneltable_FT.astype(DTYPE_real) self.kwidth = kwidth / 2 self.cl_kerneltable = cl.Buffer( self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=self.kerneltable.astype(DTYPE_real).data) self.deapo_cl = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=self.deapo.data) self.dcf = clarray.to_device(self.queue, self.dcf) self.traj = clarray.to_device(self.queue, self.traj) self.tmp_fft_array = (clarray.empty(self.queue, (self.fft_shape), dtype=DTYPE)) self.check = np.ones(par["N"], dtype=DTYPE_real) self.check[1::2] = -1 self.check = clarray.to_device(self.queue, self.check) self.par_fft = int(self.fft_shape[0] / par["NScan"]) self.fft = FFT(ctx, queue, self.tmp_fft_array[0:int(self.fft_shape[0] / par["NScan"]), ...], out_array=self.tmp_fft_array[0:int(self.fft_shape[0] / par["NScan"]), ...], axes=fft_dim) self.gridsize = par["N"] self.fwd_NUFFT = self.NUFFT self.adj_NUFFT = self.NUFFTH self.prg = Program( self.ctx, open( resource_filename('rrsg_cgreco', 'kernels/opencl_nufft_kernels.c')).read())
def get_array(data, queue=None): """Get pyopencl.array.Array from *data* which can be a numpy array, a pyopencl.array.Array or a pyopencl.Image. *queue* is an OpenCL command queue. """ if not queue: queue = cfg.OPENCL.queue if isinstance(data, cl_array.Array): result = data elif isinstance(data, np.ndarray): if data.dtype.kind == 'c': if data.dtype.itemsize != cfg.PRECISION.cl_cplx: data = data.astype(cfg.PRECISION.np_cplx) result = cl_array.to_device(queue, data.astype(cfg.PRECISION.np_cplx)) else: if data.dtype.kind != 'f' or data.dtype.itemsize != cfg.PRECISION.cl_float: data = data.astype(cfg.PRECISION.np_float) result = cl_array.to_device(queue, data.astype(cfg.PRECISION.np_float)) elif isinstance(data, cl.Image): result = cl_array.empty(queue, data.shape[::-1], np.float32) cl.enqueue_copy(queue, result.data, data, offset=0, origin=(0, 0), region=result.shape[::-1]) if result.dtype.itemsize != cfg.PRECISION.cl_float: result = result.astype(cfg.PRECISION.np_float) else: raise TypeError('Unsupported data type {}'.format(type(data))) return result
def axis_convolve(X, h, axis=0, queue=None, output=None): """Filter along an of *X* using filter vector *h*. If *h* has odd length, each output sample is aligned with each input sample and *Y* is the same size as *X*. If *h* has even length, each output sample is aligned with the mid point of each pair of input samples, and the output matrix's shape is increased by one along the convolution axis. After convolution, the :py:class:`pyopencl.array.Array` instance holding the device-side output is returned. This may be accessed on the host via :py:func:`to_array`. The axis of convolution is specified by *axis*. The default direction of convolution is column-wise. If *queue* is non-``None``, it should be a :py:class:`pyopencl.CommandQueue` instance which is used to perform the computation. If ``None``, a default global queue is used. If *output* is non-``None``, it should be a :py:class:`pyopencl.array.Array` instance which the result is written into. If ``None``, an output array is created. """ _check_cl() queue = to_queue(queue) kern = _convolve_kernel_for_queue(queue.context) # Create output if not specified if output is None: output_shape = list(X.shape) if h.shape[0] % 2 == 0: output_shape[axis] += 1 output = cl_array.empty(queue, output_shape, np.float32) return _apply_kernel(X, h, kern, output, axis=axis)
def _evaluate(self, valuation, cache): q = pl.qs[0] if id(self) not in cache: X = self.ops[0]._evaluate(valuation, cache) W = self.ops[1]._evaluate(valuation, cache) b = self.ops[2]._evaluate(valuation, cache) out_c, _, kh, kw = W.shape n, c, h, w = X.shape out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph, cover_all=self.cover_all) out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw, cover_all=self.cover_all) y = clarray.empty(q, (n, out_c, out_h, out_w), dtype=X.dtype) self.col, ev1 = conv.im2col(q, X, kh, kw, self.sy, self.sx, self.ph, self.pw, self.cover_all) W_mat = W.reshape(out_c, -1) ev1.wait() # TODO asynchronize col_mats = self.col.reshape(n, -1, out_h * out_w) y_mats = y.reshape(n, out_c, -1) for i in xrange(n): y_mats[i] = linalg.dot(q, W_mat, col_mats[i]) if b is not None: # y += b[:, None, None] _, ev3 = conv.bcast_add(q, y, b, y) ev3.wait() # TODO asynchronize cache[id(self)] = y return cache[id(self)]
def get_fluid_source(params, G, P, D, out=None): """Calculate a small fluid source term, added to conserved variables for stability""" s = G.slices sh = G.shapes # T the old fashioned way: TODO Tmhd_full... T = cl_array.empty(params['queue'], sh.grid_tensor, dtype=np.float64) for mu in range(4): Tmhd_vec(params, G, P, D, mu, out=T[mu]) if out is None: out = cl_array.empty_like(P) global gcon1_d, gcon2_d, gcon3_d if gcon1_d is None: gcon1_d = cl_array.to_device(params['queue'], (G.conn[:, :, 1, :, :] * G.gdet[Loci.CENT.value]).copy()) gcon2_d = cl_array.to_device(params['queue'], (G.conn[:, :, 2, :, :] * G.gdet[Loci.CENT.value]).copy()) gcon3_d = cl_array.to_device(params['queue'], (G.conn[:, :, 3, :, :] * G.gdet[Loci.CENT.value]).copy()) # Contract mhd stress tensor with connection evt, _ = G.dot2D2geom(params['queue'], u=T, g=gcon1_d, out=out[s.U1]) evt, _ = G.dot2D2geom(params['queue'], u=T, g=gcon2_d, out=out[s.U2]) evt, _ = G.dot2D2geom(params['queue'], u=T, g=gcon3_d, out=out[s.U2]) if 'profile' in params and params['profile']: evt.wait() return out
def __init_particle(self): print("Info- init particles") gen = PhiloxGenerator(self.ocl_ctx) self.x_gpu = cl_array.empty(self.ocl_queue, self.dim * self.np, dtype=self.dtype) # Init position on a sphere of diameter 0.05 and center (mu,mu,mu) # self.x_gpu = gen.normal( # self.ocl_queue, (self.np * self.dim), self.dtype, mu=0.5, sigma=0.05 # ) # Init velocity self.v_gpu = gen.normal(self.ocl_queue, (self.np * self.dim), self.dtype, mu=0, sigma=1) # Init time self.t_gpu = cl_array.zeros(self.ocl_queue, self.np, dtype=self.dtype) self.ocl_prg.rt_init_particles( self.ocl_queue, (self.np, ), None, self.x_gpu.data, self.v_gpu.data, ).wait()
def _calcFwdGNPartLinear(self, x): if self._imagespace is False: b = clarray.empty(self._queue[0], self._data_shape, dtype=self._DTYPE) self._FT.FFT( b, clarray.to_device( self._queue[0], (self._step_val[:, None, ...] * self.par["C"]))).wait() b = b.get() else: b = self._step_val x = clarray.to_device(self._queue[0], np.require(x, requirements="C")) grad = clarray.to_device(self._queue[0], np.zeros(x.shape + (4, ), dtype=self._DTYPE)) grad.add_event( self._grad_op.fwd(grad, x, wait_for=grad.events + x.events)) x = x.get() grad = grad.get() sym_grad = None if self._reg_type == 'TGV': v = clarray.to_device(self._queue[0], self._v) sym_grad = clarray.to_device( self._queue[0], np.zeros(x.shape + (8, ), dtype=self._DTYPE)) sym_grad.add_event( self._symgrad_op.fwd(sym_grad, v, wait_for=sym_grad.events + v.events)) sym_grad = sym_grad.get() return b, grad, sym_grad
def nd_arange(shape, axis=0, start=0, step=1, clq=None): """Fill an ND-array along one axis with a stepped range. nd_arange((Z, Y, X), axis=2, start=A, step=B) is functionally equivalent to: np.arange(A, A+X*B, B)[None,None,:] * np.ones((Z, Y, X), np.float32) but does the work on the OpenCL device and without relying on array-broadcasting which is not supported in PyOpenCL. """ assert axis >= 0 assert axis < len(shape) if clq is None: clq = cl.CommandQueue(ctx) return_dev = False else: return_dev = True out_dev = cl_array.empty(clq, shape, float32) nd_arange_dev(clq, out_dev, axis, start, step) if return_dev: return out_dev else: out = out_dev.map_to_host() clq.finish() return out
def __init__(self,queue, array,shape=None,dtype=None, orginal_cpu_readonly=False,force_release_gpu=False): #We need to have an array, or the ablity to create an array assert (array is not None) or (shape is not None and dtype is not None) self._queue = queue if array is not None: self._array = array; self._created_orignal = False self._orginaly_on_gpu = isinstance(array,cl_array.Array); if self._orginaly_on_gpu: self._gpu_array = array; self._cpu_array = None; else: self._cpu_array = array; self._gpu_array =cl_array.to_device(queue,self._cpu_array) if shape is not None and array.shape != shape: raise ValueError("Array is not in correct shape") if dtype is not None and array.dtype != dtype: raise ValueError("Array has wrong data type") else: self._gpu_array = cl_array.empty(queue,shape,dtype=dtype) self._cpu_array = None self._created_orignal = True self._orginaly_on_gpu = True; self._cpu_readonly = orginal_cpu_readonly self._force_release_gpu = force_release_gpu
def test_index_preservation(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) classes = [GenericScanKernel] dev = context.devices[0] if dev.type & cl.device_type.CPU: classes.append(GenericDebugScanKernel) for cls in classes: for n in scan_test_counts: knl = cls( context, np.int32, arguments="__global int *out", input_expr="i", scan_expr="b", neutral="0", output_statement=""" out[i] = item; """) out = cl_array.empty(queue, n, dtype=np.int32) knl(out) assert (out.get() == np.arange(n)).all() from gc import collect collect()
def allocate_arrays(self): """ Allocate various types of arrays for the tests """ # numpy images self.grad = np.zeros(self.image.shape, dtype=np.complex64) self.grad2 = np.zeros((2, ) + self.image.shape, dtype=np.float32) self.grad_ref = gradient(self.image) self.div_ref = divergence(self.grad_ref) self.image2 = np.zeros_like(self.image) # Device images self.gradient_parray = parray.empty(self.la.queue, self.image.shape, np.complex64) self.gradient_parray.fill(0) # we should be using cl.Buffer(self.la.ctx, cl.mem_flags.READ_WRITE, size=self.image.nbytes*2), # but platforms not suporting openCL 1.2 have a problem with enqueue_fill_buffer, # so we use the parray "fill" utility self.gradient_buffer = self.gradient_parray.data # Do the same for image self.image_parray = parray.to_device(self.la.queue, self.image) self.image_buffer = self.image_parray.data # Refs tmp = np.zeros(self.image.shape, dtype=np.complex64) tmp.real = np.copy(self.grad_ref[0]) tmp.imag = np.copy(self.grad_ref[1]) self.grad_ref_parray = parray.to_device(self.la.queue, tmp) self.grad_ref_buffer = self.grad_ref_parray.data
def nFTH(x, fft, par): siz = np.shape(x) result = np.zeros( (par["NScan"], par["NC"], par["NSlice"], par["dimY"], par["dimX"]), dtype=par["DTYPE"], ) tmp_result = clarray.empty( fft.queue, (1, 1, par["NSlice"], par["dimY"], par["dimX"]), dtype=par["DTYPE"], ) start = time.time() for j in range(siz[0]): for k in range(siz[1]): inp = clarray.to_device( fft.queue, np.require(x[j, k, ...][None, None, ...], requirements="C"), ) fft.FFTH(tmp_result, inp, scan_offset=j).wait() result[j, k, ...] = np.squeeze(tmp_result.get()) end = time.time() - start print("FT took %f s" % end) return result
def test_index_preservation(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.scan import GenericScanKernel, GenericDebugScanKernel classes = [GenericScanKernel] dev = context.devices[0] if dev.type & cl.device_type.CPU: classes.append(GenericDebugScanKernel) for cls in classes: for n in scan_test_counts: knl = cls( context, np.int32, arguments="__global int *out", input_expr="i", scan_expr="b", neutral="0", output_statement=""" out[i] = item; """) out = cl_array.empty(queue, n, dtype=np.int32) knl(out) assert (out.get() == np.arange(n)).all() from gc import collect collect()
def __init__(self, idata): # idata: an array of lowercase characters. # Get platform and device (complete) NAME = 'NVIDIA CUDA' platforms = cl.get_platforms() devs = None for platform in platforms: if platform.name == NAME: devs = platform.get_devices() # TODO: # Set up a command queue (complete) self.ctx = cl.Context(devs) self.queue = cl.CommandQueue(self.ctx) # host variables (incomplete) # N = 16 #get rid of N #deprecate self.a = idata #a is a bunch of letters #self.b = np.random.rand(N).astype(np.float32) #deprecate # device memory allocation (incomplete) self.a_gpu = cl_array.to_device(self.queue, self.a) # self.b_gpu = cl_array.to_device(self.queue, self.b) #deprecate self.c_gpu = cl_array.empty(self.queue, self.a.shape, self.a.dtype) # kernel code (incomplete) self.kernel = """
def __init__(self, queue, array, shape=None, dtype=None, orginal_cpu_readonly=False, force_release_gpu=False): #We need to have an array, or the ablity to create an array assert (array is not None) or (shape is not None and dtype is not None) self._queue = queue if array is not None: self._array = array self._created_orignal = False self._orginaly_on_gpu = isinstance(array, cl_array.Array) if self._orginaly_on_gpu: self._gpu_array = array self._cpu_array = None else: self._cpu_array = array self._gpu_array = cl_array.to_device(queue, self._cpu_array) if shape is not None and array.shape != shape: raise ValueError("Array is not in correct shape") if dtype is not None and array.dtype != dtype: raise ValueError("Array has wrong data type") else: self._gpu_array = cl_array.empty(queue, shape, dtype=dtype) self._cpu_array = None self._created_orignal = True self._orginaly_on_gpu = True self._cpu_readonly = orginal_cpu_readonly self._force_release_gpu = force_release_gpu
def _init_cl_arrays(self): self.cl_farfield_intensity = cla.empty( self.cl_queue, shape=self.far_field.shape, dtype=np.float32, allocator=self.cl_allocator, )
def uniform(self, *args, **kwargs): a = kwargs.pop("a", 0) b = kwargs.pop("b", 1) result = cl_array.empty(*args, **kwargs) self.fill_uniform(result, queue=result.queue, a=a, b=b) return result
def _allocate_device(self): if self.state is DeviceDataMixin.DEVICE_UNALLOCATED: if self.soa: shape = self._data.T.shape else: shape = self._data.shape self._device_data = array.empty(_queue, shape=shape, dtype=self.dtype) self.state = DeviceDataMixin.HOST
def normal(self, *args, **kwargs): mu = kwargs.pop("mu", 0) sigma = kwargs.pop("sigma", 1) result = cl_array.empty(*args, **kwargs) self.fill_normal(result, queue=result.queue, mu=mu, sigma=sigma) return result
def __init__(self, queue, num_work_items, luxury=None, seed=None, no_warmup=False, use_legacy_init=False, max_work_items=None): if luxury is None: luxury = 4 if seed is None: from time import time seed = int(time()*1e6) % 2<<30 self.context = queue.context self.luxury = luxury self.num_work_items = num_work_items from pyopencl.characterize import has_double_support self.support_double = has_double_support(queue.device) self.no_warmup = no_warmup self.use_legacy_init = use_legacy_init self.max_work_items = max_work_items src = """ %(defines)s #include <pyopencl-ranluxcl.cl> kernel void init_ranlux(unsigned seeds, global ranluxcl_state_t *ranluxcltab) { if (get_global_id(0) < %(num_work_items)d) ranluxcl_initialization(seeds, ranluxcltab); } """ % { "defines": self.generate_settings_defines(), "num_work_items": num_work_items } prg = cl.Program(queue.context, src).build() # {{{ compute work group size wg_size = None import sys import platform if ("darwin" in sys.platform and "Apple" in queue.device.platform.vendor and platform.mac_ver()[0].startswith("10.7") and queue.device.type == cl.device_type.CPU): wg_size = (1,) self.wg_size = wg_size # }}} self.state = cl_array.empty(queue, (num_work_items, 112), dtype=np.uint8) self.state.fill(17) prg.init_ranlux(queue, (num_work_items,), self.wg_size, np.uint32(seed), self.state.data)
def multi_dot(a_gpu, c): #a_gpu = cl_array.to_device(queue, a.astype(np.float32)) c_gpu = cl_array.to_device(queue, c.astype(np.float32)) out = cl_array.empty(queue, shape=(a_gpu.shape[0], a_gpu.shape[1]), dtype=np.float32) prg3.multi_dot(queue, out.shape, (128,1), a_gpu.data, c_gpu.data, out.data, np.uint32(a_gpu.shape[-1]), np.uint32(30)).wait() ax = out.get() return ax
def _allocate(self, size, dtype, name=None): """ Wrapper to define new arrays whether gpu or cpu path""" if self.use_gpu: import pyopencl.array as cla x = cla.empty(self.queue, size, dtype) y = arrayWrapper(x, name) return y else: return np.zeros(size, dtype)
def q2c(X1, X2, X3, queue=None, output=None): _check_cl() queue = to_queue(queue) kern = _q2c_kernel_for_queue(queue.context) if X1.shape != X2.shape or X2.shape != X3.shape: raise ValueError('All three X matrices must have the same shape.') # Create output if not specified if output is None: output_shape = [1,1,1] output_shape[:len(X1.shape[:2])] = X1.shape[:2] output_shape[0] >>= 1 output_shape[1] >>= 1 output_shape[2] = 6 output = cl_array.empty(queue, tuple(output_shape), np.complex64) # If necessary, convert X X1_device = to_device(X1, queue) X2_device = to_device(X2, queue) X3_device = to_device(X3, queue) # Work out size of work group taking into account element step work_shape = np.array(output.shape[:3]) # Work out optimum group size if work_shape.shape[0] >= 2 and np.all(work_shape[:2] > 1): local_shape = (int(np.floor(np.sqrt(queue.device.max_work_group_size))),) * 2 + (1,1,) else: local_shape = (queue.device.max_work_group_size, 1, 1) local_shape = local_shape[:len(work_shape)] global_shape = list(int(np.ceil(x/float(y))*y) for x, y in zip(work_shape, local_shape)) X_shape = struct.pack('iiii', *(tuple(X1_device.shape) + (1,1,1,1))[:4]) X1_strides = struct.pack('iiii', *(tuple(s//X1_device.dtype.itemsize for s in X1_device.strides) + (0,0,0,0))[:4]) X1_offset = np.int32(X1_device.offset) X2_strides = struct.pack('iiii', *(tuple(s//X2_device.dtype.itemsize for s in X2_device.strides) + (0,0,0,0))[:4]) X2_offset = np.int32(X2_device.offset) X3_strides = struct.pack('iiii', *(tuple(s//X3_device.dtype.itemsize for s in X3_device.strides) + (0,0,0,0))[:4]) X3_offset = np.int32(X3_device.offset) Y_strides = struct.pack('iiii', *(tuple(s//output.dtype.itemsize for s in output.strides) + (0,0,0,0))[:4]) Y_shape = struct.pack('iiii', *(tuple(output.shape) + (1,1,1,1))[:4]) Y_offset = np.int32(output.offset) # Perform actual convolution kern(queue, global_shape, local_shape, X_shape, X1_device.base_data, X1_strides, X1_offset, X2_device.base_data, X2_strides, X2_offset, X3_device.base_data, X3_strides, X3_offset, output.base_data, Y_strides, Y_shape, Y_offset) return output
def normal(self, *args, **kwargs): """Make a new empty array, apply :meth:`fill_normal` to it. """ mu = kwargs.pop("mu", 0) sigma = kwargs.pop("sigma", 1) result = cl_array.empty(*args, **kwargs) self.fill_normal(result, queue=result.queue, mu=mu, sigma=sigma) return result
def uniform(self, *args, **kwargs): """Make a new empty array, apply :meth:`fill_uniform` to it. """ a = kwargs.pop("a", 0) b = kwargs.pop("b", 1) result = cl_array.empty(*args, **kwargs) self.fill_uniform(result, queue=result.queue, a=a, b=b) return result
def axis_convolve_ifilter(X, h, axis=0, queue=None, output=None): _check_cl() queue = to_queue(queue) kern = _ifilter_kernel_for_queue(queue.context) # Create output if not specified if output is None: output_shape = list(X.shape) output_shape[axis] <<= 1 output = cl_array.empty(queue, output_shape, np.float32) return _apply_kernel(X, h, kern, output, axis=axis, elementstep=0.5)
def set_ggr(self,ggr): assert self.can_has_domains, "must set domains before ggr" assert isinstance(ggr,tuple) and len(ggr) == 2, "ggr must be a 2-tuple" growth_rate,ncrossings = ggr window_length = 10 # can be changed but not exposed for simplicity rate = (1+growth_rate)**(1./window_length)-1 self.plan = self._ggr_make_plan(self.m0,rate,0.02,50) self.target = 0 self.optimized_spa = 0.05 if not self.can_has_ggr: self.next_crossing = 0.0 self.crossed = False self.ggr_tracker = np.zeros((len(self.plan),3),float) self.spa_buffer = cla.empty(self.queue,(self.N,self.N),np.float32) self.whenflipped = cla.empty(self.queue,(self.N,self.N),np.int32) # build the lookup table for the recency enforcement # these parameters can be changed but are not exposed to the user to keep things simple rmin, rmax, rrate = 0.05, 2., 0.5 x = np.arange(len(self.plan)).astype('float') recency_need = rmin*rmax*np.exp(rrate*x)/(rmax+rmin*np.exp(rrate*x)) self.recency_need = cla.to_device(self.queue,recency_need.astype(np.float32)) self.set_zero(self.whenflipped) # self.crossings are the values of m_out which, when crossed over, generate a signal # to save the output to make a movie out of or whatever if isinstance(ncrossings,(int,float)): self.crossings = np.arange(0,1,1./ncrossings)[1:] if isinstance(ncrossings,(list,tuple,np.ndarray)): self.crossings = ncrossings if ncrossings != None: self.next_crossing = self.crossings[-1] self.direction = np.sign(self.m0-self.plan[-1]) self.can_has_ggr = True
def allocate_space(self,x_peak,y_peak,k_max,order,type): # step=k_max/order # points=numpy.array([i*step for i in range(order)]) # weights=numpy.array([step for i in range(order)]) [points,weights]=calc.triangle_contour(x_peak,y_peak,k_max,order) # Generate weights. self.k_max=k_max size=self.size=len(points) host_k=(numpy.array([points[i%size] for i in range(size**2)])).astype(type) # Generate k-matrix. host_k_prim=(numpy.array([points[(int)(i/size)] for i in range(size**2)])).astype(type) # Generate k_prim-matrix. host_step=(numpy.array([weights[(int)(i/size)] for i in range(size**2)])).astype(type) # Generate step-matrix. self.gpu_k=cl_array.to_device(self.ctx,self.queue,host_k) # Flush k to gpu self.gpu_k_prim=cl_array.to_device(self.ctx,self.queue,host_k_prim) # Flush k_prim to gpu. self.gpu_step=cl_array.to_device(self.ctx,self.queue,host_step) # Flush steps to gpu. self.gpu_result=cl_array.empty(self.queue,(size**2,1,),type) # Allocate space for results.
def _coh_gaussian2(t_array, w, tz): if tz != 0.: t_array -= tz shape = t_array.shape t_array = t_array.astype(np.float32) t_arr_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=t_array) shape = (shape[0], shape[1], 4) out = cl_array.empty(queue, shape=shape, dtype=np.float32) prg2.coh_gauss(queue, (t_array.size, 3), None, t_arr_gpu, np.float32(w/1.4142), out.data).wait() a = out.get() a /= np.abs(a).max(0) return a
def __init__(self, ctx, a_dev, b_dev, mat_vec_knl): self.a_dev = a_dev self.b_dev = b_dev self.context = ctx self.mat_vec_knl = mat_vec_knl queue = self.queue = cl.CommandQueue(ctx, properties=cq_props) mat_shape = a_dev.shape self.x_dev = cl_array.empty(queue, (mat_shape[1],), dtype=np.float32) mf = cl.mem_flags self.y_host_buf = cl.Buffer(ctx, mf.ALLOC_HOST_PTR, self.b_dev.nbytes) self.y_host = self.y_host_buf.get_host_array( mat_shape[0], dtype=np.float32)
def color_deconvolution(self, rgb, stain): """Return stains in normal (non-logarithmic) color space. """ rgb = self.check_contiguous(rgb) stain = self.check_contiguous(stain) assert(rgb.flags.c_contiguous == stain.flags.c_contiguous) queue = cl.CommandQueue(self.ctx) rgb2d = rgb.reshape(-1, 3) # 2D array with R,G,B columns from 3D rgb2d_g = cla.to_device(queue, rgb2d, allocator=self.mem_pool) stain_g = cla.to_device(queue, stain, allocator=self.mem_pool) out_g = cla.empty(queue, (rgb2d.shape[0], stain.shape[1]), dtype=rgb2d_g.dtype, order="C", allocator=self.mem_pool) # Process as flat array self.prg.opticalDense(queue, (rgb2d.size, 1), None, rgb2d_g.data) # In PyOpenCL arrays rgb2d_g.shape[0] is column count (usually 3 columns here). self.prg.gemm_slow(queue, out_g.shape, None, out_g.data, rgb2d_g.data, stain_g.data, np.int32(rgb2d.shape[1]), np.int32(stain.shape[1])) self.prg.toColorDense(queue, (out_g.size, 1), None, out_g.data) return out_g.get().reshape(rgb.shape) # Again 3D array
def dot(self, A, B): """Output must have same shape as A. Incoming RGB matrix "A" should be aligned """ A = self.check_contiguous(A) B = self.check_contiguous(B) assert(A.flags.c_contiguous == B.flags.c_contiguous) queue = cl.CommandQueue(self.ctx) if A.dtype is not np.float32: A = A.astype(np.float32) if B.dtype is not np.float32: B = B.astype(np.float32) A_g = cla.to_device(queue, A, self.mem_pool) B_g = cla.to_device(queue, B, self.mem_pool) C_g = cla.empty(queue, (A.shape[0], B.shape[1]), dtype=A_g.dtype, order="C", allocator=self.mem_pool) self.prg.gemm_slow(queue, C_g.shape, None, C_g.data, A_g.data, B_g.data, np.int32(A.shape[1]), np.int32(B.shape[1])) return C_g.get()
def _fold_exp(t_array, w, tz, tau_arr): if tz != 0.: t_array -= tz shape = t_array.shape t_array = t_array.astype(np.float32) t_arr_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=t_array) tau_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=(1/tau_arr).astype(np.float32)) shape = (shape[0], shape[1], tau_arr.size) out = cl_array.empty(queue, shape=shape, dtype=np.float32) global_work_size = t_array.size + (work_size[0] - t_array.size % work_size[0]) prg.fold_exp(queue, (global_work_size, tau_arr.size), work_size, t_arr_gpu, np.float32(w), tau_buf, out.data, np.uint32(t_array.size)).wait() a = out.get() return a
def unmix_stains(self, rgb, stain): """Take RGB IHC image and split it to stains like skimage version. """ rgb = self.check_contiguous(rgb) stain = self.check_contiguous(stain) assert(rgb.flags.c_contiguous == stain.flags.c_contiguous) queue = cl.CommandQueue(self.ctx) rgb2d = rgb.reshape(-1, 3) # 2D array with R,G,B columns from 3D rgb2d_g = cla.to_device(queue, rgb2d, allocator=self.mem_pool) stain_g = cla.to_device(queue, stain, allocator=self.mem_pool) out_g = cla.empty(queue, (rgb2d.shape[0], stain.shape[1]), dtype=rgb2d_g.dtype, order="C", allocator=self.mem_pool) # Process as flat array self.prg.opticalDense(queue, (rgb2d.size, 1), None, rgb2d_g.data) # In PyOpenCL arrays rgb2d_g.shape[0] is column count (usually 3 columns here). self.prg.gemm_slow(queue, out_g.shape, None, out_g.data, rgb2d_g.data, stain_g.data, np.int32(rgb2d.shape[1]), np.int32(stain.shape[1])) ### self.prg.gemm(queue, rgb2d_g.shape, None, out_g.data, rgb2d_g.data, stain_g.data, np.int32(rgb2d_g.shape[0]), np.int32(stain_g.shape[1])) # event = # event.wait() return out_g.get().reshape(rgb.shape) # Again 3D array
def empty(self, shape, dtype, order="C"): from pyopencl.array import empty return empty(self.queue, shape, dtype, order=order)
def __init__(self, queue, num_work_items=None, luxury=None, seed=None, no_warmup=False, use_legacy_init=False, max_work_items=None): """ :param queue: :class:`pyopencl.CommandQueue`, only used for initialization :param luxury: the "luxury value" of the generator, and should be 0-4, where 0 is fastest and 4 produces the best numbers. It can also be >=24, in which case it directly sets the p-value of RANLUXCL. :param num_work_items: is the number of generators to initialize, usually corresponding to the number of work-items in the NDRange RANLUXCL will be used with. May be `None`, in which case a default value is used. :param max_work_items: should reflect the maximum number of work-items that will be used on any parallel instance of RANLUXCL. So for instance if we are launching 5120 work-items on GPU1 and 10240 work-items on GPU2, GPU1's RANLUXCLTab would be generated by calling ranluxcl_intialization with numWorkitems = 5120 while GPU2's RANLUXCLTab would use numWorkitems = 10240. However maxWorkitems must be at least 10240 for both GPU1 and GPU2, and it must be set to the same value for both. (may be `None`) .. versionchanged:: 2013.1 Added default value for `num_work_items`. """ if luxury is None: luxury = 4 if num_work_items is None: if queue.device.type & cl.device_type.CPU: num_work_items = 8 * queue.device.max_compute_units else: num_work_items = 64 * queue.device.max_compute_units if seed is None: from time import time seed = int(time()*1e6) % 2 << 30 self.context = queue.context self.luxury = luxury self.num_work_items = num_work_items from pyopencl.characterize import has_double_support self.support_double = has_double_support(queue.device) self.no_warmup = no_warmup self.use_legacy_init = use_legacy_init self.max_work_items = max_work_items src = """ %(defines)s #include <pyopencl-ranluxcl.cl> kernel void init_ranlux(unsigned seeds, global ranluxcl_state_t *ranluxcltab) { if (get_global_id(0) < %(num_work_items)d) ranluxcl_initialization(seeds, ranluxcltab); } """ % { "defines": self.generate_settings_defines(), "num_work_items": num_work_items } prg = cl.Program(queue.context, src).build() # {{{ compute work group size wg_size = None import sys import platform if ("darwin" in sys.platform and "Apple" in queue.device.platform.vendor and platform.mac_ver()[0].startswith("10.7") and queue.device.type & cl.device_type.CPU): wg_size = (1,) self.wg_size = wg_size # }}} self.state = cl_array.empty(queue, (num_work_items, 112), dtype=np.uint8) self.state.fill(17) prg.init_ranlux(queue, (num_work_items,), self.wg_size, np.uint32(seed), self.state.data)