def get_binned_data_stereographic(self,limits=((-1,1),(-1,1)),points=500): #project data stereographically onto xy plane and bin it """ stereographically project measured ray endpoints and bin them on the CL DEV. This is a lot faster when you have loads of data. Binning is done with points number of points within limits=((xmin,xmax),(ymin,ymax)).""" (pos0,pwr0) = self.get_measured_rays() pos0_dev = cl_array.to_device(self.queue,pos0.astype(np.float32)) x_dev = cl_array.zeros(self.queue,pwr0.shape,dtype=np.float32) y_dev = cl_array.zeros(self.queue,pwr0.shape,dtype=np.float32) pwr0_dev = cl_array.to_device(self.queue,pwr0.astype(np.float32)) pwr_dev = cl_array.zeros(self.queue,pwr0.shape,dtype=np.float32) pivot = cl_array.to_device(self.queue,np.array([0,0,0,0],dtype=np.float32)) time1 = time() R_dev = cl_array.to_device(self.queue,np.array([[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,0]]).astype(np.float32)) evt = self.prg.stereograph_project(self.queue, pwr0.shape, None, pos0_dev.data,pwr0_dev.data,R_dev.data,pivot.data,x_dev.data,y_dev.data,pwr_dev.data) evt.wait() x=x_dev.get() y=y_dev.get() pwr=np.float64(pwr_dev.get()) time2 = time() dx = np.float64(limits[0][1]-limits[0][0])/np.float64(points) dy = np.float64(limits[1][1]-limits[1][0])/np.float64(points) pwr = pwr / (dx * dy) (H,x_coord,y_coord)=np.histogram2d(x=x.flatten(),y=y.flatten(),bins=points,range=limits,weights=pwr.flatten()) self.hist_data = (H,x_coord,y_coord) return self.hist_data
def test_random(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import RanluxGenerator if has_double_support(context.devices[0]): dtypes = [np.float32, np.float64] else: dtypes = [np.float32] gen = RanluxGenerator(queue, 5120) for ary_size in [300, 301, 302, 303, 10007]: for dtype in dtypes: ran = cl_array.zeros(queue, ary_size, dtype) gen.fill_uniform(ran) assert (0 < ran.get()).all() assert (ran.get() < 1).all() gen.synchronize(queue) ran = cl_array.zeros(queue, ary_size, dtype) gen.fill_uniform(ran, a=4, b=7) assert (4 < ran.get()).all() assert (ran.get() < 7).all() ran = gen.normal(queue, (10007,), dtype, mu=4, sigma=3) dtypes = [np.int32] for dtype in dtypes: ran = gen.uniform(queue, (10000007,), dtype, a=200, b=300) assert (200 <= ran.get()).all() assert (ran.get() < 300).all()
def init_buffers(self, kernels): if kernels is None or len(kernels.keys())==0: raise Exception('No kernels found for OpenCL convolution') mf = cl.mem_flags self.source_host_buffer = numpy.zeros(self.image_width*self.image_height, dtype=numpy.uint8) self.source_gpu_buffer = cl_array.zeros(self.queue, self.array_size, numpy.uint8) self.temporal_host_buffers = {} self.temporal_host_buffers[TMP1] = numpy.zeros_like(self.source_host_buffer, dtype=numpy.float32) self.temporal_host_buffers[TMP2] = numpy.zeros_like(self.source_host_buffer, dtype=numpy.float32) self.temporal_gpu_buffers = {} self.temporal_gpu_buffers[TMP1] = cl_array.zeros(self.queue, self.array_size, numpy.float32) self.temporal_gpu_buffers[TMP2] = cl_array.zeros(self.queue, self.array_size, numpy.float32) self.filtered_host_buffer = numpy.zeros_like(self.source_host_buffer, dtype=numpy.float32) self.filtered_gpu_buffer = cl_array.zeros(self.queue, self.array_size, numpy.float32) self.kernel_host_buffers = {} self.kernel_gpu_buffers = {} self.filtered_host_back_buffers = {} for cell in kernels.keys(): self.kernel_host_buffers[cell] = {} self.kernel_gpu_buffers[cell] = {} self.filtered_host_back_buffers[cell] = {} for centre in kernels[cell].keys(): self.kernels_to_buffers(kernels, cell, centre) self.filtered_host_back_buffers[cell][centre] = numpy.zeros_like(self.source_host_buffer, dtype=numpy.float32)
def _allocate_arrays(self): # Determine the required shape and size of an array self._ft_shape = tuple([self._target.shape[0] // 2 + 1] + list(self._target.shape[1:])) self._shape = self._target.shape # Allocate arrays on CPU self._lcc = np.zeros(self._target.shape, dtype=np.float32) self._rot = np.zeros(self._target.shape, dtype=np.int32) # Allocate arrays on GPU arrays = '_target2 _rot_template _rot_mask _rot_mask2 _gcc _ave _ave2 _glcc'.split( ) for array in arrays: setattr( self, array, cl_array.zeros(self._queue, self._shape, dtype=np.float32)) self._grot = cl_array.zeros(self._queue, self._shape, dtype=np.int32) # Allocate all complex arrays ft_arrays = 'target target2 template mask mask2 gcc ave ave2 lcc'.split( ) for ft_array in ft_arrays: setattr( self, '_ft_' + ft_array, cl_array.to_device( self._queue, np.zeros(self._ft_shape, dtype=np.complex64)))
def get_binned_data_angular(self,limits=((-1,1),(-1,1)),points=500): """ Azimuth/elevation map measured ray endpoints to a circle and bin them on the CL DEV. This linearly maps elevation to the circle's radius and azimuth to phi. nice for cross-section plots of directivity. Binning is done with points number of points within limits=((xmin,xmax),(ymin,ymax)).""" (pos0,pwr0) = self.get_measured_rays() pos0_dev = cl_array.to_device(self.queue,pos0.astype(np.float32)) x_dev = cl_array.zeros(self.queue,pwr0.shape,dtype=np.float32) y_dev = cl_array.zeros(self.queue,pwr0.shape,dtype=np.float32) pwr0_dev = cl_array.to_device(self.queue,pwr0.astype(np.float32)) pwr_dev = cl_array.zeros(self.queue,pwr0.shape,dtype=np.float32) pivot = cl_array.to_device(self.queue,np.array([0,0,0,0],dtype=np.float32)) time1 = time() R_dev = cl_array.to_device(self.queue,np.array([[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,0]]).astype(np.float32)) evt = self.prg.angular_project(self.queue, pwr0.shape, None, pos0_dev.data,pwr0_dev.data,R_dev.data,pivot.data,x_dev.data,y_dev.data,pwr_dev.data) evt.wait() x=x_dev.get() y=y_dev.get() pwr=np.float64(pwr_dev.get()) time2 = time() dx = np.float64(limits[0][1]-limits[0][0])/np.float64(points) dy = np.float64(limits[1][1]-limits[1][0])/np.float64(points) pwr = pwr / (dx * dy) (H,x_coord,y_coord)=np.histogram2d(x=x.flatten(),y=y.flatten(),bins=points,range=limits,weights=pwr.flatten()) self.hist_data = (H,x_coord,y_coord) return self.hist_data
def _setupVariables(self, x, data): data = clarray.to_device(self._queue[0], data.astype(self._DTYPE)) step_in = {} step_out = {} tmp_results = {} step_in["x"] = clarray.to_device(self._queue[0], x) step_in["xold"] = clarray.to_device(self._queue[0], x) step_in["xk"] = step_in["x"].copy() step_out["x"] = clarray.zeros_like(step_in["x"]) tmp_results["gradFx"] = step_in["x"].copy() tmp_results["DADA"] = clarray.zeros_like(step_in["x"]) tmp_results["DAd"] = clarray.zeros_like(step_in["x"]) tmp_results["d"] = data.copy() tmp_results["Ax"] = clarray.zeros_like(data) tmp_results["temp_reg"] = clarray.zeros_like(step_in["x"]) tmp_results["gradx"] = clarray.zeros( self._queue[0], step_in["x"].shape + (4,), dtype=self._DTYPE ) tmp_results["reg_norm"] = clarray.zeros( self._queue[0], step_in["x"].shape + (2,), dtype=self._DTYPE_real, ) tmp_results["reg"] = clarray.zeros( self._queue[0], step_in["x"].shape, dtype=self._DTYPE_real ) return (step_out, tmp_results, step_in, data)
def _allocate_arrays(self): # Determine the required shape and size of an array self._ft_shape = tuple( [self._target.shape[0] // 2 + 1] + list(self._target.shape[1:]) ) self._shape = self._target.shape # Allocate arrays on CPU self._lcc = np.zeros(self._target.shape, dtype=np.float32) self._rot = np.zeros(self._target.shape, dtype=np.int32) # Allocate arrays on GPU arrays = '_target2 _rot_template _rot_mask _rot_mask2 _gcc _ave _ave2 _glcc'.split() for array in arrays: setattr(self, array, cl_array.zeros( self._queue, self._shape, dtype=np.float32) ) self._grot = cl_array.zeros(self._queue, self._shape, dtype=np.int32) # Allocate all complex arrays ft_arrays = 'target target2 template mask mask2 gcc ave ave2 lcc'.split() for ft_array in ft_arrays: setattr(self, '_ft_' + ft_array, cl_array.to_device(self._queue, np.zeros(self._ft_shape, dtype=np.complex64)) )
def sum_labeled(src, labels, n=None, clq=None): if clq is None: clq = cl.CommandQueue(ctx) return_dev = False if src.dtype == numpy.bool: src = src.astype(numpy.uint8) src_dev = cl_array.to_device(clq, src) labels_dev = cl_array.to_device(clq, labels) else: return_dev = True src_dev = src labels_dev = labels if n is None: n = labels_dev.max() + 1 tmp_dev = cl_array.zeros(clq, (TOTAL_ITEMS, n), float32) dst_dev = cl_array.zeros(clq, (n, ), float32) sum_labeled_dev(clq, src_dev, labels_dev, tmp_dev, dst_dev) if return_dev: return dst_dev else: result = dst_dev.map_to_host() clq.finish() return result
def test_rotate_grid3d(self): k = self.p.program.rotate_grid3d # Identity rotation rotmat = np.asarray([1, 0, 0, 0, 1, 0, 0, 0, 1] + [0] * 7, dtype=np.float32) self.cl_grid = cl_array.zeros(self.queue, self.shape, dtype=np.float32) self.cl_grid.fill(1) self.cl_out = cl_array.zeros(self.queue, self.shape, dtype=np.float32) args = (self.cl_grid.data, rotmat, self.cl_out.data) gws = tuple([2 * self.values["llength"] + 1] * 3) k(self.queue, gws, None, *args) answer = [ [[1.0, 1.0, 1.0], [1.0, 0.0, 0.0], [0.0, 0.0, 0.0], [1.0, 0.0, 0.0]], [[1.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], [[1.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], ] self.assertTrue(np.allclose(answer, self.cl_out.get())) # 90 degree rotation around z-axis rotmat = np.asarray([0, -1, 0, 1, 0, 0, 0, 0, 1] + [0] * 7, dtype=np.float32) grid = np.zeros(self.shape, dtype=np.float32) grid[0, 0, 0] = 1 grid[0, 0, 1] = 1 self.cl_grid = cl_array.to_device(self.queue, grid) self.cl_out.fill(0) args = (self.cl_grid.data, rotmat, self.cl_out.data) k(self.queue, gws, None, *args) answer = np.zeros_like(grid) answer[0, 0, 0] = 1 answer[0, 1, 0] = 1 self.assertTrue(np.allclose(answer, self.cl_out.get()))
def __init__(self, decomp, context, queue, grid_shape, dtype): self.decomp = decomp self.grid_shape = grid_shape self.dtype = np.dtype(dtype) self.is_real = is_real = self.dtype.kind == "f" from pystella.fourier import get_complex_dtype_with_matching_prec self.cdtype = cdtype = get_complex_dtype_with_matching_prec(self.dtype) from pystella.fourier import get_real_dtype_with_matching_prec self.rdtype = get_real_dtype_with_matching_prec(self.dtype) self.fx = cla.zeros(queue, grid_shape, dtype) self.fk = cla.zeros(queue, self.shape(is_real), cdtype) from gpyfft import FFT self.forward = FFT(context, queue, self.fx, out_array=self.fk, real=is_real, scale_forward=1, scale_backward=1) self.backward = FFT(context, queue, self.fk, out_array=self.fx, real=is_real, scale_forward=1, scale_backward=1) slc = ( (), (), (), ) self.sub_k = get_sliced_momenta(grid_shape, self.dtype, slc, queue)
def test_clashvol(self): NROT = np.random.randint(self.rotations.shape[0] + 1) rotmat = self.rotations[NROT] cpu_lsurf = np.zeros_like(self.im_lsurf.array) disvis.libdisvis.rotate_image3d(self.im_lsurf.array, self.vlength, np.linalg.inv(rotmat), self.im_center, cpu_lsurf) cpu_clashvol = numpy.fft.irfftn(numpy.fft.rfftn(cpu_lsurf).conj() * numpy.fft.rfftn(self.rcore.array), s=self.shape) gpu_rcore = cl_array.to_device(self.queue, np.asarray(self.rcore.array, dtype=np.float32)) gpu_im_lsurf = cl.image_from_array(self.queue.context, np.asarray(self.im_lsurf.array, dtype=np.float32)) gpu_lsurf = cl_array.zeros(self.queue, self.shape, dtype=np.float32) self.kernels.rotate_image3d(self.queue, self.sampler, gpu_im_lsurf, rotmat, gpu_lsurf, self.im_center) gpu_ft_lsurf = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64) gpu_ft_rcore = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64) gpu_ft_clashvol = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64) gpu_clashvol = cl_array.zeros(self.queue, self.shape, dtype=np.float32) self.kernels.rfftn(self.queue, gpu_rcore, gpu_ft_rcore) self.kernels.rfftn(self.queue, gpu_lsurf, gpu_ft_lsurf) self.kernels.c_conj_multiply(self.queue, gpu_ft_lsurf, gpu_ft_rcore, gpu_ft_clashvol) self.kernels.irfftn(self.queue, gpu_ft_clashvol, gpu_clashvol) self.assertTrue(np.allclose(cpu_clashvol, gpu_clashvol.get(), atol=0.8))
def __init__(self, shape, do_checks=False, ctx=None, devicetype="all", platformid=None, deviceid=None, profile=False): """ Create a "Linear Algebra" plan for a given image shape. :param shape: shape of the image (num_rows, num_columns) :param do_checks (optional): if True, memory and data type checks are performed when possible. :param ctx: actual working context, left to None for automatic initialization from device type or platformid/deviceid :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL" :param platformid: integer with the platform_identifier, as given by clinfo :param deviceid: Integer with the device identifier, as given by clinfo :param profile: switch on profiling to be able to profile at the kernel level, store profiling elements (makes code slightly slower) """ OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype, platformid=platformid, deviceid=deviceid, profile=profile) self.d_gradient = parray.zeros(self.queue, shape, np.complex64) self.d_image = parray.zeros(self.queue, shape, np.float32) self.add_to_cl_mem({ "d_gradient": self.d_gradient, "d_image": self.d_image }) self.wg2D = None self.shape = shape self.ndrange2D = ( int(self.shape[1]), int(self.shape[0]) ) self.do_checks = bool(do_checks) OpenclProcessing.compile_kernels(self, self.kernel_files)
def _alloctmparrays(self, inp_shape, outp_shape): block_size = self.slices+self.overlap for j in range(self.num_fun): self.inp.append([]) for i in range(2*self.num_dev): self.inp[j].append([]) for k in range(len(inp_shape[j])): if not len(inp_shape[j][k]) == 0: self.inp[j][i].append( clarray.zeros( self.queue[4*int(i/2)], ((block_size, )+inp_shape[j][k][1:]), dtype=self.dtype)) else: self.inp[j][i].append([]) for j in range(self.num_fun): self.outp.append([]) for i in range(2*self.num_dev): self.outp[j].append( clarray.zeros( self.queue[4*int(i/2)], ((block_size, )+outp_shape[j][1:]), dtype=self.dtype))
def _gpu_init(self): """Method to initialize all the data for GPU-accelerate search""" self.gpu_data = {} g = self.gpu_data d = self.data q = self.queue # move data to the GPU. All should be float32, as these is the native # lenght for GPUs g['rcore'] = cl_array.to_device(q, float32array(d['rcore'].array)) g['rsurf'] = cl_array.to_device(q, float32array(d['rsurf'].array)) # Make the scanning chain object an Image, as this is faster to rotate g['im_lsurf'] = cl.image_from_array(q.context, float32array(d['lsurf'].array)) g['sampler'] = cl.Sampler(q.context, False, cl.addressing_mode.CLAMP, cl.filter_mode.LINEAR) if self.distance_restraints: g['restraints'] = cl_array.to_device(q, float32array(d['restraints'])) # Allocate arrays on the GPU g['lsurf'] = cl_array.zeros_like(g['rcore']) g['clashvol'] = cl_array.zeros_like(g['rcore']) g['intervol'] = cl_array.zeros_like(g['rcore']) g['interspace'] = cl_array.zeros(q, d['shape'], dtype=np.int32) g['restspace'] = cl_array.zeros_like(g['interspace']) g['access_interspace'] = cl_array.zeros_like(g['interspace']) g['best_access_interspace'] = cl_array.zeros_like(g['interspace']) # arrays for counting # Reductions are typically tedious on GPU, and we need to define the # workgroupsize to allocate the correct amount of data WORKGROUPSIZE = 32 nsubhists = int(np.ceil(g['rcore'].size/WORKGROUPSIZE)) g['subhists'] = cl_array.zeros(q, (nsubhists, d['nrestraints'] + 1), dtype=np.float32) g['viol_counter'] = cl_array.zeros(q, (nsubhists, d['nrestraints'], d['nrestraints']), dtype=np.float32) # complex arrays g['ft_shape'] = list(d['shape']) g['ft_shape'][0] = d['shape'][0]//2 + 1 g['ft_rcore'] = cl_array.zeros(q, g['ft_shape'], dtype=np.complex64) g['ft_rsurf'] = cl_array.zeros_like(g['ft_rcore']) g['ft_lsurf'] = cl_array.zeros_like(g['ft_rcore']) g['ft_clashvol'] = cl_array.zeros_like(g['ft_rcore']) g['ft_intervol'] = cl_array.zeros_like(g['ft_rcore']) # other miscellanious data g['nrot'] = d['nrot'] g['max_clash'] = d['max_clash'] g['min_interaction'] = d['min_interaction'] # kernels g['k'] = Kernels(q.context) g['k'].rfftn = pyclfft.RFFTn(q.context, d['shape']) g['k'].irfftn = pyclfft.iRFFTn(q.context, d['shape']) # initial calculations g['k'].rfftn(q, g['rcore'], g['ft_rcore']) g['k'].rfftn(q, g['rsurf'], g['ft_rsurf'])
def test_split_slabs(ctx_factory, vanilla, split, parameters): ctx = ctx_factory() queue = cl.CommandQueue(ctx) expect = clarray.zeros(queue, 8, dtype=np.int32) actual = clarray.zeros(queue, 8, dtype=np.int32) _, (expect, ) = vanilla(queue, a=expect, **parameters) _, (actual, ) = split(queue, a=actual, **parameters) assert np.array_equal(expect.get(), actual.get())
def initArrays(self): self.specLevel_dev = cl_array.zeros(self.queue, (self.maxCells,self.nSpecies), dtype=numpy.float32) self.specRate_dev = cl_array.zeros(self.queue, (self.maxCells,self.nSpecies), dtype=numpy.float32) self.celltype = numpy.zeros((self.maxCells,), dtype=numpy.int32) self.celltype_dev = cl_array.zeros(self.queue, (self.maxCells,),dtype=numpy.int32) self.effgrow = numpy.zeros((self.maxCells,), dtype=numpy.float32) self.effgrow_dev = cl_array.zeros(self.queue, (self.maxCells,), dtype=numpy.float32)
def test_random_float_in_range(ctx_factory, rng_class, ary_size, plot_hist=False): context = ctx_factory() queue = cl.CommandQueue(context) device = queue.device if device.platform.vendor == "The pocl project" \ and device.type & cl.device_type.GPU \ and rng_class is RanluxGenerator: pytest.xfail("ranlux test fails on POCL + Nvidia," "at least the Titan V, as of pocl 1.6, 2021-01-20") if has_double_support(context.devices[0]): dtypes = [np.float32, np.float64] else: dtypes = [np.float32] if rng_class is RanluxGenerator: gen = rng_class(queue, 5120) else: gen = rng_class(context) for dtype in dtypes: print(dtype) ran = cl_array.zeros(queue, ary_size, dtype) gen.fill_uniform(ran) if plot_hist: import matplotlib.pyplot as pt pt.hist(ran.get(), 30) pt.show() assert (0 <= ran.get()).all() assert (ran.get() <= 1).all() if rng_class is RanluxGenerator: gen.synchronize(queue) ran = cl_array.zeros(queue, ary_size, dtype) gen.fill_uniform(ran, a=4, b=7) ran_host = ran.get() for cond in [4 <= ran_host, ran_host <= 7]: good = cond.all() if not good: print(np.where(~cond)) print(ran_host[~cond]) assert good ran = gen.normal(queue, ary_size, dtype, mu=10, sigma=3) if plot_hist: import matplotlib.pyplot as pt pt.hist(ran.get(), 30) pt.show()
def test_zero_size_array(ctx_factory, empty_shape): context = ctx_factory() queue = cl.CommandQueue(context) a = cl_array.zeros(queue, empty_shape, dtype=np.float32) b = cl_array.zeros(queue, empty_shape, dtype=np.float32) b.fill(1) c = a + b c_host = c.get() cl_array.to_device(queue, c_host)
def zeros(n, dtype, backend='cython'): if backend == 'opencl': import pyopencl.array as gpuarray from .opencl import get_queue out = gpuarray.zeros(get_queue(), n, dtype) elif backend == 'cuda': import pycuda.gpuarray as gpuarray out = gpuarray.zeros(n, dtype) else: out = np.zeros(n, dtype=dtype) return wrap_array(out, backend)
def test_copy_buffer_rect(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) arr1 = cl_array.zeros(queue, (2, 3), "f") arr2 = cl_array.zeros(queue, (4, 5), "f") arr1.fill(1) cl.enqueue_copy( queue, arr2.data, arr1.data, src_origin=(0, 0), dst_origin=(1, 1), region=arr1.shape[::-1])
def execute(self, n_it=1, **kwargs): # this defines how often the calculations are copied back from the compute unit (GPU) # e.g. 10 means that every 10th iteration is copied from the computing unit (GPU) to "python" n_out = kwargs.get('n_out', 10) queue = self.queue prg = self.program local_size = self.local_size #(n_local,) #self.local_size n_local = 512 ng = self.ng # initialize the next step i_out = 0 total_out = (n_it // n_out + 1) time_axis = np.arange(total_out, dtype=np.float32) * self.t_step n_excited = np.zeros(total_out, dtype=np.float32) n_excited[0] = 1.0 tmp_1 = cl_array.zeros(queue, (n_local * total_out, ), dtype=np.float32) tmp_2 = cl_array.zeros(queue, (n_local * total_out, ), dtype=np.float32) p = self.p_gp n = self.n_gp b = self.b_gp d = self.d_gp k = self.k_gp #prg.copy3d(queue, self.global_size, None, # n.data, p.data, b).wait() for time_i in range(n_it): if time_i % 2 > 0: p, n = n, p prg.iterate(queue, self.global_size_3d, local_size, n, p, d, k, b) if time_i % n_out == 0: prg.reduce_decay(queue, self.global_size, self.local_size, p, k, cl.LocalMemory(n_local * 32), cl.LocalMemory(n_local * 32), np.int32(self.global_size[0]), np.int32(n_local), np.int32(i_out), np.float32(time_i), tmp_1.data, tmp_2.data) i_out += 1 self.it += 1 dc = (tmp_1.map_to_host()).reshape((total_out, n_local)).sum(axis=1) ds = (tmp_2.map_to_host()).reshape((total_out, n_local)).sum(axis=1) n_ex = dc / ds cl.enqueue_copy(queue, self.p_np, self.p_gp) self.p = self.p_np.reshape((ng, ng, ng), order='C') return time_axis, n_ex, self.p
def zeros(n, dtype, backend='cython'): if backend == 'opencl': import pyopencl.array as gpuarray dev_array = gpuarray.zeros(get_queue(), n, dtype) elif backend == 'cuda': import pycuda.gpuarray as gpuarray dev_array = gpuarray.zeros(n, dtype) else: return Array(np.zeros(n, dtype=dtype)) wrapped_array = Array() wrapped_array.set_dev_array(dev_array) return wrapped_array
def test_CPU_vs_GPU_adj(self): inpadj_CPU = clarray.to_device(self.queue, self.symdivin) outadj_CPU = clarray.zeros(self.queue, self.symgradin.shape, dtype=DTYPE) outadj_CPU.add_event(self.symgrad.adj(outadj_CPU, inpadj_CPU)) outadj_CPU = outadj_CPU.map_to_host(wait_for=outadj_CPU.events) inpadj_GPU = clarray.to_device(self.queue_GPU, self.symdivin) outadj_GPU = clarray.zeros(self.queue_GPU, self.symgradin.shape, dtype=DTYPE) outadj_GPU.add_event(self.symgrad_GPU.adj(outadj_GPU, inpadj_GPU)) outadj_GPU = outadj_GPU.map_to_host(wait_for=outadj_GPU.events) np.testing.assert_allclose(outadj_CPU, outadj_GPU, rtol=RTOL, atol=ATOL)
def test_touch(self): MAX_CLASH = 100 + 0.9 MIN_INTER = 300 + 0.9 NROT = np.random.randint(self.rotations.shape[0] + 1) rotmat = self.rotations[0] cpu_lsurf = np.zeros_like(self.im_lsurf.array) disvis.libdisvis.rotate_image3d(self.im_lsurf.array, self.vlength, np.linalg.inv(rotmat), self.im_center, cpu_lsurf) cpu_clashvol = numpy.fft.irfftn(numpy.fft.rfftn(cpu_lsurf).conj() * numpy.fft.rfftn(self.rcore.array)) gpu_rcore = cl_array.to_device(self.queue, np.asarray(self.rcore.array, dtype=np.float32)) gpu_im_lsurf = cl.image_from_array(self.queue.context, np.asarray(self.im_lsurf.array, dtype=np.float32)) gpu_lsurf = cl_array.zeros(self.queue, self.shape, dtype=np.float32) self.kernels.rotate_image3d(self.queue, self.sampler, gpu_im_lsurf, rotmat, gpu_lsurf, self.im_center) gpu_ft_lsurf = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64) gpu_ft_rcore = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64) gpu_ft_clashvol = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64) gpu_clashvol = cl_array.zeros(self.queue, self.shape, dtype=np.float32) self.kernels.rfftn(self.queue, gpu_rcore, gpu_ft_rcore) self.kernels.rfftn(self.queue, gpu_lsurf, gpu_ft_lsurf) self.kernels.c_conj_multiply(self.queue, gpu_ft_lsurf, gpu_ft_rcore, gpu_ft_clashvol) self.kernels.irfftn(self.queue, gpu_ft_clashvol, gpu_clashvol) cpu_intervol = numpy.fft.irfftn(numpy.fft.rfftn(cpu_lsurf).conj() * numpy.fft.rfftn(self.rsurf.array)) gpu_rsurf = cl_array.to_device(self.queue, np.asarray(self.rsurf.array, dtype=np.float32)) gpu_ft_rsurf = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64) gpu_ft_intervol = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64) gpu_intervol = cl_array.zeros(self.queue, self.shape, dtype=np.float32) cpu_interspace = np.zeros(self.shape, dtype=np.int32) gpu_interspace = cl_array.zeros(self.queue, self.shape, dtype=np.int32) self.kernels.rfftn(self.queue, gpu_rsurf, gpu_ft_rsurf) self.kernels.rfftn(self.queue, gpu_lsurf, gpu_ft_lsurf) self.kernels.c_conj_multiply(self.queue, gpu_ft_lsurf, gpu_ft_rsurf, gpu_ft_intervol) self.kernels.irfftn(self.queue, gpu_ft_intervol, gpu_intervol) self.kernels.touch(self.queue, gpu_clashvol, MAX_CLASH, gpu_intervol, MIN_INTER, gpu_interspace) np.logical_and(cpu_clashvol < MAX_CLASH, cpu_intervol > MIN_INTER, cpu_interspace) disvis.volume.Volume(cpu_interspace, self.im_lsurf.voxelspacing, self.im_lsurf.origin).tofile('cpu_interspace.mrc') disvis.volume.Volume(gpu_interspace.get(), self.im_lsurf.voxelspacing, self.im_lsurf.origin).tofile('gpu_interspace.mrc') disvis.volume.Volume(cpu_interspace - gpu_interspace.get(), self.im_lsurf.voxelspacing, self.im_lsurf.origin).tofile('diff.mrc') print() print(cpu_interspace.sum(), gpu_interspace.get().sum()) print(np.abs(cpu_interspace - gpu_interspace.get()).sum()) self.assertTrue(np.allclose(gpu_interspace.get(), cpu_interspace))
def _allocate_arrays(self): self.d_frames = parray.zeros(self.queue, (self.nframes, ) + self.shape, self.dtype) self._old_d_frames = None self.d_sums = parray.zeros(self.queue, self.output_shape, self.sums_dtype) self.d_sums_f = parray.zeros( self.queue, self.output_shape, self.output_dtype, ) self.d_output = parray.zeros(self.queue, (self.n_bins, self.nframes), np.float32)
def test_copy_buffer_rect(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) _xfail_if_pocl_gpu(queue.device, "rectangular copies") arr1 = cl_array.zeros(queue, (2, 3), "f") arr2 = cl_array.zeros(queue, (4, 5), "f") arr1.fill(1) cl.enqueue_copy( queue, arr2.data, arr1.data, src_origin=(0, 0), dst_origin=(1, 1), region=arr1.shape[::-1])
def init_data_periodic(self): """ additional arrays for periodic simulations """ # Connectivity of periodic grid self.needle_sq_neighbour_inds = numpy.zeros((self.n_sqs*9,), numpy.int32) self.needle_sq_neighbour_inds_dev = cl_array.zeros(self.queue, (self.n_sqs*9,), numpy.int32) self.needle_sq_neighbour_offset_inds = numpy.zeros((self.n_sqs*9,), numpy.int32) self.needle_sq_neighbour_offset_inds_dev = cl_array.zeros(self.queue, (self.n_sqs*9,), numpy.int32) # offset vectors for computing cell images self.offset_vecs = numpy.zeros((9,), vec.float4) self.offset_vecs_dev = cl_array.zeros(self.queue, (9,), vec.float4)
def initArrays(self): self.gridIdxs = numpy.zeros((self.maxCells, 8), dtype=numpy.int32) self.gridIdxs_dev = cl_array.zeros(self.queue, (self.maxCells, 8), dtype=numpy.int32) self.triWts = numpy.zeros((self.maxCells, 8), dtype=numpy.float32) self.triWts_dev = cl_array.zeros(self.queue, (self.maxCells, 8), dtype=numpy.float32) self.cellSigRates = numpy.zeros((self.maxCells, 8, self.nSignals), dtype=numpy.float32) self.cellSigRates_dev = cl_array.zeros( self.queue, (self.maxCells, 8, self.nSignals), dtype=numpy.float32) self.cellSigLevels = numpy.zeros((self.maxCells, self.nSignals), dtype=numpy.float32) self.cellSigLevels_dev = cl_array.zeros(self.queue, (self.maxCells, self.nSignals), dtype=numpy.float32) self.signalLevel_dev = cl_array.zeros(self.queue, self.gridDim, dtype=numpy.float32) self.specLevel_dev = cl_array.zeros(self.queue, (self.maxCells, self.nSpecies), dtype=numpy.float32) self.specRate_dev = cl_array.zeros(self.queue, (self.maxCells, self.nSpecies), dtype=numpy.float32) self.celltype = numpy.zeros((self.maxCells, ), dtype=numpy.int32) self.celltype_dev = cl_array.zeros(self.queue, (self.maxCells, ), dtype=numpy.int32)
def test_random_float_in_range(ctx_factory, rng_class, ary_size, plot_hist=False): context = ctx_factory() queue = cl.CommandQueue(context) if has_double_support(context.devices[0]): dtypes = [np.float32, np.float64] else: dtypes = [np.float32] if rng_class is RanluxGenerator: gen = rng_class(queue, 5120) else: gen = rng_class(context) for dtype in dtypes: print(dtype) ran = cl_array.zeros(queue, ary_size, dtype) gen.fill_uniform(ran) if plot_hist: import matplotlib.pyplot as pt pt.hist(ran.get(), 30) pt.show() assert (0 <= ran.get()).all() assert (ran.get() <= 1).all() if rng_class is RanluxGenerator: gen.synchronize(queue) ran = cl_array.zeros(queue, ary_size, dtype) gen.fill_uniform(ran, a=4, b=7) ran_host = ran.get() for cond in [4 <= ran_host, ran_host <= 7]: good = cond.all() if not good: print(np.where(~cond)) print(ran_host[~cond]) assert good ran = gen.normal(queue, ary_size, dtype, mu=10, sigma=3) if plot_hist: import matplotlib.pyplot as pt pt.hist(ran.get(), 30) pt.show()
def _allocate_memory(self): self.d_filter_f = parray.zeros(self.queue, (self.sino_f_shape[-1],), np.complex64) self.is_cpu = (self.device.type == "CPU") # These are already allocated by FFT() if using the opencl backend if self.fft_backend == "opencl": self.d_sino_padded = self.fft.data_in self.d_sino_f = self.fft.data_out else: # When using the numpy backend, arrays are not pre-allocated self.d_sino_padded = np.zeros(self.sino_padded_shape, "f") self.d_sino_f = np.zeros(self.sino_f_shape, np.complex64) # These are needed for rectangular memcpy in certain cases (see below). self.tmp_sino_device = parray.zeros(self.queue, self.sino_shape, "f") self.tmp_sino_host = np.zeros(self.sino_shape, "f")
def test_dot_fwdgrad(self): x = expr.Variable('x') y = expr.Variable('y') dotprod = op.Dot(x, y) valuation = pl.valuation() nx = np.random.uniform(0, 1, (10, )).astype(np.float32) ny = np.random.uniform(0, 1, (10, )).astype(np.float32) valuation['x'] = nx valuation['y'] = ny xw = clarray.zeros(pl.qs[0], (10, ), dtype=np.float32) + 1.0 yw = clarray.zeros(pl.qs[0], (10, ), dtype=np.float32) gddot = dotprod.fwd_grad({'x': xw, 'y': yw}, valuation) ddot = gddot.get()
def __init__(self, sino_shape, slice_shape=None, axis_position=None, angles=None, ctx=None, devicetype="all", platformid=None, deviceid=None, profile=False): OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype, platformid=platformid, deviceid=deviceid, profile=profile) # Create a backprojector self.backprojector = Backprojection(sino_shape, slice_shape=slice_shape, axis_position=axis_position, angles=angles, ctx=self.ctx, profile=profile) # Create a projector self.projector = Projection(self.backprojector.slice_shape, self.backprojector.angles, axis_position=axis_position, detector_width=self.backprojector.num_bins, normalize=False, ctx=self.ctx, profile=profile) self.sino_shape = sino_shape self.is_cpu = self.backprojector.is_cpu # Arrays self.d_data = parray.zeros(self.queue, sino_shape, dtype=np.float32) self.d_sino = parray.zeros_like(self.d_data) self.d_x = parray.zeros(self.queue, self.backprojector.slice_shape, dtype=np.float32) self.d_x_old = parray.zeros_like(self.d_x) self.add_to_cl_mem({ "d_data": self.d_data, "d_sino": self.d_sino, "d_x": self.d_x, "d_x_old": self.d_x_old, })
def pad(image, region=None, out=None, value=0, queue=None, block=False): """Pad a 2D *image*. *region* is the region to pad as (y_0, x_0, height, width). If not specified, the next power of two dimensions are used and the image is centered in the padded one. The final image dimensions are height x width and the filling starts at (y_0, x_0), *out* is the pyopencl Array instance, if not specified it will be created. *out* is also returned. *value* is the padded value. If *block* is True, wait for the copy to finish. """ if region is None: shape = tuple([next_power_of_two(n) for n in image.shape]) y_0 = (shape[0] - image.shape[0]) / 2 x_0 = (shape[1] - image.shape[1]) / 2 region = (y_0, x_0) + shape if queue is None: queue = cfg.OPENCL.queue if out is None: out = cl_array.zeros(queue, (region[2], region[3]), dtype=image.dtype) + value image = g_util.get_array(image, queue=queue) n_bytes = image.dtype.itemsize y_0, x_0, height, width = region src_origin = (0, 0, 0) dst_origin = (n_bytes * x_0, y_0, 0) region = (n_bytes * image.shape[1], image.shape[0], 1) LOG.debug('pad, shape: %s, src_origin: %s, dst_origin: %s, region: %s', image.shape, src_origin, dst_origin, region) _copy_rect(image, out, src_origin, dst_origin, region, queue, block=block) return out
def transfer_many(objects, shape, pixel_size, energy, exponent=False, offset=None, queue=None, out=None, t=None, check=True, block=False): """Compute transmission from more *objects*. If *exponent* is True, compute only the exponent, if it is False, evaluate the exponent. Use *shape* (y, x), *pixel_size*, *energy*, *offset* as (y, x), OpenCL command *queue*, *out* array, time *t*, check the sampling if *check* is True and wait for OpenCL kernels if *block* is True. Returned *out* array is different from the input one because of the pyopencl.clmath behavior. """ if queue is None: queue = cfg.OPENCL.queue if out is None: out = cl_array.zeros(queue, shape, cfg.PRECISION.np_cplx) u_sample = cl_array.Array(queue, shape, cfg.PRECISION.np_cplx) lam = energy_to_wavelength(energy) for i, sample in enumerate(objects): try: out += sample.transfer(shape, pixel_size, energy, exponent=True, offset=offset, t=t, queue=queue, out=u_sample, check=False, block=block) except NotImplementedError: LOG.debug('%s does not support real space transfer', sample) if check and not is_wavefield_sampling_ok(out, queue=queue): LOG.error('Insufficient transmission function sampling') # Apply the exponent if not exponent: out = clmath.exp(out, queue=queue) return out
def allocate_arrays(self): """ Allocate various types of arrays for the tests """ # numpy images self.grad = np.zeros(self.image.shape, dtype=np.complex64) self.grad2 = np.zeros((2,) + self.image.shape, dtype=np.float32) self.grad_ref = gradient(self.image) self.div_ref = divergence(self.grad_ref) self.image2 = np.zeros_like(self.image) # Device images self.gradient_parray = parray.zeros(self.la.queue, self.image.shape, np.complex64) # we should be using cl.Buffer(self.la.ctx, cl.mem_flags.READ_WRITE, size=self.image.nbytes*2), # but platforms not suporting openCL 1.2 have a problem with enqueue_fill_buffer, # so we use the parray "fill" utility self.gradient_buffer = self.gradient_parray.data # Do the same for image self.image_parray = parray.to_device(self.la.queue, self.image) self.image_buffer = self.image_parray.data # Refs tmp = np.zeros(self.image.shape, dtype=np.complex64) tmp.real = np.copy(self.grad_ref[0]) tmp.imag = np.copy(self.grad_ref[1]) self.grad_ref_parray = parray.to_device(self.la.queue, tmp) self.grad_ref_buffer = self.grad_ref_parray.data
def test_2d_real_to_complex_double(self, ctx): if not has_double(ctx): #TODO: find better way to skip test return queue = cl.CommandQueue(ctx) M = 64 N = 32 nd_data = np.arange(M*N, dtype=np.float64) nd_data.shape = (M, N) cl_data = cla.to_device(queue, nd_data) cl_data_transformed = cla.zeros(queue, (M, N//2+1), dtype = np.complex128) transform = FFT(ctx, queue, cl_data, cl_data_transformed, axes = (1,0), ) transform.enqueue() print(cl_data_transformed.get) print(np.fft.rfft2(nd_data)) assert np.allclose(cl_data_transformed.get(), np.fft.rfft2(nd_data), rtol=1e-8, atol=1e-8)
def test_2d_real_to_complex(self, ctx): queue = cl.CommandQueue(ctx) M = 64 N = 32 nd_data = np.arange(M*N, dtype=np.float32) nd_data.shape = (M, N) cl_data = cla.to_device(queue, nd_data) cl_data_transformed = cla.zeros(queue, (M, N//2+1), dtype = np.complex64) transform = FFT(ctx, queue, cl_data, cl_data_transformed, axes = (1,0), ) transform.enqueue() print(cl_data_transformed.get) print(np.fft.rfft2(nd_data)) assert np.allclose(cl_data_transformed.get(), np.fft.rfft2(nd_data), rtol=1e-3, atol=1e-3)
def build(self, coords, values, base): """Use OpenCL to build the arrays.""" lenbase = base.shape[0] lencoords = coords.shape[0] coords_array = cla.to_device(self.queue, coords) values_array = cla.to_device(self.queue, values) base_array = cla.to_device(self.queue, base) template_array = cla.zeros(self.queue, (lenbase), dtype=np.int32) event = self.program.nearest( self.queue, base.shape, None, coords_array.data, values_array.data, base_array.data, template_array.data, np.int32(lencoords), self.nnear, self.usemajority, ) try: event.wait() except cl.RuntimeError, inst: errstr = inst.__str__() if errstr == "clWaitForEvents failed: out of resources": print "OpenCL timed out, probably due to the display manager." print "Disable your display manager and try again!" print "If that does not work, rerun with OpenCL disabled." else: raise cl.RuntimeError, inst sys.exit(1)
def test_fancy_indexing(ctx_factory): if _PYPY: pytest.xfail("numpypy: multi value setting is not supported") context = ctx_factory() queue = cl.CommandQueue(context) numpy_dest = np.zeros((4,), np.int32) numpy_idx = np.arange(3, 0, -1, dtype=np.int32) numpy_src = np.arange(8, 10, dtype=np.int32) numpy_dest[numpy_idx] = numpy_src cl_dest = cl_array.zeros(queue, (4,), np.int32) cl_idx = cl_array.arange(queue, 3, 0, -1, dtype=np.int32) cl_src = cl_array.arange(queue, 8, 10, dtype=np.int32) cl_dest[cl_idx] = cl_src assert np.all(numpy_dest == cl_dest.get()) cl_idx[1] = 3 cl_idx[2] = 2 numpy_idx[1] = 3 numpy_idx[2] = 2 numpy_dest[numpy_idx] = numpy_src cl_dest[cl_idx] = cl_src assert np.all(numpy_dest == cl_dest.get())
def compute_slices(self, shape, pixel_size, queue=None, out=None, offset=None): """Compute slices with *shape* as (z, y, x), *pixel_size*. Use *queue* and *out* for outuput. Offset is the starting point offset as (x, y, z). """ if queue is None: queue = cfg.OPENCL.queue if out is None: out = cl_array.zeros(queue, shape, dtype=np.uint8) pixel_size = make_tuple(pixel_size, num_dims=2) v_1, v_2, v_3 = self._make_inputs(queue, pixel_size) psm = pixel_size.simplified.magnitude max_dx = self.max_triangle_x_diff.simplified.magnitude / psm[1] if offset is None: offset = gutil.make_vfloat3(0, 0, 0) else: offset = offset.simplified.magnitude offset = gutil.make_vfloat3(offset[0] / psm[1], offset[1] / psm[0], offset[2] / psm[1]) cfg.OPENCL.programs['mesh'].compute_slices(queue, (shape[2], shape[0]), None, v_1.data, v_2.data, v_3.data, out.data, np.int32(shape[1]), np.int32(self.num_triangles), offset, cfg.PRECISION.np_float(max_dx)) return out
def _transfer( self, shape, pixel_size, energy, offset, exponent=False, t=None, queue=None, out=None, check=True, block=False, ): """Transfer function implementation based on a refractive index.""" if out is None: out = cl_array.zeros(queue, shape, dtype=cfg.PRECISION.np_cplx) else: # transmission_many adds values, make sure it start with a zeroed array out.fill(0) return transfer_many( self.bodies, shape, pixel_size, energy, offset=offset, exponent=exponent, queue=queue, out=out, t=t, check=check, block=block, )
def __init__(self, label, rng, input, nin, nout, W=None, b=None, activation_fn=op.Sigmoid): q = pl.qs[0] self.input = input if W is None: nw = np.asarray(rng.uniform(low=-np.sqrt(6. / (nin + nout)), high=np.sqrt(6. / (nin + nout)), size=(nin, nout)), dtype=np.float32) if activation_fn == op.Sigmoid: nw *= 4 W = clarray.to_device(q, nw) if b is None: b = clarray.zeros(q, (nout, ), np.float32) self.W = W self.b = b vW = expr.Variable('W' + label) vb = expr.Variable('b' + label) lin_out = op.Add(op.Dot(self.input, vW), vb) self.output = lin_out if activation_fn is None else activation_fn( lin_out) self.params = [(vW.name, self.W), (vb.name, self.b)]
def zeros_cl(queue, shape): """ Create GPUArray of zeros directly on GPU memory. Parameters ---------- queue PyOpenCL queue. shape : tuple Dimensions of the GPUArray. Returns ------- gpuarray GPUArray of zeros. Examples -------- >>> a = zeros_cl((3, 2)) [[ 0., 0.], [ 0., 0.], [ 0., 0.]] >>> type(a) <class 'pyopencl.array.Array'> """ return cl_array.zeros(queue, shape, dtype=float32)
def pad(image, region=None, out=None, value=0, queue=None, block=False): """Pad a 2D *image*. *region* is the region to pad as (y_0, x_0, height, width). If not specified, the next power of two dimensions are used and the image is centered in the padded one. The final image dimensions are height x width and the filling starts at (y_0, x_0), *out* is the pyopencl Array instance, if not specified it will be created. *out* is also returned. *value* is the padded value. If *block* is True, wait for the copy to finish. """ if region is None: shape = tuple([next_power_of_two(n) for n in image.shape]) y_0 = (shape[0] - image.shape[0]) // 2 x_0 = (shape[1] - image.shape[1]) // 2 region = (y_0, x_0) + shape if queue is None: queue = cfg.OPENCL.queue if out is None: out = cl_array.zeros(queue, (region[2], region[3]), dtype=image.dtype) + value image = g_util.get_array(image, queue=queue) n_bytes = image.dtype.itemsize y_0, x_0, height, width = region src_origin = (0, 0, 0) dst_origin = (n_bytes * x_0, y_0, 0) region = (n_bytes * image.shape[1], image.shape[0], 1) LOG.debug( "pad, shape: %s, src_origin: %s, dst_origin: %s, region: %s", image.shape, src_origin, dst_origin, region, ) _copy_rect(image, out, src_origin, dst_origin, region, queue, block=block) return out
def test_map_to_host(ctx_factory): if _PYPY: pytest.skip("numpypy: no array creation from __array_interface__") context = ctx_factory() queue = cl.CommandQueue(context) if context.devices[0].type & cl.device_type.GPU: mf = cl.mem_flags allocator = cl_tools.DeferredAllocator( context, mf.READ_WRITE | mf.ALLOC_HOST_PTR) else: allocator = None a_dev = cl_array.zeros(queue, (5, 6, 7,), dtype=np.float32, allocator=allocator) a_dev[3, 2, 1] = 10 a_host = a_dev.map_to_host() a_host[1, 2, 3] = 10 a_host_saved = a_host.copy() a_host.base.release(queue) a_dev.finish() print("DEV[HOST_WRITE]", a_dev.get()[1, 2, 3]) print("HOST[DEV_WRITE]", a_host_saved[3, 2, 1]) assert (a_host_saved == a_dev.get()).all()
def ones_cl(queue, shape): """ Create GPUArray of ones directly on GPU memory. Parameters ---------- queue PyOpenCL queue. shape : tuple Dimensions of the GPUArray. Returns ------- gpuarray GPUArray of ones. Examples -------- >>> a = ones_cl((3, 2)) [[ 1., 1.], [ 1., 1.], [ 1., 1.]] >>> type(a) <class 'pyopencl.array.Array'> """ a = cl_array.zeros(queue, shape, dtype=float32) a.fill(1.0) return a
def CalcF(ctx, queue, m2, r2): # Define dimensions xdim = ydim = m2.shape[0] # m2 = np.float32(m2) # r2 = np.float32(r2) # Get the compiled kernel kernel = get_kernel(ctx, xdim) # Move data to the GPU gpu_m2 = cl_array.to_device(queue, m2) gpu_r2 = cl_array.to_device(queue, r2) gpu_result = cl_array.zeros(queue, (ydim, xdim), np.float32) # Define grid shape (the same as the matrix dimensions) grid_shape = (ydim, xdim) # Get group shape based on the matrix dimensions and the actual hardware group_shape = (16, 16) event = kernel.CalcF(queue, grid_shape, group_shape, gpu_result.data, gpu_m2.data, gpu_r2.data) event.wait() result = gpu_result.get() queue.finish() return result
def _transfer_fourier( self, shape, pixel_size, energy, t=None, queue=None, out=None, block=False ): if out is None: out = cl_array.zeros(queue, shape, cfg.PRECISION.np_cplx) return out
def __init__(self, npixels_x, npixels_y, max_iterations): """ Initialize the renderer """ self.npixels_x = npixels_x self.npixels_y = npixels_y self.max_iterations = max_iterations # Initialize OpenCL self.context = cl.create_some_context() self.queue = cl.CommandQueue(self.context) self.cl_res = cl_array.zeros(self.queue, (self.npixels_x*self.npixels_y,), np.float32) # Set up program kernel with open('julia.cl') as source: try: self.program = cl.Program(self.context, source.read()).build() except Exception as err: #cl.cffi_cl.RuntimeError as err: raise RuntimeError('Could not compile program: {0}'.format(err)) self.kernel = self.program.kernel_main if not self.kernel: raise RuntimeError('Could not load program kernel (does file exist?)') self.kernel.set_scalar_arg_dtypes([None, np.float32, np.int32])
def test_subset_minmax(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand l_a = 200000 gran = 5 l_m = l_a - l_a // gran + 1 if has_double_support(): dtypes = [numpy.float64, numpy.float32, numpy.int32] else: dtypes = [numpy.float32, numpy.int32] for dtype in dtypes: a_gpu = clrand(context, queue, (l_a,), dtype) a = a_gpu.get() meaningful_indices_gpu = cl_array.zeros(l_m, dtype=numpy.int32) meaningful_indices = meaningful_indices_gpu.get() j = 0 for i in range(len(meaningful_indices)): meaningful_indices[i] = j j = j + 1 if j % gran == 0: j = j + 1 meaningful_indices_gpu = cl_array.to_device(meaningful_indices) b = a[meaningful_indices] min_a = numpy.min(b) min_a_gpu = cl_array.subset_min(meaningful_indices_gpu, a_gpu).get() assert min_a_gpu == min_a
def _project(self, shape, pixel_size, offset, t=None, queue=None, out=None, block=False): """Projection implementation.""" def get_crop(index, fov): minimum = max(self.extrema[index][0], fov[index][0]) maximum = min(self.extrema[index][1], fov[index][1]) return minimum - offset[::-1][index], maximum - offset[::-1][index] def get_px_value(value, round_func, ps): return int(round_func(get_magnitude(value / ps))) # Move to the desired location, apply the T matrix and resort the triangles self.transform() self.sort() psm = pixel_size.simplified.magnitude fov = offset + shape * pixel_size fov = np.concatenate((offset.simplified.magnitude[::-1], fov.simplified.magnitude[::-1])).reshape(2, 2).transpose() * q.m if out is None: out = cl_array.zeros(queue, shape, dtype=cfg.PRECISION.np_float) if (self.extrema[0][0] < fov[0][1] and self.extrema[0][1] > fov[0][0] and self.extrema[1][0] < fov[1][1] and self.extrema[1][1] > fov[1][0]): # Object inside FOV x_min, x_max = get_crop(0, fov) y_min, y_max = get_crop(1, fov) x_min_px = get_px_value(x_min, np.floor, pixel_size[1]) x_max_px = get_px_value(x_max, np.ceil, pixel_size[1]) y_min_px = get_px_value(y_min, np.floor, pixel_size[0]) y_max_px = get_px_value(y_max, np.ceil, pixel_size[0]) width = min(x_max_px - x_min_px, shape[1]) height = min(y_max_px - y_min_px, shape[0]) compute_offset = cl_array.vec.make_int2(x_min_px, y_min_px) v_1, v_2, v_3 = self._make_inputs(queue, pixel_size) max_dx = self.max_triangle_x_diff.simplified.magnitude / psm[1] # Use the same pixel size as for the x-axis, which will work for objects "not too far" # from the imaging plane min_z = self.extrema[2][0].simplified.magnitude / psm[1] offset = gutil.make_vfloat2(*(offset / pixel_size).simplified.magnitude[::-1]) ev = cfg.OPENCL.programs['mesh'].compute_thickness(queue, (width, height), None, v_1.data, v_2.data, v_3.data, out.data, np.int32(self.num_triangles), np.int32(shape[1]), compute_offset, offset, cfg.PRECISION.np_float(psm[1]), cfg.PRECISION.np_float(max_dx), cfg.PRECISION.np_float(min_z), np.int32(self.iterations)) if block: ev.wait() return out
def init_data(self): """Set up the data OpenCL will store on the device.""" # cell data cell_geom = (self.max_cells,) self.cell_centers = numpy.zeros(cell_geom, vec.float4) self.cell_centers_dev = cl_array.zeros(self.queue, cell_geom, vec.float4) # cell geometry self.cell_areas_dev = cl_array.zeros(self.queue, cell_geom, numpy.float32) self.cell_areas = numpy.zeros(cell_geom, numpy.float32) self.cell_vols_dev = cl_array.zeros(self.queue, cell_geom, numpy.float32) self.cell_vols = numpy.zeros(cell_geom, numpy.float32) self.cell_old_vols_dev = cl_array.zeros(self.queue, cell_geom, numpy.float32) self.cell_old_vols = numpy.zeros(cell_geom, numpy.float32) self.cell_growth_rates = numpy.zeros(cell_geom, numpy.float32) self.cell_growth_rates_dev = cl_array.zeros(self.queue, cell_geom, numpy.float32)
def _project(self, shape, pixel_size, offset, t=None, queue=None, out=None, block=False): """Projection function implementation. *shape* and *pixel_size* are 2D.""" if out is None: out = cl_array.zeros(queue, shape, dtype=cfg.PRECISION.np_float) for body in self.bodies: out += body.project(shape, pixel_size, offset=offset, t=t, queue=queue, out=None, block=block) return out
def _allocate_device(self): if self.state is DeviceDataMixin.DEVICE_UNALLOCATED: if self.soa: shape = tuple(reversed(self.shape)) else: shape = self.shape self._device_data = array.zeros(_queue, shape=shape, dtype=self.dtype) self.state = DeviceDataMixin.HOST
def main(): # Allocate the first GPU ctx = cl.create_some_context(0)#use device 0, the GPU queue = cl.CommandQueue(ctx) # Define dimensions ydim = 1024 xdim = 1024 # Create random matrix matrix = np.random.random((ydim, xdim)) matrix = np.float32(matrix) # Create random matrix2 matrix2 = np.random.random((ydim, xdim)) matrix2 = np.float32(matrix2) # Get the compiled kernel kernel = get_kernel(ctx, xdim) # Start timing t1 = time.time() # Move data to the GPU gpu_matrix = cl_array.to_device(queue, matrix) gpu_matrix2 = cl_array.to_device(queue, matrix2) gpu_result = cl_array.zeros(queue, (ydim, xdim), np.float32) # Define grid shape (the same as the matrix dimensions) grid_shape = (ydim, xdim) # Get group shape based on the matrix dimensions and the actual hardware group_shape = (16,16)#(32,16) # Execute the kernel event = kernel.add(queue, grid_shape, group_shape, gpu_result.data, gpu_matrix.data, gpu_matrix2.data) # Wait for the kernel to finish event.wait() # Move the result from GPU to CPU result = gpu_result.get() # Measure end time t2 = time.time() # Print result and execution time print result print "Elapsed: %f seconds " % (t2-t1) # Free the GPU resource queue.finish()
def test_vector_fill(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) a_gpu = cl_array.Array(queue, 100, dtype=cl_array.vec.float4) a_gpu.fill(cl_array.vec.make_float4(0.0, 0.0, 1.0, 0.0)) a = a_gpu.get() assert a.dtype == cl_array.vec.float4 a_gpu = cl_array.zeros(queue, 100, dtype=cl_array.vec.float4)
def __init__(self, sino_shape, slice_shape=None, axis_position=None, angles=None, ctx=None, devicetype="all", platformid=None, deviceid=None, profile=False ): OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype, platformid=platformid, deviceid=deviceid, profile=profile) # Create a backprojector self.backprojector = Backprojection( sino_shape, slice_shape=slice_shape, axis_position=axis_position, angles=angles, ctx=self.ctx, profile=profile ) # Create a projector self.projector = Projection( self.backprojector.slice_shape, self.backprojector.angles, axis_position=axis_position, detector_width=self.backprojector.num_bins, normalize=False, ctx=self.ctx, profile=profile ) self.sino_shape = sino_shape self.is_cpu = self.backprojector.is_cpu # Arrays self.d_data = parray.zeros(self.queue, sino_shape, dtype=np.float32) self.d_sino = parray.zeros_like(self.d_data) self.d_x = parray.zeros(self.queue, self.backprojector.slice_shape, dtype=np.float32) self.d_x_old = parray.zeros_like(self.d_x) self.add_to_cl_mem({ "d_data": self.d_data, "d_sino": self.d_sino, "d_x": self.d_x, "d_x_old": self.d_x_old, })