Example #1
0
	def get_binned_data_stereographic(self,limits=((-1,1),(-1,1)),points=500): #project data stereographically onto xy plane and bin it
		""" stereographically project measured ray endpoints and bin them on the CL DEV. This is a lot faster when you have loads of data. Binning is done with points number of points within limits=((xmin,xmax),(ymin,ymax))."""
		(pos0,pwr0) = self.get_measured_rays()
		pos0_dev = cl_array.to_device(self.queue,pos0.astype(np.float32))
		x_dev	 = cl_array.zeros(self.queue,pwr0.shape,dtype=np.float32)
		y_dev	 = cl_array.zeros(self.queue,pwr0.shape,dtype=np.float32)
		pwr0_dev = cl_array.to_device(self.queue,pwr0.astype(np.float32))
		pwr_dev  = cl_array.zeros(self.queue,pwr0.shape,dtype=np.float32)
		pivot    = cl_array.to_device(self.queue,np.array([0,0,0,0],dtype=np.float32))
			
		time1 = time()
		R_dev = cl_array.to_device(self.queue,np.array([[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,0]]).astype(np.float32))
		evt = self.prg.stereograph_project(self.queue, pwr0.shape, None, pos0_dev.data,pwr0_dev.data,R_dev.data,pivot.data,x_dev.data,y_dev.data,pwr_dev.data)
			
			
		evt.wait()
			
		x=x_dev.get()
		y=y_dev.get()
		pwr=np.float64(pwr_dev.get())
	
		time2 = time()
		dx = np.float64(limits[0][1]-limits[0][0])/np.float64(points)
		dy = np.float64(limits[1][1]-limits[1][0])/np.float64(points)
		pwr = pwr / (dx * dy)
		
		(H,x_coord,y_coord)=np.histogram2d(x=x.flatten(),y=y.flatten(),bins=points,range=limits,weights=pwr.flatten())
		self.hist_data = (H,x_coord,y_coord)
		return self.hist_data
Example #2
0
def test_random(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    from pyopencl.clrandom import RanluxGenerator

    if has_double_support(context.devices[0]):
        dtypes = [np.float32, np.float64]
    else:
        dtypes = [np.float32]

    gen = RanluxGenerator(queue, 5120)

    for ary_size in [300, 301, 302, 303, 10007]:
        for dtype in dtypes:
            ran = cl_array.zeros(queue, ary_size, dtype)
            gen.fill_uniform(ran)
            assert (0 < ran.get()).all()
            assert (ran.get() < 1).all()

            gen.synchronize(queue)

            ran = cl_array.zeros(queue, ary_size, dtype)
            gen.fill_uniform(ran, a=4, b=7)
            assert (4 < ran.get()).all()
            assert (ran.get() < 7).all()

            ran = gen.normal(queue, (10007,), dtype, mu=4, sigma=3)

    dtypes = [np.int32]
    for dtype in dtypes:
        ran = gen.uniform(queue, (10000007,), dtype, a=200, b=300)
        assert (200 <= ran.get()).all()
        assert (ran.get() < 300).all()
Example #3
0
  def init_buffers(self, kernels):
    if kernels is None or len(kernels.keys())==0:
      raise Exception('No kernels found for OpenCL convolution')

    mf = cl.mem_flags
      
    self.source_host_buffer = numpy.zeros(self.image_width*self.image_height, dtype=numpy.uint8)
    self.source_gpu_buffer  = cl_array.zeros(self.queue, self.array_size, numpy.uint8)
                                       
    
    self.temporal_host_buffers = {}
    self.temporal_host_buffers[TMP1] = numpy.zeros_like(self.source_host_buffer, dtype=numpy.float32)
    self.temporal_host_buffers[TMP2] = numpy.zeros_like(self.source_host_buffer, dtype=numpy.float32)
    
    self.temporal_gpu_buffers = {}
    self.temporal_gpu_buffers[TMP1] = cl_array.zeros(self.queue, self.array_size, numpy.float32) 

    self.temporal_gpu_buffers[TMP2] = cl_array.zeros(self.queue, self.array_size, numpy.float32)
    
    self.filtered_host_buffer = numpy.zeros_like(self.source_host_buffer, dtype=numpy.float32)
    
    self.filtered_gpu_buffer = cl_array.zeros(self.queue, self.array_size, numpy.float32)
    
    self.kernel_host_buffers = {}
    self.kernel_gpu_buffers = {}
    self.filtered_host_back_buffers = {}
    
    for cell in kernels.keys():
      self.kernel_host_buffers[cell] = {}
      self.kernel_gpu_buffers[cell] = {}
      self.filtered_host_back_buffers[cell] = {}
      for centre in kernels[cell].keys():
        self.kernels_to_buffers(kernels, cell, centre)
        self.filtered_host_back_buffers[cell][centre] = numpy.zeros_like(self.source_host_buffer, 
                                                                         dtype=numpy.float32)    
Example #4
0
        def _allocate_arrays(self):

            # Determine the required shape and size of an array
            self._ft_shape = tuple([self._target.shape[0] // 2 + 1] +
                                   list(self._target.shape[1:]))
            self._shape = self._target.shape

            # Allocate arrays on CPU
            self._lcc = np.zeros(self._target.shape, dtype=np.float32)
            self._rot = np.zeros(self._target.shape, dtype=np.int32)

            # Allocate arrays on GPU
            arrays = '_target2 _rot_template _rot_mask _rot_mask2 _gcc _ave _ave2 _glcc'.split(
            )
            for array in arrays:
                setattr(
                    self, array,
                    cl_array.zeros(self._queue, self._shape, dtype=np.float32))
            self._grot = cl_array.zeros(self._queue,
                                        self._shape,
                                        dtype=np.int32)

            # Allocate all complex arrays
            ft_arrays = 'target target2 template mask mask2 gcc ave ave2 lcc'.split(
            )
            for ft_array in ft_arrays:
                setattr(
                    self, '_ft_' + ft_array,
                    cl_array.to_device(
                        self._queue,
                        np.zeros(self._ft_shape, dtype=np.complex64)))
Example #5
0
	def get_binned_data_angular(self,limits=((-1,1),(-1,1)),points=500):
		""" Azimuth/elevation map measured ray endpoints to a circle and bin them on the CL DEV. This linearly maps elevation to the circle's radius and azimuth to phi. nice for cross-section plots of directivity. Binning is done with points number of points within limits=((xmin,xmax),(ymin,ymax))."""
		(pos0,pwr0) = self.get_measured_rays()
		pos0_dev = cl_array.to_device(self.queue,pos0.astype(np.float32))
		x_dev	 = cl_array.zeros(self.queue,pwr0.shape,dtype=np.float32)
		y_dev	 = cl_array.zeros(self.queue,pwr0.shape,dtype=np.float32)
		pwr0_dev = cl_array.to_device(self.queue,pwr0.astype(np.float32))
		pwr_dev  = cl_array.zeros(self.queue,pwr0.shape,dtype=np.float32)
		pivot    = cl_array.to_device(self.queue,np.array([0,0,0,0],dtype=np.float32))
			
		time1 = time()
		R_dev = cl_array.to_device(self.queue,np.array([[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,0]]).astype(np.float32))
		evt = self.prg.angular_project(self.queue, pwr0.shape, None, pos0_dev.data,pwr0_dev.data,R_dev.data,pivot.data,x_dev.data,y_dev.data,pwr_dev.data)
			
			
		evt.wait()
			
		x=x_dev.get()
		y=y_dev.get()
		pwr=np.float64(pwr_dev.get())
	
		time2 = time()
		dx = np.float64(limits[0][1]-limits[0][0])/np.float64(points)
		dy = np.float64(limits[1][1]-limits[1][0])/np.float64(points)
		pwr = pwr / (dx * dy)
		
		(H,x_coord,y_coord)=np.histogram2d(x=x.flatten(),y=y.flatten(),bins=points,range=limits,weights=pwr.flatten())
		self.hist_data = (H,x_coord,y_coord)
		return self.hist_data
Example #6
0
    def _setupVariables(self, x, data):
        data = clarray.to_device(self._queue[0], data.astype(self._DTYPE))

        step_in = {}
        step_out = {}
        tmp_results = {}

        step_in["x"] = clarray.to_device(self._queue[0], x)
        step_in["xold"] = clarray.to_device(self._queue[0], x)
        step_in["xk"] = step_in["x"].copy()

        step_out["x"] = clarray.zeros_like(step_in["x"])

        tmp_results["gradFx"] = step_in["x"].copy()
        tmp_results["DADA"] = clarray.zeros_like(step_in["x"])
        tmp_results["DAd"] = clarray.zeros_like(step_in["x"])
        tmp_results["d"] = data.copy()
        tmp_results["Ax"] = clarray.zeros_like(data)

        tmp_results["temp_reg"] = clarray.zeros_like(step_in["x"])
        tmp_results["gradx"] = clarray.zeros(
            self._queue[0], step_in["x"].shape + (4,), dtype=self._DTYPE
        )

        tmp_results["reg_norm"] = clarray.zeros(
            self._queue[0],
            step_in["x"].shape + (2,),
            dtype=self._DTYPE_real,
        )
        tmp_results["reg"] = clarray.zeros(
            self._queue[0], step_in["x"].shape, dtype=self._DTYPE_real
        )
        return (step_out, tmp_results, step_in, data)
Example #7
0
        def _allocate_arrays(self):

            # Determine the required shape and size of an array
            self._ft_shape = tuple(
                    [self._target.shape[0] // 2 + 1] + list(self._target.shape[1:])
                    )
            self._shape = self._target.shape

            # Allocate arrays on CPU
            self._lcc = np.zeros(self._target.shape, dtype=np.float32)
            self._rot = np.zeros(self._target.shape, dtype=np.int32)

            # Allocate arrays on GPU
            arrays = '_target2 _rot_template _rot_mask _rot_mask2 _gcc _ave _ave2 _glcc'.split()
            for array in arrays:
                setattr(self, array, 
                        cl_array.zeros( self._queue, self._shape, dtype=np.float32)
                        )
            self._grot = cl_array.zeros(self._queue, self._shape, dtype=np.int32)

            # Allocate all complex arrays
            ft_arrays = 'target target2 template mask mask2 gcc ave ave2 lcc'.split()
            for ft_array in ft_arrays:
                setattr(self, '_ft_' + ft_array, 
                        cl_array.to_device(self._queue,
                            np.zeros(self._ft_shape, dtype=np.complex64))
                        )
Example #8
0
def sum_labeled(src, labels, n=None, clq=None):
    if clq is None:
        clq = cl.CommandQueue(ctx)
        return_dev = False
        if src.dtype == numpy.bool:
            src = src.astype(numpy.uint8)
        src_dev = cl_array.to_device(clq, src)
        labels_dev = cl_array.to_device(clq, labels)
    else:
        return_dev = True
        src_dev = src
        labels_dev = labels

    if n is None:
        n = labels_dev.max() + 1
    tmp_dev = cl_array.zeros(clq, (TOTAL_ITEMS, n), float32)
    dst_dev = cl_array.zeros(clq, (n, ), float32)

    sum_labeled_dev(clq, src_dev, labels_dev, tmp_dev, dst_dev)

    if return_dev:
        return dst_dev
    else:
        result = dst_dev.map_to_host()
        clq.finish()
        return result
Example #9
0
    def test_rotate_grid3d(self):
        k = self.p.program.rotate_grid3d
        # Identity rotation
        rotmat = np.asarray([1, 0, 0, 0, 1, 0, 0, 0, 1] + [0] * 7, dtype=np.float32)
        self.cl_grid = cl_array.zeros(self.queue, self.shape, dtype=np.float32)
        self.cl_grid.fill(1)
        self.cl_out = cl_array.zeros(self.queue, self.shape, dtype=np.float32)
        args = (self.cl_grid.data, rotmat, self.cl_out.data)
        gws = tuple([2 * self.values["llength"] + 1] * 3)
        k(self.queue, gws, None, *args)
        answer = [
            [[1.0, 1.0, 1.0], [1.0, 0.0, 0.0], [0.0, 0.0, 0.0], [1.0, 0.0, 0.0]],
            [[1.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]],
            [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]],
            [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]],
            [[1.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]],
        ]

        self.assertTrue(np.allclose(answer, self.cl_out.get()))

        # 90 degree rotation around z-axis
        rotmat = np.asarray([0, -1, 0, 1, 0, 0, 0, 0, 1] + [0] * 7, dtype=np.float32)
        grid = np.zeros(self.shape, dtype=np.float32)
        grid[0, 0, 0] = 1
        grid[0, 0, 1] = 1
        self.cl_grid = cl_array.to_device(self.queue, grid)
        self.cl_out.fill(0)
        args = (self.cl_grid.data, rotmat, self.cl_out.data)
        k(self.queue, gws, None, *args)

        answer = np.zeros_like(grid)
        answer[0, 0, 0] = 1
        answer[0, 1, 0] = 1
        self.assertTrue(np.allclose(answer, self.cl_out.get()))
Example #10
0
    def __init__(self, decomp, context, queue, grid_shape, dtype):
        self.decomp = decomp
        self.grid_shape = grid_shape
        self.dtype = np.dtype(dtype)
        self.is_real = is_real = self.dtype.kind == "f"

        from pystella.fourier import get_complex_dtype_with_matching_prec
        self.cdtype = cdtype = get_complex_dtype_with_matching_prec(self.dtype)
        from pystella.fourier import get_real_dtype_with_matching_prec
        self.rdtype = get_real_dtype_with_matching_prec(self.dtype)

        self.fx = cla.zeros(queue, grid_shape, dtype)
        self.fk = cla.zeros(queue, self.shape(is_real), cdtype)
        from gpyfft import FFT
        self.forward = FFT(context,
                           queue,
                           self.fx,
                           out_array=self.fk,
                           real=is_real,
                           scale_forward=1,
                           scale_backward=1)
        self.backward = FFT(context,
                            queue,
                            self.fk,
                            out_array=self.fx,
                            real=is_real,
                            scale_forward=1,
                            scale_backward=1)

        slc = (
            (),
            (),
            (),
        )
        self.sub_k = get_sliced_momenta(grid_shape, self.dtype, slc, queue)
Example #11
0
    def test_clashvol(self):

        NROT = np.random.randint(self.rotations.shape[0] + 1)
        rotmat = self.rotations[NROT]
        cpu_lsurf = np.zeros_like(self.im_lsurf.array)
        disvis.libdisvis.rotate_image3d(self.im_lsurf.array, self.vlength, np.linalg.inv(rotmat), self.im_center, cpu_lsurf)

        cpu_clashvol = numpy.fft.irfftn(numpy.fft.rfftn(cpu_lsurf).conj() * numpy.fft.rfftn(self.rcore.array), s=self.shape)

        gpu_rcore = cl_array.to_device(self.queue, np.asarray(self.rcore.array, dtype=np.float32))
        gpu_im_lsurf = cl.image_from_array(self.queue.context, np.asarray(self.im_lsurf.array, dtype=np.float32))
        gpu_lsurf = cl_array.zeros(self.queue, self.shape, dtype=np.float32)

        self.kernels.rotate_image3d(self.queue, self.sampler, gpu_im_lsurf, rotmat, gpu_lsurf, self.im_center)

        gpu_ft_lsurf = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64)
        gpu_ft_rcore = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64)
        gpu_ft_clashvol = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64)
        gpu_clashvol = cl_array.zeros(self.queue, self.shape, dtype=np.float32)

        self.kernels.rfftn(self.queue, gpu_rcore, gpu_ft_rcore)
        self.kernels.rfftn(self.queue, gpu_lsurf, gpu_ft_lsurf)
        self.kernels.c_conj_multiply(self.queue, gpu_ft_lsurf, gpu_ft_rcore, gpu_ft_clashvol)
        self.kernels.irfftn(self.queue, gpu_ft_clashvol, gpu_clashvol)

        self.assertTrue(np.allclose(cpu_clashvol, gpu_clashvol.get(), atol=0.8))
Example #12
0
    def __init__(self, shape, do_checks=False, ctx=None, devicetype="all", platformid=None, deviceid=None, profile=False):
        """
        Create a "Linear Algebra" plan for a given image shape.

        :param shape: shape of the image (num_rows, num_columns)
        :param do_checks (optional): if True, memory and data type checks are performed when possible.
        :param ctx: actual working context, left to None for automatic
                    initialization from device type or platformid/deviceid
        :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL"
        :param platformid: integer with the platform_identifier, as given by clinfo
        :param deviceid: Integer with the device identifier, as given by clinfo
        :param profile: switch on profiling to be able to profile at the kernel level,
                        store profiling elements (makes code slightly slower)

        """
        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
                                  platformid=platformid, deviceid=deviceid,
                                  profile=profile)

        self.d_gradient = parray.zeros(self.queue, shape, np.complex64)
        self.d_image = parray.zeros(self.queue, shape, np.float32)
        self.add_to_cl_mem({
            "d_gradient": self.d_gradient,
            "d_image": self.d_image
        })

        self.wg2D = None
        self.shape = shape
        self.ndrange2D = (
            int(self.shape[1]),
            int(self.shape[0])
        )
        self.do_checks = bool(do_checks)
        OpenclProcessing.compile_kernels(self, self.kernel_files)
Example #13
0
    def _alloctmparrays(self,
                        inp_shape,
                        outp_shape):
        block_size = self.slices+self.overlap
        for j in range(self.num_fun):
            self.inp.append([])
            for i in range(2*self.num_dev):
                self.inp[j].append([])
                for k in range(len(inp_shape[j])):
                    if not len(inp_shape[j][k]) == 0:
                        self.inp[j][i].append(
                            clarray.zeros(
                                self.queue[4*int(i/2)],
                                ((block_size, )+inp_shape[j][k][1:]),
                                dtype=self.dtype))
                    else:
                        self.inp[j][i].append([])

        for j in range(self.num_fun):
            self.outp.append([])
            for i in range(2*self.num_dev):
                self.outp[j].append(
                    clarray.zeros(
                        self.queue[4*int(i/2)],
                        ((block_size, )+outp_shape[j][1:]),
                        dtype=self.dtype))
Example #14
0
    def _gpu_init(self):
        """Method to initialize all the data for GPU-accelerate search"""

        self.gpu_data = {}
        g = self.gpu_data
        d = self.data
        q = self.queue

        # move data to the GPU. All should be float32, as these is the native
        # lenght for GPUs
        g['rcore'] = cl_array.to_device(q, float32array(d['rcore'].array))
        g['rsurf'] = cl_array.to_device(q, float32array(d['rsurf'].array))
        # Make the scanning chain object an Image, as this is faster to rotate
        g['im_lsurf'] = cl.image_from_array(q.context, float32array(d['lsurf'].array))
        g['sampler'] = cl.Sampler(q.context, False, cl.addressing_mode.CLAMP,
                                  cl.filter_mode.LINEAR)

        if self.distance_restraints:
            g['restraints'] = cl_array.to_device(q, float32array(d['restraints']))

        # Allocate arrays on the GPU
        g['lsurf'] = cl_array.zeros_like(g['rcore'])
        g['clashvol'] = cl_array.zeros_like(g['rcore'])
        g['intervol'] = cl_array.zeros_like(g['rcore'])
        g['interspace'] = cl_array.zeros(q, d['shape'], dtype=np.int32)
        g['restspace'] = cl_array.zeros_like(g['interspace'])
        g['access_interspace'] = cl_array.zeros_like(g['interspace'])
        g['best_access_interspace'] = cl_array.zeros_like(g['interspace'])

        # arrays for counting
        # Reductions are typically tedious on GPU, and we need to define the
        # workgroupsize to allocate the correct amount of data
        WORKGROUPSIZE = 32
        nsubhists = int(np.ceil(g['rcore'].size/WORKGROUPSIZE))
        g['subhists'] = cl_array.zeros(q, (nsubhists, d['nrestraints'] + 1), dtype=np.float32)
        g['viol_counter'] = cl_array.zeros(q, (nsubhists, d['nrestraints'], d['nrestraints']), dtype=np.float32)

        # complex arrays
        g['ft_shape'] = list(d['shape'])
        g['ft_shape'][0] = d['shape'][0]//2 + 1
        g['ft_rcore'] = cl_array.zeros(q, g['ft_shape'], dtype=np.complex64)
        g['ft_rsurf'] = cl_array.zeros_like(g['ft_rcore'])
        g['ft_lsurf'] = cl_array.zeros_like(g['ft_rcore'])
        g['ft_clashvol'] = cl_array.zeros_like(g['ft_rcore'])
        g['ft_intervol'] = cl_array.zeros_like(g['ft_rcore'])

        # other miscellanious data
        g['nrot'] = d['nrot']
        g['max_clash'] = d['max_clash']
        g['min_interaction'] = d['min_interaction']

        # kernels
        g['k'] = Kernels(q.context)
        g['k'].rfftn = pyclfft.RFFTn(q.context, d['shape'])
        g['k'].irfftn = pyclfft.iRFFTn(q.context, d['shape'])

        # initial calculations
        g['k'].rfftn(q, g['rcore'], g['ft_rcore'])
        g['k'].rfftn(q, g['rsurf'], g['ft_rsurf'])
Example #15
0
def test_split_slabs(ctx_factory, vanilla, split, parameters):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)
    expect = clarray.zeros(queue, 8, dtype=np.int32)
    actual = clarray.zeros(queue, 8, dtype=np.int32)
    _, (expect, ) = vanilla(queue, a=expect, **parameters)
    _, (actual, ) = split(queue, a=actual, **parameters)
    assert np.array_equal(expect.get(), actual.get())
    def initArrays(self):
        self.specLevel_dev = cl_array.zeros(self.queue, (self.maxCells,self.nSpecies), dtype=numpy.float32)
        self.specRate_dev = cl_array.zeros(self.queue, (self.maxCells,self.nSpecies), dtype=numpy.float32)

        self.celltype = numpy.zeros((self.maxCells,), dtype=numpy.int32)
        self.celltype_dev = cl_array.zeros(self.queue, (self.maxCells,),dtype=numpy.int32)
    
        self.effgrow = numpy.zeros((self.maxCells,), dtype=numpy.float32)
        self.effgrow_dev = cl_array.zeros(self.queue, (self.maxCells,), dtype=numpy.float32)
Example #17
0
def test_random_float_in_range(ctx_factory,
                               rng_class,
                               ary_size,
                               plot_hist=False):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    device = queue.device
    if device.platform.vendor == "The pocl project" \
            and device.type & cl.device_type.GPU \
            and rng_class is RanluxGenerator:
        pytest.xfail("ranlux test fails on POCL + Nvidia,"
                     "at least the Titan V, as of pocl 1.6, 2021-01-20")

    if has_double_support(context.devices[0]):
        dtypes = [np.float32, np.float64]
    else:
        dtypes = [np.float32]

    if rng_class is RanluxGenerator:
        gen = rng_class(queue, 5120)
    else:
        gen = rng_class(context)

    for dtype in dtypes:
        print(dtype)
        ran = cl_array.zeros(queue, ary_size, dtype)
        gen.fill_uniform(ran)

        if plot_hist:
            import matplotlib.pyplot as pt
            pt.hist(ran.get(), 30)
            pt.show()

        assert (0 <= ran.get()).all()
        assert (ran.get() <= 1).all()

        if rng_class is RanluxGenerator:
            gen.synchronize(queue)

        ran = cl_array.zeros(queue, ary_size, dtype)
        gen.fill_uniform(ran, a=4, b=7)
        ran_host = ran.get()

        for cond in [4 <= ran_host, ran_host <= 7]:
            good = cond.all()
            if not good:
                print(np.where(~cond))
                print(ran_host[~cond])
            assert good

        ran = gen.normal(queue, ary_size, dtype, mu=10, sigma=3)

        if plot_hist:
            import matplotlib.pyplot as pt
            pt.hist(ran.get(), 30)
            pt.show()
    def initArrays(self):
        self.specLevel_dev = cl_array.zeros(self.queue, (self.maxCells,self.nSpecies), dtype=numpy.float32)
        self.specRate_dev = cl_array.zeros(self.queue, (self.maxCells,self.nSpecies), dtype=numpy.float32)

        self.celltype = numpy.zeros((self.maxCells,), dtype=numpy.int32)
        self.celltype_dev = cl_array.zeros(self.queue, (self.maxCells,),dtype=numpy.int32)
    
        self.effgrow = numpy.zeros((self.maxCells,), dtype=numpy.float32)
        self.effgrow_dev = cl_array.zeros(self.queue, (self.maxCells,), dtype=numpy.float32)
Example #19
0
def test_zero_size_array(ctx_factory, empty_shape):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    a = cl_array.zeros(queue, empty_shape, dtype=np.float32)
    b = cl_array.zeros(queue, empty_shape, dtype=np.float32)
    b.fill(1)
    c = a + b
    c_host = c.get()
    cl_array.to_device(queue, c_host)
Example #20
0
def zeros(n, dtype, backend='cython'):
    if backend == 'opencl':
        import pyopencl.array as gpuarray
        from .opencl import get_queue
        out = gpuarray.zeros(get_queue(), n, dtype)
    elif backend == 'cuda':
        import pycuda.gpuarray as gpuarray
        out = gpuarray.zeros(n, dtype)
    else:
        out = np.zeros(n, dtype=dtype)
    return wrap_array(out, backend)
Example #21
0
def test_copy_buffer_rect(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    arr1 = cl_array.zeros(queue, (2, 3), "f")
    arr2 = cl_array.zeros(queue, (4, 5), "f")
    arr1.fill(1)
    cl.enqueue_copy(
            queue, arr2.data, arr1.data,
            src_origin=(0, 0), dst_origin=(1, 1),
            region=arr1.shape[::-1])
Example #22
0
    def execute(self, n_it=1, **kwargs):

        # this defines how often the calculations are copied back from the compute unit (GPU)
        # e.g. 10 means that every 10th iteration is copied from the computing unit (GPU) to "python"
        n_out = kwargs.get('n_out', 10)

        queue = self.queue
        prg = self.program
        local_size = self.local_size  #(n_local,) #self.local_size
        n_local = 512
        ng = self.ng

        # initialize the next step
        i_out = 0
        total_out = (n_it // n_out + 1)
        time_axis = np.arange(total_out, dtype=np.float32) * self.t_step
        n_excited = np.zeros(total_out, dtype=np.float32)
        n_excited[0] = 1.0

        tmp_1 = cl_array.zeros(queue, (n_local * total_out, ),
                               dtype=np.float32)
        tmp_2 = cl_array.zeros(queue, (n_local * total_out, ),
                               dtype=np.float32)

        p = self.p_gp
        n = self.n_gp

        b = self.b_gp
        d = self.d_gp
        k = self.k_gp

        #prg.copy3d(queue, self.global_size, None,
        #           n.data, p.data, b).wait()
        for time_i in range(n_it):
            if time_i % 2 > 0:
                p, n = n, p
            prg.iterate(queue, self.global_size_3d, local_size, n, p, d, k, b)
            if time_i % n_out == 0:
                prg.reduce_decay(queue, self.global_size, self.local_size, p,
                                 k, cl.LocalMemory(n_local * 32),
                                 cl.LocalMemory(n_local * 32),
                                 np.int32(self.global_size[0]),
                                 np.int32(n_local), np.int32(i_out),
                                 np.float32(time_i), tmp_1.data, tmp_2.data)
                i_out += 1
            self.it += 1

        dc = (tmp_1.map_to_host()).reshape((total_out, n_local)).sum(axis=1)
        ds = (tmp_2.map_to_host()).reshape((total_out, n_local)).sum(axis=1)
        n_ex = dc / ds
        cl.enqueue_copy(queue, self.p_np, self.p_gp)
        self.p = self.p_np.reshape((ng, ng, ng), order='C')
        return time_axis, n_ex, self.p
Example #23
0
def zeros(n, dtype, backend='cython'):
    if backend == 'opencl':
        import pyopencl.array as gpuarray
        dev_array = gpuarray.zeros(get_queue(), n, dtype)
    elif backend == 'cuda':
        import pycuda.gpuarray as gpuarray
        dev_array = gpuarray.zeros(n, dtype)
    else:
        return Array(np.zeros(n, dtype=dtype))
    wrapped_array = Array()
    wrapped_array.set_dev_array(dev_array)
    return wrapped_array
 def test_CPU_vs_GPU_adj(self):
     inpadj_CPU = clarray.to_device(self.queue, self.symdivin)
     outadj_CPU = clarray.zeros(self.queue, self.symgradin.shape, dtype=DTYPE)
     outadj_CPU.add_event(self.symgrad.adj(outadj_CPU, inpadj_CPU))
     outadj_CPU = outadj_CPU.map_to_host(wait_for=outadj_CPU.events)
     
     inpadj_GPU = clarray.to_device(self.queue_GPU, self.symdivin)
     outadj_GPU = clarray.zeros(self.queue_GPU, self.symgradin.shape, dtype=DTYPE)
     outadj_GPU.add_event(self.symgrad_GPU.adj(outadj_GPU, inpadj_GPU))
     outadj_GPU = outadj_GPU.map_to_host(wait_for=outadj_GPU.events)     
     
     np.testing.assert_allclose(outadj_CPU, outadj_GPU, rtol=RTOL, atol=ATOL)
Example #25
0
    def test_touch(self):

        MAX_CLASH = 100 + 0.9
        MIN_INTER = 300 + 0.9

        NROT = np.random.randint(self.rotations.shape[0] + 1)
        rotmat = self.rotations[0]
        cpu_lsurf = np.zeros_like(self.im_lsurf.array)
        disvis.libdisvis.rotate_image3d(self.im_lsurf.array, self.vlength, np.linalg.inv(rotmat), self.im_center, cpu_lsurf)

        cpu_clashvol = numpy.fft.irfftn(numpy.fft.rfftn(cpu_lsurf).conj() * numpy.fft.rfftn(self.rcore.array))

        gpu_rcore = cl_array.to_device(self.queue, np.asarray(self.rcore.array, dtype=np.float32))
        gpu_im_lsurf = cl.image_from_array(self.queue.context, np.asarray(self.im_lsurf.array, dtype=np.float32))
        gpu_lsurf = cl_array.zeros(self.queue, self.shape, dtype=np.float32)

        self.kernels.rotate_image3d(self.queue, self.sampler, gpu_im_lsurf, rotmat, gpu_lsurf, self.im_center)

        gpu_ft_lsurf = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64)
        gpu_ft_rcore = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64)
        gpu_ft_clashvol = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64)
        gpu_clashvol = cl_array.zeros(self.queue, self.shape, dtype=np.float32)

        self.kernels.rfftn(self.queue, gpu_rcore, gpu_ft_rcore)
        self.kernels.rfftn(self.queue, gpu_lsurf, gpu_ft_lsurf)
        self.kernels.c_conj_multiply(self.queue, gpu_ft_lsurf, gpu_ft_rcore, gpu_ft_clashvol)
        self.kernels.irfftn(self.queue, gpu_ft_clashvol, gpu_clashvol)
        
        cpu_intervol = numpy.fft.irfftn(numpy.fft.rfftn(cpu_lsurf).conj() * numpy.fft.rfftn(self.rsurf.array))

        gpu_rsurf = cl_array.to_device(self.queue, np.asarray(self.rsurf.array, dtype=np.float32))

        gpu_ft_rsurf = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64)
        gpu_ft_intervol = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64)
        gpu_intervol = cl_array.zeros(self.queue, self.shape, dtype=np.float32)

        cpu_interspace = np.zeros(self.shape, dtype=np.int32)
        gpu_interspace = cl_array.zeros(self.queue, self.shape, dtype=np.int32)

        self.kernels.rfftn(self.queue, gpu_rsurf, gpu_ft_rsurf)
        self.kernels.rfftn(self.queue, gpu_lsurf, gpu_ft_lsurf)
        self.kernels.c_conj_multiply(self.queue, gpu_ft_lsurf, gpu_ft_rsurf, gpu_ft_intervol)
        self.kernels.irfftn(self.queue, gpu_ft_intervol, gpu_intervol)

        self.kernels.touch(self.queue, gpu_clashvol, MAX_CLASH, gpu_intervol, MIN_INTER, gpu_interspace)

        np.logical_and(cpu_clashvol < MAX_CLASH, cpu_intervol > MIN_INTER, cpu_interspace)

        disvis.volume.Volume(cpu_interspace, self.im_lsurf.voxelspacing, self.im_lsurf.origin).tofile('cpu_interspace.mrc')
        disvis.volume.Volume(gpu_interspace.get(), self.im_lsurf.voxelspacing, self.im_lsurf.origin).tofile('gpu_interspace.mrc')
        disvis.volume.Volume(cpu_interspace - gpu_interspace.get(), self.im_lsurf.voxelspacing, self.im_lsurf.origin).tofile('diff.mrc')
        print()
        print(cpu_interspace.sum(), gpu_interspace.get().sum())
        print(np.abs(cpu_interspace - gpu_interspace.get()).sum())
                           

        self.assertTrue(np.allclose(gpu_interspace.get(), cpu_interspace))
Example #26
0
 def _allocate_arrays(self):
     self.d_frames = parray.zeros(self.queue, (self.nframes, ) + self.shape,
                                  self.dtype)
     self._old_d_frames = None
     self.d_sums = parray.zeros(self.queue, self.output_shape,
                                self.sums_dtype)
     self.d_sums_f = parray.zeros(
         self.queue,
         self.output_shape,
         self.output_dtype,
     )
     self.d_output = parray.zeros(self.queue, (self.n_bins, self.nframes),
                                  np.float32)
Example #27
0
def test_copy_buffer_rect(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    _xfail_if_pocl_gpu(queue.device, "rectangular copies")

    arr1 = cl_array.zeros(queue, (2, 3), "f")
    arr2 = cl_array.zeros(queue, (4, 5), "f")
    arr1.fill(1)
    cl.enqueue_copy(
            queue, arr2.data, arr1.data,
            src_origin=(0, 0), dst_origin=(1, 1),
            region=arr1.shape[::-1])
Example #28
0
	def init_data_periodic(self):
		"""
		additional arrays for periodic simulations
		"""
		# Connectivity of periodic grid
		self.needle_sq_neighbour_inds = numpy.zeros((self.n_sqs*9,), numpy.int32)
		self.needle_sq_neighbour_inds_dev = cl_array.zeros(self.queue, (self.n_sqs*9,), numpy.int32)
		self.needle_sq_neighbour_offset_inds = numpy.zeros((self.n_sqs*9,), numpy.int32)
		self.needle_sq_neighbour_offset_inds_dev = cl_array.zeros(self.queue, (self.n_sqs*9,), numpy.int32)

		# offset vectors for computing cell images
		self.offset_vecs = numpy.zeros((9,), vec.float4)
		self.offset_vecs_dev = cl_array.zeros(self.queue, (9,), vec.float4)
    def initArrays(self):
        self.gridIdxs = numpy.zeros((self.maxCells, 8), dtype=numpy.int32)
        self.gridIdxs_dev = cl_array.zeros(self.queue, (self.maxCells, 8),
                                           dtype=numpy.int32)
        self.triWts = numpy.zeros((self.maxCells, 8), dtype=numpy.float32)
        self.triWts_dev = cl_array.zeros(self.queue, (self.maxCells, 8),
                                         dtype=numpy.float32)
        self.cellSigRates = numpy.zeros((self.maxCells, 8, self.nSignals),
                                        dtype=numpy.float32)
        self.cellSigRates_dev = cl_array.zeros(
            self.queue, (self.maxCells, 8, self.nSignals), dtype=numpy.float32)
        self.cellSigLevels = numpy.zeros((self.maxCells, self.nSignals),
                                         dtype=numpy.float32)
        self.cellSigLevels_dev = cl_array.zeros(self.queue,
                                                (self.maxCells, self.nSignals),
                                                dtype=numpy.float32)
        self.signalLevel_dev = cl_array.zeros(self.queue,
                                              self.gridDim,
                                              dtype=numpy.float32)
        self.specLevel_dev = cl_array.zeros(self.queue,
                                            (self.maxCells, self.nSpecies),
                                            dtype=numpy.float32)
        self.specRate_dev = cl_array.zeros(self.queue,
                                           (self.maxCells, self.nSpecies),
                                           dtype=numpy.float32)

        self.celltype = numpy.zeros((self.maxCells, ), dtype=numpy.int32)
        self.celltype_dev = cl_array.zeros(self.queue, (self.maxCells, ),
                                           dtype=numpy.int32)
Example #30
0
def test_random_float_in_range(ctx_factory,
                               rng_class,
                               ary_size,
                               plot_hist=False):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    if has_double_support(context.devices[0]):
        dtypes = [np.float32, np.float64]
    else:
        dtypes = [np.float32]

    if rng_class is RanluxGenerator:
        gen = rng_class(queue, 5120)
    else:
        gen = rng_class(context)

    for dtype in dtypes:
        print(dtype)
        ran = cl_array.zeros(queue, ary_size, dtype)
        gen.fill_uniform(ran)

        if plot_hist:
            import matplotlib.pyplot as pt
            pt.hist(ran.get(), 30)
            pt.show()

        assert (0 <= ran.get()).all()
        assert (ran.get() <= 1).all()

        if rng_class is RanluxGenerator:
            gen.synchronize(queue)

        ran = cl_array.zeros(queue, ary_size, dtype)
        gen.fill_uniform(ran, a=4, b=7)
        ran_host = ran.get()

        for cond in [4 <= ran_host, ran_host <= 7]:
            good = cond.all()
            if not good:
                print(np.where(~cond))
                print(ran_host[~cond])
            assert good

        ran = gen.normal(queue, ary_size, dtype, mu=10, sigma=3)

        if plot_hist:
            import matplotlib.pyplot as pt
            pt.hist(ran.get(), 30)
            pt.show()
Example #31
0
 def _allocate_memory(self):
     self.d_filter_f = parray.zeros(self.queue, (self.sino_f_shape[-1],), np.complex64)
     self.is_cpu = (self.device.type == "CPU")
     # These are already allocated by FFT() if using the opencl backend
     if self.fft_backend == "opencl":
         self.d_sino_padded = self.fft.data_in
         self.d_sino_f = self.fft.data_out
     else:
         # When using the numpy backend, arrays are not pre-allocated
         self.d_sino_padded = np.zeros(self.sino_padded_shape, "f")
         self.d_sino_f = np.zeros(self.sino_f_shape, np.complex64)
     # These are needed for rectangular memcpy in certain cases (see below).
     self.tmp_sino_device = parray.zeros(self.queue, self.sino_shape, "f")
     self.tmp_sino_host = np.zeros(self.sino_shape, "f")
    def test_dot_fwdgrad(self):
        x = expr.Variable('x')
        y = expr.Variable('y')
        dotprod = op.Dot(x, y)

        valuation = pl.valuation()
        nx = np.random.uniform(0, 1, (10, )).astype(np.float32)
        ny = np.random.uniform(0, 1, (10, )).astype(np.float32)
        valuation['x'] = nx
        valuation['y'] = ny
        xw = clarray.zeros(pl.qs[0], (10, ), dtype=np.float32) + 1.0
        yw = clarray.zeros(pl.qs[0], (10, ), dtype=np.float32)

        gddot = dotprod.fwd_grad({'x': xw, 'y': yw}, valuation)
        ddot = gddot.get()
Example #33
0
    def __init__(self,
                 sino_shape,
                 slice_shape=None,
                 axis_position=None,
                 angles=None,
                 ctx=None,
                 devicetype="all",
                 platformid=None,
                 deviceid=None,
                 profile=False):
        OpenclProcessing.__init__(self,
                                  ctx=ctx,
                                  devicetype=devicetype,
                                  platformid=platformid,
                                  deviceid=deviceid,
                                  profile=profile)

        # Create a backprojector
        self.backprojector = Backprojection(sino_shape,
                                            slice_shape=slice_shape,
                                            axis_position=axis_position,
                                            angles=angles,
                                            ctx=self.ctx,
                                            profile=profile)
        # Create a projector
        self.projector = Projection(self.backprojector.slice_shape,
                                    self.backprojector.angles,
                                    axis_position=axis_position,
                                    detector_width=self.backprojector.num_bins,
                                    normalize=False,
                                    ctx=self.ctx,
                                    profile=profile)
        self.sino_shape = sino_shape
        self.is_cpu = self.backprojector.is_cpu
        # Arrays
        self.d_data = parray.zeros(self.queue, sino_shape, dtype=np.float32)
        self.d_sino = parray.zeros_like(self.d_data)
        self.d_x = parray.zeros(self.queue,
                                self.backprojector.slice_shape,
                                dtype=np.float32)
        self.d_x_old = parray.zeros_like(self.d_x)

        self.add_to_cl_mem({
            "d_data": self.d_data,
            "d_sino": self.d_sino,
            "d_x": self.d_x,
            "d_x_old": self.d_x_old,
        })
Example #34
0
def pad(image, region=None, out=None, value=0, queue=None, block=False):
    """Pad a 2D *image*. *region* is the region to pad as (y_0, x_0, height, width). If not
    specified, the next power of two dimensions are used and the image is centered in the padded
    one. The final image dimensions are height x width and the filling starts at (y_0, x_0), *out*
    is the pyopencl Array instance, if not specified it will be created. *out* is also returned.
    *value* is the padded value. If *block* is True, wait for the copy to finish.
    """
    if region is None:
        shape = tuple([next_power_of_two(n) for n in image.shape])
        y_0 = (shape[0] - image.shape[0]) / 2
        x_0 = (shape[1] - image.shape[1]) / 2
        region = (y_0, x_0) + shape
    if queue is None:
        queue = cfg.OPENCL.queue
    if out is None:
        out = cl_array.zeros(queue, (region[2], region[3]), dtype=image.dtype) + value
    image = g_util.get_array(image, queue=queue)

    n_bytes = image.dtype.itemsize
    y_0, x_0, height, width = region
    src_origin = (0, 0, 0)
    dst_origin = (n_bytes * x_0, y_0, 0)
    region = (n_bytes * image.shape[1], image.shape[0], 1)
    LOG.debug('pad, shape: %s, src_origin: %s, dst_origin: %s, region: %s', image.shape,
              src_origin, dst_origin, region)

    _copy_rect(image, out, src_origin, dst_origin, region, queue, block=block)

    return out
Example #35
0
def transfer_many(objects, shape, pixel_size, energy, exponent=False, offset=None, queue=None,
                  out=None, t=None, check=True, block=False):
    """Compute transmission from more *objects*. If *exponent* is True, compute only the exponent,
    if it is False, evaluate the exponent. Use *shape* (y, x), *pixel_size*, *energy*, *offset* as
    (y, x), OpenCL command *queue*, *out* array, time *t*, check the sampling if *check* is True and
    wait for OpenCL kernels if *block* is True. Returned *out* array is different from the input one
    because of the pyopencl.clmath behavior.
    """
    if queue is None:
        queue = cfg.OPENCL.queue
    if out is None:
        out = cl_array.zeros(queue, shape, cfg.PRECISION.np_cplx)
    u_sample = cl_array.Array(queue, shape, cfg.PRECISION.np_cplx)
    lam = energy_to_wavelength(energy)

    for i, sample in enumerate(objects):
        try:
            out += sample.transfer(shape, pixel_size, energy, exponent=True, offset=offset, t=t,
                                   queue=queue, out=u_sample, check=False, block=block)
        except NotImplementedError:
            LOG.debug('%s does not support real space transfer', sample)

    if check and not is_wavefield_sampling_ok(out, queue=queue):
        LOG.error('Insufficient transmission function sampling')

    # Apply the exponent
    if not exponent:
        out = clmath.exp(out, queue=queue)

    return out
Example #36
0
 def allocate_arrays(self):
     """
     Allocate various types of arrays for the tests
     """
     # numpy images
     self.grad = np.zeros(self.image.shape, dtype=np.complex64)
     self.grad2 = np.zeros((2,) + self.image.shape, dtype=np.float32)
     self.grad_ref = gradient(self.image)
     self.div_ref = divergence(self.grad_ref)
     self.image2 = np.zeros_like(self.image)
     # Device images
     self.gradient_parray = parray.zeros(self.la.queue, self.image.shape, np.complex64)
     # we should be using cl.Buffer(self.la.ctx, cl.mem_flags.READ_WRITE, size=self.image.nbytes*2),
     # but platforms not suporting openCL 1.2 have a problem with enqueue_fill_buffer,
     # so we use the parray "fill" utility
     self.gradient_buffer = self.gradient_parray.data
     # Do the same for image
     self.image_parray = parray.to_device(self.la.queue, self.image)
     self.image_buffer = self.image_parray.data
     # Refs
     tmp = np.zeros(self.image.shape, dtype=np.complex64)
     tmp.real = np.copy(self.grad_ref[0])
     tmp.imag = np.copy(self.grad_ref[1])
     self.grad_ref_parray = parray.to_device(self.la.queue, tmp)
     self.grad_ref_buffer = self.grad_ref_parray.data
Example #37
0
    def test_2d_real_to_complex_double(self, ctx):
        if not has_double(ctx): #TODO: find better way to skip test
            return
        queue = cl.CommandQueue(ctx)
        
        M = 64
        N = 32

        nd_data = np.arange(M*N, dtype=np.float64)
        nd_data.shape = (M, N)
        cl_data = cla.to_device(queue, nd_data)
        
        cl_data_transformed = cla.zeros(queue, (M, N//2+1), dtype = np.complex128)
        
        transform = FFT(ctx, queue,
                        cl_data,
                        cl_data_transformed,
                        axes = (1,0),
                        )

        transform.enqueue()

        print(cl_data_transformed.get)
        print(np.fft.rfft2(nd_data))
        
        assert np.allclose(cl_data_transformed.get(),
                           np.fft.rfft2(nd_data),
                           rtol=1e-8, atol=1e-8)
Example #38
0
    def test_2d_real_to_complex(self, ctx):
        queue = cl.CommandQueue(ctx)
        
        M = 64
        N = 32

        nd_data = np.arange(M*N, dtype=np.float32)
        nd_data.shape = (M, N)
        cl_data = cla.to_device(queue, nd_data)
        
        cl_data_transformed = cla.zeros(queue, (M, N//2+1), dtype = np.complex64)
        
        transform = FFT(ctx, queue,
                        cl_data,
                        cl_data_transformed,
                        axes = (1,0),
                        )

        transform.enqueue()

        print(cl_data_transformed.get)
        print(np.fft.rfft2(nd_data))
        
        assert np.allclose(cl_data_transformed.get(),
                           np.fft.rfft2(nd_data),
                           rtol=1e-3, atol=1e-3)
Example #39
0
 def build(self, coords, values, base):
     """Use OpenCL to build the arrays."""
     lenbase = base.shape[0]
     lencoords = coords.shape[0]
     coords_array = cla.to_device(self.queue, coords)
     values_array = cla.to_device(self.queue, values)
     base_array = cla.to_device(self.queue, base)
     template_array = cla.zeros(self.queue, (lenbase), dtype=np.int32)
     event = self.program.nearest(
         self.queue,
         base.shape,
         None,
         coords_array.data,
         values_array.data,
         base_array.data,
         template_array.data,
         np.int32(lencoords),
         self.nnear,
         self.usemajority,
     )
     try:
         event.wait()
     except cl.RuntimeError, inst:
         errstr = inst.__str__()
         if errstr == "clWaitForEvents failed: out of resources":
             print "OpenCL timed out, probably due to the display manager."
             print "Disable your display manager and try again!"
             print "If that does not work, rerun with OpenCL disabled."
         else:
             raise cl.RuntimeError, inst
         sys.exit(1)
Example #40
0
def test_fancy_indexing(ctx_factory):
    if _PYPY:
        pytest.xfail("numpypy: multi value setting is not supported")
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    numpy_dest = np.zeros((4,), np.int32)
    numpy_idx = np.arange(3, 0, -1, dtype=np.int32)
    numpy_src = np.arange(8, 10, dtype=np.int32)
    numpy_dest[numpy_idx] = numpy_src

    cl_dest = cl_array.zeros(queue, (4,), np.int32)
    cl_idx = cl_array.arange(queue, 3, 0, -1, dtype=np.int32)
    cl_src = cl_array.arange(queue, 8, 10, dtype=np.int32)
    cl_dest[cl_idx] = cl_src

    assert np.all(numpy_dest == cl_dest.get())

    cl_idx[1] = 3
    cl_idx[2] = 2

    numpy_idx[1] = 3
    numpy_idx[2] = 2

    numpy_dest[numpy_idx] = numpy_src
    cl_dest[cl_idx] = cl_src

    assert np.all(numpy_dest == cl_dest.get())
Example #41
0
    def compute_slices(self, shape, pixel_size, queue=None, out=None, offset=None):
        """Compute slices with *shape* as (z, y, x), *pixel_size*. Use *queue* and *out* for
        outuput. Offset is the starting point offset as (x, y, z).
        """
        if queue is None:
            queue = cfg.OPENCL.queue
        if out is None:
            out = cl_array.zeros(queue, shape, dtype=np.uint8)

        pixel_size = make_tuple(pixel_size, num_dims=2)
        v_1, v_2, v_3 = self._make_inputs(queue, pixel_size)
        psm = pixel_size.simplified.magnitude
        max_dx = self.max_triangle_x_diff.simplified.magnitude / psm[1]
        if offset is None:
            offset = gutil.make_vfloat3(0, 0, 0)
        else:
            offset = offset.simplified.magnitude
            offset = gutil.make_vfloat3(offset[0] / psm[1], offset[1] / psm[0], offset[2] / psm[1])

        cfg.OPENCL.programs['mesh'].compute_slices(queue,
                                                   (shape[2], shape[0]),
                                                   None,
                                                   v_1.data,
                                                   v_2.data,
                                                   v_3.data,
                                                   out.data,
                                                   np.int32(shape[1]),
                                                   np.int32(self.num_triangles),
                                                   offset,
                                                   cfg.PRECISION.np_float(max_dx))


        return out
Example #42
0
    def _transfer(
        self,
        shape,
        pixel_size,
        energy,
        offset,
        exponent=False,
        t=None,
        queue=None,
        out=None,
        check=True,
        block=False,
    ):
        """Transfer function implementation based on a refractive index."""
        if out is None:
            out = cl_array.zeros(queue, shape, dtype=cfg.PRECISION.np_cplx)
        else:
            # transmission_many adds values, make sure it start with a zeroed array
            out.fill(0)

        return transfer_many(
            self.bodies,
            shape,
            pixel_size,
            energy,
            offset=offset,
            exponent=exponent,
            queue=queue,
            out=out,
            t=t,
            check=check,
            block=block,
        )
Example #43
0
    def __init__(self,
                 label,
                 rng,
                 input,
                 nin,
                 nout,
                 W=None,
                 b=None,
                 activation_fn=op.Sigmoid):
        q = pl.qs[0]
        self.input = input
        if W is None:
            nw = np.asarray(rng.uniform(low=-np.sqrt(6. / (nin + nout)),
                                        high=np.sqrt(6. / (nin + nout)),
                                        size=(nin, nout)),
                            dtype=np.float32)
            if activation_fn == op.Sigmoid:
                nw *= 4
            W = clarray.to_device(q, nw)
        if b is None:
            b = clarray.zeros(q, (nout, ), np.float32)

        self.W = W
        self.b = b

        vW = expr.Variable('W' + label)
        vb = expr.Variable('b' + label)
        lin_out = op.Add(op.Dot(self.input, vW), vb)
        self.output = lin_out if activation_fn is None else activation_fn(
            lin_out)
        self.params = [(vW.name, self.W), (vb.name, self.b)]
Example #44
0
def zeros_cl(queue, shape):
    """ Create GPUArray of zeros directly on GPU memory.

    Parameters
    ----------
    queue
        PyOpenCL queue.
    shape : tuple
        Dimensions of the GPUArray.

    Returns
    -------
    gpuarray
        GPUArray of zeros.

    Examples
    --------
    >>> a = zeros_cl((3, 2))
    [[ 0.,  0.],
     [ 0.,  0.],
     [ 0.,  0.]]

    >>> type(a)
    <class 'pyopencl.array.Array'>

    """

    return cl_array.zeros(queue, shape, dtype=float32)
Example #45
0
def pad(image, region=None, out=None, value=0, queue=None, block=False):
    """Pad a 2D *image*. *region* is the region to pad as (y_0, x_0, height, width). If not
    specified, the next power of two dimensions are used and the image is centered in the padded
    one. The final image dimensions are height x width and the filling starts at (y_0, x_0), *out*
    is the pyopencl Array instance, if not specified it will be created. *out* is also returned.
    *value* is the padded value. If *block* is True, wait for the copy to finish.
    """
    if region is None:
        shape = tuple([next_power_of_two(n) for n in image.shape])
        y_0 = (shape[0] - image.shape[0]) // 2
        x_0 = (shape[1] - image.shape[1]) // 2
        region = (y_0, x_0) + shape
    if queue is None:
        queue = cfg.OPENCL.queue
    if out is None:
        out = cl_array.zeros(queue, (region[2], region[3]), dtype=image.dtype) + value
    image = g_util.get_array(image, queue=queue)

    n_bytes = image.dtype.itemsize
    y_0, x_0, height, width = region
    src_origin = (0, 0, 0)
    dst_origin = (n_bytes * x_0, y_0, 0)
    region = (n_bytes * image.shape[1], image.shape[0], 1)
    LOG.debug(
        "pad, shape: %s, src_origin: %s, dst_origin: %s, region: %s",
        image.shape,
        src_origin,
        dst_origin,
        region,
    )

    _copy_rect(image, out, src_origin, dst_origin, region, queue, block=block)

    return out
Example #46
0
def test_map_to_host(ctx_factory):
    if _PYPY:
        pytest.skip("numpypy: no array creation from __array_interface__")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    if context.devices[0].type & cl.device_type.GPU:
        mf = cl.mem_flags
        allocator = cl_tools.DeferredAllocator(
                context, mf.READ_WRITE | mf.ALLOC_HOST_PTR)
    else:
        allocator = None

    a_dev = cl_array.zeros(queue, (5, 6, 7,), dtype=np.float32, allocator=allocator)
    a_dev[3, 2, 1] = 10
    a_host = a_dev.map_to_host()
    a_host[1, 2, 3] = 10

    a_host_saved = a_host.copy()
    a_host.base.release(queue)

    a_dev.finish()

    print("DEV[HOST_WRITE]", a_dev.get()[1, 2, 3])
    print("HOST[DEV_WRITE]", a_host_saved[3, 2, 1])

    assert (a_host_saved == a_dev.get()).all()
Example #47
0
def ones_cl(queue, shape):
    """ Create GPUArray of ones directly on GPU memory.

    Parameters
    ----------
    queue
        PyOpenCL queue.
    shape : tuple
        Dimensions of the GPUArray.

    Returns
    -------
    gpuarray
        GPUArray of ones.

    Examples
    --------
    >>> a = ones_cl((3, 2))
    [[ 1.,  1.],
     [ 1.,  1.],
     [ 1.,  1.]]

    >>> type(a)
    <class 'pyopencl.array.Array'>

    """

    a = cl_array.zeros(queue, shape, dtype=float32)
    a.fill(1.0)

    return a
def CalcF(ctx, queue, m2, r2):

    # Define dimensions
    xdim = ydim = m2.shape[0]

    #    m2 = np.float32(m2)
    #    r2 = np.float32(r2)

    # Get the compiled kernel
    kernel = get_kernel(ctx, xdim)

    # Move data to the GPU

    gpu_m2 = cl_array.to_device(queue, m2)
    gpu_r2 = cl_array.to_device(queue, r2)
    gpu_result = cl_array.zeros(queue, (ydim, xdim), np.float32)

    # Define grid shape (the same as the matrix dimensions)
    grid_shape = (ydim, xdim)

    # Get group shape based on the matrix dimensions and the actual hardware
    group_shape = (16, 16)

    event = kernel.CalcF(queue, grid_shape, group_shape, gpu_result.data, gpu_m2.data, gpu_r2.data)

    event.wait()
    result = gpu_result.get()
    queue.finish()

    return result
Example #49
0
    def _transfer_fourier(
        self, shape, pixel_size, energy, t=None, queue=None, out=None, block=False
    ):
        if out is None:
            out = cl_array.zeros(queue, shape, cfg.PRECISION.np_cplx)

        return out
Example #50
0
    def __init__(self, npixels_x, npixels_y, max_iterations):
        """
        Initialize the renderer
        """
        self.npixels_x = npixels_x
        self.npixels_y = npixels_y
        self.max_iterations = max_iterations

        # Initialize OpenCL
        self.context = cl.create_some_context()
        self.queue = cl.CommandQueue(self.context)

        self.cl_res = cl_array.zeros(self.queue, (self.npixels_x*self.npixels_y,), np.float32)

        # Set up program kernel
        with open('julia.cl') as source:
            try:
                self.program = cl.Program(self.context, source.read()).build()
            except Exception as err: #cl.cffi_cl.RuntimeError as err:
                raise RuntimeError('Could not compile program: {0}'.format(err))
            self.kernel = self.program.kernel_main
        if not self.kernel:
            raise RuntimeError('Could not load program kernel (does file exist?)')
        
        self.kernel.set_scalar_arg_dtypes([None, np.float32, np.int32])
Example #51
0
    def test_subset_minmax(ctx_getter):
        context = ctx_getter()
        queue = cl.CommandQueue(context)

        from pyopencl.clrandom import rand as clrand

        l_a = 200000
        gran = 5
        l_m = l_a - l_a // gran + 1

        if has_double_support():
            dtypes = [numpy.float64, numpy.float32, numpy.int32]
        else:
            dtypes = [numpy.float32, numpy.int32]

        for dtype in dtypes:
            a_gpu = clrand(context, queue, (l_a,), dtype)
            a = a_gpu.get()

            meaningful_indices_gpu = cl_array.zeros(l_m, dtype=numpy.int32)
            meaningful_indices = meaningful_indices_gpu.get()
            j = 0
            for i in range(len(meaningful_indices)):
                meaningful_indices[i] = j
                j = j + 1
                if j % gran == 0:
                    j = j + 1

            meaningful_indices_gpu = cl_array.to_device(meaningful_indices)
            b = a[meaningful_indices]

            min_a = numpy.min(b)
            min_a_gpu = cl_array.subset_min(meaningful_indices_gpu, a_gpu).get()

            assert min_a_gpu == min_a
Example #52
0
    def _project(self, shape, pixel_size, offset, t=None, queue=None, out=None, block=False):
        """Projection implementation."""
        def get_crop(index, fov):
            minimum = max(self.extrema[index][0], fov[index][0])
            maximum = min(self.extrema[index][1], fov[index][1])

            return minimum - offset[::-1][index], maximum - offset[::-1][index]

        def get_px_value(value, round_func, ps):
            return int(round_func(get_magnitude(value / ps)))

        # Move to the desired location, apply the T matrix and resort the triangles
        self.transform()
        self.sort()

        psm = pixel_size.simplified.magnitude
        fov = offset + shape * pixel_size
        fov = np.concatenate((offset.simplified.magnitude[::-1],
                              fov.simplified.magnitude[::-1])).reshape(2, 2).transpose() * q.m
        if out is None:
            out = cl_array.zeros(queue, shape, dtype=cfg.PRECISION.np_float)

        if (self.extrema[0][0] < fov[0][1] and self.extrema[0][1] > fov[0][0] and
            self.extrema[1][0] < fov[1][1] and self.extrema[1][1] > fov[1][0]):
            # Object inside FOV
            x_min, x_max = get_crop(0, fov)
            y_min, y_max = get_crop(1, fov)
            x_min_px = get_px_value(x_min, np.floor, pixel_size[1])
            x_max_px = get_px_value(x_max, np.ceil, pixel_size[1])
            y_min_px = get_px_value(y_min, np.floor, pixel_size[0])
            y_max_px = get_px_value(y_max, np.ceil, pixel_size[0])
            width = min(x_max_px - x_min_px, shape[1])
            height = min(y_max_px - y_min_px, shape[0])
            compute_offset = cl_array.vec.make_int2(x_min_px, y_min_px)
            v_1, v_2, v_3 = self._make_inputs(queue, pixel_size)
            max_dx = self.max_triangle_x_diff.simplified.magnitude / psm[1]
            # Use the same pixel size as for the x-axis, which will work for objects "not too far"
            # from the imaging plane
            min_z = self.extrema[2][0].simplified.magnitude / psm[1]
            offset = gutil.make_vfloat2(*(offset / pixel_size).simplified.magnitude[::-1])

            ev = cfg.OPENCL.programs['mesh'].compute_thickness(queue,
                                                               (width, height),
                                                               None,
                                                               v_1.data,
                                                               v_2.data,
                                                               v_3.data,
                                                               out.data,
                                                               np.int32(self.num_triangles),
                                                               np.int32(shape[1]),
                                                               compute_offset,
                                                               offset,
                                                               cfg.PRECISION.np_float(psm[1]),
                                                               cfg.PRECISION.np_float(max_dx),
                                                               cfg.PRECISION.np_float(min_z),
                                                               np.int32(self.iterations))
            if block:
                ev.wait()

        return out
Example #53
0
def test_random_float_in_range(ctx_factory, rng_class, ary_size, plot_hist=False):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    if has_double_support(context.devices[0]):
        dtypes = [np.float32, np.float64]
    else:
        dtypes = [np.float32]

    if rng_class is RanluxGenerator:
        gen = rng_class(queue, 5120)
    else:
        gen = rng_class(context)

    for dtype in dtypes:
        print(dtype)
        ran = cl_array.zeros(queue, ary_size, dtype)
        gen.fill_uniform(ran)

        if plot_hist:
            import matplotlib.pyplot as pt
            pt.hist(ran.get(), 30)
            pt.show()

        assert (0 <= ran.get()).all()
        assert (ran.get() <= 1).all()

        if rng_class is RanluxGenerator:
            gen.synchronize(queue)

        ran = cl_array.zeros(queue, ary_size, dtype)
        gen.fill_uniform(ran, a=4, b=7)
        ran_host = ran.get()

        for cond in [4 <= ran_host,  ran_host <= 7]:
            good = cond.all()
            if not good:
                print(np.where(~cond))
                print(ran_host[~cond])
            assert good

        ran = gen.normal(queue, ary_size, dtype, mu=10, sigma=3)

        if plot_hist:
            import matplotlib.pyplot as pt
            pt.hist(ran.get(), 30)
            pt.show()
    def init_data(self):
        """Set up the data OpenCL will store on the device."""
        # cell data
        cell_geom = (self.max_cells,)
        self.cell_centers = numpy.zeros(cell_geom, vec.float4)
        self.cell_centers_dev = cl_array.zeros(self.queue, cell_geom, vec.float4)

        # cell geometry
        self.cell_areas_dev = cl_array.zeros(self.queue, cell_geom, numpy.float32)
        self.cell_areas = numpy.zeros(cell_geom, numpy.float32)
        self.cell_vols_dev = cl_array.zeros(self.queue, cell_geom, numpy.float32)
        self.cell_vols = numpy.zeros(cell_geom, numpy.float32)
        self.cell_old_vols_dev = cl_array.zeros(self.queue, cell_geom, numpy.float32)
        self.cell_old_vols = numpy.zeros(cell_geom, numpy.float32)

        self.cell_growth_rates = numpy.zeros(cell_geom, numpy.float32)
        self.cell_growth_rates_dev = cl_array.zeros(self.queue, cell_geom, numpy.float32)
Example #55
0
    def _project(self, shape, pixel_size, offset, t=None, queue=None, out=None, block=False):
        """Projection function implementation. *shape* and *pixel_size* are 2D."""
        if out is None:
            out = cl_array.zeros(queue, shape, dtype=cfg.PRECISION.np_float)
        for body in self.bodies:
            out += body.project(shape, pixel_size, offset=offset, t=t, queue=queue, out=None,
                                block=block)

        return out
Example #56
0
 def _allocate_device(self):
     if self.state is DeviceDataMixin.DEVICE_UNALLOCATED:
         if self.soa:
             shape = tuple(reversed(self.shape))
         else:
             shape = self.shape
         self._device_data = array.zeros(_queue, shape=shape,
                                         dtype=self.dtype)
         self.state = DeviceDataMixin.HOST
def main():
    # Allocate the first GPU
    ctx = cl.create_some_context(0)#use device 0, the GPU
    queue = cl.CommandQueue(ctx)
    
    # Define dimensions
    ydim = 1024
    xdim = 1024

    # Create random matrix
    matrix = np.random.random((ydim, xdim))
    matrix = np.float32(matrix)

    # Create random matrix2
    matrix2 = np.random.random((ydim, xdim))
    matrix2 = np.float32(matrix2)

    # Get the compiled kernel
    kernel = get_kernel(ctx, xdim)

    # Start timing
    t1 = time.time()
    
    # Move data to the GPU
    gpu_matrix = cl_array.to_device(queue, matrix)
    gpu_matrix2 = cl_array.to_device(queue, matrix2)
    gpu_result = cl_array.zeros(queue, (ydim, xdim), np.float32)

    # Define grid shape (the same as the matrix dimensions)
    grid_shape = (ydim, xdim)
    
    # Get group shape based on the matrix dimensions and the actual hardware
    group_shape = (16,16)#(32,16)
    
    # Execute the kernel
    event = kernel.add(queue, 
                       grid_shape, group_shape, 
                       gpu_result.data, 
                       gpu_matrix.data, 
                       gpu_matrix2.data)
                       
    # Wait for the kernel to finish
    event.wait()
    
    # Move the result from GPU to CPU
    result = gpu_result.get()
    
    # Measure end time
    t2 = time.time()

    # Print result and execution time
    print result
    print "Elapsed: %f seconds " % (t2-t1)

    # Free the GPU resource
    queue.finish()
Example #58
0
def test_vector_fill(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    a_gpu = cl_array.Array(queue, 100, dtype=cl_array.vec.float4)
    a_gpu.fill(cl_array.vec.make_float4(0.0, 0.0, 1.0, 0.0))
    a = a_gpu.get()
    assert a.dtype == cl_array.vec.float4

    a_gpu = cl_array.zeros(queue, 100, dtype=cl_array.vec.float4)
Example #59
0
    def __init__(self, sino_shape, slice_shape=None, axis_position=None, angles=None,
                 ctx=None, devicetype="all", platformid=None, deviceid=None,
                 profile=False
                 ):
        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
                                  platformid=platformid, deviceid=deviceid,
                                  profile=profile)

        # Create a backprojector
        self.backprojector = Backprojection(
            sino_shape,
            slice_shape=slice_shape,
            axis_position=axis_position,
            angles=angles,
            ctx=self.ctx,
            profile=profile
        )
        # Create a projector
        self.projector = Projection(
            self.backprojector.slice_shape,
            self.backprojector.angles,
            axis_position=axis_position,
            detector_width=self.backprojector.num_bins,
            normalize=False,
            ctx=self.ctx,
            profile=profile
        )
        self.sino_shape = sino_shape
        self.is_cpu = self.backprojector.is_cpu
        # Arrays
        self.d_data = parray.zeros(self.queue, sino_shape, dtype=np.float32)
        self.d_sino = parray.zeros_like(self.d_data)
        self.d_x = parray.zeros(self.queue,
                                self.backprojector.slice_shape,
                                dtype=np.float32)
        self.d_x_old = parray.zeros_like(self.d_x)

        self.add_to_cl_mem({
            "d_data": self.d_data,
            "d_sino": self.d_sino,
            "d_x": self.d_x,
            "d_x_old": self.d_x_old,
        })