Example #1
0
def test_nan_arithmetic(ctx_getter):
    context = ctx_getter()
    queue = cl.CommandQueue(context)

    def make_nan_contaminated_vector(size):
        shape = (size,)
        a = numpy.random.randn(*shape).astype(numpy.float32)
        #for i in range(0, shape[0], 3):
            #a[i] = float('nan')
        from random import randrange
        for i in range(size//10):
            a[randrange(0, size)] = float('nan')
        return a

    size = 1 << 20

    a = make_nan_contaminated_vector(size)
    a_gpu = cl_array.to_device(context, queue, a)
    b = make_nan_contaminated_vector(size)
    b_gpu = cl_array.to_device(context, queue, b)

    ab = a*b
    ab_gpu = (a_gpu*b_gpu).get()

    for i in range(size):
        assert numpy.isnan(ab[i]) == numpy.isnan(ab_gpu[i])
Example #2
0
    def _make_inputs(self, queue, pixel_size):
        mf = cl.mem_flags
        v_1 = cl_array.to_device(queue, self._make_vertices(0, pixel_size[1]))
        v_2 = cl_array.to_device(queue, self._make_vertices(1, pixel_size[0]))
        v_3 = cl_array.to_device(queue, self._make_vertices(2, pixel_size[1]))

        return v_1, v_2, v_3
Example #3
0
 def build(self, coords, values, base):
     """Use OpenCL to build the arrays."""
     lenbase = base.shape[0]
     lencoords = coords.shape[0]
     coords_array = cla.to_device(self.queue, coords)
     values_array = cla.to_device(self.queue, values)
     base_array = cla.to_device(self.queue, base)
     template_array = cla.zeros(self.queue, (lenbase), dtype=np.int32)
     event = self.program.nearest(
         self.queue,
         base.shape,
         None,
         coords_array.data,
         values_array.data,
         base_array.data,
         template_array.data,
         np.int32(lencoords),
         self.nnear,
         self.usemajority,
     )
     try:
         event.wait()
     except cl.RuntimeError, inst:
         errstr = inst.__str__()
         if errstr == "clWaitForEvents failed: out of resources":
             print "OpenCL timed out, probably due to the display manager."
             print "Disable your display manager and try again!"
             print "If that does not work, rerun with OpenCL disabled."
         else:
             raise cl.RuntimeError, inst
         sys.exit(1)
Example #4
0
    def compute_preconditioners(self):
        """
        Create a diagonal preconditioner for the projection and backprojection
        operator.
        Each term of the diagonal is the sum of the projector/backprojector
        along rows [1], i.e the projection/backprojection of an array of ones.

        [1] Jens Gregor and Thomas Benson,
            Computational Analysis and Improvement of SIRT,
            IEEE transactions on medical imaging, vol. 27, no. 7,  2008
        """

        # r_{i,i} = 1/(sum_j a_{i,j})
        slice_ones = np.ones(self.backprojector.slice_shape, dtype=np.float32)
        R = 1./self.projector.projection(slice_ones)  # could be all done on GPU, but I want extra checks
        R[np.logical_not(np.isfinite(R))] = 1.  # In the case where the rotation axis is excentred
        self.d_R = parray.to_device(self.queue, R)
        # c_{j,j} = 1/(sum_i a_{i,j})
        sino_ones = np.ones(self.sino_shape, dtype=np.float32)
        C = 1./self.backprojector.backprojection(sino_ones)
        C[np.logical_not(np.isfinite(C))] = 1.  # In the case where the rotation axis is excentred
        self.d_C = parray.to_device(self.queue, C)

        self.add_to_cl_mem({
            "d_R": self.d_R,
            "d_C": self.d_C
        })
    def computeEnergy(self, x, y, z, q):

        xd = cl_array.to_device(self.queue, x)
        yd = cl_array.to_device(self.queue, y)
        zd = cl_array.to_device(self.queue, z)
        qd = cl_array.to_device(self.queue, q)
        coulombEnergy = cl_array.zeros_like(xd)
        prec = x.dtype
        if prec == numpy.float32:
            self.compEnergyF.calc_potential_energy(self.queue,
                    (x.size, ), None,
                    xd.data, yd.data, zd.data,
                    qd.data, coulombEnergy.data, numpy.int32(len(x)),
                    numpy.float32(self.k),numpy.float32(self.impactFact),
                    g_times_l = False)
        elif prec == numpy.float64:
            self.compEnergyD.calc_potential_energy(self.queue,
                    (x.size, ), None,
                    xd.data, yd.data, zd.data,
                    qd.data, coulombEnergy.data, numpy.int32(len(x)) ,
                    numpy.float64(self.k),numpy.float64(self.impactFact),
                    g_times_l = False)
        else:
            print("Unknown float type.")

        return numpy.sum(coulombEnergy.get(self.queue))
    def computeEnergy(self, x, y, z, q):

        coulombEnergy = cl_array.zero_like(q)
        xd = cl_array.to_device(self.queue, x)
        yd = cl_array.to_device(self.queue, y)
        zd = cl_array.to_device(self.queue, z)
        qd = cl_array.to_device(self.queue, q)
        prec = x.dtype
        if prec == numpy.float32:
            self.compEnergyF.calc_potential_energy(
                self.queue, (x.size, ),
                None,
                xd.data,
                yd.data,
                zd.data,
                qd.data,
                coulombEnergy.data,
                g_time_l=False)
        elif prec == numpy.float64:
            self.compEnergyD.calc_potential_energy(
                self.queue, (x.size, ),
                None,
                xd.data,
                yd.data,
                zd.data,
                qd.data,
                coulombEnergy.data,
                g_time_l=False)
        else:
            print("Unknown float type.")

        return np.sum(coulombEnergy.get(self.queue))
Example #7
0
    def test_count_1(self):
        
        nrepeats = 3
        shape = [5, 5, 5]

        np_interspace = randint(2, size=shape).astype(np.int32)
        np_access_interspace = randint(nrepeats, size=shape).astype(np.int32)
        np_count = np.ones([nrepeats] + shape, dtype=np.float32)
        weight = 0.5

        expected = np.ones_like(np_count)
        tmp = expected[0]
        tmp[np_interspace == 1] += weight
        for i in range(1, nrepeats):
            tmp = expected[i]
            tmp[np_access_interspace == i] += weight


        cl_interspace = cl_array.to_device(self.queue, np_interspace)
        cl_access_interspace = cl_array.to_device(self.queue, np_access_interspace)
        cl_count = cl_array.to_device(self.queue, np_count)

        self.kernels.count(self.queue, cl_interspace, cl_access_interspace, weight, cl_count)

        self.assertTrue(np.allclose(expected, cl_count.get()))
Example #8
0
def test_fancy_indexing(ctx_factory):
    if _PYPY:
        pytest.xfail("numpypy: multi value setting is not supported")
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    n = 2 ** 20 + 2**18 + 22
    numpy_dest = np.zeros(n, dtype=np.int32)
    numpy_idx = np.arange(n, dtype=np.int32)
    np.random.shuffle(numpy_idx)
    numpy_src = 20000+np.arange(n, dtype=np.int32)

    cl_dest = cl_array.to_device(queue, numpy_dest)
    cl_idx = cl_array.to_device(queue, numpy_idx)
    cl_src = cl_array.to_device(queue, numpy_src)

    numpy_dest[numpy_idx] = numpy_src
    cl_dest[cl_idx] = cl_src

    assert np.array_equal(numpy_dest, cl_dest.get())

    numpy_dest = numpy_src[numpy_idx]
    cl_dest = cl_src[cl_idx]

    assert np.array_equal(numpy_dest, cl_dest.get())
Example #9
0
    def compute_preconditioners(self):
        """
        Create a diagonal preconditioner for the projection and backprojection
        operator.
        Each term of the diagonal is the sum of the projector/backprojector
        along rows [2],
        i.e the projection/backprojection of an array of ones.

        [2] T. Pock, A. Chambolle,
            Diagonal preconditioning for first order primal-dual algorithms in
            convex optimization,
            International Conference on Computer Vision, 2011
        """

        # Compute the diagonal preconditioner "Sigma"
        slice_ones = np.ones(self.backprojector.slice_shape, dtype=np.float32)
        Sigma_k = 1./self.projector.projection(slice_ones)
        Sigma_k[np.logical_not(np.isfinite(Sigma_k))] = 1.
        self.d_Sigma_k = parray.to_device(self.queue, Sigma_k)
        self.d_Sigma_kp1 = self.d_Sigma_k + 1  # TODO: memory vs computation
        self.Sigma_grad = 1/2.0  # For discrete gradient, sum|D_i,j| = 2 along lines or cols

        # Compute the diagonal preconditioner "Tau"
        sino_ones = np.ones(self.sino_shape, dtype=np.float32)
        C = self.backprojector.backprojection(sino_ones)
        Tau = 1./(C + 2.)
        self.d_Tau = parray.to_device(self.queue, Tau)

        self.add_to_cl_mem({
            "d_Sigma_k": self.d_Sigma_k,
            "d_Sigma_kp1": self.d_Sigma_kp1,
            "d_Tau": self.d_Tau
        })
Example #10
0
	def get_binned_data_angular(self,limits=((-1,1),(-1,1)),points=500):
		""" Azimuth/elevation map measured ray endpoints to a circle and bin them on the CL DEV. This linearly maps elevation to the circle's radius and azimuth to phi. nice for cross-section plots of directivity. Binning is done with points number of points within limits=((xmin,xmax),(ymin,ymax))."""
		(pos0,pwr0) = self.get_measured_rays()
		pos0_dev = cl_array.to_device(self.queue,pos0.astype(np.float32))
		x_dev	 = cl_array.zeros(self.queue,pwr0.shape,dtype=np.float32)
		y_dev	 = cl_array.zeros(self.queue,pwr0.shape,dtype=np.float32)
		pwr0_dev = cl_array.to_device(self.queue,pwr0.astype(np.float32))
		pwr_dev  = cl_array.zeros(self.queue,pwr0.shape,dtype=np.float32)
		pivot    = cl_array.to_device(self.queue,np.array([0,0,0,0],dtype=np.float32))
			
		time1 = time()
		R_dev = cl_array.to_device(self.queue,np.array([[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,0]]).astype(np.float32))
		evt = self.prg.angular_project(self.queue, pwr0.shape, None, pos0_dev.data,pwr0_dev.data,R_dev.data,pivot.data,x_dev.data,y_dev.data,pwr_dev.data)
			
			
		evt.wait()
			
		x=x_dev.get()
		y=y_dev.get()
		pwr=np.float64(pwr_dev.get())
	
		time2 = time()
		dx = np.float64(limits[0][1]-limits[0][0])/np.float64(points)
		dy = np.float64(limits[1][1]-limits[1][0])/np.float64(points)
		pwr = pwr / (dx * dy)
		
		(H,x_coord,y_coord)=np.histogram2d(x=x.flatten(),y=y.flatten(),bins=points,range=limits,weights=pwr.flatten())
		self.hist_data = (H,x_coord,y_coord)
		return self.hist_data
Example #11
0
        def __init__(self, target, queue, laplace=False):
            super(GPUCorrelator, self).__init__(target, laplace=laplace)
            self._queue = queue
            self._ctx = self._queue.context
            self._gpu = self._queue.device


            self._allocate_arrays()
            self._build_ffts()
            self._generate_kernels()

            target = self._target
            if self._laplace:
                target = self._laplace_filter(self._target)
            # move some arrays to the GPU
            self._gtarget = cl_array.to_device(self._queue, target.astype(np.float32))
            self._lcc_mask = cl_array.to_device(self._queue, self._lcc_mask.astype(np.int32))
            # Do some one-time precalculations
            self._rfftn(self._gtarget, self._ft_target)
            self._k.multiply(self._gtarget, self._gtarget, self._target2)
            self._rfftn(self._target2, self._ft_target2)

            self._gcenter = np.asarray(list(self._center) + [0], dtype=np.float32)
            self._gshape = np.asarray(
                    list(self._target.shape) + [np.product(self._target.shape)],
                    dtype=np.int32)
Example #12
0
	def get_binned_data_stereographic(self,limits=((-1,1),(-1,1)),points=500): #project data stereographically onto xy plane and bin it
		""" stereographically project measured ray endpoints and bin them on the CL DEV. This is a lot faster when you have loads of data. Binning is done with points number of points within limits=((xmin,xmax),(ymin,ymax))."""
		(pos0,pwr0) = self.get_measured_rays()
		pos0_dev = cl_array.to_device(self.queue,pos0.astype(np.float32))
		x_dev	 = cl_array.zeros(self.queue,pwr0.shape,dtype=np.float32)
		y_dev	 = cl_array.zeros(self.queue,pwr0.shape,dtype=np.float32)
		pwr0_dev = cl_array.to_device(self.queue,pwr0.astype(np.float32))
		pwr_dev  = cl_array.zeros(self.queue,pwr0.shape,dtype=np.float32)
		pivot    = cl_array.to_device(self.queue,np.array([0,0,0,0],dtype=np.float32))
			
		time1 = time()
		R_dev = cl_array.to_device(self.queue,np.array([[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,0]]).astype(np.float32))
		evt = self.prg.stereograph_project(self.queue, pwr0.shape, None, pos0_dev.data,pwr0_dev.data,R_dev.data,pivot.data,x_dev.data,y_dev.data,pwr_dev.data)
			
			
		evt.wait()
			
		x=x_dev.get()
		y=y_dev.get()
		pwr=np.float64(pwr_dev.get())
	
		time2 = time()
		dx = np.float64(limits[0][1]-limits[0][0])/np.float64(points)
		dy = np.float64(limits[1][1]-limits[1][0])/np.float64(points)
		pwr = pwr / (dx * dy)
		
		(H,x_coord,y_coord)=np.histogram2d(x=x.flatten(),y=y.flatten(),bins=points,range=limits,weights=pwr.flatten())
		self.hist_data = (H,x_coord,y_coord)
		return self.hist_data
    def __init__(self, ctx, queue, dtype=np.float32):
        self.ctx = ctx
        self.queue = queue
        sobel_c = np.array([1., 0., -1.]).astype(dtype)
        sobel_r = np.array([1., 2., 1.]).astype(dtype)
        self.sobel_c = cl_array.to_device(self.queue, sobel_c)
        self.sobel_r = cl_array.to_device(self.queue, sobel_r)

        self.scratch = None

        self.sepconv_rc = LocalMemorySeparableCorrelation(self.ctx, self.queue, sobel_r, sobel_c)
        self.sepconv_cr = LocalMemorySeparableCorrelation(self.ctx, self.queue, sobel_c, sobel_r)

        TYPE = ""
        if dtype == np.float32:
            TYPE = "float"
        elif dtype == np.uint8:
            TYPE = "unsigned char"
        elif dtype == np.uint16:
            TYPE = "unsigned short"

        self.mag = ElementwiseKernel(ctx,
                                    "float *result, %s *imgx, %s *imgy" % (TYPE, TYPE),
                                    "result[i] = sqrt((float)imgx[i]*imgx[i] + (float)imgy[i]*imgy[i])",
                                    "mag")
Example #14
0
 def allocate_arrays(self):
     """
     Allocate various types of arrays for the tests
     """
     # numpy images
     self.grad = np.zeros(self.image.shape, dtype=np.complex64)
     self.grad2 = np.zeros((2,) + self.image.shape, dtype=np.float32)
     self.grad_ref = gradient(self.image)
     self.div_ref = divergence(self.grad_ref)
     self.image2 = np.zeros_like(self.image)
     # Device images
     self.gradient_parray = parray.zeros(self.la.queue, self.image.shape, np.complex64)
     # we should be using cl.Buffer(self.la.ctx, cl.mem_flags.READ_WRITE, size=self.image.nbytes*2),
     # but platforms not suporting openCL 1.2 have a problem with enqueue_fill_buffer,
     # so we use the parray "fill" utility
     self.gradient_buffer = self.gradient_parray.data
     # Do the same for image
     self.image_parray = parray.to_device(self.la.queue, self.image)
     self.image_buffer = self.image_parray.data
     # Refs
     tmp = np.zeros(self.image.shape, dtype=np.complex64)
     tmp.real = np.copy(self.grad_ref[0])
     tmp.imag = np.copy(self.grad_ref[1])
     self.grad_ref_parray = parray.to_device(self.la.queue, tmp)
     self.grad_ref_buffer = self.grad_ref_parray.data
def test_pthomas():
    nz = 3
    ny = 4
    nx = 5

    a = np.random.rand(nx)
    b = np.random.rand(nx)
    c = np.random.rand(nx)
    d = np.random.rand(nz, ny, nx)
    d_copy = d.copy()

    solver = pthomas.PThomas(context, queue, (nz, ny, nx))
    a_d = cl_array.to_device(queue, a)
    b_d = cl_array.to_device(queue, b)
    c_d = cl_array.to_device(queue, c)
    c2_d = cl_array.to_device(queue, c)
    d_d = cl_array.to_device(queue, d)
    evt = solver.solve(a_d, b_d, c_d, c2_d, d_d)
    d = d_d.get()

    for i in range(nz):
        for j in range(ny):
            x_true = scipy_solve_banded(a, b, c, d_copy[i,j,:])
            assert_allclose(x_true, d[i,j,:])
    print 'pass'
def CalcF(ctx, queue, m2, r2):

    # Define dimensions
    xdim = ydim = m2.shape[0]

    #    m2 = np.float32(m2)
    #    r2 = np.float32(r2)

    # Get the compiled kernel
    kernel = get_kernel(ctx, xdim)

    # Move data to the GPU

    gpu_m2 = cl_array.to_device(queue, m2)
    gpu_r2 = cl_array.to_device(queue, r2)
    gpu_result = cl_array.zeros(queue, (ydim, xdim), np.float32)

    # Define grid shape (the same as the matrix dimensions)
    grid_shape = (ydim, xdim)

    # Get group shape based on the matrix dimensions and the actual hardware
    group_shape = (16, 16)

    event = kernel.CalcF(queue, grid_shape, group_shape, gpu_result.data, gpu_m2.data, gpu_r2.data)

    event.wait()
    result = gpu_result.get()
    queue.finish()

    return result
Example #17
0
def get_array(data, queue=None):
    """Get pyopencl.array.Array from *data* which can be a numpy array, a pyopencl.array.Array or a
    pyopencl.Image. *queue* is an OpenCL command queue.
    """
    if not queue:
        queue = cfg.OPENCL.queue

    if isinstance(data, cl_array.Array):
        result = data
    elif isinstance(data, np.ndarray):
        if data.dtype.kind == 'c':
            if data.dtype.itemsize != cfg.PRECISION.cl_cplx:
                data = data.astype(cfg.PRECISION.np_cplx)
            result = cl_array.to_device(queue, data.astype(cfg.PRECISION.np_cplx))
        else:
            if data.dtype.kind != 'f' or data.dtype.itemsize != cfg.PRECISION.cl_float:
                data = data.astype(cfg.PRECISION.np_float)
            result = cl_array.to_device(queue, data.astype(cfg.PRECISION.np_float))
    elif isinstance(data, cl.Image):
        result = cl_array.empty(queue, data.shape[::-1], np.float32)
        cl.enqueue_copy(queue, result.data, data, offset=0, origin=(0, 0),
                        region=result.shape[::-1])
        if result.dtype.itemsize != cfg.PRECISION.cl_float:
            result = result.astype(cfg.PRECISION.np_float)
    else:
        raise TypeError('Unsupported data type {}'.format(type(data)))

    return result
Example #18
0
    def _gpu_init(self):
        """Method to initialize all the data for GPU-accelerate search"""

        self.gpu_data = {}
        g = self.gpu_data
        d = self.data
        q = self.queue

        # move data to the GPU. All should be float32, as these is the native
        # lenght for GPUs
        g['rcore'] = cl_array.to_device(q, float32array(d['rcore'].array))
        g['rsurf'] = cl_array.to_device(q, float32array(d['rsurf'].array))
        # Make the scanning chain object an Image, as this is faster to rotate
        g['im_lsurf'] = cl.image_from_array(q.context, float32array(d['lsurf'].array))
        g['sampler'] = cl.Sampler(q.context, False, cl.addressing_mode.CLAMP,
                                  cl.filter_mode.LINEAR)

        if self.distance_restraints:
            g['restraints'] = cl_array.to_device(q, float32array(d['restraints']))

        # Allocate arrays on the GPU
        g['lsurf'] = cl_array.zeros_like(g['rcore'])
        g['clashvol'] = cl_array.zeros_like(g['rcore'])
        g['intervol'] = cl_array.zeros_like(g['rcore'])
        g['interspace'] = cl_array.zeros(q, d['shape'], dtype=np.int32)
        g['restspace'] = cl_array.zeros_like(g['interspace'])
        g['access_interspace'] = cl_array.zeros_like(g['interspace'])
        g['best_access_interspace'] = cl_array.zeros_like(g['interspace'])

        # arrays for counting
        # Reductions are typically tedious on GPU, and we need to define the
        # workgroupsize to allocate the correct amount of data
        WORKGROUPSIZE = 32
        nsubhists = int(np.ceil(g['rcore'].size/WORKGROUPSIZE))
        g['subhists'] = cl_array.zeros(q, (nsubhists, d['nrestraints'] + 1), dtype=np.float32)
        g['viol_counter'] = cl_array.zeros(q, (nsubhists, d['nrestraints'], d['nrestraints']), dtype=np.float32)

        # complex arrays
        g['ft_shape'] = list(d['shape'])
        g['ft_shape'][0] = d['shape'][0]//2 + 1
        g['ft_rcore'] = cl_array.zeros(q, g['ft_shape'], dtype=np.complex64)
        g['ft_rsurf'] = cl_array.zeros_like(g['ft_rcore'])
        g['ft_lsurf'] = cl_array.zeros_like(g['ft_rcore'])
        g['ft_clashvol'] = cl_array.zeros_like(g['ft_rcore'])
        g['ft_intervol'] = cl_array.zeros_like(g['ft_rcore'])

        # other miscellanious data
        g['nrot'] = d['nrot']
        g['max_clash'] = d['max_clash']
        g['min_interaction'] = d['min_interaction']

        # kernels
        g['k'] = Kernels(q.context)
        g['k'].rfftn = pyclfft.RFFTn(q.context, d['shape'])
        g['k'].irfftn = pyclfft.iRFFTn(q.context, d['shape'])

        # initial calculations
        g['k'].rfftn(q, g['rcore'], g['ft_rcore'])
        g['k'].rfftn(q, g['rsurf'], g['ft_rsurf'])
Example #19
0
def gs_mod_gpu(idata,itera=10,osize=256):
    
    
    cut=osize//2
    
    pl=cl.get_platforms()[0]
    devices=pl.get_devices(device_type=cl.device_type.GPU)
    ctx = cl.Context(devices=[devices[0]])
    queue = cl.CommandQueue(ctx)

    plan = Plan(idata.shape, queue=queue,dtype=complex128) #no funciona con "complex128"
    
    src = str(Template(KERNEL).render(
        double_support=all(
            has_double_support(dev) for dev in devices),
        amd_double_support=all(
            has_amd_double_support(dev) for dev in devices)
        ))
    prg = cl.Program(ctx,src).build() 
    

    idata_gpu=cl_array.to_device(queue, ifftshift(idata).astype("complex128"))
    fdata_gpu=cl_array.empty_like(idata_gpu)
    rdata_gpu=cl_array.empty_like(idata_gpu)
    plan.execute(idata_gpu.data,fdata_gpu.data)
    
    mask=exp(2.j*pi*random(idata.shape))
    mask[512-cut:512+cut,512-cut:512+cut]=0
    
    
    idata_gpu=cl_array.to_device(queue, ifftshift(idata+mask).astype("complex128"))
    fdata_gpu=cl_array.empty_like(idata_gpu)
    rdata_gpu=cl_array.empty_like(idata_gpu)
    error_gpu=cl_array.to_device(ctx, queue, zeros(idata_gpu.shape).astype("double"))
    plan.execute(idata_gpu.data,fdata_gpu.data)
    
    e=1000
    ea=1000
    for i in range (itera):
        prg.norm(queue, fdata_gpu.shape, None,fdata_gpu.data)
        plan.execute(fdata_gpu.data,rdata_gpu.data,inverse=True)
        #~ prg.norm1(queue, rdata_gpu.shape,None,rdata_gpu.data,idata_gpu.data,error_gpu.data, int32(cut))
        norm1=prg.norm1
        norm1.set_scalar_arg_dtypes([None, None, None, int32])
        norm1(queue, rdata_gpu.shape,None,rdata_gpu.data,idata_gpu.data,error_gpu.data, int32(cut))
        
        e= sqrt(cl_array.sum(error_gpu).get())/(2*cut)

        #~ if e>ea: 
           #~ 
            #~ break
        #~ ea=e
        plan.execute(rdata_gpu.data,fdata_gpu.data)
    
    fdata=fdata_gpu.get()
    fdata=ifftshift(fdata)
    fdata=exp(1.j*angle(fdata))
    return fdata
Example #20
0
    def test_touch(self):

        MAX_CLASH = 100 + 0.9
        MIN_INTER = 300 + 0.9

        NROT = np.random.randint(self.rotations.shape[0] + 1)
        rotmat = self.rotations[0]
        cpu_lsurf = np.zeros_like(self.im_lsurf.array)
        disvis.libdisvis.rotate_image3d(self.im_lsurf.array, self.vlength, np.linalg.inv(rotmat), self.im_center, cpu_lsurf)

        cpu_clashvol = numpy.fft.irfftn(numpy.fft.rfftn(cpu_lsurf).conj() * numpy.fft.rfftn(self.rcore.array))

        gpu_rcore = cl_array.to_device(self.queue, np.asarray(self.rcore.array, dtype=np.float32))
        gpu_im_lsurf = cl.image_from_array(self.queue.context, np.asarray(self.im_lsurf.array, dtype=np.float32))
        gpu_lsurf = cl_array.zeros(self.queue, self.shape, dtype=np.float32)

        self.kernels.rotate_image3d(self.queue, self.sampler, gpu_im_lsurf, rotmat, gpu_lsurf, self.im_center)

        gpu_ft_lsurf = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64)
        gpu_ft_rcore = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64)
        gpu_ft_clashvol = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64)
        gpu_clashvol = cl_array.zeros(self.queue, self.shape, dtype=np.float32)

        self.kernels.rfftn(self.queue, gpu_rcore, gpu_ft_rcore)
        self.kernels.rfftn(self.queue, gpu_lsurf, gpu_ft_lsurf)
        self.kernels.c_conj_multiply(self.queue, gpu_ft_lsurf, gpu_ft_rcore, gpu_ft_clashvol)
        self.kernels.irfftn(self.queue, gpu_ft_clashvol, gpu_clashvol)
        
        cpu_intervol = numpy.fft.irfftn(numpy.fft.rfftn(cpu_lsurf).conj() * numpy.fft.rfftn(self.rsurf.array))

        gpu_rsurf = cl_array.to_device(self.queue, np.asarray(self.rsurf.array, dtype=np.float32))

        gpu_ft_rsurf = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64)
        gpu_ft_intervol = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64)
        gpu_intervol = cl_array.zeros(self.queue, self.shape, dtype=np.float32)

        cpu_interspace = np.zeros(self.shape, dtype=np.int32)
        gpu_interspace = cl_array.zeros(self.queue, self.shape, dtype=np.int32)

        self.kernels.rfftn(self.queue, gpu_rsurf, gpu_ft_rsurf)
        self.kernels.rfftn(self.queue, gpu_lsurf, gpu_ft_lsurf)
        self.kernels.c_conj_multiply(self.queue, gpu_ft_lsurf, gpu_ft_rsurf, gpu_ft_intervol)
        self.kernels.irfftn(self.queue, gpu_ft_intervol, gpu_intervol)

        self.kernels.touch(self.queue, gpu_clashvol, MAX_CLASH, gpu_intervol, MIN_INTER, gpu_interspace)

        np.logical_and(cpu_clashvol < MAX_CLASH, cpu_intervol > MIN_INTER, cpu_interspace)

        disvis.volume.Volume(cpu_interspace, self.im_lsurf.voxelspacing, self.im_lsurf.origin).tofile('cpu_interspace.mrc')
        disvis.volume.Volume(gpu_interspace.get(), self.im_lsurf.voxelspacing, self.im_lsurf.origin).tofile('gpu_interspace.mrc')
        disvis.volume.Volume(cpu_interspace - gpu_interspace.get(), self.im_lsurf.voxelspacing, self.im_lsurf.origin).tofile('diff.mrc')
        print()
        print(cpu_interspace.sum(), gpu_interspace.get().sum())
        print(np.abs(cpu_interspace - gpu_interspace.get()).sum())
                           

        self.assertTrue(np.allclose(gpu_interspace.get(), cpu_interspace))
def main():
    # Allocate the first GPU
    ctx = cl.create_some_context(0)#use device 0, the GPU
    queue = cl.CommandQueue(ctx)
    
    # Define dimensions
    ydim = 1024
    xdim = 1024

    # Create random matrix
    matrix = np.random.random((ydim, xdim))
    matrix = np.float32(matrix)

    # Create random matrix2
    matrix2 = np.random.random((ydim, xdim))
    matrix2 = np.float32(matrix2)

    # Get the compiled kernel
    kernel = get_kernel(ctx, xdim)

    # Start timing
    t1 = time.time()
    
    # Move data to the GPU
    gpu_matrix = cl_array.to_device(queue, matrix)
    gpu_matrix2 = cl_array.to_device(queue, matrix2)
    gpu_result = cl_array.zeros(queue, (ydim, xdim), np.float32)

    # Define grid shape (the same as the matrix dimensions)
    grid_shape = (ydim, xdim)
    
    # Get group shape based on the matrix dimensions and the actual hardware
    group_shape = (16,16)#(32,16)
    
    # Execute the kernel
    event = kernel.add(queue, 
                       grid_shape, group_shape, 
                       gpu_result.data, 
                       gpu_matrix.data, 
                       gpu_matrix2.data)
                       
    # Wait for the kernel to finish
    event.wait()
    
    # Move the result from GPU to CPU
    result = gpu_result.get()
    
    # Measure end time
    t2 = time.time()

    # Print result and execution time
    print result
    print "Elapsed: %f seconds " % (t2-t1)

    # Free the GPU resource
    queue.finish()
Example #22
0
    def test(self):
        a = numpy.random.randn(4, 4).astype(numpy.float32)
        b = numpy.random.randn(4, 4).astype(numpy.float32)
        c = numpy.random.randn(4, 4).astype(numpy.float32)

        a_gpu = cl_array.to_device(self.ctx, queue, a)
        b_gpu = cl_array.to_device(self.ctx, queue, b)
        c_gpu = cl_array.to_device(self.ctx, queue, c)

        dest_gpu = cl_array.empty_like(a_gpu)
 def sum_solutions(self, line_da, x_R_d, x_UH, x_LH, alpha, beta):
     x_UH_d = cl_array.to_device(self.queue, x_UH)
     x_LH_d = cl_array.to_device(self.queue, x_LH)
     alpha_d = cl_array.to_device(self.queue, alpha)
     beta_d = cl_array.to_device(self.queue, beta)
     evt = self.sum_solutions_kernel(self.queue, (line_da.nx, line_da.ny, line_da.nz), None,
             x_R_d.data, x_UH_d.data,
                 x_LH_d.data, alpha_d.data, beta_d.data,
                     np.int32(line_da.nx), np.int32(line_da.ny),
                         np.int32(line_da.nz))
Example #24
0
    def prepare_dev_data(self):
        ldis = self.ldis

        # differentiation matrix
        drds_dev = np.empty((ldis.Np, ldis.Np, 2), dtype=np.float32)
        drds_dev[:, :, 0] = ldis.Dr.T
        drds_dev[:, :, 1] = ldis.Ds.T
        mf = cl.mem_flags
        self.diffmatrices_img = cl.Image(
            self.ctx,
            mf.READ_ONLY | mf.COPY_HOST_PTR,
            cl.ImageFormat(cl.channel_order.RG, cl.channel_type.FLOAT),
            shape=drds_dev.shape[:2],
            hostbuf=drds_dev,
        )

        # geometric coefficients
        drdx_dev = np.empty((self.K, self.dimensions ** 2), dtype=np.float32)
        drdx_dev[:, 0] = self.rx[:, 0]
        drdx_dev[:, 1] = self.ry[:, 0]
        drdx_dev[:, 2] = self.sx[:, 0]
        drdx_dev[:, 3] = self.sy[:, 0]
        self.drdx_dev = cl_array.to_device(self.queue, drdx_dev)

        # lift matrix
        lift_dev = np.zeros((ldis.Np, ldis.Nfp, 4), dtype=np.float32)
        partitioned_lift = ldis.LIFT.reshape(ldis.Np, -1, ldis.Nfaces)

        lift_dev[:, :, : ldis.Nfaces] = partitioned_lift

        self.lift_img = cl.Image(
            self.ctx,
            mf.READ_ONLY | mf.COPY_HOST_PTR,
            cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.FLOAT),
            shape=(ldis.Nfp, ldis.Np),
            hostbuf=lift_dev,
        )

        # surface info
        surfinfo_dev = np.empty((self.K, 6, ldis.Nafp), dtype=np.float32)

        el_p, face_i_p = divmod(self.vmapP.reshape(-1, ldis.Nafp), ldis.Np)
        el_m, face_i_m = divmod(self.vmapM.reshape(-1, ldis.Nafp), ldis.Np)

        ind_p = el_p * self.block_size + face_i_p
        ind_m = el_m * self.block_size + face_i_m

        surfinfo_dev[:, 0, :] = ind_m
        surfinfo_dev[:, 1, :] = ind_p
        surfinfo_dev[:, 2, :] = self.Fscale
        surfinfo_dev[:, 3, :] = np.where(ind_m == ind_p, -1, 1)
        surfinfo_dev[:, 4, :] = self.nx
        surfinfo_dev[:, 5, :] = self.ny

        self.surfinfo_dev = cl_array.to_device(self.queue, surfinfo_dev)
Example #25
0
    def test_multiply(self):
        np_in1 = np.arange(10, dtype=np.float32)
        np_in2 = np.arange(10, dtype=np.float32)
        np_out = np_in1 * np_in2

        cl_in1 = cl_array.to_device(self.queue, np_in1)
        cl_out = cl_array.to_device(self.queue, np.zeros(10, dtype=np.float32))
        cl_in2 = cl_array.to_device(self.queue, np_in2)

        self.k.multiply(cl_in1, cl_in2, cl_out)
        self.assertTrue(np.allclose(np_out, cl_out.get()))
Example #26
0
def _build_block_index(discr,
                       nblks=10,
                       factor=1.0,
                       method='elements',
                       use_tree=True):

    from pytential.linalg.proxy import (
            partition_by_nodes, partition_by_elements)

    if method == 'elements':
        factor = 1.0

    if method == 'nodes':
        nnodes = discr.nnodes
    else:
        nnodes = discr.mesh.nelements
    max_particles_in_box = nnodes // nblks

    # create index ranges
    if method == 'nodes':
        indices = partition_by_nodes(discr,
                                     use_tree=use_tree,
                                     max_nodes_in_box=max_particles_in_box)
    elif method == 'elements':
        indices = partition_by_elements(discr,
                                        use_tree=use_tree,
                                        max_elements_in_box=max_particles_in_box)
    else:
        raise ValueError('unknown method: {}'.format(method))

    # randomly pick a subset of points
    if abs(factor - 1.0) > 1.0e-14:
        with cl.CommandQueue(discr.cl_context) as queue:
            indices = indices.get(queue)

            indices_ = np.empty(indices.nblocks, dtype=np.object)
            for i in range(indices.nblocks):
                iidx = indices.block_indices(i)
                isize = int(factor * len(iidx))
                isize = max(1, min(isize, len(iidx)))

                indices_[i] = np.sort(
                        np.random.choice(iidx, size=isize, replace=False))

            ranges_ = to_device(queue,
                    np.cumsum([0] + [r.shape[0] for r in indices_]))
            indices_ = to_device(queue, np.hstack(indices_))

            indices = BlockIndexRanges(discr.cl_context,
                                       indices_.with_queue(None),
                                       ranges_.with_queue(None))

    return indices
Example #27
0
def test_multiply_array(ctx_getter):
    """Test the multiplication of two arrays."""

    context = ctx_getter()
    queue = cl.CommandQueue(context)

    a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32)

    a_gpu = cl_array.to_device(context, queue, a)
    b_gpu = cl_array.to_device(context, queue, a)

    a_squared = (b_gpu*a_gpu).get()

    assert (a*a == a_squared).all()
Example #28
0
    def send_arrays_to_device(self, field, field_temp,
                              field_interaction, factor):
        """ Move numpy arrays onto compute device. """
        self.shape = field.shape

        self.buf_field = cl_array.to_device(
            self.queue, field.astype(self.np_complex))
        self.buf_temp = cl_array.to_device(
            self.queue, field_temp.astype(self.np_complex))
        self.buf_interaction = cl_array.to_device(
            self.queue, field_interaction.astype(self.np_complex))

        self.buf_factor = cl_array.to_device(
            self.queue, factor.astype(self.np_complex))
Example #29
0
 def work(x,y,n):
     ctx = cl.create_some_context()
     queue = cl.CommandQueue(ctx)
     arr =cl_array.to_device(queue, numpy.zeros(n).astype(numpy.float16))
     ris = cl_array.to_device(queue,numpy.zeros(1).astype(numpy.float32))
     summ = ElementwiseKernel(ctx,
         "float a,float b, float *x,float *c ",
         "c[0]=a+b ") 
     prod = ElementwiseKernel(ctx,
         "float a,float b, float *x, float *c ",
         "c[0]=a*b ")
     summ (x,y,arr,ris)
     prod(x,y,arr,ris)   
     return ris
Example #30
0
File: lab1.py Project: spetz911/CL
def alter_sum():
	ctx = cl_init()
	queue = cl.CommandQueue(ctx)

	n = 10**6
	a_gpu = cl_array.to_device(
		    queue, np.random.randn(n).astype(np.float32))
	b_gpu = cl_array.to_device(
		    queue, np.random.randn(n).astype(np.float32))

	cl_sum = cl_array.sum(a_gpu).get()
	numpy_sum = np.sum(a_gpu.get())

	print cl_sum, numpy_sum
Example #31
0
def test_divide_scalar(ctx_factory):
    """Test the division of an array and a scalar."""

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
    a_gpu = cl_array.to_device(queue, a)

    result = (a_gpu / 2).get()
    assert (a / 2 == result).all()

    result = (2 / a_gpu).get()
    assert (np.abs(2 / a - result) < 1e-5).all()
Example #32
0
    def _allocate_memory(self, mode):
        self.mode = mode or "reflect"
        option_array_names = {
            "allocate_input_array": "data_in",
            "allocate_output_array": "data_out",
            "allocate_tmp_array": "data_tmp",
        }
        # Nonseparable transforms do not need tmp array
        if not (self.separable):
            self.extra_options["allocate_tmp_array"] = False
        # Allocate arrays
        for option_name, array_name in option_array_names.items():
            if self.extra_options[option_name]:
                value = parray.empty(self.queue, self.shape, np.float32)
                value.fill(np.float32(0.0))
            else:
                value = None
            setattr(self, array_name, value)

        if isinstance(self.kernel, np.ndarray):
            self.d_kernel = parray.to_device(self.queue, self.kernel)
        else:
            if not (isinstance(self.kernel, parray.Array)):
                raise ValueError(
                    "kernel must be either numpy array or pyopencl array")
            self.d_kernel = self.kernel
        self._old_input_ref = None
        self._old_output_ref = None
        if self.use_textures:
            self._allocate_textures()
        self._c_modes_mapping = {
            "periodic": 2,
            "wrap": 2,
            "nearest": 1,
            "replicate": 1,
            "reflect": 0,
            "constant": 3,
        }
        mp = self._c_modes_mapping
        if self.mode.lower() not in mp:
            raise ValueError("""
                Mode %s is not available for textures. Available modes are:
                %s
                """ % (self.mode, str(mp.keys())))
        # TODO
        if not (self.use_textures) and self.mode.lower() == "constant":
            raise NotImplementedError(
                "mode='constant' is not implemented without textures yet")
        #
        self._c_conv_mode = mp[self.mode]
Example #33
0
def test_nan_arithmetic(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    def make_nan_contaminated_vector(size):
        shape = (size, )
        a = np.random.randn(*shape).astype(np.float32)
        from random import randrange
        for i in range(size // 10):
            a[randrange(0, size)] = float('nan')
        return a

    size = 1 << 20

    a = make_nan_contaminated_vector(size)
    a_gpu = cl_array.to_device(queue, a)
    b = make_nan_contaminated_vector(size)
    b_gpu = cl_array.to_device(queue, b)

    ab = a * b
    ab_gpu = (a_gpu * b_gpu).get()

    assert (np.isnan(ab) == np.isnan(ab_gpu)).all()
Example #34
0
    def process(self, ibuf):
        if isinstance(ibuf, np.ndarray):
            ibuf = cla.to_device(self._queue, ibuf)
        sz = ibuf.shape[0]
        k2 = False
        if len(ibuf.shape) > 1:
            if ibuf.shape[1] == 1:
                pass
            elif ibuf.shape[1] == 2:
                k2 = True
            else:
                raise ValueError('invalid dimensionality')

        max_lag = self._nlags * self._lag_step + self._lag_base
        max_pre = max_lag + self._win_length
        offset = max_pre
        count = sz - offset
        obuf = cla.empty(self._queue,
                         ((count + self._interval - 1) // self._interval,
                          self._nlags // 64),
                         dtype=splice_point)

        if k2:
            ev = self._program.eval_state_2(self._queue, (self._nlags, ),
                                            (64, ),
                                            ibuf.data,
                                            obuf.data,
                                            np.int32(offset),
                                            self._win_length,
                                            self._lag_base,
                                            self._lag_step,
                                            self._interval,
                                            np.int32(count),
                                            wait_for=None)
        else:
            ev = self._program.eval_state_1(self._queue, (self._nlags, ),
                                            (64, ),
                                            ibuf.data,
                                            obuf.data,
                                            np.int32(offset),
                                            self._win_length,
                                            self._lag_base,
                                            self._lag_step,
                                            self._interval,
                                            np.int32(count),
                                            wait_for=None)

        ev.wait()
        return obuf.get()
Example #35
0
    def solve(self, A, B, x0=None, tol=10e-6, iters=300):
        r""" Solve linear system of equations by a Jacobi
		iterative method.
		@param A Linear system matrix.
		@param B Linear system independent term.
		@param x0 Initial aproximation of the solution.
		@param tol Relative error tolerance: \n
		\$ \vert\vert B - A \, x \vert \vert_\infty /
		\vert\vert B \vert \vert_\infty \$
		@param iters Maximum number of iterations.
		"""
        # Create/set OpenCL buffers
        self.setBuffers(A, B, x0)
        # Get dimensions for OpenCL execution
        n = np.uint32(len(B))
        gSize = (clUtils.globalSize(n), )
        # Get a norm to can compare later for valid result
        B_cl = cl_array.to_device(self.context, self.queue, B)
        bnorm2 = self.dot(B_cl, B_cl).get()
        FreeCAD.Console.PrintMessage(bnorm2)
        FreeCAD.Console.PrintMessage("\n")
        # Iterate while the result converges or maximum number
        # of iterations is reached.
        for i in range(0, iters):
            # Compute residues
            kernelargs = (self.A, self.B, self.X0, self.R.data, n)
            # Test if the final result has been reached
            self.program.r(self.queue, gSize, None, *(kernelargs))
            rnorm2 = self.dot(self.R, self.R).get()
            FreeCAD.Console.PrintMessage("\t")
            FreeCAD.Console.PrintMessage(rnorm2)
            FreeCAD.Console.PrintMessage("\n")
            if np.sqrt(rnorm2 / bnorm2) <= tol:
                break
            # Iterate
            kernelargs = (self.A, self.R.data, self.AR.data, n)
            self.program.dot_mat_vec(self.queue, gSize, None, *(kernelargs))
            AR_R = self.dot(self.AR, self.R).get()
            AR_AR = self.dot(self.AR, self.AR).get()
            kernelargs = (self.A, self.R.data, self.X, self.X0, AR_R, AR_AR, n)
            self.program.minres(self.queue, gSize, None, *(kernelargs))
            # Swap variables
            swap = self.X
            self.X = self.X0
            self.X0 = swap
        # Return result computed
        x = np.zeros((n), dtype=np.float32)
        cl.enqueue_read_buffer(self.queue, self.X0, x).wait()
        return (x, np.sqrt(rnorm2 / bnorm2), i)
Example #36
0
def test_divide_inplace_array(ctx_factory):
    """Test inplace division of arrays."""

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    dtypes = (np.uint8, np.uint16, np.uint32,
                  np.int8, np.int16, np.int32,
                  np.float32, np.complex64)
    from pyopencl.characterize import has_double_support
    if has_double_support(queue.device):
        dtypes = dtypes + (np.float64, np.complex128)

    from itertools import product

    for dtype_a, dtype_b in product(dtypes, repeat=2):
        print(dtype_a, dtype_b)
        a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype_a)
        b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(dtype_b)

        a_gpu = cl_array.to_device(queue, a)
        b_gpu = cl_array.to_device(queue, b)

        # ensure the same behavior as inplace numpy.ndarray division
        try:
            a_gpu /= b_gpu
        except TypeError:
            # pass for now, as numpy casts differently for in-place and out-place
            # true_divide
            pass
            # with np.testing.assert_raises(TypeError):
            #     a /= b
        else:
            a /= b
            assert (np.abs(a_gpu.get() - a) < 1e-3).all()
            assert a_gpu.dtype is a.dtype
Example #37
0
def test_ones_matrix_arange_vector():
    inp_layer = np.arange(inp_size).astype(np.float32)
    inp_layer = pycl_array.to_device(clsingle.queue, inp_layer)
    matrix = clsingle.ones((out_size, inp_size))
    out_layer = clsingle.zeros(out_size)

    code.program.matrix_vector_mul(clsingle.queue, (out_size, TS), (WPT, TS),
                                   inp_size, RESET_OUTPUT, inp_layer.data,
                                   matrix.data, out_layer.data)

    approx_val = pytest.approx(np.sum(inp_layer.get()))
    out_layer = out_layer.get()

    for i in range(out_size):
        assert out_layer[i] == approx_val
Example #38
0
def shuffle(x_data, rows, cols):
    """
    Odd sized row count will not have 1 row shuffled
    :param x_data:
    :param rows:
    :param cols:
    :param swaps_g:
    :return:
    """
    swaps_np = np.arange(rows, dtype=cltypes.uint)
    np.random.shuffle(swaps_np)
    swaps_g = array.to_device(queue, swaps_np, allocator=read_only_arr)
    e1 = shuffle_krnl(queue, (cols, len(swaps_np) // 2), None, x_data, swaps_g.data)
    e1.wait()
    return swaps_g
Example #39
0
    def cg_solve(self, x, iters):
        x = clarray.to_device(self.queue, np.require(x, requirements="C"))
        b = clarray.empty(self.queue,
                          (self.NScan, 1, self.NSlice, self.dimY, self.dimX),
                          DTYPE, "C")
        Ax = clarray.empty(self.queue,
                           (self.NScan, 1, self.NSlice, self.dimY, self.dimX),
                           DTYPE, "C")
        data = clarray.to_device(self.queue, self.data)

        self.operator_rhs(b, data)
        res = b
        p = res
        delta = np.linalg.norm(res.get())**2/np.linalg.norm(b.get())**2
        self.res.append(delta)
        print("Initial Residuum: ", delta)

        for i in range(iters):
            self.operator_lhs(Ax, p)
            Ax = Ax + self.reco_par["lambd"]*p
            alpha = (clarray.vdot(res, res)/(clarray.vdot(p, Ax))).real.get()
            x[i+1] = (x[i] + alpha*p)
            res_new = res - alpha*Ax
            delta = np.linalg.norm(res_new.get())**2/np.linalg.norm(b.get())**2
            self.res.append(delta)
            if delta < self.reco_par["tol"]:
                print("Converged after %i iterations to %1.3e." % (i, delta))
                return x.get()[:i+1, ...]
            if not np.mod(i, 1):
                print("Residuum at iter %i : %1.3e" % (i, delta), end='\r')

            beta = (clarray.vdot(res_new, res_new) /
                    clarray.vdot(res, res)).real.get()
            p = res_new+beta*p
            (res, res_new) = (res_new, res)
        return x.get()
Example #40
0
def test_outoforderqueue_reductions(ctx_factory):
    context = ctx_factory()
    try:
        queue = cl.CommandQueue(context,
               properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
    except Exception:
        pytest.skip("out-of-order queue not available")
    # 0/1 values to avoid accumulated rounding error
    a = (np.random.rand(10**6) > 0.5).astype(np.dtype('float32'))
    a[800000] = 10  # all<5 looks true until near the end
    a_gpu = cl_array.to_device(queue, a)
    b1 = cl_array.sum(a_gpu).get()
    b2 = cl_array.dot(a_gpu, 3 - a_gpu).get()
    b3 = (a_gpu < 5).all().get()
    assert b1 == a.sum() and b2 == a.dot(3 - a) and b3 == 0
    def test_adj_inplace(self):
        inpgrad = clarray.to_device(self.queue, self.symgradin)
        inpdiv = clarray.to_device(self.queue, self.symdivin)

        outgrad = clarray.zeros_like(inpdiv)
        outdiv = clarray.zeros_like(inpgrad)

        outgrad.add_event(self.symgrad.fwd(outgrad, inpgrad))
        outdiv.add_event(self.symgrad.adj(outdiv, inpdiv))

        outgrad = outgrad.get()
        outdiv = outdiv.get()

        a1 = np.vdot(outgrad[..., :3].flatten(),
                     self.symdivin[..., :3].flatten())/self.symgradin.size*4
        a2 = 2*np.vdot(outgrad[..., 3:6].flatten(),
                       self.symdivin[..., 3:6].flatten())/self.symgradin.size*4
        a = a1+a2
        b = np.vdot(self.symgradin.flatten(),
                    -outdiv.flatten())/self.symgradin.size*4

        print("Adjointness: %.2e +1j %.2e" % ((a - b).real, (a - b).imag))

        np.testing.assert_allclose(a, b, rtol=RTOL, atol=ATOL)
Example #42
0
    def test_1d_out_of_place(self, ctx):
        queue = cl.CommandQueue(ctx)
        
        nd_data = np.arange(32, dtype=np.complex64)
        cl_data = cla.to_device(queue, nd_data)
        cl_data_transformed = cla.zeros_like(cl_data)
        
        transform = FFT(ctx, queue,
                        cl_data,
                        cl_data_transformed
        )
        transform.enqueue()

        assert np.allclose(cl_data_transformed.get(),
                           np.fft.fft(nd_data))
    def setupClVariables(self):
        self.nrOfDetectionAngleSteps = self.configReader.nrOfDetectionAngleSteps
        self.host_mostRecentMembraneCoordinatesX = np.zeros(
            shape=self.nrOfDetectionAngleSteps, dtype=np.float64)
        self.dev_mostRecentMembraneCoordinatesX = cl_array.to_device(
            self.managementQueue, self.host_mostRecentMembraneCoordinatesX)
        self.host_mostRecentMembraneCoordinatesY = np.zeros(
            shape=self.nrOfDetectionAngleSteps, dtype=np.float64)
        self.dev_mostRecentMembraneCoordinatesY = cl_array.to_device(
            self.managementQueue, self.host_mostRecentMembraneCoordinatesY)

        self.host_mostRecentMembraneNormalVectorsX = np.zeros(
            shape=self.nrOfDetectionAngleSteps, dtype=np.float64)
        self.dev_mostRecentMembraneNormalVectorsX = cl_array.to_device(
            self.managementQueue, self.host_mostRecentMembraneNormalVectorsX)
        self.host_mostRecentMembraneNormalVectorsY = np.zeros(
            shape=self.nrOfDetectionAngleSteps, dtype=np.float64)
        self.dev_mostRecentMembraneNormalVectorsY = cl_array.to_device(
            self.managementQueue, self.host_mostRecentMembraneNormalVectorsY)

        self.host_contourCenter = np.zeros(1, cl.array.vec.double2)
        self.dev_mostRecentContourCenter = cl_array.to_device(
            self.managementQueue, self.host_contourCenter)
        pass
Example #44
0
    def __init__(self,
                 data: Union[cl.array.Array, list, np.ndarray],
                 gpu: bool = False) -> None:
        """Initialize variables."""

        self._gpu: bool = gpu

        if isinstance(data, list):
            self._data: np.ndarray = np.array(data, dtype=np.float32)

            if self._gpu:
                self._data = clarray.to_device(QUEUE, self._data)

        elif isinstance(data, np.ndarray):
            if data.dtype != np.float32:
                # NOTE: The NumPy array has to be converted into a list first.
                #       Otherwise, the operations on cpu and gpu produce
                #       different results. This behavior can be caused by many
                #       reasons including OpenCL and even the operating system
                #       itself. Some research is needed to figure out cause and
                #       eliminate extra work for rebuilding the array.
                self._data: np.ndarray = np.array(data.tolist(), np.float32)
            else:
                self._data: np.ndarray = data

            if self._gpu:
                self._data = clarray.to_device(QUEUE, self._data)

        elif isinstance(data, cl.array.Array):
            self._data: cl.array.Array = data
            self._gpu: bool = True

        else:
            raise TypeError(
                "Expected `list`, `np.ndarray`, or `pyopencl.array.Array` got "
                f"`{type(data)}`")
    def __init__(self, ctx, queue, shape, coeffs):
        '''
        Create context for the Cyclic Reduction Solver
        that solves a "near-toeplitz"
        tridiagonal system with
        diagonals:
        a = (_, ai, ai .... an)
        b[:] = (b1, bi, bi, bi... bn)
        c[:] = (c1, ci, ci, ... _)

        Parameters
        ----------
        ctx: PyOpenCL context
        queue: PyOpenCL command queue
        shape: The size of the tridiagonal system.
        coeffs: A list of coefficients that make up the tridiagonal matrix:
            [b1, c1, ai, bi, ci, an, bn]
        '''
        self.ctx = ctx
        self.queue = queue
        self.device = self.ctx.devices[0]
        self.platform = self.device.platform
        self.nz, self.ny, self.nx = shape
        self.coeffs = coeffs

        mf = cl.mem_flags

        # check that system_size is a power of 2:
        assert np.int(np.log2(self.nx)) == np.log2(self.nx)

        # compute coefficients a, b, etc.,
        a, b, c, k1, k2, b_first, k1_first, k1_last = self._precompute_coefficients(
        )

        self.a_d = cl_array.to_device(queue, a)
        self.b_d = cl_array.to_device(queue, b)
        self.c_d = cl_array.to_device(queue, c)
        self.k1_d = cl_array.to_device(queue, k1)
        self.k2_d = cl_array.to_device(queue, k2)
        self.b_first_d = cl_array.to_device(queue, b_first)
        self.k1_first_d = cl_array.to_device(queue, k1_first)
        self.k1_last_d = cl_array.to_device(queue, k1_last)

        self.forward_reduction, self.back_substitution = kernels.get_funcs(
            self.ctx, 'kernels.cl', 'globalForwardReduction',
            'globalBackSubstitution')
Example #46
0
def clfftn(data):
    """ OpenCL FFT 3D
    """
    clear_first_arg_caches()
    #ctx = cl.create_some_context(interactive=False)
    #queue = cl.CommandQueue(ctx)
    ctx, queue = clinit()
    plan = Plan(data.shape, normalize=True, queue=queue)
    # forward transform on device
    gpu_data = cl_array.to_device(queue, data)
    # forward transform
    plan.execute(gpu_data.data)
    #result = gpu_data.get()
    result = gpu_data.get()
    return result
Example #47
0
def mhd_gamma_calc(queue, G, P, loc=Loci.CENT, out=None):
    """Find relativistic gamma-factor w.r.t. normal observer"""
    s = G.slices
    sh = G.shapes

    global g3
    if g3 is None:
        g3 = cl_array.to_device(queue, G.gcov[loc.value, 1:, 1:].copy())

    if out is None:
        out = cl_array.empty(queue, sh.grid_scalar, dtype=np.float64)

    evt, _ = G.dot2geom2(queue, g=g3, u=P[s.U3VEC], v=P[s.U3VEC], out=out)
    out = clm.sqrt(1. + out)
    return out
Example #48
0
def test_zero_size_array(ctx_factory, empty_shape):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    if queue.device.platform.name == "Intel(R) OpenCL":
        pytest.xfail("size-0 arrays fail on Intel CL")

    a = cl_array.zeros(queue, empty_shape, dtype=np.float32)
    b = cl_array.zeros(queue, empty_shape, dtype=np.float32)
    b.fill(1)
    c = a + b
    c_host = c.get()
    cl_array.to_device(queue, c_host)

    assert c.flags.c_contiguous == c_host.flags.c_contiguous
    assert c.flags.f_contiguous == c_host.flags.f_contiguous

    for order in "CF":
        c_flat = c.reshape(-1, order=order)
        c_host_flat = c_host.reshape(-1, order=order)
        assert c_flat.shape == c_host_flat.shape
        assert c_flat.strides == c_host_flat.strides
        assert c_flat.flags.c_contiguous == c_host_flat.flags.c_contiguous
        assert c_flat.flags.f_contiguous == c_host_flat.flags.f_contiguous
Example #49
0
 def __init__(self, ary, backend=None):
     self.backend = get_backend(backend)
     self.data = ary
     self._convert = False
     if self.backend == 'opencl':
         use_double = get_config().use_double
         self._dtype = np.float64 if use_double else np.float32
         if np.issubdtype(self.data.dtype, np.float):
             self._convert = True
         from pyopencl.array import to_device
         from .opencl import get_queue
         self.q = get_queue()
         self.dev = to_device(self.q, self._get_data())
     else:
         self.dev = self.data
Example #50
0
def project_metaballs_naive(metaballs,
                            shape,
                            pixel_size,
                            offset=None,
                            z_step=None,
                            queue=None,
                            out=None,
                            block=False):
    """Project a list of :class:`.MetaBall` on an image plane with *shape*, *pixel_size*. *z_step*
    is the physical step in the z-dimension, if not specified it is the same as *pixel_size*.
    *offset* is the physical spatial body offset as (y, x). Use OpenCL *queue* and *out* pyopencl
    Array instance for returning the result. If *block* is True, wait for the kernel to finish.
    """
    def get_extrema(sgn):
        func = np.max if sgn > 0 else np.min
        x_ps = util.make_tuple(pixel_size)[1]
        res = [(ball.position[2] + sgn *
                (2 * ball.radius + x_ps)).simplified.magnitude
               for ball in metaballs]

        return func(res)

    if offset is None:
        offset = (0, 0) * q.m
    if not queue:
        queue = cfg.OPENCL.queue
    if out is None:
        out = cl_array.Array(queue, shape, cfg.PRECISION.np_float)

    string = ''.join([body.pack() for body in metaballs])
    data = np.fromstring(string, dtype=np.float32)
    data = cl_array.to_device(queue, data)
    n, m = shape
    ps = util.make_tuple(pixel_size.simplified.magnitude)
    z_step = ps[1] if z_step is None else z_step.simplified.magnitude

    z_range = get_extrema(-1), get_extrema(1)
    offset = g_util.make_vfloat2(*offset.simplified.magnitude[::-1])

    ev = cfg.OPENCL.programs['geometry'].naive_metaballs(
        cfg.OPENCL.queue, (m, n), None, out.data, data.data,
        np.int32(len(metaballs)), offset, g_util.make_vfloat2(*z_range),
        cfg.PRECISION.np_float(z_step), g_util.make_vfloat2(*ps[::-1]),
        np.int32(True))
    if block:
        ev.wait()

    return out
Example #51
0
def test_hankel_01_complex(ctx_factory, ref_src):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    if not has_double_support(ctx.devices[0]):
        from pytest import skip
        skip(
            "no double precision support--cannot test complex bessel function")

    n = 10**6
    np.random.seed(11)
    z = (np.logspace(-5, 2, n) * np.exp(1j * 2 * np.pi * np.random.rand(n)))

    def get_err(check, ref):
        return np.max(np.abs(check - ref)) / np.max(np.abs(ref))

    if ref_src == "pyfmmlib":
        pyfmmlib = pytest.importorskip("pyfmmlib")
        h0_ref, h1_ref = pyfmmlib.hank103_vec(z, ifexpon=1)
    elif ref_src == "scipy":
        spec = pytest.importorskip("scipy.special")
        h0_ref = spec.hankel1(0, z)
        h1_ref = spec.hankel1(1, z)

    else:
        raise ValueError("ref_src")

    z_dev = cl_array.to_device(queue, z)

    h0_dev, h1_dev = clmath.hankel_01(z_dev)

    rel_err_h0 = np.abs(h0_dev.get() - h0_ref) / np.abs(h0_ref)
    rel_err_h1 = np.abs(h1_dev.get() - h1_ref) / np.abs(h1_ref)

    max_rel_err_h0 = np.max(rel_err_h0)
    max_rel_err_h1 = np.max(rel_err_h1)

    print("H0", max_rel_err_h0)
    print("H1", max_rel_err_h1)

    assert max_rel_err_h0 < 4e-13
    assert max_rel_err_h1 < 2e-13

    if 0:
        import matplotlib.pyplot as pt
        pt.loglog(np.abs(z), rel_err_h0)
        pt.loglog(np.abs(z), rel_err_h1)
        pt.show()
Example #52
0
    def step(self, delta_time):

        if pause:
            return

        centers = np.ndarray((self.galaxy_count, 4), dtype=np.float32)
        for i in range(self.galaxy_count):
            centers[i][:3] = self.galaxies[i].position
            centers[i][3] = self.galaxies[i].mass

        centers_buffer = clarray.to_device(cl_queue, centers)

        gl.glFlush()
        gl.glFinish()

        for i, galaxy in enumerate(self.galaxies):
            cl.enqueue_acquire_gl_objects(cl_queue, [
                galaxy.body_positions_cl_buffer,
                galaxy.body_velocities_cl_buffer
            ])
            kernel_step(cl_queue, (galaxy.body_count, ), None,
                        galaxy.body_positions_cl_buffer,
                        galaxy.body_velocities_cl_buffer, centers_buffer.data,
                        np.uint(galaxy.body_count), np.uint(self.galaxy_count),
                        np.float32(self.dt * delta_time), np.float32(self.G))
            cl.enqueue_release_gl_objects(cl_queue, [
                galaxy.body_positions_cl_buffer,
                galaxy.body_velocities_cl_buffer
            ])
            cl_queue.finish()

        centers = [
            mathutils.Vector((galaxy.position.x, galaxy.position.y,
                              galaxy.position.z, galaxy.mass))
            for galaxy in self.galaxies
        ]

        for i in range(self.galaxy_count):
            this_galaxy = self.galaxies[i]
            f = mathutils.Vector((0, 0, 0))
            #f = np.zeros((4,), dtype=np.float32)
            for j in self.others(i):
                delta_pos = mathutils.Vector(centers[j] - centers[i]).xyz
                length = max(1.0, delta_pos.length_squared)
                f += delta_pos.normalized() * self.G * centers[i][3] * centers[
                    j][3] / delta_pos.length_squared
            this_galaxy.velocity += f * delta_time * self.dt
            this_galaxy.position += this_galaxy.velocity * delta_time * self.dt
Example #53
0
    def setUp(self):
        parser = tmpArgs()
        parser.streamed = False
        parser.devices = -1
        parser.use_GPU = True

        par = {}
        pyqmri.pyqmri._setupOCL(parser, par)
        setupPar(par)
        if DTYPE == np.complex128:
            file = resource_filename(
                        'pyqmri', 'kernels/OpenCL_Kernels_double.c')
        else:
            file = resource_filename(
                        'pyqmri', 'kernels/OpenCL_Kernels.c')

        prg = []
        for j in range(len(par["ctx"])):
            with open(file) as myfile:
                prg.append(Program(
                    par["ctx"][j],
                    myfile.read()))
        prg = prg[0]

        self.op = pyqmri.operator.OperatorImagespace(
            par, prg,
            DTYPE=DTYPE,
            DTYPE_real=DTYPE_real)
        self.opinfwd = np.random.randn(par["unknowns"], par["NSlice"],
                                        par["dimY"], par["dimX"]) +\
            1j * np.random.randn(par["unknowns"], par["NSlice"],
                                  par["dimY"], par["dimX"])
        self.opinadj = np.random.randn(par["NScan"], 1, par["NSlice"],
                                        par["dimY"], par["dimX"]) +\
            1j * np.random.randn(par["NScan"], 1, par["NSlice"],
                                  par["dimY"], par["dimX"])
        self.model_gradient = np.random.randn(par["unknowns"], par["NScan"],
                                              par["NSlice"],
                                              par["dimY"], par["dimX"]) + \
            1j * np.random.randn(par["unknowns"], par["NScan"],
                                  par["NSlice"],
                                  par["dimY"], par["dimX"])

        self.model_gradient = self.model_gradient.astype(DTYPE)
        self.opinfwd = self.opinfwd.astype(DTYPE)
        self.opinadj = self.opinadj.astype(DTYPE)
        self.queue = par["queue"][0]
        self.grad_buf = clarray.to_device(self.queue, self.model_gradient)
Example #54
0
def transfer(thickness,
             refractive_index,
             wavelength,
             exponent=False,
             queue=None,
             out=None,
             check=True,
             block=False):
    """Transfer *thickness* (can be either a numpy or pyopencl array) with *refractive_index* and
    given *wavelength*. If *exponent* is True, compute the exponent of the function without applying
    the wavenumber. Use command *queue* for computation and *out* pyopencl array. If *block* is
    True, wait for the kernel to finish. If *check* is True, the function is checked for aliasing
    artefacts. Returned *out* array is different from the input one because of the pyopencl.clmath
    behavior.
    """
    if queue is None:
        queue = cfg.OPENCL.queue

    if isinstance(thickness, cl_array.Array):
        thickness_mem = thickness
    else:
        prep = thickness.simplified.magnitude.astype(cfg.PRECISION.np_float)
        thickness_mem = cl_array.to_device(queue, prep)

    if out is None:
        out = cl_array.Array(queue, thickness_mem.shape, cfg.PRECISION.np_cplx)

    if exponent or check:
        wavenumber = cfg.PRECISION.np_float(2 * np.pi /
                                            wavelength.simplified.magnitude)
        ev = cfg.OPENCL.programs['physics'].transmission_add(
            queue, thickness_mem.shape[::-1], None,
            out.data, thickness_mem.data,
            cfg.PRECISION.np_cplx(refractive_index), wavenumber, np.int32(1))
        if check and not is_wavefield_sampling_ok(out, queue=queue):
            LOG.error('Insufficient transmission function sampling')
        if not exponent:
            # Apply the exponent
            out = clmath.exp(out, queue=queue)
    else:
        ev = cfg.OPENCL.programs['physics'].transfer(
            queue, thickness_mem.shape[::-1], None, out.data,
            thickness_mem.data, cfg.PRECISION.np_cplx(refractive_index),
            cfg.PRECISION.np_float(wavelength.simplified.magnitude))
    if block:
        ev.wait()

    return out
Example #55
0
def test_outoforderqueue_copy(ctx_factory):
    context = ctx_factory()
    try:
        queue = cl.CommandQueue(context,
               properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
    except Exception:
        pytest.skip("out-of-order queue not available")
    a = np.random.rand(10**6).astype(np.dtype('float32'))
    a_gpu = cl_array.to_device(queue, a)
    c_gpu = a_gpu**2 - 7
    b_gpu = c_gpu.copy()  # testing that this waits for and creates events
    b_gpu *= 10
    queue.finish()
    b1 = b_gpu.get()
    b = 10 * (a**2 - 7)
    assert np.abs(b1 - b).mean() < 1e-5
Example #56
0
    def probabilities(self):
        """Gets the squared absolute value of each of the amplitudes"""
        out = pycl_array.to_device(
            self.queue,
            np.zeros(2**self.num_qubits, dtype=np.float32)
        )

        program.calculate_probabilities(
            self.queue,
            out.shape,
            None,
            self.buffer.data,
            out.data
        )

        return out.get()
Example #57
0
    def _init_reference_field(self, scale_ref=1):
        # clear object patches
        for subfield, mask in zip(self.object_multiareafield.subfields,
                                  self.object_multiareafield.subfields_masks):
            np.copyto(subfield.field, 0, where=mask)

        # obtain reference field
        self.propagator_object_to_farfield.propagator_full_field.propagate()

        self.object_field_ref = self.object_multiareafield.field.copy()
        self.far_field.field *= scale_ref
        self.far_field_ref = self.far_field.copy()

        self.cl_far_field_ref = cla.to_device(self.cl_queue,
                                              self.far_field_ref.field,
                                              allocator=self.cl_allocator)
Example #58
0
    def test_grad_outofplace(self):
        gradx = np.zeros_like(self.gradin)
        grady = np.zeros_like(self.gradin)
        gradz = np.zeros_like(self.gradin)

        gradx[..., :-1] = np.diff(self.gradin, axis=-1)
        grady[..., :-1, :] = np.diff(self.gradin, axis=-2)
        gradz[:, :-1, ...] = np.diff(self.gradin, axis=-3) * self.dz

        grad = np.stack((gradx, grady, gradz), axis=-1)

        inp = clarray.to_device(self.queue, self.gradin)
        outp = self.grad.fwdoop(inp)
        outp = outp.get()

        np.testing.assert_allclose(outp[..., :-1], grad, rtol=RTOL, atol=ATOL)
Example #59
0
def test_outoforderqueue_clmath(ctx_factory):
    context = ctx_factory()
    try:
        queue = cl.CommandQueue(context,
                                properties=cl.command_queue_properties.
                                OUT_OF_ORDER_EXEC_MODE_ENABLE)
    except Exception:
        pytest.skip("out-of-order queue not available")
    a = np.random.rand(10**6).astype(np.dtype('float32'))
    a_gpu = cl_array.to_device(queue, a)
    # testing that clmath functions wait for and create events
    b_gpu = clmath.fabs(clmath.sin(a_gpu * 5))
    queue.finish()
    b1 = b_gpu.get()
    b = np.abs(np.sin(a * 5))
    assert np.abs(b1 - b).mean() < 1e-5
Example #60
0
def test_identity_matrix_random_vector():
    inp_layer = clsingle.random(inp_size)
    matrix = [np.arange(out_size) == i for i in range(out_size)]
    matrix = np.array(matrix).astype(np.float32)
    matrix = pycl_array.to_device(clsingle.queue, matrix)
    out_layer = clsingle.ones(out_size)

    code.program.matrix_vector_mul(clsingle.queue, (out_size, TS), (WPT, TS),
                                   inp_size, RESET_OUTPUT, inp_layer.data,
                                   matrix.data, out_layer.data)

    out_layer = out_layer.get()
    inp_layer = inp_layer.get()

    for i in range(out_size):
        assert out_layer[i] == pytest.approx(inp_layer[i])