Esempio n. 1
0
def kspacegaussian_filter_pyfftCL(ksp, sigma):
    clear_first_arg_caches()
    sz = ksp.shape
    dtype = np.complex64
    ftype = np.float32
    #api = cluda.ocl_api()
    ctx = cl.create_some_context(interactive=False)
    queue = cl.CommandQueue(ctx)
    queue.flush()
    data_dev = cl_array.to_device(queue, ksp)
    w = h = k = 512
    plan = Plan((w, h, k), normalize=True, queue=queue)
    FACTOR = 1.0
    program = cl.Program(ctx, """
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#include "pyopencl-complex.h" 
__kernel void gauss_kernel(__global cfloat_t *dest) //, __global cfloat_t *src)
{
  uint x = get_global_id(0);uint y = get_global_id(1);uint z = get_global_id(2);
  uint dim1= %d;
  uint dim2= %d;
  uint dim3= %d;                    
  float sigma[3];
  sigma[0]=%f;sigma[1]=%f;sigma[2]=%f;
  float factor = %f;            
  float TWOPISQ = 19.739208802178716; //6.283185307179586;  //2*3.141592;

  ulong idx = z*dim1*dim2 + y*dim1 + x;
  float i = (float)(x);  //(x / dim3) / dim2);
      i = (i - (float)floor((float)(dim1)/2.0f))/(float)(dim1);
  float j = (float)y; //(x / dim3);
      //if((int)j > dim2) {j=(float)fmod(j, (float)dim2);};
      j = (j - (float)floor((float)(dim2)/2.0f))/(float)(dim2);
  //Account for large global index (stored as ulong) before performing modulus
  //double pre_k=fmod((double)(x) , (double) dim3);
  float k = (float) z; // pre_k;
      k = (k - (float)floor((float)(dim3)/2.0f))/(float)(dim3);

  float weight = exp(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]));
  dest[idx].x = dest[idx].x * weight;
  dest[idx].y = dest[idx].y * weight; 
  
}
""" % (sz[0], sz[1], sz[2], sigma[0], sigma[1], sigma[2], FACTOR)).build()
    gauss_kernel = program.gauss_kernel
    #data_dev = thr.empty_like(ksp_dev)
    gauss_kernel(queue, sz, None, data_dev.data).wait()  # , data_dev.data
    ksp_out = data_dev.get()
    queue.flush()
    ctx = cl.create_some_context(interactive=False)
    queue = cl.CommandQueue(ctx)
    w = h = k = 512
    plan = Plan((w, h, k), normalize=True, queue=queue)
    data2_dev = cl_array.to_device(queue, ksp_out)
    plan.execute(data2_dev.data, inverse=True)
    result = data2_dev.get()
    result = np.fft.fftshift(result)
    queue.finish()
    return result  # ,ksp_out
 def __init__(self, in_scale, in_matrix, queue):
     self.scale = tuple(in_scale)
     # create plan
     self.plan = Plan(self.scale, queue=queue)
     # prepare data
     self.data = in_matrix
     self.gpu_data = cl_array.to_device(queue, self.data)
    def test_fft(self):
        data = gpu_util.get_array(np.random.normal(100, 100,
                                                   size=(4, 4)).astype(cfg.PRECISION.np_float))
        orig = gpu_util.get_host(data)
        data = ip.fft_2(data)
        ip.ifft_2(data)
        np.testing.assert_almost_equal(orig, data.get().real, decimal=4)

        # With a plan
        from pyfft.cl import Plan
        plan = Plan((4, 4), queue=cfg.OPENCL.queue)
        data = ip.fft_2(np.copy(orig), plan=plan)
        ip.ifft_2(data, plan=plan)
        np.testing.assert_almost_equal(orig, data.get().real, decimal=4)

        # Test double precision
        syris.init(double_precision=True, device_index=0)
        data = gpu_util.get_array(np.random.normal(100, 100,
                                                   size=(4, 4)).astype(cfg.PRECISION.np_float))
        gt = np.fft.fft2(data.get())
        data = ip.fft_2(data)
        np.testing.assert_almost_equal(gt, data.get(), decimal=4)

        gt = np.fft.ifft2(data.get())
        data = ip.ifft_2(data)
        np.testing.assert_almost_equal(gt, data.get(), decimal=4)
Esempio n. 4
0
    def __call__(self, domain, field):
        # Setup plan for calculating fast Fourier transforms:
        self.plan = Plan(domain.total_samples, queue=self.queue)

        field_temp = np.empty_like(field)
        field_interaction = np.empty_like(field)

        from pyofss.modules.linearity import Linearity
        dispersion = Linearity(beta=[0.0, 0.0, 0.0, 1.0], sim_type="default")
        factor = dispersion(domain)

        self.send_arrays_to_device(
            field, field_temp, field_interaction, factor)

        stepsize = self.length / self.total_steps
        zs = np.linspace(0.0, self.length, self.total_steps + 1)

        #start = time.clock()
        for z in zs[:-1]:
            self.cl_rk4ip(self.buf_field, self.buf_temp,
                          self.buf_interaction, self.buf_factor, stepsize)
        #stop = time.clock()

        #cl_result = self.buf_field.get()
        #print("cl_result: %e" % ((stop - start) / 1000.0))

        return self.buf_field.get()
Esempio n. 5
0
    def __init__(self, total_samples, dorf, length=None, total_steps=None,
                 name="ocl_fibre"):
        self.name = name

        self.queue = None
        self.np_float = None
        self.np_complex = None
        self.prg = None
        self.cl_initialise(dorf)

        self.plan = Plan(total_samples, queue=self.queue)

        self.buf_field = None
        self.buf_temp = None
        self.buf_interaction = None
        self.buf_factor = None

        self.shape = None
        self.plan = None

        self.cached_factor = False
        # Force usage of cached version of function:
        self.cl_linear = self.cl_linear_cached

        self.length = length
        self.total_steps = total_steps
Esempio n. 6
0
def gs_mod_gpu(idata, itera=10, osize=256):

    cut = osize // 2

    pl = cl.get_platforms()[0]
    devices = pl.get_devices(device_type=cl.device_type.GPU)
    ctx = cl.Context(devices=[devices[0]])
    queue = cl.CommandQueue(ctx)

    plan = Plan(idata.shape, queue=queue,
                dtype=complex128)  #no funciona con "complex128"

    src = str(
        Template(KERNEL).render(
            double_support=all(has_double_support(dev) for dev in devices),
            amd_double_support=all(
                has_amd_double_support(dev) for dev in devices)))
    prg = cl.Program(ctx, src).build()

    idata_gpu = cl_array.to_device(queue,
                                   ifftshift(idata).astype("complex128"))
    fdata_gpu = cl_array.empty_like(idata_gpu)
    rdata_gpu = cl_array.empty_like(idata_gpu)
    plan.execute(idata_gpu.data, fdata_gpu.data)

    mask = exp(2.j * pi * random(idata.shape))
    mask[512 - cut:512 + cut, 512 - cut:512 + cut] = 0

    idata_gpu = cl_array.to_device(
        queue,
        ifftshift(idata + mask).astype("complex128"))
    fdata_gpu = cl_array.empty_like(idata_gpu)
    rdata_gpu = cl_array.empty_like(idata_gpu)
    error_gpu = cl_array.to_device(ctx, queue,
                                   zeros(idata_gpu.shape).astype("double"))
    plan.execute(idata_gpu.data, fdata_gpu.data)

    e = 1000
    ea = 1000
    for i in range(itera):
        prg.norm(queue, fdata_gpu.shape, None, fdata_gpu.data)
        plan.execute(fdata_gpu.data, rdata_gpu.data, inverse=True)
        #~ prg.norm1(queue, rdata_gpu.shape,None,rdata_gpu.data,idata_gpu.data,error_gpu.data, int32(cut))
        norm1 = prg.norm1
        norm1.set_scalar_arg_dtypes([None, None, None, int32])
        norm1(queue, rdata_gpu.shape, None, rdata_gpu.data, idata_gpu.data,
              error_gpu.data, int32(cut))

        e = sqrt(cl_array.sum(error_gpu).get()) / (2 * cut)

        #~ if e>ea:
        #~
        #~ break
        #~ ea=e
        plan.execute(rdata_gpu.data, fdata_gpu.data)

    fdata = fdata_gpu.get()
    fdata = ifftshift(fdata)
    fdata = exp(1.j * angle(fdata))
    return fdata
Esempio n. 7
0
def _get_plan(itype,otype,inlen):
    try:
        theplan = _plans[(itype,otype,inlen)]
    except KeyError:
        theplan = Plan(inlen,dtype=itype,queue=pycbc.scheme.mgr.state.queue,normalize=False,fast_math=True)
        _plans.update({(itype,otype,inlen) : theplan })

    return theplan
Esempio n. 8
0
def _ocl_fft_gpu_inplace(ocl_arr, inverse=False, plan=None):

    assert_bufs_type(np.complex64, ocl_arr)

    if plan is None:
        plan = Plan(ocl_arr.shape, queue=get_device().queue)

    plan.execute(ocl_arr.data, ocl_arr.data, inverse=inverse)
    def __init__(self, in_size, kernel_size, batch_size, context, queue):
        self.sizes = []
        for i in xrange(len(in_size)):
            self.sizes.append(get_power_of_two(in_size[i] + kernel_size[i] +
                                               1))
        self.sizes = tuple(self.sizes)

        self.ctx = context
        self.queue = queue
        self.plan = Plan(self.sizes, queue=self.queue)
        self.in_array = cl.array.zeros(
            self.queue,
            (batch_size, self.sizes[0], self.sizes[1], self.sizes[2]),
            numpy.complex64)
        self.kernel = cl.array.zeros(
            self.queue,
            (batch_size, self.sizes[0], self.sizes[1], self.sizes[2]),
            numpy.complex64)
        self.result_buffer = numpy.zeros(self.in_array.shape, numpy.complex64)
        self.kernel_center = []
        for i in xrange(len(kernel_size)):
            self.kernel_center.append(kernel_size[i] / 2)
        self.kernel_center = tuple(self.kernel_center)

        self.halves = []
        for i in xrange(len(kernel_size)):
            self.halves.append(numpy.ceil(kernel_size[i] / 2.0))
        self.halves = tuple(self.halves)

        self.padding_locations = []
        for i in xrange(len(self.sizes)):
            # without this if even kernels result in an incorrect edge in the result
            if kernel_size[i] % 2 == 0:
                self.padding_locations.append(
                    -1 * ((in_size[i] - self.sizes[i]) / 2))
                self.padding_locations.append(
                    -1 * ((self.sizes[i] - in_size[i]) / 2))
            else:
                self.padding_locations.append((self.sizes[i] - in_size[i]) / 2)
                self.padding_locations.append((in_size[i] - self.sizes[i]) / 2)
        self.padding_locations = tuple(self.padding_locations)

        self.valid_locations = []
        for i in xrange(len(self.sizes)):
            self.valid_locations.append(self.padding_locations[(i * 2)] +
                                        self.halves[i] - 1)
            self.valid_locations.append(self.padding_locations[(i * 2)] +
                                        self.halves[i] + in_size[i] -
                                        kernel_size[i])
        self.valid_locations = tuple(self.valid_locations)

        self.full_locations = []
        for i in xrange(len(self.sizes)):
            offset = self.sizes[i] - (in_size[i] + kernel_size[i] - 1)
            self.full_locations.append(offset / 2)
            self.full_locations.append(-offset / 2)

        self.batch_size = batch_size
Esempio n. 10
0
def clifftn(data):
    clear_first_arg_caches()
    ctx = cl.create_some_context(interactive=False)
    queue = cl.CommandQueue(ctx)
    plan = Plan(data.shape, normalize=True, queue=queue)
    # Inverse transform:
    plan.execute(gpu_data.data, inverse=True)
    result = gpu_data.get()
    return result
Esempio n. 11
0
def _ocl_fft_gpu(ocl_arr, res_arr=None, inverse=False, plan=None):

    assert_bufs_type(np.complex64, ocl_arr)

    if plan is None:
        plan = Plan(ocl_arr.shape, queue=get_device().queue)

    if res_arr is None:
        res_arr = OCLArray.empty(ocl_arr.shape, np.complex64)

    plan.execute(ocl_arr.data, res_arr.data, inverse=inverse)

    return res_arr
Esempio n. 12
0
def _ocl_fft_numpy(arr, inverse=False, plan=None):
    if plan is None:
        plan = Plan(arr.shape, queue=get_device().queue)

    if arr.dtype != np.complex64:
        logger.info("converting %s to complex64, might slow things down..." %
                    arr.dtype)

    ocl_arr = OCLArray.from_array(arr.astype(np.complex64, copy=False))

    _ocl_fft_gpu_inplace(ocl_arr, inverse=inverse, plan=plan)

    return ocl_arr.get()
Esempio n. 13
0
def clfftn(data):
    """ OpenCL FFT 3D
    """
    clear_first_arg_caches()
    #ctx = cl.create_some_context(interactive=False)
    #queue = cl.CommandQueue(ctx)
    ctx, queue = clinit()
    plan = Plan(data.shape, normalize=True, queue=queue)
    # forward transform on device
    gpu_data = cl_array.to_device(queue, data)
    # forward transform
    plan.execute(gpu_data.data)
    #result = gpu_data.get()
    result = gpu_data.get()
    return result
Esempio n. 14
0
def _fft_2(data, inverse=False, plan=None, queue=None, block=True):
    """Execute FFT on *data*, which is first converted to a pyopencl array and retyped to
    complex.
    """
    data = g_util.get_array(data, queue=queue)
    if data.dtype != cfg.PRECISION.np_cplx:
        data = data.astype(cfg.PRECISION.np_cplx)

    if not plan:
        if not queue:
            queue = cfg.OPENCL.queue
        if queue not in cfg.OPENCL.fft_plans:
            cfg.OPENCL.fft_plans[queue] = {}
        if data.shape not in cfg.OPENCL.fft_plans[queue]:
            LOG.debug('Creating FFT Plan for {} and shape {}'.format(queue, data.shape))
            cfg.OPENCL.fft_plans[queue][data.shape] = Plan(data.shape,
                                                           dtype=cfg.PRECISION.np_cplx,
                                                           queue=queue)
        plan = cfg.OPENCL.fft_plans[queue][data.shape]

    LOG.debug('fft_2, shape: %s, inverse: %s', data.shape, inverse)
    plan.execute(data.data, inverse=inverse, wait_for_finish=block)

    return data
Esempio n. 15
0
imggauss = kspacegaussian_filter_CL(ksp, np.ones(3), ctx)
print 'PyFFT +OpenCL Gaussian filter:'
toc()
tic()
print 'Complex K-space filter + Numpy IFFT'
kspgauss2 = KSP.kspacegaussian_filter2(ksp, 1)
image_filtered = simpleifft(procpar, dims, hdr, kspgauss2, args)
toc()

# PYFFT

tic()
#ctx = cl.create_some_context(interactive=False)
#queue = cl.CommandQueue(ctx)
w = h = k = 512
plan = Plan((w, h, k), normalize=True, queue=queue)
gpu_data = cl_array.to_device(queue, ksp)
plan.execute(gpu_data.data, inverse=True)
result = gpu_data.get()
toc()
result = np.fft.fftshift(result)
print "PyFFT OpenCL IFFT time and first three results:"
print "%s sec, %s" % (toc(), str(np.abs(result[:3, 0, 0])))

tic()
reference = np.fft.fftshift(np.fft.ifftn(ksp))
print "Numpy IFFTN time and first three results:"
print "%s sec, %s" % (toc(), str(np.abs(reference[:3, 0, 0])))

print "Calulating L1 norm "
print np.linalg.norm(result - reference) / np.linalg.norm(reference)
Esempio n. 16
0
def gs_gpu(idata, itera=100):
    """Gerchberg-Saxton algorithm to calculate DOEs using the GPU
    
    Calculates the phase distribution in a object plane to obtain an 
    specific amplitude distribution in the target plane. It uses a 
    FFT to calculate the field propagation.
    The wavefront at the DOE plane is assumed as a plane wave.
    
    **ARGUMENTS:**
	
		========== ======================================================
		idata      numpy array containing the target amplitude distribution 
        itera      Maximum number of iterations
		========== ======================================================
    """

    pl = cl.get_platforms()[0]
    devices = pl.get_devices(device_type=cl.device_type.GPU)
    ctx = cl.Context(devices=[devices[0]])
    queue = cl.CommandQueue(ctx)

    plan = Plan(idata.shape, queue=queue,
                dtype=complex128)  #no funciona con "complex128"

    src = str(
        Template(KERNEL).render(
            double_support=all(has_double_support(dev) for dev in devices),
            amd_double_support=all(
                has_amd_double_support(dev) for dev in devices)))
    prg = cl.Program(ctx, src).build()

    idata_gpu = cl_array.to_device(queue,
                                   ifftshift(idata).astype("complex128"))
    fdata_gpu = cl_array.empty_like(idata_gpu)
    rdata_gpu = cl_array.empty_like(idata_gpu)
    plan.execute(idata_gpu.data, fdata_gpu.data)

    e = 1000
    ea = 1000
    for i in range(itera):
        prg.norm(queue, fdata_gpu.shape, None, fdata_gpu.data)
        plan.execute(fdata_gpu.data, rdata_gpu.data, inverse=True)
        tr = rdata_gpu.get()
        rdata = ifftshift(tr)

        #TODO: This calculation should be done in the GPU
        e = (abs(rdata) - idata).std()
        if e > ea:
            break
        ea = e

        prg.norm2(queue, rdata_gpu.shape, None, rdata_gpu.data, idata_gpu.data)

        plan.execute(rdata_gpu.data, fdata_gpu.data)

    fdata = fdata_gpu.get()

    #~ prg.norm(queue, fdata_gpu.shape, None,fdata_gpu.data)
    fdata = ifftshift(fdata)
    fdata = exp(1.j * angle(fdata))

    #~ fdata=fdata_gpu.get()
    return fdata
Esempio n. 17
0
def fft_plan(shape):
    """returns an opencl/pyfft plan of shape dshape"""
    return Plan(shape, queue=get_device().queue)
Esempio n. 18
0
def myfunc(d_g):
    from pyfft.cl import Plan
    from gputools import get_device
    plan = Plan(d_g.shape, queue=get_device().queue, fast_math=True)
    plan.execute(d_g.data, d_g.data)
    plan.execute(d_g.data, d_g.data, inverse=True)
Esempio n. 19
0
freq_hz = freq * 1000000.0

blocklen = (32 * 1024)  

# bring up OpenCL
from pyfft.cl import Plan
import numpy

import pyopencl as cl
import pyopencl.array as clarray
import pyopencl.clmath as clmath

ctx = cl.create_some_context(interactive=False)
queue = cl.CommandQueue(ctx)

plan = Plan(blocklen, queue=queue, dtype=numpy.complex64)

lowpass_filter_b, lowpass_filter_a = sps.butter(8, (4.5/(freq/2)), 'low')

# stubs for later
f_deemp_b = []
f_deemp_a = []

# default deemp constants				
deemp_t1 = .825
deemp_t2 = 2.35

# audio filters
Baudiorf = sps.firwin(65, 3.5 / (freq / 2), window='hamming', pass_zero=True)

afreq = freq / 4
Esempio n. 20
0
	def __init__(self, k0, nx, ny, h, d=None, l=10, dz=None,
			w=0.39, propcorr=None, phasetol=None, context=None):
		'''
		Initialize a split-step engine over an nx-by-ny grid with
		isotropic step size h. The unitless wave number is k0. The wave
		is advanced in steps of dz or (if dz is not provided) h.

		If d is provided, it is a 4-tuple that describes the
		directivity of any source as d = (dx, dy, dz, w), where (dx,
		dy, dz) is the directivity axis and w is the beam width
		parameter. Otherwise, all sources are treated as point sources.

		If l is specified and greater than zero, it is the width of a
		Hann window used to attenuate the field along each edge.

		The parameter w (as a multiplier 1 / w**2) governs the
		high-order spectral cross term.

		If propcorr is specified, it should be a tuple of Booleans of
		the form (hospec, hospat) that determines whether corrections
		involving high-order spectral or spatial terms, respectively,
		are used in the propagator by default. If propcorr is
		unspecified, both corrections are used.

		The parameter phasetol specifies the maximum permissible phase
		deviation (in fractions of pi), relative to propagation through
		the homogeneous background, incurred by propagating through the
		inhomogeneous medium. The number of steps per slab will be
		adjusted so that the phase shift incurred by propagating
		through materials with the most extreme sound speeds will never
		exceed phasetol.
		'''
		# Ensure that the phase tolerance is not too small
		# Otherwise, number of propagation steps will blow up uncontrollably
		if phasetol is not None and abs(phasetol) < 1e-6:
			raise ValueError('Phase tolerance must be greater than 1e-6')

		# Copy the parameters
		self.grid = nx, ny
		self.h, self.k0, self.l = h, k0, l
		self.w = np.float32(1. / w**2)
		self.phasetol = phasetol
		# Specify the use of corrective terms
		if propcorr is not None:
			self.propcorr = tuple(propcorr)
		else: self.propcorr = (True, True)

		# Set the step length
		self.dz = dz if dz else h

		# Grab the provided context or create a default
		self.context = util.grabcontext(context)

		# Build the program for the context
		t = Template(filename=self._kernel, output_encoding='ascii')
		src = t.render(grid = self.grid, k0=k0, h=h, d=d, l=l)
		self.prog = cl.Program(self.context, src).build()

		# Create a command queue for forward propagation calculations
		self.fwdque = cl.CommandQueue(self.context)
		# Create command queues for transfers
		self.recvque = cl.CommandQueue(self.context)
		self.sendque = cl.CommandQueue(self.context)

		# Create an FFT plan in the OpenCL propagation queue
		# Reorder the axes to conform with row-major ordering
		self.fftplan = Plan((ny, nx), queue=self.fwdque)

		grid = self.grid
		def newbuffer():
			nbytes = cutil.prod(grid) * np.complex64().nbytes
			flags = cl.mem_flags.READ_WRITE
			return util.SyncBuffer(self.context, flags, size=nbytes)
		# Buffers to store the propagating (twice) and backward fields
		self.fld = [newbuffer() for i in range(3)]
		# Scratch space used during computations
		self.scratch = [newbuffer() for i in range(3)]
		# The index of refraction gets two buffers for transmission
		self.obj = [newbuffer() for i in range(2)]
		# Two buffers are used for the Goertzel FFT of the contrast source
		self.goertzbuf = [newbuffer() for i in range(2)]
		# The sound speed extrema for the current slab are stored here
		self.speedlim = [1., 1.]
		# Initialize buffer to hold results of advance()
		self.result = newbuffer()

		# By default, volume fields will be transfered from the device
		self._goertzel = False

		# Initialize refractive index and fields
		self.reset()

		# By default, device exchange happens on the full grid
		self.rectxfer = util.RectangularTransfer(grid, grid, np.complex64, alloc_host=False)
Esempio n. 21
0
def fft_plan(shape, **kwargs):
    """returns an opencl/pyfft plan of shape dshape

    kwargs are the same as pyfft.cl.Plan
    """
    return Plan(shape, queue=get_device().queue, **kwargs)