Esempio n. 1
0
def kspacegaussian_filter_pyfftCL(ksp, sigma):
    clear_first_arg_caches()
    sz = ksp.shape
    dtype = np.complex64
    ftype = np.float32
    #api = cluda.ocl_api()
    ctx = cl.create_some_context(interactive=False)
    queue = cl.CommandQueue(ctx)
    queue.flush()
    data_dev = cl_array.to_device(queue, ksp)
    w = h = k = 512
    plan = Plan((w, h, k), normalize=True, queue=queue)
    FACTOR = 1.0
    program = cl.Program(ctx, """
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#include "pyopencl-complex.h" 
__kernel void gauss_kernel(__global cfloat_t *dest) //, __global cfloat_t *src)
{
  uint x = get_global_id(0);uint y = get_global_id(1);uint z = get_global_id(2);
  uint dim1= %d;
  uint dim2= %d;
  uint dim3= %d;                    
  float sigma[3];
  sigma[0]=%f;sigma[1]=%f;sigma[2]=%f;
  float factor = %f;            
  float TWOPISQ = 19.739208802178716; //6.283185307179586;  //2*3.141592;

  ulong idx = z*dim1*dim2 + y*dim1 + x;
  float i = (float)(x);  //(x / dim3) / dim2);
      i = (i - (float)floor((float)(dim1)/2.0f))/(float)(dim1);
  float j = (float)y; //(x / dim3);
      //if((int)j > dim2) {j=(float)fmod(j, (float)dim2);};
      j = (j - (float)floor((float)(dim2)/2.0f))/(float)(dim2);
  //Account for large global index (stored as ulong) before performing modulus
  //double pre_k=fmod((double)(x) , (double) dim3);
  float k = (float) z; // pre_k;
      k = (k - (float)floor((float)(dim3)/2.0f))/(float)(dim3);

  float weight = exp(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]));
  dest[idx].x = dest[idx].x * weight;
  dest[idx].y = dest[idx].y * weight; 
  
}
""" % (sz[0], sz[1], sz[2], sigma[0], sigma[1], sigma[2], FACTOR)).build()
    gauss_kernel = program.gauss_kernel
    #data_dev = thr.empty_like(ksp_dev)
    gauss_kernel(queue, sz, None, data_dev.data).wait()  # , data_dev.data
    ksp_out = data_dev.get()
    queue.flush()
    ctx = cl.create_some_context(interactive=False)
    queue = cl.CommandQueue(ctx)
    w = h = k = 512
    plan = Plan((w, h, k), normalize=True, queue=queue)
    data2_dev = cl_array.to_device(queue, ksp_out)
    plan.execute(data2_dev.data, inverse=True)
    result = data2_dev.get()
    result = np.fft.fftshift(result)
    queue.finish()
    return result  # ,ksp_out
Esempio n. 2
0
def _ocl_fft_gpu_inplace(ocl_arr, inverse=False, plan=None):

    assert_bufs_type(np.complex64, ocl_arr)

    if plan is None:
        plan = Plan(ocl_arr.shape, queue=get_device().queue)

    plan.execute(ocl_arr.data, ocl_arr.data, inverse=inverse)
Esempio n. 3
0
def _ocl_fft_gpu_inplace(ocl_arr,inverse = False, plan = None):

    assert_bufs_type(np.complex64,ocl_arr)

    if plan is None:
        plan = Plan(ocl_arr.shape, queue = get_device().queue)

    plan.execute(ocl_arr.data,ocl_arr.data, inverse = inverse)
Esempio n. 4
0
def clifftn(data):
    clear_first_arg_caches()
    ctx = cl.create_some_context(interactive=False)
    queue = cl.CommandQueue(ctx)
    plan = Plan(data.shape, normalize=True, queue=queue)
    # Inverse transform:
    plan.execute(gpu_data.data, inverse=True)
    result = gpu_data.get()
    return result
Esempio n. 5
0
def _ocl_fft_gpu(ocl_arr,res_arr = None,inverse = False, plan = None):

    assert_bufs_type(np.complex64,ocl_arr)

    if plan is None:
        plan = Plan(ocl_arr.shape, queue = get_device().queue)

    if res_arr is None:
        res_arr = OCLArray.empty(ocl_arr.shape,np.complex64)
        
    plan.execute(ocl_arr.data,res_arr.data, inverse = inverse)

    return res_arr
Esempio n. 6
0
def _ocl_fft_gpu(ocl_arr, res_arr=None, inverse=False, plan=None):

    assert_bufs_type(np.complex64, ocl_arr)

    if plan is None:
        plan = Plan(ocl_arr.shape, queue=get_device().queue)

    if res_arr is None:
        res_arr = OCLArray.empty(ocl_arr.shape, np.complex64)

    plan.execute(ocl_arr.data, res_arr.data, inverse=inverse)

    return res_arr
Esempio n. 7
0
def gs_mod_gpu(idata, itera=10, osize=256):

    cut = osize // 2

    pl = cl.get_platforms()[0]
    devices = pl.get_devices(device_type=cl.device_type.GPU)
    ctx = cl.Context(devices=[devices[0]])
    queue = cl.CommandQueue(ctx)

    plan = Plan(idata.shape, queue=queue,
                dtype=complex128)  #no funciona con "complex128"

    src = str(
        Template(KERNEL).render(
            double_support=all(has_double_support(dev) for dev in devices),
            amd_double_support=all(
                has_amd_double_support(dev) for dev in devices)))
    prg = cl.Program(ctx, src).build()

    idata_gpu = cl_array.to_device(queue,
                                   ifftshift(idata).astype("complex128"))
    fdata_gpu = cl_array.empty_like(idata_gpu)
    rdata_gpu = cl_array.empty_like(idata_gpu)
    plan.execute(idata_gpu.data, fdata_gpu.data)

    mask = exp(2.j * pi * random(idata.shape))
    mask[512 - cut:512 + cut, 512 - cut:512 + cut] = 0

    idata_gpu = cl_array.to_device(
        queue,
        ifftshift(idata + mask).astype("complex128"))
    fdata_gpu = cl_array.empty_like(idata_gpu)
    rdata_gpu = cl_array.empty_like(idata_gpu)
    error_gpu = cl_array.to_device(ctx, queue,
                                   zeros(idata_gpu.shape).astype("double"))
    plan.execute(idata_gpu.data, fdata_gpu.data)

    e = 1000
    ea = 1000
    for i in range(itera):
        prg.norm(queue, fdata_gpu.shape, None, fdata_gpu.data)
        plan.execute(fdata_gpu.data, rdata_gpu.data, inverse=True)
        #~ prg.norm1(queue, rdata_gpu.shape,None,rdata_gpu.data,idata_gpu.data,error_gpu.data, int32(cut))
        norm1 = prg.norm1
        norm1.set_scalar_arg_dtypes([None, None, None, int32])
        norm1(queue, rdata_gpu.shape, None, rdata_gpu.data, idata_gpu.data,
              error_gpu.data, int32(cut))

        e = sqrt(cl_array.sum(error_gpu).get()) / (2 * cut)

        #~ if e>ea:
        #~
        #~ break
        #~ ea=e
        plan.execute(rdata_gpu.data, fdata_gpu.data)

    fdata = fdata_gpu.get()
    fdata = ifftshift(fdata)
    fdata = exp(1.j * angle(fdata))
    return fdata
Esempio n. 8
0
    def _prep_gpu():
        """ Set up GPU calculation dependencies """

        # try to import the necessary libraries
        fallback = False
        try:
            import gpu
            import string
            import pyopencl as cl
            import pyopencl.array as cla
            from pyfft.cl import Plan
        except ImportError:
            fallback = True
            
        # check gpu_info
        try:
            assert gpu.valid(gpu_info),\
            "gpu_info in propagate_distances improperly specified"
            
            context, device, queue, platform = gpu_info
        except AssertionError:
            fallback = True
            
        if fallback:
            propagate_distances(data, distances, energy_or_wavelength,
                                pixel_pitch, subregion=subregion,
                                silent=silent, band_limit=band_limit,
                                gpu_info=None, im_convert=im_convert)
    
        # if everything is OK, allocate memory and build kernels
        kp = string.join(gpu.__file__.split('/')[:-1], '/')+'/kernels/'
        build = _build_helper(context, device, kp)
        phase_multiply = build('propagate_phase_multiply.cl')
        copy_to_buffer = build('propagate_copy_to_save_buffer.cl')
        fftplan = Plan((N, N), queue=queue)

        # put the signals onto the gpu along with buffers for the
        # various operations
        rarray = cla.to_device(queue, r.astype(np.float32))
        fourier = cla.to_device(queue, data.astype(np.complex64))
        phase = cla.empty(queue, (N, N), np.complex64)
        back = cla.empty(queue, (N, N), np.complex64)
        store = cla.empty(queue, (nf, rows, cols), np.complex64)
        
        # precompute the fourier transform of data. 
        fftplan.execute(fourier.data, wait_for_finish=True)

        return phase_multiply, copy_to_buffer, fftplan, rarray, fourier,\
               phase, back, store, build
Esempio n. 9
0
def clfftn(data):
    """ OpenCL FFT 3D
    """
    clear_first_arg_caches()
    #ctx = cl.create_some_context(interactive=False)
    #queue = cl.CommandQueue(ctx)
    ctx, queue = clinit()
    plan = Plan(data.shape, normalize=True, queue=queue)
    # forward transform on device
    gpu_data = cl_array.to_device(queue, data)
    # forward transform
    plan.execute(gpu_data.data)
    #result = gpu_data.get()
    result = gpu_data.get()
    return result
Esempio n. 10
0
def gs_mod_gpu(idata,itera=10,osize=256):
    
    
    cut=osize//2
    
    pl=cl.get_platforms()[0]
    devices=pl.get_devices(device_type=cl.device_type.GPU)
    ctx = cl.Context(devices=[devices[0]])
    queue = cl.CommandQueue(ctx)

    plan = Plan(idata.shape, queue=queue,dtype=complex128) #no funciona con "complex128"
    
    src = str(Template(KERNEL).render(
        double_support=all(
            has_double_support(dev) for dev in devices),
        amd_double_support=all(
            has_amd_double_support(dev) for dev in devices)
        ))
    prg = cl.Program(ctx,src).build() 
    

    idata_gpu=cl_array.to_device(queue, ifftshift(idata).astype("complex128"))
    fdata_gpu=cl_array.empty_like(idata_gpu)
    rdata_gpu=cl_array.empty_like(idata_gpu)
    plan.execute(idata_gpu.data,fdata_gpu.data)
    
    mask=exp(2.j*pi*random(idata.shape))
    mask[512-cut:512+cut,512-cut:512+cut]=0
    
    
    idata_gpu=cl_array.to_device(queue, ifftshift(idata+mask).astype("complex128"))
    fdata_gpu=cl_array.empty_like(idata_gpu)
    rdata_gpu=cl_array.empty_like(idata_gpu)
    error_gpu=cl_array.to_device(ctx, queue, zeros(idata_gpu.shape).astype("double"))
    plan.execute(idata_gpu.data,fdata_gpu.data)
    
    e=1000
    ea=1000
    for i in range (itera):
        prg.norm(queue, fdata_gpu.shape, None,fdata_gpu.data)
        plan.execute(fdata_gpu.data,rdata_gpu.data,inverse=True)
        #~ prg.norm1(queue, rdata_gpu.shape,None,rdata_gpu.data,idata_gpu.data,error_gpu.data, int32(cut))
        norm1=prg.norm1
        norm1.set_scalar_arg_dtypes([None, None, None, int32])
        norm1(queue, rdata_gpu.shape,None,rdata_gpu.data,idata_gpu.data,error_gpu.data, int32(cut))
        
        e= sqrt(cl_array.sum(error_gpu).get())/(2*cut)

        #~ if e>ea: 
           #~ 
            #~ break
        #~ ea=e
        plan.execute(rdata_gpu.data,fdata_gpu.data)
    
    fdata=fdata_gpu.get()
    fdata=ifftshift(fdata)
    fdata=exp(1.j*angle(fdata))
    return fdata
class FastFourierTransform:
    def __init__(self, in_scale, in_matrix, queue):
        self.scale = tuple(in_scale)
        # create plan
        self.plan = Plan(self.scale, queue=queue)
        # prepare data
        self.data = in_matrix
        self.gpu_data = cl_array.to_device(queue, self.data)

    def fourier_transform(self):
        # forward transform
        self.plan.execute(self.gpu_data.data)
        # print self.gpu_data.get()
        # inverse transform
        self.plan.execute(self.gpu_data.data, inverse=True)
        result_ = self.gpu_data.get()
        print(result_)
        error = numpy.abs(
            numpy.sum(numpy.abs(self.data) - numpy.abs(result_)) /
            self.data.size)
        if error > 1e-6:
            s = 'error occurs'
            raise ValueError('invalid value: %s' % s)
        return result_
Esempio n. 12
0
from pyfft.cl import Plan
import numpy
import pyopencl as cl
import pyopencl.array as cl_array

# initialize context
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

# create plan
plan = Plan((16, 16), queue=queue)

# prepare data
data = numpy.ones((16, 16), dtype=numpy.complex64)
gpu_data = cl_array.to_device(ctx, queue, data)
print gpu_data

# forward transform
plan.execute(gpu_data.data)
result = gpu_data.get()
print result

# inverse transform
plan.execute(gpu_data.data, inverse=True)
result = gpu_data.get()
error = numpy.abs(numpy.sum(numpy.abs(data) - numpy.abs(result)) / data.size)
print error < 1e-6
Esempio n. 13
0
def myfunc(d_g):
    from pyfft.cl import Plan
    from gputools import get_device
    plan = Plan(d_g.shape, queue = get_device().queue, fast_math = True)
    plan.execute(d_g.data,d_g.data)
    plan.execute(d_g.data,d_g.data, inverse = True)
Esempio n. 14
0
class OpenclFibre(object):
    """
    This optical module is similar to Fibre, but uses PyOpenCl (Python
    bindings around OpenCL) to generate parallelised code.
    """
    def __init__(self, total_samples, dorf, length=None, total_steps=None,
                 name="ocl_fibre"):
        self.name = name

        self.queue = None
        self.np_float = None
        self.np_complex = None
        self.prg = None
        self.cl_initialise(dorf)

        self.plan = Plan(total_samples, queue=self.queue)

        self.buf_field = None
        self.buf_temp = None
        self.buf_interaction = None
        self.buf_factor = None

        self.shape = None
        self.plan = None

        self.cached_factor = False
        # Force usage of cached version of function:
        self.cl_linear = self.cl_linear_cached

        self.length = length
        self.total_steps = total_steps

    def __call__(self, domain, field):
        # Setup plan for calculating fast Fourier transforms:
        self.plan = Plan(domain.total_samples, queue=self.queue)

        field_temp = np.empty_like(field)
        field_interaction = np.empty_like(field)

        from pyofss.modules.linearity import Linearity
        dispersion = Linearity(beta=[0.0, 0.0, 0.0, 1.0], sim_type="default")
        factor = dispersion(domain)

        self.send_arrays_to_device(
            field, field_temp, field_interaction, factor)

        stepsize = self.length / self.total_steps
        zs = np.linspace(0.0, self.length, self.total_steps + 1)

        #start = time.clock()
        for z in zs[:-1]:
            self.cl_rk4ip(self.buf_field, self.buf_temp,
                          self.buf_interaction, self.buf_factor, stepsize)
        #stop = time.clock()

        #cl_result = self.buf_field.get()
        #print("cl_result: %e" % ((stop - start) / 1000.0))

        return self.buf_field.get()

    def cl_initialise(self, dorf="float"):
        """ Initialise opencl related parameters. """
        float_conversions = {"float": np.float32, "double": np.float64}
        complex_conversions = {"float": np.complex64, "double": np.complex128}

        self.np_float = float_conversions[dorf]
        self.np_complex = complex_conversions[dorf]

        for platform in cl.get_platforms():
            if platform.name == "NVIDIA CUDA":
                print("Using compiler optimisations suitable for Nvidia GPUs")
                compiler_options = "-cl-mad-enable -cl-fast-relaxed-math"
            else:
                compiler_options = ""

        ctx = cl.create_some_context(interactive=False)
        self.queue = cl.CommandQueue(ctx)

        substitutions = {"dorf": dorf}
        code = OPENCL_OPERATIONS.substitute(substitutions)
        self.prg = cl.Program(ctx, code).build(options=compiler_options)

    @staticmethod
    def print_device_info():
        """ Output information on each OpenCL platform and device. """
        for platform in cl.get_platforms():
            print "=" * 60
            print "Platform information:"
            print "Name: ", platform.name
            print "Profile: ", platform.profile
            print "Vender: ", platform.vendor
            print "Version: ", platform.version

            for device in platform.get_devices():
                print "-" * 60
                print "Device information:"
                print "Name: ", device.name
                print "Type: ", cl.device_type.to_string(device.type)
                print "Memory: ", device.global_mem_size // (1024 ** 2), "MB"
                print "Max clock speed: ", device.max_clock_frequency, "MHz"
                print "Compute units: ", device.max_compute_units

            print "=" * 60

    def send_arrays_to_device(self, field, field_temp,
                              field_interaction, factor):
        """ Move numpy arrays onto compute device. """
        self.shape = field.shape

        self.buf_field = cl_array.to_device(
            self.queue, field.astype(self.np_complex))
        self.buf_temp = cl_array.to_device(
            self.queue, field_temp.astype(self.np_complex))
        self.buf_interaction = cl_array.to_device(
            self.queue, field_interaction.astype(self.np_complex))

        self.buf_factor = cl_array.to_device(
            self.queue, factor.astype(self.np_complex))

    def cl_copy(self, first_buffer, second_buffer):
        """ Copy contents of one buffer into another. """
        self.prg.cl_copy(self.queue, self.shape, None,
                         first_buffer.data, second_buffer.data)

    def cl_linear(self, field_buffer, stepsize, factor_buffer):
        """ Linear part of step. """
        self.plan.execute(field_buffer.data, inverse=True)
        self.prg.cl_linear(self.queue, self.shape, None, field_buffer.data,
                           factor_buffer.data, self.np_float(stepsize))
        self.plan.execute(field_buffer.data)

    def cl_linear_cached(self, field_buffer, stepsize, factor_buffer):
        """ Linear part of step (cached version). """
        if (self.cached_factor is False):
            print "Caching factor"
            self.prg.cl_cache(self.queue, self.shape, None,
                              factor_buffer.data, self.np_float(stepsize))
            self.cached_factor = True

        self.plan.execute(field_buffer.data, inverse=True)
        self.prg.cl_linear_cached(self.queue, self.shape, None,
                                  field_buffer.data, factor_buffer.data)
        self.plan.execute(field_buffer.data)

    def cl_nonlinear(self, field_buffer, stepsize, gamma=100.0):
        """ Nonlinear part of step. """
        self.prg.cl_nonlinear(self.queue, self.shape, None, field_buffer.data,
                              self.np_float(gamma), self.np_float(stepsize))

    def cl_sum(self, first_buffer, first_factor, second_buffer, second_factor):
        """ Calculate weighted summation. """
        self.prg.cl_sum(self.queue, self.shape, None,
                        first_buffer.data, self.np_float(first_factor),
                        second_buffer.data, self.np_float(second_factor))

    def cl_rk4ip(self, field, field_temp, field_interaction, factor, stepsize):
        """ Runge-Kutta in the interaction picture method using OpenCL. """
        inv_six = 1.0 / 6.0
        inv_three = 1.0 / 3.0
        half_step = 0.5 * stepsize

        self.cl_copy(field_temp, field)
        self.cl_linear(field, half_step, factor)

        self.cl_copy(field_interaction, field)
        self.cl_nonlinear(field_temp, stepsize)
        self.cl_linear(field_temp, half_step, factor)

        self.cl_sum(field, 1.0, field_temp, inv_six)
        self.cl_sum(field_temp, 0.5, field_interaction, 1.0)
        self.cl_nonlinear(field_temp, stepsize)

        self.cl_sum(field, 1.0, field_temp, inv_three)
        self.cl_sum(field_temp, 0.5, field_interaction, 1.0)
        self.cl_nonlinear(field_temp, stepsize)

        self.cl_sum(field, 1.0, field_temp, inv_three)
        self.cl_sum(field_temp, 1.0, field_interaction, 1.0)
        self.cl_linear(field_interaction, half_step, factor)

        self.cl_linear(field, half_step, factor)
        self.cl_nonlinear(field_temp, stepsize)

        self.cl_sum(field, 1.0, field_temp, inv_six)
Esempio n. 15
0
toc()
tic()
print 'Complex K-space filter + Numpy IFFT'
kspgauss2 = KSP.kspacegaussian_filter2(ksp, 1)
image_filtered = simpleifft(procpar, dims, hdr, kspgauss2, args)
toc()

# PYFFT

tic()
#ctx = cl.create_some_context(interactive=False)
#queue = cl.CommandQueue(ctx)
w = h = k = 512
plan = Plan((w, h, k), normalize=True, queue=queue)
gpu_data = cl_array.to_device(queue, ksp)
plan.execute(gpu_data.data, inverse=True)
result = gpu_data.get()
toc()
result = np.fft.fftshift(result)
print "PyFFT OpenCL IFFT time and first three results:"
print "%s sec, %s" % (toc(), str(np.abs(result[:3, 0, 0])))

tic()
reference = np.fft.fftshift(np.fft.ifftn(ksp))
print "Numpy IFFTN time and first three results:"
print "%s sec, %s" % (toc(), str(np.abs(reference[:3, 0, 0])))

print "Calulating L1 norm "
print np.linalg.norm(result - reference) / np.linalg.norm(reference)

import matplotlib
Esempio n. 16
0
def gs_gpu(idata, itera=100):
    """Gerchberg-Saxton algorithm to calculate DOEs using the GPU
    
    Calculates the phase distribution in a object plane to obtain an 
    specific amplitude distribution in the target plane. It uses a 
    FFT to calculate the field propagation.
    The wavefront at the DOE plane is assumed as a plane wave.
    
    **ARGUMENTS:**
	
		========== ======================================================
		idata      numpy array containing the target amplitude distribution 
        itera      Maximum number of iterations
		========== ======================================================
    """

    pl = cl.get_platforms()[0]
    devices = pl.get_devices(device_type=cl.device_type.GPU)
    ctx = cl.Context(devices=[devices[0]])
    queue = cl.CommandQueue(ctx)

    plan = Plan(idata.shape, queue=queue,
                dtype=complex128)  #no funciona con "complex128"

    src = str(
        Template(KERNEL).render(
            double_support=all(has_double_support(dev) for dev in devices),
            amd_double_support=all(
                has_amd_double_support(dev) for dev in devices)))
    prg = cl.Program(ctx, src).build()

    idata_gpu = cl_array.to_device(queue,
                                   ifftshift(idata).astype("complex128"))
    fdata_gpu = cl_array.empty_like(idata_gpu)
    rdata_gpu = cl_array.empty_like(idata_gpu)
    plan.execute(idata_gpu.data, fdata_gpu.data)

    e = 1000
    ea = 1000
    for i in range(itera):
        prg.norm(queue, fdata_gpu.shape, None, fdata_gpu.data)
        plan.execute(fdata_gpu.data, rdata_gpu.data, inverse=True)
        tr = rdata_gpu.get()
        rdata = ifftshift(tr)

        #TODO: This calculation should be done in the GPU
        e = (abs(rdata) - idata).std()
        if e > ea:
            break
        ea = e

        prg.norm2(queue, rdata_gpu.shape, None, rdata_gpu.data, idata_gpu.data)

        plan.execute(rdata_gpu.data, fdata_gpu.data)

    fdata = fdata_gpu.get()

    #~ prg.norm(queue, fdata_gpu.shape, None,fdata_gpu.data)
    fdata = ifftshift(fdata)
    fdata = exp(1.j * angle(fdata))

    #~ fdata=fdata_gpu.get()
    return fdata
class Convolver:
    """ Class that computes the necessary information to perform a
    convolution and provides the actual convolution function. Can handle
    2d or 3d convolutions. """

    def __init__(self, insize, kernelsize, batchsize, context, queue):
        self.sizes = []
        for i in xrange(len(insize)):
            self.sizes.append(getPowerOfTwo(insize[i] + kernelsize[i] + 1))
        self.sizes = tuple(self.sizes)

        self.ctx = context
        self.queue = queue
        self.plan = Plan(self.sizes, queue=self.queue)
        self.inarray = cl.array.zeros(
            self.queue, (batchsize, self.sizes[0], self.sizes[1], self.sizes[2]), numpy.complex64
        )
        self.kernel = cl.array.zeros(
            self.queue, (batchsize, self.sizes[0], self.sizes[1], self.sizes[2]), numpy.complex64
        )
        self.result_buffer = numpy.zeros(self.inarray.shape, numpy.complex64)

        self.kernel_center = []
        for i in xrange(len(kernelsize)):
            self.kernel_center.append(kernelsize[i] / 2)
        self.kernel_center = tuple(self.kernel_center)

        self.halves = []
        for i in xrange(len(kernelsize)):
            self.halves.append(numpy.ceil(kernelsize[i] / 2.0))
        self.halves = tuple(self.halves)

        self.padding_locations = []
        for i in xrange(len(self.sizes)):
            # without this if even kernels result in an incorrect edge in the result
            if kernelsize[i] % 2 == 0:
                self.padding_locations.append(-1 * ((insize[i] - self.sizes[i]) / 2))
                self.padding_locations.append(-1 * ((self.sizes[i] - insize[i]) / 2))
            else:
                self.padding_locations.append((self.sizes[i] - insize[i]) / 2)
                self.padding_locations.append((insize[i] - self.sizes[i]) / 2)
        self.padding_locations = tuple(self.padding_locations)

        self.valid_locations = []
        for i in xrange(len(self.sizes)):
            self.valid_locations.append(self.padding_locations[(i * 2)] + self.halves[i] - 1)
            self.valid_locations.append(self.padding_locations[(i * 2)] + self.halves[i] + insize[i] - kernelsize[i])
        self.valid_locations = tuple(self.valid_locations)

        self.full_locations = []
        for i in xrange(len(self.sizes)):
            offset = self.sizes[i] - (insize[i] + kernelsize[i] - 1)
            self.full_locations.append(offset / 2)
            self.full_locations.append(-offset / 2)

        self.batch_size = batchsize

    def convolution(self, A, kernel, type="valid"):
        inarray = numpy.zeros((self.batch_size, self.sizes[0], self.sizes[1], self.sizes[2]), numpy.complex64)
        inarray[
            :,
            self.padding_locations[0] : self.padding_locations[1],
            self.padding_locations[2] : self.padding_locations[3],
            self.padding_locations[4] : self.padding_locations[5],
        ] = A
        self.inarray = cl.array.to_device(self.queue, inarray)
        kernel_buffer = numpy.zeros((self.batch_size, self.sizes[0], self.sizes[1], self.sizes[2]), numpy.complex64)
        kernel_buffer[:, : self.halves[0], : self.halves[1], : self.halves[2]] = kernel[
            self.kernel_center[0] :, self.kernel_center[1] :, self.kernel_center[2] :
        ]
        kernel_buffer[:, : self.halves[0], : self.halves[1], -self.kernel_center[2] :] = kernel[
            self.kernel_center[0] :, self.kernel_center[1] :, : self.kernel_center[2]
        ]
        kernel_buffer[:, : self.halves[0], -self.kernel_center[1] :, : self.halves[2]] = kernel[
            self.kernel_center[0] :, : self.kernel_center[1], self.kernel_center[2] :
        ]
        kernel_buffer[:, : self.halves[0], -self.kernel_center[1] :, -self.kernel_center[2] :] = kernel[
            self.kernel_center[0] :, : self.kernel_center[1], : self.kernel_center[2]
        ]
        if kernel.shape[0] > 1:
            kernel_buffer[:, -self.kernel_center[0] :, : self.halves[1], : self.halves[2]] = kernel[
                : self.kernel_center[0], self.kernel_center[1] :, self.kernel_center[2] :
            ]
            kernel_buffer[:, -self.kernel_center[0] :, : self.halves[1], -self.kernel_center[2] :] = kernel[
                : self.kernel_center[0], self.kernel_center[1] :, : self.kernel_center[2]
            ]
            kernel_buffer[:, -self.kernel_center[0] :, -self.kernel_center[1] :, : self.halves[2]] = kernel[
                : self.kernel_center[0], : self.kernel_center[1], self.kernel_center[2] :
            ]
            kernel_buffer[:, -self.kernel_center[0] :, -self.kernel_center[1] :, -self.kernel_center[2] :] = kernel[
                : self.kernel_center[0], : self.kernel_center[1], : self.kernel_center[2]
            ]
        self.kernel = cl.array.to_device(self.queue, kernel_buffer)

        # fourier transform, pointwise multiply, then invert => convolution
        self.plan.execute(self.inarray.data, batch=self.batch_size)

        self.plan.execute(self.kernel.data, batch=self.batch_size)

        self.result_buffer = self.inarray * self.kernel
        self.plan.execute(self.result_buffer.data, inverse=True, batch=self.batch_size)
        result = self.result_buffer.get().astype(float)

        if type == "same":
            return result[
                :,
                self.padding_locations[0] : self.padding_locations[1],
                self.padding_locations[2] : self.padding_locations[3],
                self.padding_locations[4] : self.padding_locations[5],
            ]
        elif type == "full":
            return result[
                :,
                self.full_locations[0] : self.full_locations[1],
                self.full_locations[2] : self.full_locations[3],
                self.full_locations[4] : self.full_locations[5],
            ]
        elif type == "valid":
            return result[
                :,
                self.valid_locations[0] : self.valid_locations[1],
                self.valid_locations[2] : self.valid_locations[3],
                self.valid_locations[4] : self.valid_locations[5],
            ]
Esempio n. 18
0
class ComputationalContext:
    def __init__(self, field):
        self.ocl = OpenCLSettings.Instance()
        self.dispIndex = DispersionIndex.Instance()
        self.grid = Grid.Instance()
        self.physConst = PhysicalEnvironment.Instance()
        self.ocl = OpenCLSettings.Instance()
        self.errors = ComputationalErrors()

        self.need_update_energy = False
        self.energy = numpy.empty(1).astype(numpy.float64)
        self.layer = numpy.int32(0)
        self.Z = numpy.int32(0)
        self.z_limit = Settings.z
        self.dz = Settings.dz
        self.current_dz = numpy.float64()
        self.calculated_dz = Settings.dz

        if Settings.z_strategy == "Uniform":
            self.z_step_strategy = UniformZStepStrategy()
        else:
            self.z_step_strategy = AdaptiveZStepStrategy()

        self.global_iteration_number = numpy.int32(0)
        self.unlinear_iterations = numpy.zeros(self.grid.space_size, dtype=numpy.int32)

        self.E_next = numpy.zeros((self.grid.space_size, self.grid.time_size), dtype=numpy.float64)
        self.D = numpy.zeros(self.grid.time_size, dtype=numpy.float64)
        self.K = numpy.zeros(self.grid.time_size, dtype=numpy.float64)

        self.field_shape = field.shape
        self.test_field = numpy.zeros((self.grid.space_size, self.grid.time_size), dtype=numpy.complex128)

        self.mf = cl.mem_flags
        self.A1_buf = cl.Buffer(self.ocl.ctx, self.mf.READ_WRITE, self.grid.space_grid.nbytes)
        self.A2_buf = cl.Buffer(self.ocl.ctx, self.mf.READ_WRITE, self.grid.space_grid.nbytes)
        self.A3_buf = cl.Buffer(self.ocl.ctx, self.mf.READ_WRITE, self.grid.space_grid.nbytes)
        self.D_buf = cl.Buffer(self.ocl.ctx, self.mf.READ_WRITE, self.D.nbytes)
        self.K_buf = cl.Buffer(self.ocl.ctx, self.mf.READ_WRITE, self.K.nbytes)
        self.space_buf = cl.Buffer(self.ocl.ctx, self.mf.READ_WRITE | self.mf.COPY_HOST_PTR, hostbuf=self.grid.space_grid)
        self.space_delta_buf = cl.Buffer(self.ocl.ctx, self.mf.READ_WRITE | self.mf.COPY_HOST_PTR, hostbuf=self.grid.space_delta)
        self.plan1D = Plan(self.grid.time_size, dtype=numpy.complex128, queue=self.ocl.queue)
        self.field_buf = cl.Buffer(self.ocl.ctx, self.mf.READ_WRITE | self.mf.COPY_HOST_PTR, hostbuf=field)
        self.unlinear_iterations_buf = cl.Buffer(self.ocl.ctx, self.mf.READ_WRITE, self.unlinear_iterations.nbytes)

        self.local_buf = cl.LocalMemory(8*self.ocl.n_threads)
        self.buffers, self.sizes = self.__compute_partial_buffers(self.grid.space_size * self.grid.time_size,
                                                                  self.ocl.n_threads)

        self.nlnr_computing_time = numpy.float64(0)
        self.diff_computing_time = numpy.float64(0)
        self.disp_computing_time = numpy.float64(0)
        self.copy_computing_time = numpy.float64(0)
        self.energy_computing_time = numpy.float64(0)

    def compute_K(self, K):
        ng = self.dispIndex.n(self.physConst.w) + self.dispIndex.dng(self.physConst.w)
        for i in range(self.grid.time_size):
            K[i] = self.grid.freq_grid[i] * \
                   (self.dispIndex.n(numpy.abs(self.grid.freq_grid[i] * self.physConst.w)) - ng) / self.physConst.dnL

    def compute_D(self, D):
        for i in range(self.grid.time_size):
            if numpy.abs(self.grid.freq_grid[i]) < 10e-8:
                D[i] = 0
            else:
                D[i] = self.physConst.D / self.grid.freq_grid[i]

    def fill_data(self):
        self.compute_D(self.D)
        self.compute_K(self.K)
        self.ocl.initial_prg.ComputeA(self.ocl.queue, self.grid.space_grid.shape, None,
                                      self.A1_buf, self.A2_buf, self.A3_buf, self.space_buf, self.space_delta_buf,
                                      self.grid.space_size)

        cl.enqueue_copy(self.ocl.queue, self.D_buf, self.D)
        cl.enqueue_copy(self.ocl.queue, self.K_buf, self.K)

    def is_stop(self):
        return self.Z >= self.z_limit

    def update_z(self):
        self.Z += self.current_dz
        self.layer += 1

    def update_dz(self):
        if self.calculated_dz < 0:
            self.calculated_dz = Settings.dz

        dz = self.calculated_dz
        if self.z_limit < self.Z + dz:
            dz = self.z_limit - self.Z

        self.current_dz = dz

        if self.current_dz < 0 and numpy.abs(self.current_dz) < 1e-10:
            self.current_dz = 0

    def update_energy(self):
        if self.need_update_energy:
            self.compute_energy()
        self.need_update_energy = False

    def compute_energy(self):
        evt = self.ocl.error_prg.CalculateEnergy(self.ocl.queue, (self.sizes[0],), (self.ocl.n_threads,),
                                                 self.field_buf, self.space_buf, self.space_delta_buf,
                                                 self.buffers[0], self.local_buf, self.grid.time_size, self.grid.space_size)
        evt.wait()
        self.energy_computing_time += evt.profile.end - evt.profile.start

        for i in range(1, len(self.buffers)):
            evt = self.ocl.error_prg.Reduce(self.ocl.queue, (self.sizes[i],), (min(self.ocl.n_threads, self.sizes[i]),),
                                            self.buffers[i-1], self.buffers[i], self.local_buf)
            evt.wait()
            self.energy_computing_time += evt.profile.end - evt.profile.start

        evt = cl.enqueue_copy(self.ocl.queue, self.energy, self.buffers[-1])
        evt.wait()
        self.copy_computing_time += evt.profile.end - evt.profile.start

    def do_step(self, dz):
        iteration_number = 0
        while True:
            self.need_update_energy = True

            if dz < 0:
                self.calculated_dz = self.z_step_strategy.calculate_dz(self.calculated_dz, self.global_iteration_number,
                                                                       self.errors.get_error())
                self.update_dz()
            else:
                self.current_dz = dz

            self.linear()
            self.nonlinear()

            if iteration_number > 50:
                break
            iteration_number += 1

            if not (dz < 0 and self.z_step_strategy.need_update_dz(self.global_iteration_number,
                                                                   self.errors.get_error())):
                break

        self.update_z()

    def linear(self):
        self.errors.linear_error.begin(self)
        # Обратное преобразование Фурье
        cl.enqueue_copy(self.ocl.queue, self.test_field, self.field_buf)

        self.plan1D.execute(self.field_buf, batch=self.grid.space_size, inverse=True)

        cl.enqueue_copy(self.ocl.queue, self.test_field, self.field_buf)

        if Settings.use_difraction:
            # Применяем оператор дифракции
            dif_evt = self.ocl.linear_prg.Diff(self.ocl.queue, (self.grid.time_size,), None,
                                               self.field_buf, self.A1_buf, self.A2_buf, self.A3_buf,
                                               self.space_buf, self.space_delta_buf, self.D_buf,
                                               self.grid.space_size, self.grid.time_size, self.current_dz)
            dif_evt.wait()
            self.diff_computing_time += dif_evt.profile.end - dif_evt.profile.start

        cl.enqueue_copy(self.ocl.queue, self.test_field, self.field_buf)

        # Применяем оператор дисперсии
        disp_evt = self.ocl.linear_prg.Disp(self.ocl.queue, self.field_shape, None,
                                            self.K_buf, self.field_buf, self.current_dz, self.grid.space_size,
                                            self.grid.time_size)
        disp_evt.wait()

        cl.enqueue_copy(self.ocl.queue, self.test_field, self.field_buf)

        self.disp_computing_time += disp_evt.profile.end - disp_evt.profile.start

        # Прямое преобразование Фурье
        self.plan1D.execute(self.field_buf, batch=self.grid.space_size, inverse=False)

        self.need_update_energy = True
        self.errors.linear_error.end(self)

    def nonlinear(self):
        if Settings.use_cubic:
            self.errors.nonlinear_error.begin(self)

            dt = self.grid.time_delta[1]
            k = numpy.float64(self.physConst.G * self.current_dz / dt / 24.0)
            max_error = numpy.float64(1e-6)
            iteration = numpy.int32(0)

            cl.enqueue_copy(self.ocl.queue, self.test_field, self.field_buf)

            evt = self.ocl.nonlinear_prg.CubicUnlinean1DSolve(self.ocl.queue, (self.grid.space_size, 1), None,
                                                              self.field_buf, self.unlinear_iterations_buf,
                                                              k, dt, max_error, iteration, self.grid.time_size)

            evt.wait()
            self.nlnr_computing_time += evt.profile.end - evt.profile.start

            cl.enqueue_copy(self.ocl.queue, self.test_field, self.field_buf)

            cl.enqueue_copy(self.ocl.queue, self.unlinear_iterations, self.unlinear_iterations_buf)
            self.global_iteration_number = numpy.ndarray.max(self.unlinear_iterations)

            self.need_update_energy = True
            self.errors.nonlinear_error.end(self)

    def copy_from_buffer(self, field):
        evt = cl.enqueue_copy(self.ocl.queue, field, self.field_buf)
        evt.wait()
        self.copy_computing_time += evt.profile.end - evt.profile.start

    def __compute_partial_buffers(self, size, n_threads):
        partial_sums = list()
        sizes = list()
        break_flag = True

        sizes.append(size)
        while break_flag:
            if size / n_threads == 0:
                break_flag = False
                size = 1
            else:
                size = size / n_threads
            buf_np = numpy.empty(size, dtype=numpy.float64)
            buf = cl.Buffer(self.ocl.ctx, self.mf.READ_WRITE, size=buf_np.nbytes)
            partial_sums.append(buf)
            sizes.append(size)

        return partial_sums, sizes
Esempio n. 19
0
class file_creator():
    def __init__(self):
        # create a opencl context
        try:
            self.ctx = cl.create_some_context()
        except ValueError:
            print "error %s\nExiting." % (ValueError)

        # create a opencl command queue
        self.queue = cl.CommandQueue(self.ctx)
        #self.cd = computedevice
        # create a event
        self.thread_event = threading.Event()
        self.duration = 15

    def python_ifft_guardinterval_func(self, ofdmmode, guardinterval, inputdata):
        # create a fft plan
        self.fftplan = Plan(ofdmmode,dtype=numpy.complex64,normalize=True, fast_math=True,context=self.ctx, queue=self.queue)

        # opencl buffer holding data for the ifft - including pilots # 8k or 2k size ???
        fftbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=ofdmmode*8)

        #size of guiardinterval destination buffer
        self.dest_buf_size = ofdmmode*(1+guardinterval)

        # opencl buffer holding the computed data
        dest_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=int(self.dest_buf_size*8) )

        self.ret_buf = numpy.array(numpy.zeros(self.dest_buf_size), dtype=numpy.complex64)

        cl.enqueue_copy(self.queue, fftbuffer, inputdata).wait()
        self.fftplan.execute(fftbuffer, data_out=None, inverse=True, batch=1, wait_for_finish=True)

        cl.enqueue_copy(self.queue, dest_buf, fftbuffer, byte_count=int(8*ofdmmode), src_offset=0, dest_offset=int(ofdmmode*guardinterval)).wait()
        cl.enqueue_copy(self.queue, dest_buf, fftbuffer, byte_count=int(ofdmmode*guardinterval), src_offset=int(ofdmmode - ofdmmode*guardinterval) ,dest_offset=0).wait()
        cl.enqueue_copy(self.queue, self.ret_buf, dest_buf).wait()

        return self.ret_buf


    def test_execution_time(self):
        print "test_execution_time:"
        print "TODO"

    def test_stop(self):
        self.thread_event.set()

    def load_kernel(self, filename, kernelname):
        print "Kernel \"%s\" from file \"%s\" :" % (kernelname,filename)
        mf = cl.mem_flags
        #read in the OpenCL source file as a string
        self.f = open(filename, 'r')
        fstr = "".join(self.f.readlines())
        self.program = cl.Program(self.ctx, fstr)
        self.program.build()
        self.f.close()
        #create the opencl kernel
        return cl.Kernel(self.program,kernelname)

    def test_algorithmA(self, ofdm_mode, guardinterval):
        print "\n**************************"
        print "test ofdm numpy ifft with fftshift"
        passed = 0
        linecnt = 1
        g = 0
        size = 0
        
        if ofdm_mode == 8192:
            size = 6817
            print "8k mode"
            if guardinterval == 0.25:
                self.fd_input = open('test_bench_ofdm_input_8K_1_4.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_8K_1_4.csv', 'r')
                print "1/4 guard interval"
                g = guardinterval
            if guardinterval == 0.125:
                self.fd_input = open('test_bench_ofdm_input_8K_1_8.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_8K_1_8.csv', 'r')
                print "1/8 guard interval"
                g = guardinterval
            if guardinterval == 0.0625:
                self.fd_input = open('test_bench_ofdm_input_8K_1_16.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_8K_1_16.csv', 'r')
                print "1/16 guard interval"
                g = guardinterval
            if guardinterval == 0.03125:
                self.fd_input = open('test_bench_ofdm_input_8K_1_32.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_8K_1_32.csv', 'r')
                print "1/32 guard interval"
                g = guardinterval
                
        elif ofdm_mode == 2048:
            size = 1705
            print "2k mode"
            if guardinterval == 0.25:
                self.fd_input = open('test_bench_ofdm_input_2K_1_4.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_2K_1_4.csv', 'r')
                print "1/4 guard interval"
                g = guardinterval
            if guardinterval == 0.125:
                self.fd_input = open('test_bench_ofdm_input_2K_1_8.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_2K_1_8.csv', 'r')
                print "1/8 guard interval"
                g = guardinterval
            if guardinterval == 0.0625:
                self.fd_input = open('test_bench_ofdm_input_2K_1_16.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_2K_1_16.csv', 'r')
                print "1/16 guard interval"
                g = guardinterval
            if guardinterval == 0.03125:
                self.fd_input = open('test_bench_ofdm_input_2K_1_32.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_2K_1_32.csv', 'r')
                print "1/32 guard interval"
                g = guardinterval

             
        if g == 0:
            print "wrong guardinterval specified"
            return

        if size == 0:
            print "wrong ofdm_mode"
            return
        

        for line in self.fd_input:
            data_to_encode = [float(0) + 1j*float(0)] * ofdm_mode
            counter = (ofdm_mode-size-1)/2+1
            #counter = 0
            for tmp in line.split(","):
                if string.find(tmp, " + ") > -1:
                    data_to_encode[counter]=(float(tmp.split(" + ")[0]) + 1j * float(string.replace(tmp.split(" + ")[1],"i","")))
                if string.find(tmp, " - ") > -1:
                    data_to_encode[counter]=(float(tmp.split(" - ")[0]) - 1j * float(string.replace(tmp.split(" - ")[1],"i","")))
                counter += 1
            data_to_encode = numpy.array(data_to_encode, dtype=numpy.complex128)

            reference_data = []
            for tmp in self.fd_output.readline().split(","):
                if string.find(tmp, " + ") > -1:
                    reference_data.append(float(tmp.split(" + ")[0]) + 1j * float(string.replace(tmp.split(" + ")[1],"i","")))
                if string.find(tmp, " - ") > -1:
                    reference_data.append(float(tmp.split(" - ")[0]) - 1j * float(string.replace(tmp.split(" - ")[1],"i","")))

            reference_data = numpy.array(reference_data, dtype=numpy.complex128)


            encoded_data = numpy.fft.ifft(numpy.fft.fftshift(data_to_encode))

            # add guard interval
            tmp = []
            for i in range(0,int(ofdm_mode*g)):
                tmp.append(encoded_data[(ofdm_mode*(1-g))+i])
            for i in range(0,ofdm_mode):
                tmp.append(encoded_data[i])
            encoded_data = tmp

            if numpy.allclose(reference_data, encoded_data, rtol=1.0000000000000001e-04, atol=1e-06):
                passed += 1
                print "Test %d PASSED" % linecnt
            else:
                print "Test %d FAILED" % linecnt
                print "input data:"
                print data_to_encode
                print "encoded data[0]:"
                print encoded_data[0]
                print "reference data[0]:"
                print reference_data[0]
                print "error data:"
                print reference_data - encoded_data
            linecnt += 1
        print "%d pass out of %d" % (passed, linecnt-1)
        self.fd_input.close()
        self.fd_output.close()
        if passed == (linecnt-1):
            print "All ofdm ifft tests PASS\n"
            return True
        else:
            print "at least one ofdm ifft test FAILED\n"
            return False

    def test_algorithmB(self, ofdm_mode, guardinterval):
        print "\n**************************"
        print "test ofdm numpy ifft w/o fftshift"
        passed = 0
        linecnt = 1
        g = 0
        size = 0
        
        if ofdm_mode == 8192:
            size = 6817
            print "8k mode"
            if guardinterval == 0.25:
                self.fd_input = open('test_bench_ofdm_input_8K_1_4.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_8K_1_4.csv', 'r')
                print "1/4 guard interval"
                g = guardinterval
            if guardinterval == 0.125:
                self.fd_input = open('test_bench_ofdm_input_8K_1_8.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_8K_1_8.csv', 'r')
                print "1/8 guard interval"
                g = guardinterval
            if guardinterval == 0.0625:
                self.fd_input = open('test_bench_ofdm_input_8K_1_16.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_8K_1_16.csv', 'r')
                print "1/16 guard interval"
                g = guardinterval
            if guardinterval == 0.03125:
                self.fd_input = open('test_bench_ofdm_input_8K_1_32.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_8K_1_32.csv', 'r')
                print "1/32 guard interval"
                g = guardinterval
                
        elif ofdm_mode == 2048:
            size = 1705
            print "2k mode"
            if guardinterval == 0.25:
                self.fd_input = open('test_bench_ofdm_input_2K_1_4.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_2K_1_4.csv', 'r')
                print "1/4 guard interval"
                g = guardinterval
            if guardinterval == 0.125:
                self.fd_input = open('test_bench_ofdm_input_2K_1_8.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_2K_1_8.csv', 'r')
                print "1/8 guard interval"
                g = guardinterval
            if guardinterval == 0.0625:
                self.fd_input = open('test_bench_ofdm_input_2K_1_16.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_2K_1_16.csv', 'r')
                print "1/16 guard interval"
                g = guardinterval
            if guardinterval == 0.03125:
                self.fd_input = open('test_bench_ofdm_input_2K_1_32.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_2K_1_32.csv', 'r')
                print "1/32 guard interval"
                g = guardinterval

        if g == 0:
            print "wrong guardinterval specified"
            return
        
        if size == 0:
            print "wrong ofdm_mode"
            return
        
        for line in self.fd_input:
            data_to_encode = [float(0) + 1j*float(0)] * ofdm_mode
            #counter = (2048-size-1)/2+1
            counter = 0
            for tmp in line.split(","):
                if string.find(tmp, " + ") > -1:
                    data_to_encode[counter]=(float(tmp.split(" + ")[0]) + 1j * float(string.replace(tmp.split(" + ")[1],"i","")))
                if string.find(tmp, " - ") > -1:
                    data_to_encode[counter]=(float(tmp.split(" - ")[0]) - 1j * float(string.replace(tmp.split(" - ")[1],"i","")))
                counter += 1
            data_to_encode = numpy.array(data_to_encode, dtype=numpy.complex128)

            reference_data = []
            for tmp in self.fd_output.readline().split(","):
                if string.find(tmp, " + ") > -1:
                    reference_data.append(float(tmp.split(" + ")[0]) + 1j * float(string.replace(tmp.split(" + ")[1],"i","")))
                if string.find(tmp, " - ") > -1:
                    reference_data.append(float(tmp.split(" - ")[0]) - 1j * float(string.replace(tmp.split(" - ")[1],"i","")))

            reference_data = numpy.array(reference_data, dtype=numpy.complex128)

            # do fftshift
            tmp = [float(0) + 1j*float(0)] * ofdm_mode
            for i in range(0,size):
                tmp[(ofdm_mode-size+1)/2+i] = data_to_encode[i]
            for i in range(0,ofdm_mode/2):
                data_to_encode[i] = tmp[ofdm_mode/2+i]
                data_to_encode[i+ofdm_mode/2] = tmp[i]


            encoded_data = numpy.fft.ifft(data_to_encode)

            # add guard interval
            tmp = []
            for i in range(0,int(ofdm_mode*g)):
                tmp.append(encoded_data[(ofdm_mode*(1-g))+i])
            for i in range(0,ofdm_mode):
                tmp.append(encoded_data[i])
            encoded_data = tmp

            if numpy.allclose(reference_data, encoded_data, rtol=1.0000000000000001e-04, atol=1e-06):
                passed += 1
                print "Test %d PASSED" % linecnt
            else:
                print "Test %d FAILED" % linecnt
                print "input data:"
                #print data_to_encode
                print "encoded data[0]:"
                print encoded_data[0]
                print "reference data[0]:"
                print reference_data[0]
                print "error data:"
                #print reference_data - encoded_data
            linecnt += 1
        print "%d pass out of %d" % (passed, linecnt-1)
        self.fd_input.close()
        self.fd_output.close()
        if passed == (linecnt-1):
            print "All ofdm ifft tests PASS\n"
            return True
        else:
            print "at least one ofdm ifft test FAILED\n"
            return False

    def test_algorithmC(self, ofdm_mode, guardinterval):
        print "\n**************************"
        print "test ofdm opencl ifft w/o fftshift"
        passed = 0
        linecnt = 1
        g = 0
        size = 0
        # create a fft plan
        self.fftplan = Plan(ofdm_mode,dtype=numpy.complex128,normalize=True, fast_math=True,context=self.ctx, queue=self.queue)
        # opencl buffer holding data for the ifft - including pilots # 8k or 2k size ???
        fftbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=ofdm_mode*16)

        #size of guiardinterval destination buffer
        dest_buf_size = ofdm_mode*(1+guardinterval)

        # opencl buffer holding the computed data
        dest_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=int(dest_buf_size*16) )

        encoded_data = numpy.array(numpy.zeros(dest_buf_size), dtype=numpy.complex128)
        
        if ofdm_mode == 8192:
            size = 6817
            print "8k mode"
            if guardinterval == 0.25:
                self.fd_input = open('test_bench_ofdm_input_8K_1_4.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_8K_1_4.csv', 'r')
                print "1/4 guard interval"
                g = guardinterval
            if guardinterval == 0.125:
                self.fd_input = open('test_bench_ofdm_input_8K_1_8.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_8K_1_8.csv', 'r')
                print "1/8 guard interval"
                g = guardinterval
            if guardinterval == 0.0625:
                self.fd_input = open('test_bench_ofdm_input_8K_1_16.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_8K_1_16.csv', 'r')
                print "1/16 guard interval"
                g = guardinterval
            if guardinterval == 0.03125:
                self.fd_input = open('test_bench_ofdm_input_8K_1_32.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_8K_1_32.csv', 'r')
                print "1/32 guard interval"
                g = guardinterval
                
        elif ofdm_mode == 2048:
            size = 1705
            print "2k mode"
            if guardinterval == 0.25:
                self.fd_input = open('test_bench_ofdm_input_2K_1_4.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_2K_1_4.csv', 'r')
                print "1/4 guard interval"
                g = guardinterval
            if guardinterval == 0.125:
                self.fd_input = open('test_bench_ofdm_input_2K_1_8.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_2K_1_8.csv', 'r')
                print "1/8 guard interval"
                g = guardinterval
            if guardinterval == 0.0625:
                self.fd_input = open('test_bench_ofdm_input_2K_1_16.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_2K_1_16.csv', 'r')
                print "1/16 guard interval"
                g = guardinterval
            if guardinterval == 0.03125:
                self.fd_input = open('test_bench_ofdm_input_2K_1_32.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_2K_1_32.csv', 'r')
                print "1/32 guard interval"
                g = guardinterval

        if g == 0:
            print "wrong guardinterval specified"
            return
        
        if size == 0:
            print "wrong ofdm_mode"
            return
        
        for line in self.fd_input:
            data_to_encode = [float(0) + 1j*float(0)] * ofdm_mode
            #counter = (2048-size-1)/2+1
            counter = 0
            for tmp in line.split(","):
                if string.find(tmp, " + ") > -1:
                    data_to_encode[counter]=(float(tmp.split(" + ")[0]) + 1j * float(string.replace(tmp.split(" + ")[1],"i","")))
                if string.find(tmp, " - ") > -1:
                    data_to_encode[counter]=(float(tmp.split(" - ")[0]) - 1j * float(string.replace(tmp.split(" - ")[1],"i","")))
                counter += 1
            data_to_encode = numpy.array(data_to_encode, dtype=numpy.complex128)

            reference_data = []
            for tmp in self.fd_output.readline().split(","):
                if string.find(tmp, " + ") > -1:
                    reference_data.append(float(tmp.split(" + ")[0]) + 1j * float(string.replace(tmp.split(" + ")[1],"i","")))
                if string.find(tmp, " - ") > -1:
                    reference_data.append(float(tmp.split(" - ")[0]) - 1j * float(string.replace(tmp.split(" - ")[1],"i","")))

            reference_data = numpy.array(reference_data, dtype=numpy.complex128)

            # do fftshift
            tmp = [float(0) + 1j*float(0)] * ofdm_mode
            for i in range(0,size):
                tmp[(ofdm_mode-size+1)/2+i] = data_to_encode[i]
            for i in range(0,ofdm_mode/2):
                data_to_encode[i] = tmp[ofdm_mode/2+i]
                data_to_encode[i+ofdm_mode/2] = tmp[i]



            cl.enqueue_copy(self.queue, fftbuffer, data_to_encode).wait()
            self.fftplan.execute(fftbuffer, data_out=None, inverse=True, batch=1, wait_for_finish=True)

            # add guard interval
            cl.enqueue_copy(self.queue, dest_buf, fftbuffer, byte_count=int(16*ofdm_mode), src_offset=0, dest_offset=int(ofdm_mode*guardinterval*16)).wait()
            cl.enqueue_copy(self.queue, dest_buf, fftbuffer, byte_count=int(ofdm_mode*guardinterval*16), src_offset=int(ofdm_mode - ofdm_mode*guardinterval)*16 ,dest_offset=0).wait()
            cl.enqueue_copy(self.queue, encoded_data, dest_buf).wait()


            if numpy.allclose(reference_data, encoded_data, rtol=1.0000000000000001e-04, atol=1e-06):
                passed += 1
                print "Test %d PASSED" % linecnt
            else:
                print "Test %d FAILED" % linecnt
                print "input data:"
                #print data_to_encode
                print "encoded data[0]:"
                print encoded_data[0]
                print "reference data[0]:"
                print reference_data[0]
                print "error data:"
                #print reference_data - encoded_data
            linecnt += 1
        print "%d pass out of %d" % (passed, linecnt-1)
        self.fd_input.close()
        self.fd_output.close()
        if passed == (linecnt-1):
            print "All ofdm ifft tests PASS\n"
            return True
        else:
            print "at least one ofdm ifft test FAILED\n"
            return False

    def test_algorithmD(self, ofdm_mode, guardinterval):
        print "\n**************************"
        print "test ofdm opencl ifft w/o fftshift http://ochafik.com/ dft"
        passed = 0
        linecnt = 1
        g = 0
        size = 0
        # create a kernel
        kernel = self.load_kernel("DiscreteFourierTransformProgram.cl", "dft")

        #size of guiardinterval destination buffer
        dest_buf_size = ofdm_mode*(1+guardinterval)

        self.inputbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=ofdm_mode*16)
        # opencl buffer
        self.outputbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=ofdm_mode*16)

        # opencl buffer holding the computed data
        dest_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=int(dest_buf_size*16) )

        encoded_data = numpy.array(numpy.zeros(dest_buf_size), dtype=numpy.complex128)
        
        if ofdm_mode == 8192:
            size = 6817
            print "8k mode"
            if guardinterval == 0.25:
                self.fd_input = open('test_bench_ofdm_input_8K_1_4.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_8K_1_4.csv', 'r')
                print "1/4 guard interval"
                g = guardinterval
            if guardinterval == 0.125:
                self.fd_input = open('test_bench_ofdm_input_8K_1_8.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_8K_1_8.csv', 'r')
                print "1/8 guard interval"
                g = guardinterval
            if guardinterval == 0.0625:
                self.fd_input = open('test_bench_ofdm_input_8K_1_16.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_8K_1_16.csv', 'r')
                print "1/16 guard interval"
                g = guardinterval
            if guardinterval == 0.03125:
                self.fd_input = open('test_bench_ofdm_input_8K_1_32.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_8K_1_32.csv', 'r')
                print "1/32 guard interval"
                g = guardinterval
                
        elif ofdm_mode == 2048:
            size = 1705
            print "2k mode"
            if guardinterval == 0.25:
                self.fd_input = open('test_bench_ofdm_input_2K_1_4.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_2K_1_4.csv', 'r')
                print "1/4 guard interval"
                g = guardinterval
            if guardinterval == 0.125:
                self.fd_input = open('test_bench_ofdm_input_2K_1_8.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_2K_1_8.csv', 'r')
                print "1/8 guard interval"
                g = guardinterval
            if guardinterval == 0.0625:
                self.fd_input = open('test_bench_ofdm_input_2K_1_16.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_2K_1_16.csv', 'r')
                print "1/16 guard interval"
                g = guardinterval
            if guardinterval == 0.03125:
                self.fd_input = open('test_bench_ofdm_input_2K_1_32.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_2K_1_32.csv', 'r')
                print "1/32 guard interval"
                g = guardinterval

        if g == 0:
            print "wrong guardinterval specified"
            return
        
        if size == 0:
            print "wrong ofdm_mode"
            return
        
        for line in self.fd_input:
            data_to_encode = [float(0) + 1j*float(0)] * ofdm_mode

            counter = 0
            for tmp in line.split(","):
                if string.find(tmp, " + ") > -1:
                    data_to_encode[counter]=(float(tmp.split(" + ")[0]) + 1j * float(string.replace(tmp.split(" + ")[1],"i","")))
                if string.find(tmp, " - ") > -1:
                    data_to_encode[counter]=(float(tmp.split(" - ")[0]) - 1j * float(string.replace(tmp.split(" - ")[1],"i","")))
                counter += 1
            data_to_encode = numpy.array(data_to_encode, dtype=numpy.complex128)

            reference_data = []
            for tmp in self.fd_output.readline().split(","):
                if string.find(tmp, " + ") > -1:
                    reference_data.append(float(tmp.split(" + ")[0]) + 1j * float(string.replace(tmp.split(" + ")[1],"i","")))
                if string.find(tmp, " - ") > -1:
                    reference_data.append(float(tmp.split(" - ")[0]) - 1j * float(string.replace(tmp.split(" - ")[1],"i","")))

            reference_data = numpy.array(reference_data, dtype=numpy.complex128)

            # do fftshift
            tmp = [float(0) + 1j*float(0)] * ofdm_mode
            for i in range(0,size):
                tmp[(ofdm_mode-size+1)/2+i] = data_to_encode[i]
            for i in range(0,ofdm_mode/2):
                data_to_encode[i] = tmp[ofdm_mode/2+i]
                data_to_encode[i+ofdm_mode/2] = tmp[i]



            cl.enqueue_copy(self.queue, self.inputbuffer, data_to_encode)

            kernel.set_args(self.inputbuffer, self.outputbuffer, numpy.int32(ofdm_mode),numpy.int32(-1))
            cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode),),None).wait()
            cl.enqueue_copy(self.queue, dest_buf, self.outputbuffer, byte_count=int(16*ofdm_mode), src_offset=0, dest_offset=int(ofdm_mode*guardinterval*16)).wait()
            cl.enqueue_copy(self.queue, dest_buf, self.outputbuffer, byte_count=int(ofdm_mode*guardinterval*16), src_offset=int(ofdm_mode - ofdm_mode*guardinterval)*16 ,dest_offset=0).wait()
            cl.enqueue_copy(self.queue, encoded_data, dest_buf)

            if numpy.allclose(reference_data, encoded_data, rtol=1.0000000000000001e-04, atol=1e-06):
                passed += 1
                print "Test %d PASSED" % linecnt
            else:
                print "Test %d FAILED" % linecnt
                print "input data:"
                print data_to_encode
                print "encoded data[0]:"
                print encoded_data[0]
                print "reference data[0]:"
                print reference_data[0]
                print "error data:"
                #print reference_data - encoded_data
            linecnt += 1
        print "%d pass out of %d" % (passed, linecnt-1)
        self.fd_input.close()
        self.fd_output.close()
        if passed == (linecnt-1):
            print "All ofdm ifft tests PASS\n"
            return True
        else:
            print "at least one ofdm ifft test FAILED\n"
            return False

    def test_algorithmE(self, ofdm_mode, guardinterval):
        print "\n**************************"
        print "test ofdm opencl ifft bealto.com radix 2 fft"
        passed = 0
        linecnt = 1
        g = 0
        size = 0
        
        if ofdm_mode == 8192:
            size = 6817
            print "8k mode"
            if guardinterval == 0.25:
                self.fd_input = open('test_bench_ofdm_input_8K_1_4.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_8K_1_4.csv', 'r')
                print "1/4 guard interval"
                g = guardinterval
            if guardinterval == 0.125:
                self.fd_input = open('test_bench_ofdm_input_8K_1_8.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_8K_1_8.csv', 'r')
                print "1/8 guard interval"
                g = guardinterval
            if guardinterval == 0.0625:
                self.fd_input = open('test_bench_ofdm_input_8K_1_16.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_8K_1_16.csv', 'r')
                print "1/16 guard interval"
                g = guardinterval
            if guardinterval == 0.03125:
                self.fd_input = open('test_bench_ofdm_input_8K_1_32.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_8K_1_32.csv', 'r')
                print "1/32 guard interval"
                g = guardinterval
                
        elif ofdm_mode == 2048:
            size = 1705
            print "2k mode"
            if guardinterval == 0.25:
                self.fd_input = open('test_bench_ofdm_input_2K_1_4.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_2K_1_4.csv', 'r')
                print "1/4 guard interval"
                g = guardinterval
            if guardinterval == 0.125:
                self.fd_input = open('test_bench_ofdm_input_2K_1_8.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_2K_1_8.csv', 'r')
                print "1/8 guard interval"
                g = guardinterval
            if guardinterval == 0.0625:
                self.fd_input = open('test_bench_ofdm_input_2K_1_16.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_2K_1_16.csv', 'r')
                print "1/16 guard interval"
                g = guardinterval
            if guardinterval == 0.03125:
                self.fd_input = open('test_bench_ofdm_input_2K_1_32.csv', 'r')
                self.fd_output = open('test_bench_ofdm_output_2K_1_32.csv', 'r')
                print "1/32 guard interval"
                g = guardinterval

        if g == 0:
            print "wrong guardinterval specified"
            return
        
        if size == 0:
            print "wrong ofdm_mode"
            return
            
        kernel = self.load_kernel("../FFT.cl", "fftRadix2Kernel")
        swapkernel = self.load_kernel("../FFT.cl", "fftswaprealimag")

        #size of guiardinterval destination buffer
        dest_buf_size = ofdm_mode*(1+guardinterval)
        self.tmpbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=size*8)
        self.inputbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=ofdm_mode*8)
        # opencl buffer
        self.outputbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=ofdm_mode*8)

        # opencl buffer holding the computed data
        dest_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=int(dest_buf_size*8) )

        encoded_data = numpy.array(numpy.zeros(dest_buf_size), dtype=numpy.complex64)
        
      
        
        for line in self.fd_input:
            data_to_encode = numpy.array([float(0) + 1j*float(0)] * size, dtype=numpy.complex64)
            #data_to_encode = numpy.array([float(0) + 1j*float(0)] * ofdm_mode, dtype=numpy.complex64)

            cl.enqueue_copy(self.queue, self.inputbuffer, data_to_encode)
            cl.enqueue_copy(self.queue, self.outputbuffer, data_to_encode)

            counter = 0
            for tmp in line.split(","):
                if string.find(tmp, " + ") > -1:
                    data_to_encode[counter]=(float(tmp.split(" + ")[0]) + 1j * float(string.replace(tmp.split(" + ")[1],"i","")))
                if string.find(tmp, " - ") > -1:
                    data_to_encode[counter]=(float(tmp.split(" - ")[0]) - 1j * float(string.replace(tmp.split(" - ")[1],"i","")))
                counter += 1
            data_to_encode = numpy.array(data_to_encode, dtype=numpy.complex64)

            reference_data = []
            for tmp in self.fd_output.readline().split(","):
                if string.find(tmp, " + ") > -1:
                    reference_data.append(float(tmp.split(" + ")[0]) + 1j * float(string.replace(tmp.split(" + ")[1],"i","")))
                if string.find(tmp, " - ") > -1:
                    reference_data.append(float(tmp.split(" - ")[0]) - 1j * float(string.replace(tmp.split(" - ")[1],"i","")))

            reference_data = numpy.array(reference_data, dtype=numpy.complex64)
            
            # do fftshift
            cl.enqueue_copy(self.queue, self.inputbuffer,numpy.array([float(0) + 1j*float(0)] * ofdm_mode, dtype=numpy.complex64))
            cl.enqueue_copy(self.queue, self.tmpbuffer, data_to_encode)
            cl.enqueue_copy(self.queue, self.inputbuffer, self.tmpbuffer, byte_count=((size-1)/2)*8,src_offset=0,dest_offset=(ofdm_mode-(size-1)/2)*8)
            cl.enqueue_copy(self.queue, self.inputbuffer, self.tmpbuffer, byte_count=((size+1)/2)*8,src_offset=((size-1)/2)*8,dest_offset=0)

            swapkernel.set_args(self.inputbuffer)
            cl.enqueue_nd_range_kernel(self.queue,swapkernel,(int(ofdm_mode),),None).wait()

            kernel.set_args(self.inputbuffer, self.outputbuffer, numpy.int32(1))
            cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode/2),),None).wait()

            kernel.set_args(self.outputbuffer, self.inputbuffer, numpy.int32(2))
            cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode/2),),None).wait()

            kernel.set_args(self.inputbuffer, self.outputbuffer, numpy.int32(4))
            cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode/2),),None).wait()

            kernel.set_args(self.outputbuffer, self.inputbuffer, numpy.int32(8))
            cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode/2),),None).wait()

            kernel.set_args(self.inputbuffer, self.outputbuffer, numpy.int32(16))
            cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode/2),),None).wait()

            kernel.set_args(self.outputbuffer, self.inputbuffer, numpy.int32(32))
            cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode/2),),None).wait()

            kernel.set_args(self.inputbuffer, self.outputbuffer, numpy.int32(64))
            cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode/2),),None).wait()

            kernel.set_args(self.outputbuffer, self.inputbuffer, numpy.int32(128))
            cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode/2),),None).wait()

            kernel.set_args(self.inputbuffer, self.outputbuffer, numpy.int32(256))
            cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode/2),),None).wait()

            kernel.set_args(self.outputbuffer, self.inputbuffer, numpy.int32(512))
            cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode/2),),None).wait()

            kernel.set_args(self.inputbuffer, self.outputbuffer, numpy.int32(1024))
            cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode/2),),None).wait()

            if ofdm_mode == 8192:
                kernel.set_args(self.outputbuffer, self.inputbuffer, numpy.int32(2048))
                cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode/2),),None).wait()

                kernel.set_args(self.inputbuffer, self.outputbuffer, numpy.int32(4096))
                cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode/2),),None).wait()

            swapkernel.set_args(self.outputbuffer)
            cl.enqueue_nd_range_kernel(self.queue,swapkernel,(int(ofdm_mode),),None).wait()

            cl.enqueue_copy(self.queue, dest_buf, self.outputbuffer, byte_count=int(8*ofdm_mode), src_offset=0, dest_offset=int(ofdm_mode*guardinterval*8)).wait()
            cl.enqueue_copy(self.queue, dest_buf, self.outputbuffer, byte_count=int(ofdm_mode*guardinterval*8), src_offset=int(ofdm_mode - ofdm_mode*guardinterval)*8 ,dest_offset=0).wait()
            cl.enqueue_copy(self.queue, encoded_data, dest_buf)


            if numpy.allclose(reference_data, encoded_data, rtol=1.0000000000000001e-04, atol=1e-06):
                passed += 1
                print "Test %d PASSED" % linecnt
            else:
                print "Test %d FAILED" % linecnt
                print "input data:"
                print data_to_encode
                print "encoded data[0]:"
                print encoded_data[0]
                print "reference data[0]:"
                print reference_data[0]
                print "error data:"
                #print reference_data - encoded_data

            linecnt += 1
        print "%d pass out of %d" % (passed, linecnt-1)
        self.fd_input.close()
        self.fd_output.close()
        if passed == (linecnt-1):
            print "All ofdm ifft tests PASS\n"
            return True
        else:
            print "at least one ofdm ifft test FAILED\n"
            return False
Esempio n. 20
0
def myfunc(d_g):
    from pyfft.cl import Plan
    from gputools import get_device
    plan = Plan(d_g.shape, queue=get_device().queue, fast_math=True)
    plan.execute(d_g.data, d_g.data)
    plan.execute(d_g.data, d_g.data, inverse=True)
Esempio n. 21
0
def gs_gpu(idata,itera=100):
    """Gerchberg-Saxton algorithm to calculate DOEs using the GPU
    
    Calculates the phase distribution in a object plane to obtain an 
    specific amplitude distribution in the target plane. It uses a 
    FFT to calculate the field propagation.
    The wavefront at the DOE plane is assumed as a plane wave.
    
    **ARGUMENTS:**
	
		========== ======================================================
		idata      numpy array containing the target amplitude distribution 
        itera      Maximum number of iterations
		========== ======================================================
    """ 
    
    pl=cl.get_platforms()[0]
    devices=pl.get_devices(device_type=cl.device_type.GPU)
    ctx = cl.Context(devices=[devices[0]])
    queue = cl.CommandQueue(ctx)

    plan = Plan(idata.shape, queue=queue,dtype=complex128) #no funciona con "complex128"
    
    src = str(Template(KERNEL).render(
        double_support=all(
            has_double_support(dev) for dev in devices),
        amd_double_support=all(
            has_amd_double_support(dev) for dev in devices)
        ))
    prg = cl.Program(ctx,src).build() 
    
    idata_gpu=cl_array.to_device(queue, ifftshift(idata).astype("complex128"))
    fdata_gpu=cl_array.empty_like(idata_gpu)
    rdata_gpu=cl_array.empty_like(idata_gpu)
    plan.execute(idata_gpu.data,fdata_gpu.data)
    
    e=1000
    ea=1000
    for i in range (itera):
        prg.norm(queue, fdata_gpu.shape, None,fdata_gpu.data)
        plan.execute(fdata_gpu.data,rdata_gpu.data,inverse=True)
        tr=rdata_gpu.get()
        rdata=ifftshift(tr)
        
        
        #TODO: This calculation should be done in the GPU
        e= (abs(rdata)-idata).std()
        if e>ea: 
            break
        ea=e
        
        prg.norm2(queue, rdata_gpu.shape,None,rdata_gpu.data,idata_gpu.data)
        
        plan.execute(rdata_gpu.data,fdata_gpu.data)
    
    fdata=fdata_gpu.get()
    
    #~ prg.norm(queue, fdata_gpu.shape, None,fdata_gpu.data)
    fdata=ifftshift(fdata)
    fdata=exp(1.j*angle(fdata))
    
    #~ fdata=fdata_gpu.get()
    return fdata
Esempio n. 22
0
class SplitStep(object):
	'''
	An OpenCL version of the split-step parabolic equation solver.
	'''

	_kernel = util.srcpath(__file__, 'clsrc', 'splitstep.mako')

	def __init__(self, k0, nx, ny, h, d=None, l=10, dz=None,
			w=0.39, propcorr=None, phasetol=None, context=None):
		'''
		Initialize a split-step engine over an nx-by-ny grid with
		isotropic step size h. The unitless wave number is k0. The wave
		is advanced in steps of dz or (if dz is not provided) h.

		If d is provided, it is a 4-tuple that describes the
		directivity of any source as d = (dx, dy, dz, w), where (dx,
		dy, dz) is the directivity axis and w is the beam width
		parameter. Otherwise, all sources are treated as point sources.

		If l is specified and greater than zero, it is the width of a
		Hann window used to attenuate the field along each edge.

		The parameter w (as a multiplier 1 / w**2) governs the
		high-order spectral cross term.

		If propcorr is specified, it should be a tuple of Booleans of
		the form (hospec, hospat) that determines whether corrections
		involving high-order spectral or spatial terms, respectively,
		are used in the propagator by default. If propcorr is
		unspecified, both corrections are used.

		The parameter phasetol specifies the maximum permissible phase
		deviation (in fractions of pi), relative to propagation through
		the homogeneous background, incurred by propagating through the
		inhomogeneous medium. The number of steps per slab will be
		adjusted so that the phase shift incurred by propagating
		through materials with the most extreme sound speeds will never
		exceed phasetol.
		'''
		# Ensure that the phase tolerance is not too small
		# Otherwise, number of propagation steps will blow up uncontrollably
		if phasetol is not None and abs(phasetol) < 1e-6:
			raise ValueError('Phase tolerance must be greater than 1e-6')

		# Copy the parameters
		self.grid = nx, ny
		self.h, self.k0, self.l = h, k0, l
		self.w = np.float32(1. / w**2)
		self.phasetol = phasetol
		# Specify the use of corrective terms
		if propcorr is not None:
			self.propcorr = tuple(propcorr)
		else: self.propcorr = (True, True)

		# Set the step length
		self.dz = dz if dz else h

		# Grab the provided context or create a default
		self.context = util.grabcontext(context)

		# Build the program for the context
		t = Template(filename=self._kernel, output_encoding='ascii')
		src = t.render(grid = self.grid, k0=k0, h=h, d=d, l=l)
		self.prog = cl.Program(self.context, src).build()

		# Create a command queue for forward propagation calculations
		self.fwdque = cl.CommandQueue(self.context)
		# Create command queues for transfers
		self.recvque = cl.CommandQueue(self.context)
		self.sendque = cl.CommandQueue(self.context)

		# Create an FFT plan in the OpenCL propagation queue
		# Reorder the axes to conform with row-major ordering
		self.fftplan = Plan((ny, nx), queue=self.fwdque)

		grid = self.grid
		def newbuffer():
			nbytes = cutil.prod(grid) * np.complex64().nbytes
			flags = cl.mem_flags.READ_WRITE
			return util.SyncBuffer(self.context, flags, size=nbytes)
		# Buffers to store the propagating (twice) and backward fields
		self.fld = [newbuffer() for i in range(3)]
		# Scratch space used during computations
		self.scratch = [newbuffer() for i in range(3)]
		# The index of refraction gets two buffers for transmission
		self.obj = [newbuffer() for i in range(2)]
		# Two buffers are used for the Goertzel FFT of the contrast source
		self.goertzbuf = [newbuffer() for i in range(2)]
		# The sound speed extrema for the current slab are stored here
		self.speedlim = [1., 1.]
		# Initialize buffer to hold results of advance()
		self.result = newbuffer()

		# By default, volume fields will be transfered from the device
		self._goertzel = False

		# Initialize refractive index and fields
		self.reset()

		# By default, device exchange happens on the full grid
		self.rectxfer = util.RectangularTransfer(grid, grid, np.complex64, alloc_host=False)


	def slicecoords(self):
		'''
		Return the meshgrid coordinate arrays for an x-y slab in the
		current engine. The origin is in the center of the slab.
		'''
		g = self.grid
		cg = np.ogrid[:g[0], :g[1]]
		return [(c - 0.5 * (n - 1.)) * self.h for c, n in zip(cg, g)]


	def reset(self, propcorr = None, goertzel = False):
		'''
		Reset the propagating and backward fields to zero and the prior
		refractive index buffer to unity.

		If propcorr is provided, it should be tuple as described in the
		docstring for __init__(). This will change the default behavior
		of corrective terms in the propagator.

		If goertzel is True, calls to advance() with shift=True will
		not copy the shifted, combined field from the device. Instead,
		the field will be used in a Goertzel algorithm to compute
		(slice by slice) the Fourier transform, restricted to the unit
		sphere, of induced scattering sources.
		'''
		if propcorr is not None: self.propcorr = tuple(propcorr)
		grid = self.grid
		z = np.zeros(grid, dtype=np.complex64)
		for a in self.fld + self.obj + self.goertzbuf:
			cl.enqueue_copy(self.fwdque, a, z, is_blocking=False)
		self.speedlim = [1., 1.]
		self._goertzel = goertzel


	def setroi(self, rgrid):
		'''
		Set a region of interest that will limit device transfers
		within the computational grid.
		'''
		self.rectxfer = util.RectangularTransfer(self.grid, rgrid, np.complex64, alloc_host=False)


	def setincident(self, srcloc, idx = 0):
		'''
		Set the value of the CL field buffer at index idx to the
		incident field at a location srcloc represented as a 3-tuple.

		The field plane is always assumed to have a z-height of 0; the
		z coordinate of srcloc is therefore the height of the source
		above the field plane. The transverse origin (x, y) = (0, 0)
		corresponds to the midpoint of the field plane.
		'''
		inc = self.fld[idx]
		sx, sy, dz = [np.float32(s) for s in srcloc]
		self.prog.green3d(self.fwdque, self.grid, None, inc, sx, sy, dz)


	def objupdate(self, obj):
		'''
		Update the rolling buffer with the index of refraction,
		corresponding to an object contrast obj, for the next slab.

		The transmission queue is used for updates to facilitate
		concurrency with forward propagation algorithms.
		'''
		# Transfers occur in the transmission queue for concurrency
		prog, queue, grid = self.prog, self.recvque, self.grid
		# Roll the buffer so the next slab is second
		nxt, cur = self.obj
		self.obj = [cur, nxt]

		# Figure approximate sound speed extrema in the upcoming slab
		# If the imaginary part of the wave number is negligible
		# compared to the real, maximum sound speed corresponds to the
		# minimum contrast and vice versa
		if self.phasetol:
			ctextrema = [np.max(obj.real), np.min(obj.real)]
			self.speedlim = [1 / np.sqrt(ctv + 1.) for ctv in ctextrema]

		# Ensure buffer is not used by prior calculations
		nxt.sync(queue)
		# Transfer the object contrast into the next-slab buffer
		evt = self.rectxfer.todevice(queue, nxt, obj)

		# Return buffers of the current and next slabs and a transfer event
		return cur, nxt, evt


	def propagate(self, fld = None, dz = None, idx = 0, corr = None):
		'''
		Propagate the field stored in the device buffer fld (or, if fld
		is None, the current in-device field at index idx) a step dz
		(or, if dz is None, the default step size) through the
		currently represented medium.

		If corr is not None, it should be a tuple as described in the
		reset() docstring to control the use of corrective terms in the
		spectral propagator. Otherwise, the instance default is used.
		'''
		prog, grid = self.prog, self.grid
		fwdque = self.fwdque
		hospec, hospat = corr if corr is not None else self.propcorr

		# Point to the field, scratch buffers, and refractive index
		if fld is None: fld = self.fld[idx]
		u, v, x = self.scratch
		obj = self.obj[0]

		# These constants will be used in field computations
		one = np.float32(1.)
		dz = np.float32(dz if dz is not None else self.dz)

		# Attenuate the boundaries using a Hann window, if desired
		if self.l > 0:
			prog.attenx(fwdque, (self.l, grid[1]), None, fld)
			prog.atteny(fwdque, (grid[0], self.l), None, fld)

		# Multiply, in v, the field by the contrast
		if hospec: prog.ctmul(fwdque, grid, None, v, obj, fld)
		# Multiply, in u, the field by the high-order spatial operator
		if hospat: prog.hospat(fwdque, grid, None, u, obj, fld)

		# From here, the field should be spectral
		self.fftplan.execute(fld)

		# Compute high-order spatial corrections or set the buffer to NULL
		if hospat:
			# With high-order spatial terms, transform u as well
			self.fftplan.execute(u)
			# Compute the scaled, spectral Laplacians of u and the field (in x)
			prog.laplacian(fwdque, grid, None, u, u)
			prog.laplacian(fwdque, grid, None, x, fld)
			# Apply the high-order spatial operator to x
			self.fftplan.execute(x, inverse=True)
			prog.hospat(fwdque, grid, None, x, obj, x)
			self.fftplan.execute(x)
			# Add x to u to get the high-order spatial corrections
			prog.caxpy(fwdque, grid, None, x, one, u, x)
		else: x = None

		# Compute high-order spectral corrections or set buffers to NULL
		if hospec:
			# Apply, in u, the high-order spectral operator to the field
			prog.hospec(fwdque, grid, None, u, fld)
			# Multiply u by the contrast in the spatial domain
			self.fftplan.execute(u, inverse=True)
			prog.ctmul(fwdque, grid, None, u, obj, u)
			# Let v = v + u / w**2 in the spatial domain
			prog.caxpy(fwdque, grid, None, v, self.w, u, v)
			# Transform u and v into the spectral domain
			self.fftplan.execute(u)
			self.fftplan.execute(v)
			# Apply the high-order spectral operator to the new v
			prog.hospec(fwdque, grid, None, v, v)
		else: u, v = None, None

		# Add all appropriate high-order corrections to the field
		if hospat or hospec:
			prog.corrfld(fwdque, grid, None, fld, u, v, x, dz)

		# Propagate the field through a homogeneous slab
		prog.propagate(fwdque, grid, None, fld, dz)

		# Take the inverse FFT of the field and the Laplacian
		self.fftplan.execute(fld, inverse=True)

		# Multiply by the phase screen, returning the event
		return prog.screen(fwdque, grid, None, fld, obj, dz)


	def advance(self, obj, shift=False, corr=None, shcorr=None):
		'''
		Propagate a field through the current slab and transmit it
		through an interface with the next slab characterized by object
		contrast obj. The transmission overwrites the refractive index
		of the current slab with the interface reflection coefficients.

		If shift is True, the forward is shifted by half a slab to
		agree with full-wave solutions and includes a
		backward-traveling contribution caused by reflection from the
		interface with the next slab.

		The relevant result (either the forward field or the
		half-shifted combined field) is copied into a device-side
		buffer for later retrieval and handling.

		If corr is not None, it should be a tuple as specified in the
		reset() docstring to override the default use of corrective
		terms in the spectral propagator.

		The argument shcorr is interpreted exactly as corr, but is used
		instead of corr for the propagation used to shift the field to
		the center of the slab.
		'''
		prog, grid = self.prog, self.grid
		fwdque, recvque, sendque = self.fwdque, self.recvque, self.sendque

		# Point to the field components
		fwd, bck, buf = [f for f in self.fld]

		if shift:
			# Ensure that a prior copy isn't using the buffer
			buf.sync(fwdque)
			# Copy the forward field for shifting if necessary
			cl.enqueue_copy(fwdque, buf, fwd)

		# Copy the sound speed extrema for the current slab
		speedlim = list(self.speedlim)
		# Push the next slab to its buffer (overwrites speed extrema)
		ocur, onxt, obevt = self.objupdate(obj)

		if self.phasetol is not None:
			# Figure maximum propagation distance to not
			# exceed maximum permissible phase deviation
			dzl = []
			for spd in speedlim:
				# Sign governs the sign of the phase deviation,
				# which is irrelevant, so ignore it here
				spdiff = max(abs(spd - 1.), 1e-8)
				# Preventing spdiff from reaching zero limits
				# maximum permissible propagation distance
				dzl.append(abs(0.5 * self.phasetol * spd / spdiff))
			# Subdivide the slab into maximum propagation distance
			nsteps = max(1, int(np.round(self.dz / min(dzl))))
		else: nsteps = 1
		dz = self.dz / nsteps

		# Ensure that no prior copy is using the field buffer
		fwd.sync(fwdque)

		# Propagate the forward field through the slab on the fwdque
		for i in range(nsteps): self.propagate(fwd, dz, corr=corr)

		# Ensure next slab has been received before handling interface
		cl.enqueue_barrier(fwdque, wait_for=[obevt])

		# Compute transmission through the interface
		# The reflected field is only of interest if a shift is desired
		transevt = prog.txreflect(fwdque, grid, None,
				fwd, bck if shift else None, ocur, onxt)
		# Hold the current contrast slab until the transmission is done
		ocur.attachevent(transevt)

		if shift:
			# Add the forward and backward fields
			prog.caxpy(fwdque, grid, None, buf, np.float32(1.), buf, bck)
			# Propagate the combined field a half step
			# Save the propagation event for delaying result copies
			pevt = self.propagate(buf, 0.5 * self.dz, corr=shcorr)

			# Handle Goertzel iterations to compute the Fourier
			# transform of the contrast source on the unit sphere
			if self._goertzel:
				# Compute the FFT of the source in the XY plane
				crt = self.scratch[0]
				prog.ctmul(fwdque, grid, None, crt, ocur, buf)
				self.fftplan.execute(crt)
				# Compute the next Goertzel iteration
				pn1, pn2 = self.goertzbuf
				dz = np.float32(self.dz)
				# The final argument (slab count) is not yet used
				nz = np.int32(0)
				prog.goertzelfft(fwdque, grid, None, pn1, pn2, crt, dz, nz)
				# Cycle the Goertzel buffers
				self.goertzbuf = [pn2, pn1]
			else:
				# Copy the shifted field into the result buffer
				# No result sync necessary, all mods occur on sendque
				evt = cl.enqueue_copy(sendque, self.result, buf, wait_for=[pevt])
				# Attach the copy event to the source buffer
				buf.attachevent(evt)
		else: 
			# Copy the forward field into the result buffer
			# Wait for transmissions to finish for consistency
			evt = cl.enqueue_copy(sendque, self.result, fwd, wait_for=[transevt])
			# Attach the copy event to the field buffer
			fwd.attachevent(evt)


	def getresult(self, hbuf):
		'''
		Wait for the intra-device transfer of the previous result to
		the result buffer, and initiate a device-to-host copy of the
		valid result buffer into hbuf.

		An event corresponding to the transfer is returned.
		'''
		sendque = self.sendque
		# Initiate the rectangular transfer on the transfer queue
		# No sync necessary, all mods to result buffer occur on sendque
		evt = self.rectxfer.fromdevice(sendque, self.result, hbuf)[1]
		# Attach the copy event to the result buffer
		self.result.attachevent(evt)
		return evt


	def goertzelfft(self, nz = 0):
		'''
		Finish Goertzel iterations carried out in repeated calls to
		advance() and copy the positive and negative hemispheres of the
		Fourier transform of the contrast source to successive planes
		of a Numpy array.

		If nz is specified, it is the number of slabs involved in the
		Fourier transform and is used to properly scale the output of
		the Goertzel algorithm. When nz = 0, no scaling is performed.

		Copies are synchronous and are done on the forward propagation
		queue.
		'''
		prog, grid = self.prog, self.grid
		fwdque = self.fwdque
		hemispheres = np.zeros(list(grid) + [2], dtype=np.complex64, order='F')
		# If the spectral field hasn't been computed, just return zeros
		if not self._goertzel: return hemispheres
		# Finalize the Goertzel iteration
		pn1, pn2 = self.goertzbuf
		dz = np.float32(self.dz)
		nz = np.int32(nz)
		# Pass None as the contrast current to signal final iteration
		# After this, pn1 is the positive hemisphere, pn2 is the negative
		prog.goertzelfft(fwdque, grid, None, pn1, pn2, None, dz, nz)
		# Copy the two hemispheres into planes of an array
		cl.enqueue_copy(fwdque, hemispheres[:,:,0:1], pn1, is_blocking=False)
		cl.enqueue_copy(fwdque, hemispheres[:,:,1:2], pn2, is_blocking=True)
		return hemispheres
Esempio n. 23
0
class Convolve:
    """ Class that computes the necessary information to perform a
    convolution and provides the actual convolution function. Can handle
    2d or 3d convolutions. """
    def __init__(self, in_size, kernel_size, batch_size, context, queue):
        self.sizes = []
        for i in xrange(len(in_size)):
            self.sizes.append(get_power_of_two(in_size[i] + kernel_size[i] +
                                               1))
        self.sizes = tuple(self.sizes)

        self.ctx = context
        self.queue = queue
        self.plan = Plan(self.sizes, queue=self.queue)
        self.in_array = cl.array.zeros(
            self.queue,
            (batch_size, self.sizes[0], self.sizes[1], self.sizes[2]),
            numpy.complex64)
        self.kernel = cl.array.zeros(
            self.queue,
            (batch_size, self.sizes[0], self.sizes[1], self.sizes[2]),
            numpy.complex64)
        self.result_buffer = numpy.zeros(self.in_array.shape, numpy.complex64)
        self.kernel_center = []
        for i in xrange(len(kernel_size)):
            self.kernel_center.append(kernel_size[i] / 2)
        self.kernel_center = tuple(self.kernel_center)

        self.halves = []
        for i in xrange(len(kernel_size)):
            self.halves.append(numpy.ceil(kernel_size[i] / 2.0))
        self.halves = tuple(self.halves)

        self.padding_locations = []
        for i in xrange(len(self.sizes)):
            # without this if even kernels result in an incorrect edge in the result
            if kernel_size[i] % 2 == 0:
                self.padding_locations.append(
                    -1 * ((in_size[i] - self.sizes[i]) / 2))
                self.padding_locations.append(
                    -1 * ((self.sizes[i] - in_size[i]) / 2))
            else:
                self.padding_locations.append((self.sizes[i] - in_size[i]) / 2)
                self.padding_locations.append((in_size[i] - self.sizes[i]) / 2)
        self.padding_locations = tuple(self.padding_locations)

        self.valid_locations = []
        for i in xrange(len(self.sizes)):
            self.valid_locations.append(self.padding_locations[(i * 2)] +
                                        self.halves[i] - 1)
            self.valid_locations.append(self.padding_locations[(i * 2)] +
                                        self.halves[i] + in_size[i] -
                                        kernel_size[i])
        self.valid_locations = tuple(self.valid_locations)

        self.full_locations = []
        for i in xrange(len(self.sizes)):
            offset = self.sizes[i] - (in_size[i] + kernel_size[i] - 1)
            self.full_locations.append(offset / 2)
            self.full_locations.append(-offset / 2)

        self.batch_size = batch_size

    def convolution(self, input_matrix, kernel, type_='valid'):
        in_array = numpy.zeros(
            (self.batch_size, self.sizes[0], self.sizes[1], self.sizes[2]),
            numpy.complex64)
        in_array[:, self.padding_locations[0]:self.padding_locations[1],
                 self.padding_locations[2]:self.padding_locations[3], self.
                 padding_locations[4]:self.padding_locations[5]] = input_matrix

        self.in_array = cl.array.to_device(self.queue, in_array)

        kernel_buffer = numpy.zeros(
            (self.batch_size, self.sizes[0], self.sizes[1], self.sizes[2]),
            numpy.complex64)

        kernel_buffer[:, :self.halves[0], :self.halves[1], :self.halves[2]] = \
            kernel[self.kernel_center[0]:, self.kernel_center[1]:, self.kernel_center[2]:]

        kernel_buffer[:, :self.halves[0], :self.halves[1], -self.kernel_center[2]:] = \
            kernel[self.kernel_center[0]:, self.kernel_center[1]:, :self.kernel_center[2]]

        kernel_buffer[:, :self.halves[0], -self.kernel_center[1]:, :self.halves[2]] = \
            kernel[self.kernel_center[0]:, :self.kernel_center[1], self.kernel_center[2]:]

        kernel_buffer[:, :self.halves[0], -self.kernel_center[1]:, -self.kernel_center[2]:] = \
            kernel[self.kernel_center[0]:, :self.kernel_center[1], :self.kernel_center[2]]

        if kernel.shape[0] > 1:
            kernel_buffer[:, -self.kernel_center[0]:, :self.halves[1], :self.halves[2]] = \
                kernel[:self.kernel_center[0], self.kernel_center[1]:, self.kernel_center[2]:]

            kernel_buffer[:, -self.kernel_center[0]:, :self.halves[1], -self.kernel_center[2]:] = \
                kernel[:self.kernel_center[0], self.kernel_center[1]:, :self.kernel_center[2]]

            kernel_buffer[:, -self.kernel_center[0]:, -self.kernel_center[1]:, :self.halves[2]] = \
                kernel[:self.kernel_center[0], :self.kernel_center[1], self.kernel_center[2]:]

            kernel_buffer[:, -self.kernel_center[0]:, -self.kernel_center[1]:, -self.kernel_center[2]:] = \
                kernel[:self.kernel_center[0], :self.kernel_center[1], :self.kernel_center[2]]

        self.kernel = cl.array.to_device(self.queue, kernel_buffer)

        # fourier transform, point wise multiply, then invert => convolution
        self.plan.execute(self.in_array.data, batch=self.batch_size)

        self.plan.execute(self.kernel.data, batch=self.batch_size)

        self.result_buffer = self.in_array * self.kernel
        self.plan.execute(self.result_buffer.data,
                          inverse=True,
                          batch=self.batch_size)
        result = self.result_buffer.get().astype(float)

        if type_ == 'same':
            return result[:,
                          self.padding_locations[0]:self.padding_locations[1],
                          self.padding_locations[2]:self.padding_locations[3],
                          self.padding_locations[4]:self.padding_locations[5]]
        elif type_ == 'full':
            return result[:, self.full_locations[0]:self.full_locations[1],
                          self.full_locations[2]:self.full_locations[3],
                          self.full_locations[4]:self.full_locations[5]]
        elif type_ == 'valid':
            return result[:, self.valid_locations[0]:self.valid_locations[1],
                          self.valid_locations[2]:self.valid_locations[3],
                          self.valid_locations[4]:self.valid_locations[5]]