def kspacegaussian_filter_pyfftCL(ksp, sigma): clear_first_arg_caches() sz = ksp.shape dtype = np.complex64 ftype = np.float32 #api = cluda.ocl_api() ctx = cl.create_some_context(interactive=False) queue = cl.CommandQueue(ctx) queue.flush() data_dev = cl_array.to_device(queue, ksp) w = h = k = 512 plan = Plan((w, h, k), normalize=True, queue=queue) FACTOR = 1.0 program = cl.Program(ctx, """ #pragma OPENCL EXTENSION cl_khr_fp64: enable #include "pyopencl-complex.h" __kernel void gauss_kernel(__global cfloat_t *dest) //, __global cfloat_t *src) { uint x = get_global_id(0);uint y = get_global_id(1);uint z = get_global_id(2); uint dim1= %d; uint dim2= %d; uint dim3= %d; float sigma[3]; sigma[0]=%f;sigma[1]=%f;sigma[2]=%f; float factor = %f; float TWOPISQ = 19.739208802178716; //6.283185307179586; //2*3.141592; ulong idx = z*dim1*dim2 + y*dim1 + x; float i = (float)(x); //(x / dim3) / dim2); i = (i - (float)floor((float)(dim1)/2.0f))/(float)(dim1); float j = (float)y; //(x / dim3); //if((int)j > dim2) {j=(float)fmod(j, (float)dim2);}; j = (j - (float)floor((float)(dim2)/2.0f))/(float)(dim2); //Account for large global index (stored as ulong) before performing modulus //double pre_k=fmod((double)(x) , (double) dim3); float k = (float) z; // pre_k; k = (k - (float)floor((float)(dim3)/2.0f))/(float)(dim3); float weight = exp(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); dest[idx].x = dest[idx].x * weight; dest[idx].y = dest[idx].y * weight; } """ % (sz[0], sz[1], sz[2], sigma[0], sigma[1], sigma[2], FACTOR)).build() gauss_kernel = program.gauss_kernel #data_dev = thr.empty_like(ksp_dev) gauss_kernel(queue, sz, None, data_dev.data).wait() # , data_dev.data ksp_out = data_dev.get() queue.flush() ctx = cl.create_some_context(interactive=False) queue = cl.CommandQueue(ctx) w = h = k = 512 plan = Plan((w, h, k), normalize=True, queue=queue) data2_dev = cl_array.to_device(queue, ksp_out) plan.execute(data2_dev.data, inverse=True) result = data2_dev.get() result = np.fft.fftshift(result) queue.finish() return result # ,ksp_out
def _ocl_fft_gpu_inplace(ocl_arr, inverse=False, plan=None): assert_bufs_type(np.complex64, ocl_arr) if plan is None: plan = Plan(ocl_arr.shape, queue=get_device().queue) plan.execute(ocl_arr.data, ocl_arr.data, inverse=inverse)
def _ocl_fft_gpu_inplace(ocl_arr,inverse = False, plan = None): assert_bufs_type(np.complex64,ocl_arr) if plan is None: plan = Plan(ocl_arr.shape, queue = get_device().queue) plan.execute(ocl_arr.data,ocl_arr.data, inverse = inverse)
def clifftn(data): clear_first_arg_caches() ctx = cl.create_some_context(interactive=False) queue = cl.CommandQueue(ctx) plan = Plan(data.shape, normalize=True, queue=queue) # Inverse transform: plan.execute(gpu_data.data, inverse=True) result = gpu_data.get() return result
def _ocl_fft_gpu(ocl_arr,res_arr = None,inverse = False, plan = None): assert_bufs_type(np.complex64,ocl_arr) if plan is None: plan = Plan(ocl_arr.shape, queue = get_device().queue) if res_arr is None: res_arr = OCLArray.empty(ocl_arr.shape,np.complex64) plan.execute(ocl_arr.data,res_arr.data, inverse = inverse) return res_arr
def _ocl_fft_gpu(ocl_arr, res_arr=None, inverse=False, plan=None): assert_bufs_type(np.complex64, ocl_arr) if plan is None: plan = Plan(ocl_arr.shape, queue=get_device().queue) if res_arr is None: res_arr = OCLArray.empty(ocl_arr.shape, np.complex64) plan.execute(ocl_arr.data, res_arr.data, inverse=inverse) return res_arr
def gs_mod_gpu(idata, itera=10, osize=256): cut = osize // 2 pl = cl.get_platforms()[0] devices = pl.get_devices(device_type=cl.device_type.GPU) ctx = cl.Context(devices=[devices[0]]) queue = cl.CommandQueue(ctx) plan = Plan(idata.shape, queue=queue, dtype=complex128) #no funciona con "complex128" src = str( Template(KERNEL).render( double_support=all(has_double_support(dev) for dev in devices), amd_double_support=all( has_amd_double_support(dev) for dev in devices))) prg = cl.Program(ctx, src).build() idata_gpu = cl_array.to_device(queue, ifftshift(idata).astype("complex128")) fdata_gpu = cl_array.empty_like(idata_gpu) rdata_gpu = cl_array.empty_like(idata_gpu) plan.execute(idata_gpu.data, fdata_gpu.data) mask = exp(2.j * pi * random(idata.shape)) mask[512 - cut:512 + cut, 512 - cut:512 + cut] = 0 idata_gpu = cl_array.to_device( queue, ifftshift(idata + mask).astype("complex128")) fdata_gpu = cl_array.empty_like(idata_gpu) rdata_gpu = cl_array.empty_like(idata_gpu) error_gpu = cl_array.to_device(ctx, queue, zeros(idata_gpu.shape).astype("double")) plan.execute(idata_gpu.data, fdata_gpu.data) e = 1000 ea = 1000 for i in range(itera): prg.norm(queue, fdata_gpu.shape, None, fdata_gpu.data) plan.execute(fdata_gpu.data, rdata_gpu.data, inverse=True) #~ prg.norm1(queue, rdata_gpu.shape,None,rdata_gpu.data,idata_gpu.data,error_gpu.data, int32(cut)) norm1 = prg.norm1 norm1.set_scalar_arg_dtypes([None, None, None, int32]) norm1(queue, rdata_gpu.shape, None, rdata_gpu.data, idata_gpu.data, error_gpu.data, int32(cut)) e = sqrt(cl_array.sum(error_gpu).get()) / (2 * cut) #~ if e>ea: #~ #~ break #~ ea=e plan.execute(rdata_gpu.data, fdata_gpu.data) fdata = fdata_gpu.get() fdata = ifftshift(fdata) fdata = exp(1.j * angle(fdata)) return fdata
def _prep_gpu(): """ Set up GPU calculation dependencies """ # try to import the necessary libraries fallback = False try: import gpu import string import pyopencl as cl import pyopencl.array as cla from pyfft.cl import Plan except ImportError: fallback = True # check gpu_info try: assert gpu.valid(gpu_info),\ "gpu_info in propagate_distances improperly specified" context, device, queue, platform = gpu_info except AssertionError: fallback = True if fallback: propagate_distances(data, distances, energy_or_wavelength, pixel_pitch, subregion=subregion, silent=silent, band_limit=band_limit, gpu_info=None, im_convert=im_convert) # if everything is OK, allocate memory and build kernels kp = string.join(gpu.__file__.split('/')[:-1], '/')+'/kernels/' build = _build_helper(context, device, kp) phase_multiply = build('propagate_phase_multiply.cl') copy_to_buffer = build('propagate_copy_to_save_buffer.cl') fftplan = Plan((N, N), queue=queue) # put the signals onto the gpu along with buffers for the # various operations rarray = cla.to_device(queue, r.astype(np.float32)) fourier = cla.to_device(queue, data.astype(np.complex64)) phase = cla.empty(queue, (N, N), np.complex64) back = cla.empty(queue, (N, N), np.complex64) store = cla.empty(queue, (nf, rows, cols), np.complex64) # precompute the fourier transform of data. fftplan.execute(fourier.data, wait_for_finish=True) return phase_multiply, copy_to_buffer, fftplan, rarray, fourier,\ phase, back, store, build
def clfftn(data): """ OpenCL FFT 3D """ clear_first_arg_caches() #ctx = cl.create_some_context(interactive=False) #queue = cl.CommandQueue(ctx) ctx, queue = clinit() plan = Plan(data.shape, normalize=True, queue=queue) # forward transform on device gpu_data = cl_array.to_device(queue, data) # forward transform plan.execute(gpu_data.data) #result = gpu_data.get() result = gpu_data.get() return result
def gs_mod_gpu(idata,itera=10,osize=256): cut=osize//2 pl=cl.get_platforms()[0] devices=pl.get_devices(device_type=cl.device_type.GPU) ctx = cl.Context(devices=[devices[0]]) queue = cl.CommandQueue(ctx) plan = Plan(idata.shape, queue=queue,dtype=complex128) #no funciona con "complex128" src = str(Template(KERNEL).render( double_support=all( has_double_support(dev) for dev in devices), amd_double_support=all( has_amd_double_support(dev) for dev in devices) )) prg = cl.Program(ctx,src).build() idata_gpu=cl_array.to_device(queue, ifftshift(idata).astype("complex128")) fdata_gpu=cl_array.empty_like(idata_gpu) rdata_gpu=cl_array.empty_like(idata_gpu) plan.execute(idata_gpu.data,fdata_gpu.data) mask=exp(2.j*pi*random(idata.shape)) mask[512-cut:512+cut,512-cut:512+cut]=0 idata_gpu=cl_array.to_device(queue, ifftshift(idata+mask).astype("complex128")) fdata_gpu=cl_array.empty_like(idata_gpu) rdata_gpu=cl_array.empty_like(idata_gpu) error_gpu=cl_array.to_device(ctx, queue, zeros(idata_gpu.shape).astype("double")) plan.execute(idata_gpu.data,fdata_gpu.data) e=1000 ea=1000 for i in range (itera): prg.norm(queue, fdata_gpu.shape, None,fdata_gpu.data) plan.execute(fdata_gpu.data,rdata_gpu.data,inverse=True) #~ prg.norm1(queue, rdata_gpu.shape,None,rdata_gpu.data,idata_gpu.data,error_gpu.data, int32(cut)) norm1=prg.norm1 norm1.set_scalar_arg_dtypes([None, None, None, int32]) norm1(queue, rdata_gpu.shape,None,rdata_gpu.data,idata_gpu.data,error_gpu.data, int32(cut)) e= sqrt(cl_array.sum(error_gpu).get())/(2*cut) #~ if e>ea: #~ #~ break #~ ea=e plan.execute(rdata_gpu.data,fdata_gpu.data) fdata=fdata_gpu.get() fdata=ifftshift(fdata) fdata=exp(1.j*angle(fdata)) return fdata
class FastFourierTransform: def __init__(self, in_scale, in_matrix, queue): self.scale = tuple(in_scale) # create plan self.plan = Plan(self.scale, queue=queue) # prepare data self.data = in_matrix self.gpu_data = cl_array.to_device(queue, self.data) def fourier_transform(self): # forward transform self.plan.execute(self.gpu_data.data) # print self.gpu_data.get() # inverse transform self.plan.execute(self.gpu_data.data, inverse=True) result_ = self.gpu_data.get() print(result_) error = numpy.abs( numpy.sum(numpy.abs(self.data) - numpy.abs(result_)) / self.data.size) if error > 1e-6: s = 'error occurs' raise ValueError('invalid value: %s' % s) return result_
from pyfft.cl import Plan import numpy import pyopencl as cl import pyopencl.array as cl_array # initialize context ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) # create plan plan = Plan((16, 16), queue=queue) # prepare data data = numpy.ones((16, 16), dtype=numpy.complex64) gpu_data = cl_array.to_device(ctx, queue, data) print gpu_data # forward transform plan.execute(gpu_data.data) result = gpu_data.get() print result # inverse transform plan.execute(gpu_data.data, inverse=True) result = gpu_data.get() error = numpy.abs(numpy.sum(numpy.abs(data) - numpy.abs(result)) / data.size) print error < 1e-6
def myfunc(d_g): from pyfft.cl import Plan from gputools import get_device plan = Plan(d_g.shape, queue = get_device().queue, fast_math = True) plan.execute(d_g.data,d_g.data) plan.execute(d_g.data,d_g.data, inverse = True)
class OpenclFibre(object): """ This optical module is similar to Fibre, but uses PyOpenCl (Python bindings around OpenCL) to generate parallelised code. """ def __init__(self, total_samples, dorf, length=None, total_steps=None, name="ocl_fibre"): self.name = name self.queue = None self.np_float = None self.np_complex = None self.prg = None self.cl_initialise(dorf) self.plan = Plan(total_samples, queue=self.queue) self.buf_field = None self.buf_temp = None self.buf_interaction = None self.buf_factor = None self.shape = None self.plan = None self.cached_factor = False # Force usage of cached version of function: self.cl_linear = self.cl_linear_cached self.length = length self.total_steps = total_steps def __call__(self, domain, field): # Setup plan for calculating fast Fourier transforms: self.plan = Plan(domain.total_samples, queue=self.queue) field_temp = np.empty_like(field) field_interaction = np.empty_like(field) from pyofss.modules.linearity import Linearity dispersion = Linearity(beta=[0.0, 0.0, 0.0, 1.0], sim_type="default") factor = dispersion(domain) self.send_arrays_to_device( field, field_temp, field_interaction, factor) stepsize = self.length / self.total_steps zs = np.linspace(0.0, self.length, self.total_steps + 1) #start = time.clock() for z in zs[:-1]: self.cl_rk4ip(self.buf_field, self.buf_temp, self.buf_interaction, self.buf_factor, stepsize) #stop = time.clock() #cl_result = self.buf_field.get() #print("cl_result: %e" % ((stop - start) / 1000.0)) return self.buf_field.get() def cl_initialise(self, dorf="float"): """ Initialise opencl related parameters. """ float_conversions = {"float": np.float32, "double": np.float64} complex_conversions = {"float": np.complex64, "double": np.complex128} self.np_float = float_conversions[dorf] self.np_complex = complex_conversions[dorf] for platform in cl.get_platforms(): if platform.name == "NVIDIA CUDA": print("Using compiler optimisations suitable for Nvidia GPUs") compiler_options = "-cl-mad-enable -cl-fast-relaxed-math" else: compiler_options = "" ctx = cl.create_some_context(interactive=False) self.queue = cl.CommandQueue(ctx) substitutions = {"dorf": dorf} code = OPENCL_OPERATIONS.substitute(substitutions) self.prg = cl.Program(ctx, code).build(options=compiler_options) @staticmethod def print_device_info(): """ Output information on each OpenCL platform and device. """ for platform in cl.get_platforms(): print "=" * 60 print "Platform information:" print "Name: ", platform.name print "Profile: ", platform.profile print "Vender: ", platform.vendor print "Version: ", platform.version for device in platform.get_devices(): print "-" * 60 print "Device information:" print "Name: ", device.name print "Type: ", cl.device_type.to_string(device.type) print "Memory: ", device.global_mem_size // (1024 ** 2), "MB" print "Max clock speed: ", device.max_clock_frequency, "MHz" print "Compute units: ", device.max_compute_units print "=" * 60 def send_arrays_to_device(self, field, field_temp, field_interaction, factor): """ Move numpy arrays onto compute device. """ self.shape = field.shape self.buf_field = cl_array.to_device( self.queue, field.astype(self.np_complex)) self.buf_temp = cl_array.to_device( self.queue, field_temp.astype(self.np_complex)) self.buf_interaction = cl_array.to_device( self.queue, field_interaction.astype(self.np_complex)) self.buf_factor = cl_array.to_device( self.queue, factor.astype(self.np_complex)) def cl_copy(self, first_buffer, second_buffer): """ Copy contents of one buffer into another. """ self.prg.cl_copy(self.queue, self.shape, None, first_buffer.data, second_buffer.data) def cl_linear(self, field_buffer, stepsize, factor_buffer): """ Linear part of step. """ self.plan.execute(field_buffer.data, inverse=True) self.prg.cl_linear(self.queue, self.shape, None, field_buffer.data, factor_buffer.data, self.np_float(stepsize)) self.plan.execute(field_buffer.data) def cl_linear_cached(self, field_buffer, stepsize, factor_buffer): """ Linear part of step (cached version). """ if (self.cached_factor is False): print "Caching factor" self.prg.cl_cache(self.queue, self.shape, None, factor_buffer.data, self.np_float(stepsize)) self.cached_factor = True self.plan.execute(field_buffer.data, inverse=True) self.prg.cl_linear_cached(self.queue, self.shape, None, field_buffer.data, factor_buffer.data) self.plan.execute(field_buffer.data) def cl_nonlinear(self, field_buffer, stepsize, gamma=100.0): """ Nonlinear part of step. """ self.prg.cl_nonlinear(self.queue, self.shape, None, field_buffer.data, self.np_float(gamma), self.np_float(stepsize)) def cl_sum(self, first_buffer, first_factor, second_buffer, second_factor): """ Calculate weighted summation. """ self.prg.cl_sum(self.queue, self.shape, None, first_buffer.data, self.np_float(first_factor), second_buffer.data, self.np_float(second_factor)) def cl_rk4ip(self, field, field_temp, field_interaction, factor, stepsize): """ Runge-Kutta in the interaction picture method using OpenCL. """ inv_six = 1.0 / 6.0 inv_three = 1.0 / 3.0 half_step = 0.5 * stepsize self.cl_copy(field_temp, field) self.cl_linear(field, half_step, factor) self.cl_copy(field_interaction, field) self.cl_nonlinear(field_temp, stepsize) self.cl_linear(field_temp, half_step, factor) self.cl_sum(field, 1.0, field_temp, inv_six) self.cl_sum(field_temp, 0.5, field_interaction, 1.0) self.cl_nonlinear(field_temp, stepsize) self.cl_sum(field, 1.0, field_temp, inv_three) self.cl_sum(field_temp, 0.5, field_interaction, 1.0) self.cl_nonlinear(field_temp, stepsize) self.cl_sum(field, 1.0, field_temp, inv_three) self.cl_sum(field_temp, 1.0, field_interaction, 1.0) self.cl_linear(field_interaction, half_step, factor) self.cl_linear(field, half_step, factor) self.cl_nonlinear(field_temp, stepsize) self.cl_sum(field, 1.0, field_temp, inv_six)
toc() tic() print 'Complex K-space filter + Numpy IFFT' kspgauss2 = KSP.kspacegaussian_filter2(ksp, 1) image_filtered = simpleifft(procpar, dims, hdr, kspgauss2, args) toc() # PYFFT tic() #ctx = cl.create_some_context(interactive=False) #queue = cl.CommandQueue(ctx) w = h = k = 512 plan = Plan((w, h, k), normalize=True, queue=queue) gpu_data = cl_array.to_device(queue, ksp) plan.execute(gpu_data.data, inverse=True) result = gpu_data.get() toc() result = np.fft.fftshift(result) print "PyFFT OpenCL IFFT time and first three results:" print "%s sec, %s" % (toc(), str(np.abs(result[:3, 0, 0]))) tic() reference = np.fft.fftshift(np.fft.ifftn(ksp)) print "Numpy IFFTN time and first three results:" print "%s sec, %s" % (toc(), str(np.abs(reference[:3, 0, 0]))) print "Calulating L1 norm " print np.linalg.norm(result - reference) / np.linalg.norm(reference) import matplotlib
def gs_gpu(idata, itera=100): """Gerchberg-Saxton algorithm to calculate DOEs using the GPU Calculates the phase distribution in a object plane to obtain an specific amplitude distribution in the target plane. It uses a FFT to calculate the field propagation. The wavefront at the DOE plane is assumed as a plane wave. **ARGUMENTS:** ========== ====================================================== idata numpy array containing the target amplitude distribution itera Maximum number of iterations ========== ====================================================== """ pl = cl.get_platforms()[0] devices = pl.get_devices(device_type=cl.device_type.GPU) ctx = cl.Context(devices=[devices[0]]) queue = cl.CommandQueue(ctx) plan = Plan(idata.shape, queue=queue, dtype=complex128) #no funciona con "complex128" src = str( Template(KERNEL).render( double_support=all(has_double_support(dev) for dev in devices), amd_double_support=all( has_amd_double_support(dev) for dev in devices))) prg = cl.Program(ctx, src).build() idata_gpu = cl_array.to_device(queue, ifftshift(idata).astype("complex128")) fdata_gpu = cl_array.empty_like(idata_gpu) rdata_gpu = cl_array.empty_like(idata_gpu) plan.execute(idata_gpu.data, fdata_gpu.data) e = 1000 ea = 1000 for i in range(itera): prg.norm(queue, fdata_gpu.shape, None, fdata_gpu.data) plan.execute(fdata_gpu.data, rdata_gpu.data, inverse=True) tr = rdata_gpu.get() rdata = ifftshift(tr) #TODO: This calculation should be done in the GPU e = (abs(rdata) - idata).std() if e > ea: break ea = e prg.norm2(queue, rdata_gpu.shape, None, rdata_gpu.data, idata_gpu.data) plan.execute(rdata_gpu.data, fdata_gpu.data) fdata = fdata_gpu.get() #~ prg.norm(queue, fdata_gpu.shape, None,fdata_gpu.data) fdata = ifftshift(fdata) fdata = exp(1.j * angle(fdata)) #~ fdata=fdata_gpu.get() return fdata
class Convolver: """ Class that computes the necessary information to perform a convolution and provides the actual convolution function. Can handle 2d or 3d convolutions. """ def __init__(self, insize, kernelsize, batchsize, context, queue): self.sizes = [] for i in xrange(len(insize)): self.sizes.append(getPowerOfTwo(insize[i] + kernelsize[i] + 1)) self.sizes = tuple(self.sizes) self.ctx = context self.queue = queue self.plan = Plan(self.sizes, queue=self.queue) self.inarray = cl.array.zeros( self.queue, (batchsize, self.sizes[0], self.sizes[1], self.sizes[2]), numpy.complex64 ) self.kernel = cl.array.zeros( self.queue, (batchsize, self.sizes[0], self.sizes[1], self.sizes[2]), numpy.complex64 ) self.result_buffer = numpy.zeros(self.inarray.shape, numpy.complex64) self.kernel_center = [] for i in xrange(len(kernelsize)): self.kernel_center.append(kernelsize[i] / 2) self.kernel_center = tuple(self.kernel_center) self.halves = [] for i in xrange(len(kernelsize)): self.halves.append(numpy.ceil(kernelsize[i] / 2.0)) self.halves = tuple(self.halves) self.padding_locations = [] for i in xrange(len(self.sizes)): # without this if even kernels result in an incorrect edge in the result if kernelsize[i] % 2 == 0: self.padding_locations.append(-1 * ((insize[i] - self.sizes[i]) / 2)) self.padding_locations.append(-1 * ((self.sizes[i] - insize[i]) / 2)) else: self.padding_locations.append((self.sizes[i] - insize[i]) / 2) self.padding_locations.append((insize[i] - self.sizes[i]) / 2) self.padding_locations = tuple(self.padding_locations) self.valid_locations = [] for i in xrange(len(self.sizes)): self.valid_locations.append(self.padding_locations[(i * 2)] + self.halves[i] - 1) self.valid_locations.append(self.padding_locations[(i * 2)] + self.halves[i] + insize[i] - kernelsize[i]) self.valid_locations = tuple(self.valid_locations) self.full_locations = [] for i in xrange(len(self.sizes)): offset = self.sizes[i] - (insize[i] + kernelsize[i] - 1) self.full_locations.append(offset / 2) self.full_locations.append(-offset / 2) self.batch_size = batchsize def convolution(self, A, kernel, type="valid"): inarray = numpy.zeros((self.batch_size, self.sizes[0], self.sizes[1], self.sizes[2]), numpy.complex64) inarray[ :, self.padding_locations[0] : self.padding_locations[1], self.padding_locations[2] : self.padding_locations[3], self.padding_locations[4] : self.padding_locations[5], ] = A self.inarray = cl.array.to_device(self.queue, inarray) kernel_buffer = numpy.zeros((self.batch_size, self.sizes[0], self.sizes[1], self.sizes[2]), numpy.complex64) kernel_buffer[:, : self.halves[0], : self.halves[1], : self.halves[2]] = kernel[ self.kernel_center[0] :, self.kernel_center[1] :, self.kernel_center[2] : ] kernel_buffer[:, : self.halves[0], : self.halves[1], -self.kernel_center[2] :] = kernel[ self.kernel_center[0] :, self.kernel_center[1] :, : self.kernel_center[2] ] kernel_buffer[:, : self.halves[0], -self.kernel_center[1] :, : self.halves[2]] = kernel[ self.kernel_center[0] :, : self.kernel_center[1], self.kernel_center[2] : ] kernel_buffer[:, : self.halves[0], -self.kernel_center[1] :, -self.kernel_center[2] :] = kernel[ self.kernel_center[0] :, : self.kernel_center[1], : self.kernel_center[2] ] if kernel.shape[0] > 1: kernel_buffer[:, -self.kernel_center[0] :, : self.halves[1], : self.halves[2]] = kernel[ : self.kernel_center[0], self.kernel_center[1] :, self.kernel_center[2] : ] kernel_buffer[:, -self.kernel_center[0] :, : self.halves[1], -self.kernel_center[2] :] = kernel[ : self.kernel_center[0], self.kernel_center[1] :, : self.kernel_center[2] ] kernel_buffer[:, -self.kernel_center[0] :, -self.kernel_center[1] :, : self.halves[2]] = kernel[ : self.kernel_center[0], : self.kernel_center[1], self.kernel_center[2] : ] kernel_buffer[:, -self.kernel_center[0] :, -self.kernel_center[1] :, -self.kernel_center[2] :] = kernel[ : self.kernel_center[0], : self.kernel_center[1], : self.kernel_center[2] ] self.kernel = cl.array.to_device(self.queue, kernel_buffer) # fourier transform, pointwise multiply, then invert => convolution self.plan.execute(self.inarray.data, batch=self.batch_size) self.plan.execute(self.kernel.data, batch=self.batch_size) self.result_buffer = self.inarray * self.kernel self.plan.execute(self.result_buffer.data, inverse=True, batch=self.batch_size) result = self.result_buffer.get().astype(float) if type == "same": return result[ :, self.padding_locations[0] : self.padding_locations[1], self.padding_locations[2] : self.padding_locations[3], self.padding_locations[4] : self.padding_locations[5], ] elif type == "full": return result[ :, self.full_locations[0] : self.full_locations[1], self.full_locations[2] : self.full_locations[3], self.full_locations[4] : self.full_locations[5], ] elif type == "valid": return result[ :, self.valid_locations[0] : self.valid_locations[1], self.valid_locations[2] : self.valid_locations[3], self.valid_locations[4] : self.valid_locations[5], ]
class ComputationalContext: def __init__(self, field): self.ocl = OpenCLSettings.Instance() self.dispIndex = DispersionIndex.Instance() self.grid = Grid.Instance() self.physConst = PhysicalEnvironment.Instance() self.ocl = OpenCLSettings.Instance() self.errors = ComputationalErrors() self.need_update_energy = False self.energy = numpy.empty(1).astype(numpy.float64) self.layer = numpy.int32(0) self.Z = numpy.int32(0) self.z_limit = Settings.z self.dz = Settings.dz self.current_dz = numpy.float64() self.calculated_dz = Settings.dz if Settings.z_strategy == "Uniform": self.z_step_strategy = UniformZStepStrategy() else: self.z_step_strategy = AdaptiveZStepStrategy() self.global_iteration_number = numpy.int32(0) self.unlinear_iterations = numpy.zeros(self.grid.space_size, dtype=numpy.int32) self.E_next = numpy.zeros((self.grid.space_size, self.grid.time_size), dtype=numpy.float64) self.D = numpy.zeros(self.grid.time_size, dtype=numpy.float64) self.K = numpy.zeros(self.grid.time_size, dtype=numpy.float64) self.field_shape = field.shape self.test_field = numpy.zeros((self.grid.space_size, self.grid.time_size), dtype=numpy.complex128) self.mf = cl.mem_flags self.A1_buf = cl.Buffer(self.ocl.ctx, self.mf.READ_WRITE, self.grid.space_grid.nbytes) self.A2_buf = cl.Buffer(self.ocl.ctx, self.mf.READ_WRITE, self.grid.space_grid.nbytes) self.A3_buf = cl.Buffer(self.ocl.ctx, self.mf.READ_WRITE, self.grid.space_grid.nbytes) self.D_buf = cl.Buffer(self.ocl.ctx, self.mf.READ_WRITE, self.D.nbytes) self.K_buf = cl.Buffer(self.ocl.ctx, self.mf.READ_WRITE, self.K.nbytes) self.space_buf = cl.Buffer(self.ocl.ctx, self.mf.READ_WRITE | self.mf.COPY_HOST_PTR, hostbuf=self.grid.space_grid) self.space_delta_buf = cl.Buffer(self.ocl.ctx, self.mf.READ_WRITE | self.mf.COPY_HOST_PTR, hostbuf=self.grid.space_delta) self.plan1D = Plan(self.grid.time_size, dtype=numpy.complex128, queue=self.ocl.queue) self.field_buf = cl.Buffer(self.ocl.ctx, self.mf.READ_WRITE | self.mf.COPY_HOST_PTR, hostbuf=field) self.unlinear_iterations_buf = cl.Buffer(self.ocl.ctx, self.mf.READ_WRITE, self.unlinear_iterations.nbytes) self.local_buf = cl.LocalMemory(8*self.ocl.n_threads) self.buffers, self.sizes = self.__compute_partial_buffers(self.grid.space_size * self.grid.time_size, self.ocl.n_threads) self.nlnr_computing_time = numpy.float64(0) self.diff_computing_time = numpy.float64(0) self.disp_computing_time = numpy.float64(0) self.copy_computing_time = numpy.float64(0) self.energy_computing_time = numpy.float64(0) def compute_K(self, K): ng = self.dispIndex.n(self.physConst.w) + self.dispIndex.dng(self.physConst.w) for i in range(self.grid.time_size): K[i] = self.grid.freq_grid[i] * \ (self.dispIndex.n(numpy.abs(self.grid.freq_grid[i] * self.physConst.w)) - ng) / self.physConst.dnL def compute_D(self, D): for i in range(self.grid.time_size): if numpy.abs(self.grid.freq_grid[i]) < 10e-8: D[i] = 0 else: D[i] = self.physConst.D / self.grid.freq_grid[i] def fill_data(self): self.compute_D(self.D) self.compute_K(self.K) self.ocl.initial_prg.ComputeA(self.ocl.queue, self.grid.space_grid.shape, None, self.A1_buf, self.A2_buf, self.A3_buf, self.space_buf, self.space_delta_buf, self.grid.space_size) cl.enqueue_copy(self.ocl.queue, self.D_buf, self.D) cl.enqueue_copy(self.ocl.queue, self.K_buf, self.K) def is_stop(self): return self.Z >= self.z_limit def update_z(self): self.Z += self.current_dz self.layer += 1 def update_dz(self): if self.calculated_dz < 0: self.calculated_dz = Settings.dz dz = self.calculated_dz if self.z_limit < self.Z + dz: dz = self.z_limit - self.Z self.current_dz = dz if self.current_dz < 0 and numpy.abs(self.current_dz) < 1e-10: self.current_dz = 0 def update_energy(self): if self.need_update_energy: self.compute_energy() self.need_update_energy = False def compute_energy(self): evt = self.ocl.error_prg.CalculateEnergy(self.ocl.queue, (self.sizes[0],), (self.ocl.n_threads,), self.field_buf, self.space_buf, self.space_delta_buf, self.buffers[0], self.local_buf, self.grid.time_size, self.grid.space_size) evt.wait() self.energy_computing_time += evt.profile.end - evt.profile.start for i in range(1, len(self.buffers)): evt = self.ocl.error_prg.Reduce(self.ocl.queue, (self.sizes[i],), (min(self.ocl.n_threads, self.sizes[i]),), self.buffers[i-1], self.buffers[i], self.local_buf) evt.wait() self.energy_computing_time += evt.profile.end - evt.profile.start evt = cl.enqueue_copy(self.ocl.queue, self.energy, self.buffers[-1]) evt.wait() self.copy_computing_time += evt.profile.end - evt.profile.start def do_step(self, dz): iteration_number = 0 while True: self.need_update_energy = True if dz < 0: self.calculated_dz = self.z_step_strategy.calculate_dz(self.calculated_dz, self.global_iteration_number, self.errors.get_error()) self.update_dz() else: self.current_dz = dz self.linear() self.nonlinear() if iteration_number > 50: break iteration_number += 1 if not (dz < 0 and self.z_step_strategy.need_update_dz(self.global_iteration_number, self.errors.get_error())): break self.update_z() def linear(self): self.errors.linear_error.begin(self) # Обратное преобразование Фурье cl.enqueue_copy(self.ocl.queue, self.test_field, self.field_buf) self.plan1D.execute(self.field_buf, batch=self.grid.space_size, inverse=True) cl.enqueue_copy(self.ocl.queue, self.test_field, self.field_buf) if Settings.use_difraction: # Применяем оператор дифракции dif_evt = self.ocl.linear_prg.Diff(self.ocl.queue, (self.grid.time_size,), None, self.field_buf, self.A1_buf, self.A2_buf, self.A3_buf, self.space_buf, self.space_delta_buf, self.D_buf, self.grid.space_size, self.grid.time_size, self.current_dz) dif_evt.wait() self.diff_computing_time += dif_evt.profile.end - dif_evt.profile.start cl.enqueue_copy(self.ocl.queue, self.test_field, self.field_buf) # Применяем оператор дисперсии disp_evt = self.ocl.linear_prg.Disp(self.ocl.queue, self.field_shape, None, self.K_buf, self.field_buf, self.current_dz, self.grid.space_size, self.grid.time_size) disp_evt.wait() cl.enqueue_copy(self.ocl.queue, self.test_field, self.field_buf) self.disp_computing_time += disp_evt.profile.end - disp_evt.profile.start # Прямое преобразование Фурье self.plan1D.execute(self.field_buf, batch=self.grid.space_size, inverse=False) self.need_update_energy = True self.errors.linear_error.end(self) def nonlinear(self): if Settings.use_cubic: self.errors.nonlinear_error.begin(self) dt = self.grid.time_delta[1] k = numpy.float64(self.physConst.G * self.current_dz / dt / 24.0) max_error = numpy.float64(1e-6) iteration = numpy.int32(0) cl.enqueue_copy(self.ocl.queue, self.test_field, self.field_buf) evt = self.ocl.nonlinear_prg.CubicUnlinean1DSolve(self.ocl.queue, (self.grid.space_size, 1), None, self.field_buf, self.unlinear_iterations_buf, k, dt, max_error, iteration, self.grid.time_size) evt.wait() self.nlnr_computing_time += evt.profile.end - evt.profile.start cl.enqueue_copy(self.ocl.queue, self.test_field, self.field_buf) cl.enqueue_copy(self.ocl.queue, self.unlinear_iterations, self.unlinear_iterations_buf) self.global_iteration_number = numpy.ndarray.max(self.unlinear_iterations) self.need_update_energy = True self.errors.nonlinear_error.end(self) def copy_from_buffer(self, field): evt = cl.enqueue_copy(self.ocl.queue, field, self.field_buf) evt.wait() self.copy_computing_time += evt.profile.end - evt.profile.start def __compute_partial_buffers(self, size, n_threads): partial_sums = list() sizes = list() break_flag = True sizes.append(size) while break_flag: if size / n_threads == 0: break_flag = False size = 1 else: size = size / n_threads buf_np = numpy.empty(size, dtype=numpy.float64) buf = cl.Buffer(self.ocl.ctx, self.mf.READ_WRITE, size=buf_np.nbytes) partial_sums.append(buf) sizes.append(size) return partial_sums, sizes
class file_creator(): def __init__(self): # create a opencl context try: self.ctx = cl.create_some_context() except ValueError: print "error %s\nExiting." % (ValueError) # create a opencl command queue self.queue = cl.CommandQueue(self.ctx) #self.cd = computedevice # create a event self.thread_event = threading.Event() self.duration = 15 def python_ifft_guardinterval_func(self, ofdmmode, guardinterval, inputdata): # create a fft plan self.fftplan = Plan(ofdmmode,dtype=numpy.complex64,normalize=True, fast_math=True,context=self.ctx, queue=self.queue) # opencl buffer holding data for the ifft - including pilots # 8k or 2k size ??? fftbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=ofdmmode*8) #size of guiardinterval destination buffer self.dest_buf_size = ofdmmode*(1+guardinterval) # opencl buffer holding the computed data dest_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=int(self.dest_buf_size*8) ) self.ret_buf = numpy.array(numpy.zeros(self.dest_buf_size), dtype=numpy.complex64) cl.enqueue_copy(self.queue, fftbuffer, inputdata).wait() self.fftplan.execute(fftbuffer, data_out=None, inverse=True, batch=1, wait_for_finish=True) cl.enqueue_copy(self.queue, dest_buf, fftbuffer, byte_count=int(8*ofdmmode), src_offset=0, dest_offset=int(ofdmmode*guardinterval)).wait() cl.enqueue_copy(self.queue, dest_buf, fftbuffer, byte_count=int(ofdmmode*guardinterval), src_offset=int(ofdmmode - ofdmmode*guardinterval) ,dest_offset=0).wait() cl.enqueue_copy(self.queue, self.ret_buf, dest_buf).wait() return self.ret_buf def test_execution_time(self): print "test_execution_time:" print "TODO" def test_stop(self): self.thread_event.set() def load_kernel(self, filename, kernelname): print "Kernel \"%s\" from file \"%s\" :" % (kernelname,filename) mf = cl.mem_flags #read in the OpenCL source file as a string self.f = open(filename, 'r') fstr = "".join(self.f.readlines()) self.program = cl.Program(self.ctx, fstr) self.program.build() self.f.close() #create the opencl kernel return cl.Kernel(self.program,kernelname) def test_algorithmA(self, ofdm_mode, guardinterval): print "\n**************************" print "test ofdm numpy ifft with fftshift" passed = 0 linecnt = 1 g = 0 size = 0 if ofdm_mode == 8192: size = 6817 print "8k mode" if guardinterval == 0.25: self.fd_input = open('test_bench_ofdm_input_8K_1_4.csv', 'r') self.fd_output = open('test_bench_ofdm_output_8K_1_4.csv', 'r') print "1/4 guard interval" g = guardinterval if guardinterval == 0.125: self.fd_input = open('test_bench_ofdm_input_8K_1_8.csv', 'r') self.fd_output = open('test_bench_ofdm_output_8K_1_8.csv', 'r') print "1/8 guard interval" g = guardinterval if guardinterval == 0.0625: self.fd_input = open('test_bench_ofdm_input_8K_1_16.csv', 'r') self.fd_output = open('test_bench_ofdm_output_8K_1_16.csv', 'r') print "1/16 guard interval" g = guardinterval if guardinterval == 0.03125: self.fd_input = open('test_bench_ofdm_input_8K_1_32.csv', 'r') self.fd_output = open('test_bench_ofdm_output_8K_1_32.csv', 'r') print "1/32 guard interval" g = guardinterval elif ofdm_mode == 2048: size = 1705 print "2k mode" if guardinterval == 0.25: self.fd_input = open('test_bench_ofdm_input_2K_1_4.csv', 'r') self.fd_output = open('test_bench_ofdm_output_2K_1_4.csv', 'r') print "1/4 guard interval" g = guardinterval if guardinterval == 0.125: self.fd_input = open('test_bench_ofdm_input_2K_1_8.csv', 'r') self.fd_output = open('test_bench_ofdm_output_2K_1_8.csv', 'r') print "1/8 guard interval" g = guardinterval if guardinterval == 0.0625: self.fd_input = open('test_bench_ofdm_input_2K_1_16.csv', 'r') self.fd_output = open('test_bench_ofdm_output_2K_1_16.csv', 'r') print "1/16 guard interval" g = guardinterval if guardinterval == 0.03125: self.fd_input = open('test_bench_ofdm_input_2K_1_32.csv', 'r') self.fd_output = open('test_bench_ofdm_output_2K_1_32.csv', 'r') print "1/32 guard interval" g = guardinterval if g == 0: print "wrong guardinterval specified" return if size == 0: print "wrong ofdm_mode" return for line in self.fd_input: data_to_encode = [float(0) + 1j*float(0)] * ofdm_mode counter = (ofdm_mode-size-1)/2+1 #counter = 0 for tmp in line.split(","): if string.find(tmp, " + ") > -1: data_to_encode[counter]=(float(tmp.split(" + ")[0]) + 1j * float(string.replace(tmp.split(" + ")[1],"i",""))) if string.find(tmp, " - ") > -1: data_to_encode[counter]=(float(tmp.split(" - ")[0]) - 1j * float(string.replace(tmp.split(" - ")[1],"i",""))) counter += 1 data_to_encode = numpy.array(data_to_encode, dtype=numpy.complex128) reference_data = [] for tmp in self.fd_output.readline().split(","): if string.find(tmp, " + ") > -1: reference_data.append(float(tmp.split(" + ")[0]) + 1j * float(string.replace(tmp.split(" + ")[1],"i",""))) if string.find(tmp, " - ") > -1: reference_data.append(float(tmp.split(" - ")[0]) - 1j * float(string.replace(tmp.split(" - ")[1],"i",""))) reference_data = numpy.array(reference_data, dtype=numpy.complex128) encoded_data = numpy.fft.ifft(numpy.fft.fftshift(data_to_encode)) # add guard interval tmp = [] for i in range(0,int(ofdm_mode*g)): tmp.append(encoded_data[(ofdm_mode*(1-g))+i]) for i in range(0,ofdm_mode): tmp.append(encoded_data[i]) encoded_data = tmp if numpy.allclose(reference_data, encoded_data, rtol=1.0000000000000001e-04, atol=1e-06): passed += 1 print "Test %d PASSED" % linecnt else: print "Test %d FAILED" % linecnt print "input data:" print data_to_encode print "encoded data[0]:" print encoded_data[0] print "reference data[0]:" print reference_data[0] print "error data:" print reference_data - encoded_data linecnt += 1 print "%d pass out of %d" % (passed, linecnt-1) self.fd_input.close() self.fd_output.close() if passed == (linecnt-1): print "All ofdm ifft tests PASS\n" return True else: print "at least one ofdm ifft test FAILED\n" return False def test_algorithmB(self, ofdm_mode, guardinterval): print "\n**************************" print "test ofdm numpy ifft w/o fftshift" passed = 0 linecnt = 1 g = 0 size = 0 if ofdm_mode == 8192: size = 6817 print "8k mode" if guardinterval == 0.25: self.fd_input = open('test_bench_ofdm_input_8K_1_4.csv', 'r') self.fd_output = open('test_bench_ofdm_output_8K_1_4.csv', 'r') print "1/4 guard interval" g = guardinterval if guardinterval == 0.125: self.fd_input = open('test_bench_ofdm_input_8K_1_8.csv', 'r') self.fd_output = open('test_bench_ofdm_output_8K_1_8.csv', 'r') print "1/8 guard interval" g = guardinterval if guardinterval == 0.0625: self.fd_input = open('test_bench_ofdm_input_8K_1_16.csv', 'r') self.fd_output = open('test_bench_ofdm_output_8K_1_16.csv', 'r') print "1/16 guard interval" g = guardinterval if guardinterval == 0.03125: self.fd_input = open('test_bench_ofdm_input_8K_1_32.csv', 'r') self.fd_output = open('test_bench_ofdm_output_8K_1_32.csv', 'r') print "1/32 guard interval" g = guardinterval elif ofdm_mode == 2048: size = 1705 print "2k mode" if guardinterval == 0.25: self.fd_input = open('test_bench_ofdm_input_2K_1_4.csv', 'r') self.fd_output = open('test_bench_ofdm_output_2K_1_4.csv', 'r') print "1/4 guard interval" g = guardinterval if guardinterval == 0.125: self.fd_input = open('test_bench_ofdm_input_2K_1_8.csv', 'r') self.fd_output = open('test_bench_ofdm_output_2K_1_8.csv', 'r') print "1/8 guard interval" g = guardinterval if guardinterval == 0.0625: self.fd_input = open('test_bench_ofdm_input_2K_1_16.csv', 'r') self.fd_output = open('test_bench_ofdm_output_2K_1_16.csv', 'r') print "1/16 guard interval" g = guardinterval if guardinterval == 0.03125: self.fd_input = open('test_bench_ofdm_input_2K_1_32.csv', 'r') self.fd_output = open('test_bench_ofdm_output_2K_1_32.csv', 'r') print "1/32 guard interval" g = guardinterval if g == 0: print "wrong guardinterval specified" return if size == 0: print "wrong ofdm_mode" return for line in self.fd_input: data_to_encode = [float(0) + 1j*float(0)] * ofdm_mode #counter = (2048-size-1)/2+1 counter = 0 for tmp in line.split(","): if string.find(tmp, " + ") > -1: data_to_encode[counter]=(float(tmp.split(" + ")[0]) + 1j * float(string.replace(tmp.split(" + ")[1],"i",""))) if string.find(tmp, " - ") > -1: data_to_encode[counter]=(float(tmp.split(" - ")[0]) - 1j * float(string.replace(tmp.split(" - ")[1],"i",""))) counter += 1 data_to_encode = numpy.array(data_to_encode, dtype=numpy.complex128) reference_data = [] for tmp in self.fd_output.readline().split(","): if string.find(tmp, " + ") > -1: reference_data.append(float(tmp.split(" + ")[0]) + 1j * float(string.replace(tmp.split(" + ")[1],"i",""))) if string.find(tmp, " - ") > -1: reference_data.append(float(tmp.split(" - ")[0]) - 1j * float(string.replace(tmp.split(" - ")[1],"i",""))) reference_data = numpy.array(reference_data, dtype=numpy.complex128) # do fftshift tmp = [float(0) + 1j*float(0)] * ofdm_mode for i in range(0,size): tmp[(ofdm_mode-size+1)/2+i] = data_to_encode[i] for i in range(0,ofdm_mode/2): data_to_encode[i] = tmp[ofdm_mode/2+i] data_to_encode[i+ofdm_mode/2] = tmp[i] encoded_data = numpy.fft.ifft(data_to_encode) # add guard interval tmp = [] for i in range(0,int(ofdm_mode*g)): tmp.append(encoded_data[(ofdm_mode*(1-g))+i]) for i in range(0,ofdm_mode): tmp.append(encoded_data[i]) encoded_data = tmp if numpy.allclose(reference_data, encoded_data, rtol=1.0000000000000001e-04, atol=1e-06): passed += 1 print "Test %d PASSED" % linecnt else: print "Test %d FAILED" % linecnt print "input data:" #print data_to_encode print "encoded data[0]:" print encoded_data[0] print "reference data[0]:" print reference_data[0] print "error data:" #print reference_data - encoded_data linecnt += 1 print "%d pass out of %d" % (passed, linecnt-1) self.fd_input.close() self.fd_output.close() if passed == (linecnt-1): print "All ofdm ifft tests PASS\n" return True else: print "at least one ofdm ifft test FAILED\n" return False def test_algorithmC(self, ofdm_mode, guardinterval): print "\n**************************" print "test ofdm opencl ifft w/o fftshift" passed = 0 linecnt = 1 g = 0 size = 0 # create a fft plan self.fftplan = Plan(ofdm_mode,dtype=numpy.complex128,normalize=True, fast_math=True,context=self.ctx, queue=self.queue) # opencl buffer holding data for the ifft - including pilots # 8k or 2k size ??? fftbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=ofdm_mode*16) #size of guiardinterval destination buffer dest_buf_size = ofdm_mode*(1+guardinterval) # opencl buffer holding the computed data dest_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=int(dest_buf_size*16) ) encoded_data = numpy.array(numpy.zeros(dest_buf_size), dtype=numpy.complex128) if ofdm_mode == 8192: size = 6817 print "8k mode" if guardinterval == 0.25: self.fd_input = open('test_bench_ofdm_input_8K_1_4.csv', 'r') self.fd_output = open('test_bench_ofdm_output_8K_1_4.csv', 'r') print "1/4 guard interval" g = guardinterval if guardinterval == 0.125: self.fd_input = open('test_bench_ofdm_input_8K_1_8.csv', 'r') self.fd_output = open('test_bench_ofdm_output_8K_1_8.csv', 'r') print "1/8 guard interval" g = guardinterval if guardinterval == 0.0625: self.fd_input = open('test_bench_ofdm_input_8K_1_16.csv', 'r') self.fd_output = open('test_bench_ofdm_output_8K_1_16.csv', 'r') print "1/16 guard interval" g = guardinterval if guardinterval == 0.03125: self.fd_input = open('test_bench_ofdm_input_8K_1_32.csv', 'r') self.fd_output = open('test_bench_ofdm_output_8K_1_32.csv', 'r') print "1/32 guard interval" g = guardinterval elif ofdm_mode == 2048: size = 1705 print "2k mode" if guardinterval == 0.25: self.fd_input = open('test_bench_ofdm_input_2K_1_4.csv', 'r') self.fd_output = open('test_bench_ofdm_output_2K_1_4.csv', 'r') print "1/4 guard interval" g = guardinterval if guardinterval == 0.125: self.fd_input = open('test_bench_ofdm_input_2K_1_8.csv', 'r') self.fd_output = open('test_bench_ofdm_output_2K_1_8.csv', 'r') print "1/8 guard interval" g = guardinterval if guardinterval == 0.0625: self.fd_input = open('test_bench_ofdm_input_2K_1_16.csv', 'r') self.fd_output = open('test_bench_ofdm_output_2K_1_16.csv', 'r') print "1/16 guard interval" g = guardinterval if guardinterval == 0.03125: self.fd_input = open('test_bench_ofdm_input_2K_1_32.csv', 'r') self.fd_output = open('test_bench_ofdm_output_2K_1_32.csv', 'r') print "1/32 guard interval" g = guardinterval if g == 0: print "wrong guardinterval specified" return if size == 0: print "wrong ofdm_mode" return for line in self.fd_input: data_to_encode = [float(0) + 1j*float(0)] * ofdm_mode #counter = (2048-size-1)/2+1 counter = 0 for tmp in line.split(","): if string.find(tmp, " + ") > -1: data_to_encode[counter]=(float(tmp.split(" + ")[0]) + 1j * float(string.replace(tmp.split(" + ")[1],"i",""))) if string.find(tmp, " - ") > -1: data_to_encode[counter]=(float(tmp.split(" - ")[0]) - 1j * float(string.replace(tmp.split(" - ")[1],"i",""))) counter += 1 data_to_encode = numpy.array(data_to_encode, dtype=numpy.complex128) reference_data = [] for tmp in self.fd_output.readline().split(","): if string.find(tmp, " + ") > -1: reference_data.append(float(tmp.split(" + ")[0]) + 1j * float(string.replace(tmp.split(" + ")[1],"i",""))) if string.find(tmp, " - ") > -1: reference_data.append(float(tmp.split(" - ")[0]) - 1j * float(string.replace(tmp.split(" - ")[1],"i",""))) reference_data = numpy.array(reference_data, dtype=numpy.complex128) # do fftshift tmp = [float(0) + 1j*float(0)] * ofdm_mode for i in range(0,size): tmp[(ofdm_mode-size+1)/2+i] = data_to_encode[i] for i in range(0,ofdm_mode/2): data_to_encode[i] = tmp[ofdm_mode/2+i] data_to_encode[i+ofdm_mode/2] = tmp[i] cl.enqueue_copy(self.queue, fftbuffer, data_to_encode).wait() self.fftplan.execute(fftbuffer, data_out=None, inverse=True, batch=1, wait_for_finish=True) # add guard interval cl.enqueue_copy(self.queue, dest_buf, fftbuffer, byte_count=int(16*ofdm_mode), src_offset=0, dest_offset=int(ofdm_mode*guardinterval*16)).wait() cl.enqueue_copy(self.queue, dest_buf, fftbuffer, byte_count=int(ofdm_mode*guardinterval*16), src_offset=int(ofdm_mode - ofdm_mode*guardinterval)*16 ,dest_offset=0).wait() cl.enqueue_copy(self.queue, encoded_data, dest_buf).wait() if numpy.allclose(reference_data, encoded_data, rtol=1.0000000000000001e-04, atol=1e-06): passed += 1 print "Test %d PASSED" % linecnt else: print "Test %d FAILED" % linecnt print "input data:" #print data_to_encode print "encoded data[0]:" print encoded_data[0] print "reference data[0]:" print reference_data[0] print "error data:" #print reference_data - encoded_data linecnt += 1 print "%d pass out of %d" % (passed, linecnt-1) self.fd_input.close() self.fd_output.close() if passed == (linecnt-1): print "All ofdm ifft tests PASS\n" return True else: print "at least one ofdm ifft test FAILED\n" return False def test_algorithmD(self, ofdm_mode, guardinterval): print "\n**************************" print "test ofdm opencl ifft w/o fftshift http://ochafik.com/ dft" passed = 0 linecnt = 1 g = 0 size = 0 # create a kernel kernel = self.load_kernel("DiscreteFourierTransformProgram.cl", "dft") #size of guiardinterval destination buffer dest_buf_size = ofdm_mode*(1+guardinterval) self.inputbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=ofdm_mode*16) # opencl buffer self.outputbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=ofdm_mode*16) # opencl buffer holding the computed data dest_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=int(dest_buf_size*16) ) encoded_data = numpy.array(numpy.zeros(dest_buf_size), dtype=numpy.complex128) if ofdm_mode == 8192: size = 6817 print "8k mode" if guardinterval == 0.25: self.fd_input = open('test_bench_ofdm_input_8K_1_4.csv', 'r') self.fd_output = open('test_bench_ofdm_output_8K_1_4.csv', 'r') print "1/4 guard interval" g = guardinterval if guardinterval == 0.125: self.fd_input = open('test_bench_ofdm_input_8K_1_8.csv', 'r') self.fd_output = open('test_bench_ofdm_output_8K_1_8.csv', 'r') print "1/8 guard interval" g = guardinterval if guardinterval == 0.0625: self.fd_input = open('test_bench_ofdm_input_8K_1_16.csv', 'r') self.fd_output = open('test_bench_ofdm_output_8K_1_16.csv', 'r') print "1/16 guard interval" g = guardinterval if guardinterval == 0.03125: self.fd_input = open('test_bench_ofdm_input_8K_1_32.csv', 'r') self.fd_output = open('test_bench_ofdm_output_8K_1_32.csv', 'r') print "1/32 guard interval" g = guardinterval elif ofdm_mode == 2048: size = 1705 print "2k mode" if guardinterval == 0.25: self.fd_input = open('test_bench_ofdm_input_2K_1_4.csv', 'r') self.fd_output = open('test_bench_ofdm_output_2K_1_4.csv', 'r') print "1/4 guard interval" g = guardinterval if guardinterval == 0.125: self.fd_input = open('test_bench_ofdm_input_2K_1_8.csv', 'r') self.fd_output = open('test_bench_ofdm_output_2K_1_8.csv', 'r') print "1/8 guard interval" g = guardinterval if guardinterval == 0.0625: self.fd_input = open('test_bench_ofdm_input_2K_1_16.csv', 'r') self.fd_output = open('test_bench_ofdm_output_2K_1_16.csv', 'r') print "1/16 guard interval" g = guardinterval if guardinterval == 0.03125: self.fd_input = open('test_bench_ofdm_input_2K_1_32.csv', 'r') self.fd_output = open('test_bench_ofdm_output_2K_1_32.csv', 'r') print "1/32 guard interval" g = guardinterval if g == 0: print "wrong guardinterval specified" return if size == 0: print "wrong ofdm_mode" return for line in self.fd_input: data_to_encode = [float(0) + 1j*float(0)] * ofdm_mode counter = 0 for tmp in line.split(","): if string.find(tmp, " + ") > -1: data_to_encode[counter]=(float(tmp.split(" + ")[0]) + 1j * float(string.replace(tmp.split(" + ")[1],"i",""))) if string.find(tmp, " - ") > -1: data_to_encode[counter]=(float(tmp.split(" - ")[0]) - 1j * float(string.replace(tmp.split(" - ")[1],"i",""))) counter += 1 data_to_encode = numpy.array(data_to_encode, dtype=numpy.complex128) reference_data = [] for tmp in self.fd_output.readline().split(","): if string.find(tmp, " + ") > -1: reference_data.append(float(tmp.split(" + ")[0]) + 1j * float(string.replace(tmp.split(" + ")[1],"i",""))) if string.find(tmp, " - ") > -1: reference_data.append(float(tmp.split(" - ")[0]) - 1j * float(string.replace(tmp.split(" - ")[1],"i",""))) reference_data = numpy.array(reference_data, dtype=numpy.complex128) # do fftshift tmp = [float(0) + 1j*float(0)] * ofdm_mode for i in range(0,size): tmp[(ofdm_mode-size+1)/2+i] = data_to_encode[i] for i in range(0,ofdm_mode/2): data_to_encode[i] = tmp[ofdm_mode/2+i] data_to_encode[i+ofdm_mode/2] = tmp[i] cl.enqueue_copy(self.queue, self.inputbuffer, data_to_encode) kernel.set_args(self.inputbuffer, self.outputbuffer, numpy.int32(ofdm_mode),numpy.int32(-1)) cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode),),None).wait() cl.enqueue_copy(self.queue, dest_buf, self.outputbuffer, byte_count=int(16*ofdm_mode), src_offset=0, dest_offset=int(ofdm_mode*guardinterval*16)).wait() cl.enqueue_copy(self.queue, dest_buf, self.outputbuffer, byte_count=int(ofdm_mode*guardinterval*16), src_offset=int(ofdm_mode - ofdm_mode*guardinterval)*16 ,dest_offset=0).wait() cl.enqueue_copy(self.queue, encoded_data, dest_buf) if numpy.allclose(reference_data, encoded_data, rtol=1.0000000000000001e-04, atol=1e-06): passed += 1 print "Test %d PASSED" % linecnt else: print "Test %d FAILED" % linecnt print "input data:" print data_to_encode print "encoded data[0]:" print encoded_data[0] print "reference data[0]:" print reference_data[0] print "error data:" #print reference_data - encoded_data linecnt += 1 print "%d pass out of %d" % (passed, linecnt-1) self.fd_input.close() self.fd_output.close() if passed == (linecnt-1): print "All ofdm ifft tests PASS\n" return True else: print "at least one ofdm ifft test FAILED\n" return False def test_algorithmE(self, ofdm_mode, guardinterval): print "\n**************************" print "test ofdm opencl ifft bealto.com radix 2 fft" passed = 0 linecnt = 1 g = 0 size = 0 if ofdm_mode == 8192: size = 6817 print "8k mode" if guardinterval == 0.25: self.fd_input = open('test_bench_ofdm_input_8K_1_4.csv', 'r') self.fd_output = open('test_bench_ofdm_output_8K_1_4.csv', 'r') print "1/4 guard interval" g = guardinterval if guardinterval == 0.125: self.fd_input = open('test_bench_ofdm_input_8K_1_8.csv', 'r') self.fd_output = open('test_bench_ofdm_output_8K_1_8.csv', 'r') print "1/8 guard interval" g = guardinterval if guardinterval == 0.0625: self.fd_input = open('test_bench_ofdm_input_8K_1_16.csv', 'r') self.fd_output = open('test_bench_ofdm_output_8K_1_16.csv', 'r') print "1/16 guard interval" g = guardinterval if guardinterval == 0.03125: self.fd_input = open('test_bench_ofdm_input_8K_1_32.csv', 'r') self.fd_output = open('test_bench_ofdm_output_8K_1_32.csv', 'r') print "1/32 guard interval" g = guardinterval elif ofdm_mode == 2048: size = 1705 print "2k mode" if guardinterval == 0.25: self.fd_input = open('test_bench_ofdm_input_2K_1_4.csv', 'r') self.fd_output = open('test_bench_ofdm_output_2K_1_4.csv', 'r') print "1/4 guard interval" g = guardinterval if guardinterval == 0.125: self.fd_input = open('test_bench_ofdm_input_2K_1_8.csv', 'r') self.fd_output = open('test_bench_ofdm_output_2K_1_8.csv', 'r') print "1/8 guard interval" g = guardinterval if guardinterval == 0.0625: self.fd_input = open('test_bench_ofdm_input_2K_1_16.csv', 'r') self.fd_output = open('test_bench_ofdm_output_2K_1_16.csv', 'r') print "1/16 guard interval" g = guardinterval if guardinterval == 0.03125: self.fd_input = open('test_bench_ofdm_input_2K_1_32.csv', 'r') self.fd_output = open('test_bench_ofdm_output_2K_1_32.csv', 'r') print "1/32 guard interval" g = guardinterval if g == 0: print "wrong guardinterval specified" return if size == 0: print "wrong ofdm_mode" return kernel = self.load_kernel("../FFT.cl", "fftRadix2Kernel") swapkernel = self.load_kernel("../FFT.cl", "fftswaprealimag") #size of guiardinterval destination buffer dest_buf_size = ofdm_mode*(1+guardinterval) self.tmpbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=size*8) self.inputbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=ofdm_mode*8) # opencl buffer self.outputbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=ofdm_mode*8) # opencl buffer holding the computed data dest_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=int(dest_buf_size*8) ) encoded_data = numpy.array(numpy.zeros(dest_buf_size), dtype=numpy.complex64) for line in self.fd_input: data_to_encode = numpy.array([float(0) + 1j*float(0)] * size, dtype=numpy.complex64) #data_to_encode = numpy.array([float(0) + 1j*float(0)] * ofdm_mode, dtype=numpy.complex64) cl.enqueue_copy(self.queue, self.inputbuffer, data_to_encode) cl.enqueue_copy(self.queue, self.outputbuffer, data_to_encode) counter = 0 for tmp in line.split(","): if string.find(tmp, " + ") > -1: data_to_encode[counter]=(float(tmp.split(" + ")[0]) + 1j * float(string.replace(tmp.split(" + ")[1],"i",""))) if string.find(tmp, " - ") > -1: data_to_encode[counter]=(float(tmp.split(" - ")[0]) - 1j * float(string.replace(tmp.split(" - ")[1],"i",""))) counter += 1 data_to_encode = numpy.array(data_to_encode, dtype=numpy.complex64) reference_data = [] for tmp in self.fd_output.readline().split(","): if string.find(tmp, " + ") > -1: reference_data.append(float(tmp.split(" + ")[0]) + 1j * float(string.replace(tmp.split(" + ")[1],"i",""))) if string.find(tmp, " - ") > -1: reference_data.append(float(tmp.split(" - ")[0]) - 1j * float(string.replace(tmp.split(" - ")[1],"i",""))) reference_data = numpy.array(reference_data, dtype=numpy.complex64) # do fftshift cl.enqueue_copy(self.queue, self.inputbuffer,numpy.array([float(0) + 1j*float(0)] * ofdm_mode, dtype=numpy.complex64)) cl.enqueue_copy(self.queue, self.tmpbuffer, data_to_encode) cl.enqueue_copy(self.queue, self.inputbuffer, self.tmpbuffer, byte_count=((size-1)/2)*8,src_offset=0,dest_offset=(ofdm_mode-(size-1)/2)*8) cl.enqueue_copy(self.queue, self.inputbuffer, self.tmpbuffer, byte_count=((size+1)/2)*8,src_offset=((size-1)/2)*8,dest_offset=0) swapkernel.set_args(self.inputbuffer) cl.enqueue_nd_range_kernel(self.queue,swapkernel,(int(ofdm_mode),),None).wait() kernel.set_args(self.inputbuffer, self.outputbuffer, numpy.int32(1)) cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode/2),),None).wait() kernel.set_args(self.outputbuffer, self.inputbuffer, numpy.int32(2)) cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode/2),),None).wait() kernel.set_args(self.inputbuffer, self.outputbuffer, numpy.int32(4)) cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode/2),),None).wait() kernel.set_args(self.outputbuffer, self.inputbuffer, numpy.int32(8)) cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode/2),),None).wait() kernel.set_args(self.inputbuffer, self.outputbuffer, numpy.int32(16)) cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode/2),),None).wait() kernel.set_args(self.outputbuffer, self.inputbuffer, numpy.int32(32)) cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode/2),),None).wait() kernel.set_args(self.inputbuffer, self.outputbuffer, numpy.int32(64)) cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode/2),),None).wait() kernel.set_args(self.outputbuffer, self.inputbuffer, numpy.int32(128)) cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode/2),),None).wait() kernel.set_args(self.inputbuffer, self.outputbuffer, numpy.int32(256)) cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode/2),),None).wait() kernel.set_args(self.outputbuffer, self.inputbuffer, numpy.int32(512)) cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode/2),),None).wait() kernel.set_args(self.inputbuffer, self.outputbuffer, numpy.int32(1024)) cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode/2),),None).wait() if ofdm_mode == 8192: kernel.set_args(self.outputbuffer, self.inputbuffer, numpy.int32(2048)) cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode/2),),None).wait() kernel.set_args(self.inputbuffer, self.outputbuffer, numpy.int32(4096)) cl.enqueue_nd_range_kernel(self.queue,kernel,(int(ofdm_mode/2),),None).wait() swapkernel.set_args(self.outputbuffer) cl.enqueue_nd_range_kernel(self.queue,swapkernel,(int(ofdm_mode),),None).wait() cl.enqueue_copy(self.queue, dest_buf, self.outputbuffer, byte_count=int(8*ofdm_mode), src_offset=0, dest_offset=int(ofdm_mode*guardinterval*8)).wait() cl.enqueue_copy(self.queue, dest_buf, self.outputbuffer, byte_count=int(ofdm_mode*guardinterval*8), src_offset=int(ofdm_mode - ofdm_mode*guardinterval)*8 ,dest_offset=0).wait() cl.enqueue_copy(self.queue, encoded_data, dest_buf) if numpy.allclose(reference_data, encoded_data, rtol=1.0000000000000001e-04, atol=1e-06): passed += 1 print "Test %d PASSED" % linecnt else: print "Test %d FAILED" % linecnt print "input data:" print data_to_encode print "encoded data[0]:" print encoded_data[0] print "reference data[0]:" print reference_data[0] print "error data:" #print reference_data - encoded_data linecnt += 1 print "%d pass out of %d" % (passed, linecnt-1) self.fd_input.close() self.fd_output.close() if passed == (linecnt-1): print "All ofdm ifft tests PASS\n" return True else: print "at least one ofdm ifft test FAILED\n" return False
def myfunc(d_g): from pyfft.cl import Plan from gputools import get_device plan = Plan(d_g.shape, queue=get_device().queue, fast_math=True) plan.execute(d_g.data, d_g.data) plan.execute(d_g.data, d_g.data, inverse=True)
def gs_gpu(idata,itera=100): """Gerchberg-Saxton algorithm to calculate DOEs using the GPU Calculates the phase distribution in a object plane to obtain an specific amplitude distribution in the target plane. It uses a FFT to calculate the field propagation. The wavefront at the DOE plane is assumed as a plane wave. **ARGUMENTS:** ========== ====================================================== idata numpy array containing the target amplitude distribution itera Maximum number of iterations ========== ====================================================== """ pl=cl.get_platforms()[0] devices=pl.get_devices(device_type=cl.device_type.GPU) ctx = cl.Context(devices=[devices[0]]) queue = cl.CommandQueue(ctx) plan = Plan(idata.shape, queue=queue,dtype=complex128) #no funciona con "complex128" src = str(Template(KERNEL).render( double_support=all( has_double_support(dev) for dev in devices), amd_double_support=all( has_amd_double_support(dev) for dev in devices) )) prg = cl.Program(ctx,src).build() idata_gpu=cl_array.to_device(queue, ifftshift(idata).astype("complex128")) fdata_gpu=cl_array.empty_like(idata_gpu) rdata_gpu=cl_array.empty_like(idata_gpu) plan.execute(idata_gpu.data,fdata_gpu.data) e=1000 ea=1000 for i in range (itera): prg.norm(queue, fdata_gpu.shape, None,fdata_gpu.data) plan.execute(fdata_gpu.data,rdata_gpu.data,inverse=True) tr=rdata_gpu.get() rdata=ifftshift(tr) #TODO: This calculation should be done in the GPU e= (abs(rdata)-idata).std() if e>ea: break ea=e prg.norm2(queue, rdata_gpu.shape,None,rdata_gpu.data,idata_gpu.data) plan.execute(rdata_gpu.data,fdata_gpu.data) fdata=fdata_gpu.get() #~ prg.norm(queue, fdata_gpu.shape, None,fdata_gpu.data) fdata=ifftshift(fdata) fdata=exp(1.j*angle(fdata)) #~ fdata=fdata_gpu.get() return fdata
class SplitStep(object): ''' An OpenCL version of the split-step parabolic equation solver. ''' _kernel = util.srcpath(__file__, 'clsrc', 'splitstep.mako') def __init__(self, k0, nx, ny, h, d=None, l=10, dz=None, w=0.39, propcorr=None, phasetol=None, context=None): ''' Initialize a split-step engine over an nx-by-ny grid with isotropic step size h. The unitless wave number is k0. The wave is advanced in steps of dz or (if dz is not provided) h. If d is provided, it is a 4-tuple that describes the directivity of any source as d = (dx, dy, dz, w), where (dx, dy, dz) is the directivity axis and w is the beam width parameter. Otherwise, all sources are treated as point sources. If l is specified and greater than zero, it is the width of a Hann window used to attenuate the field along each edge. The parameter w (as a multiplier 1 / w**2) governs the high-order spectral cross term. If propcorr is specified, it should be a tuple of Booleans of the form (hospec, hospat) that determines whether corrections involving high-order spectral or spatial terms, respectively, are used in the propagator by default. If propcorr is unspecified, both corrections are used. The parameter phasetol specifies the maximum permissible phase deviation (in fractions of pi), relative to propagation through the homogeneous background, incurred by propagating through the inhomogeneous medium. The number of steps per slab will be adjusted so that the phase shift incurred by propagating through materials with the most extreme sound speeds will never exceed phasetol. ''' # Ensure that the phase tolerance is not too small # Otherwise, number of propagation steps will blow up uncontrollably if phasetol is not None and abs(phasetol) < 1e-6: raise ValueError('Phase tolerance must be greater than 1e-6') # Copy the parameters self.grid = nx, ny self.h, self.k0, self.l = h, k0, l self.w = np.float32(1. / w**2) self.phasetol = phasetol # Specify the use of corrective terms if propcorr is not None: self.propcorr = tuple(propcorr) else: self.propcorr = (True, True) # Set the step length self.dz = dz if dz else h # Grab the provided context or create a default self.context = util.grabcontext(context) # Build the program for the context t = Template(filename=self._kernel, output_encoding='ascii') src = t.render(grid = self.grid, k0=k0, h=h, d=d, l=l) self.prog = cl.Program(self.context, src).build() # Create a command queue for forward propagation calculations self.fwdque = cl.CommandQueue(self.context) # Create command queues for transfers self.recvque = cl.CommandQueue(self.context) self.sendque = cl.CommandQueue(self.context) # Create an FFT plan in the OpenCL propagation queue # Reorder the axes to conform with row-major ordering self.fftplan = Plan((ny, nx), queue=self.fwdque) grid = self.grid def newbuffer(): nbytes = cutil.prod(grid) * np.complex64().nbytes flags = cl.mem_flags.READ_WRITE return util.SyncBuffer(self.context, flags, size=nbytes) # Buffers to store the propagating (twice) and backward fields self.fld = [newbuffer() for i in range(3)] # Scratch space used during computations self.scratch = [newbuffer() for i in range(3)] # The index of refraction gets two buffers for transmission self.obj = [newbuffer() for i in range(2)] # Two buffers are used for the Goertzel FFT of the contrast source self.goertzbuf = [newbuffer() for i in range(2)] # The sound speed extrema for the current slab are stored here self.speedlim = [1., 1.] # Initialize buffer to hold results of advance() self.result = newbuffer() # By default, volume fields will be transfered from the device self._goertzel = False # Initialize refractive index and fields self.reset() # By default, device exchange happens on the full grid self.rectxfer = util.RectangularTransfer(grid, grid, np.complex64, alloc_host=False) def slicecoords(self): ''' Return the meshgrid coordinate arrays for an x-y slab in the current engine. The origin is in the center of the slab. ''' g = self.grid cg = np.ogrid[:g[0], :g[1]] return [(c - 0.5 * (n - 1.)) * self.h for c, n in zip(cg, g)] def reset(self, propcorr = None, goertzel = False): ''' Reset the propagating and backward fields to zero and the prior refractive index buffer to unity. If propcorr is provided, it should be tuple as described in the docstring for __init__(). This will change the default behavior of corrective terms in the propagator. If goertzel is True, calls to advance() with shift=True will not copy the shifted, combined field from the device. Instead, the field will be used in a Goertzel algorithm to compute (slice by slice) the Fourier transform, restricted to the unit sphere, of induced scattering sources. ''' if propcorr is not None: self.propcorr = tuple(propcorr) grid = self.grid z = np.zeros(grid, dtype=np.complex64) for a in self.fld + self.obj + self.goertzbuf: cl.enqueue_copy(self.fwdque, a, z, is_blocking=False) self.speedlim = [1., 1.] self._goertzel = goertzel def setroi(self, rgrid): ''' Set a region of interest that will limit device transfers within the computational grid. ''' self.rectxfer = util.RectangularTransfer(self.grid, rgrid, np.complex64, alloc_host=False) def setincident(self, srcloc, idx = 0): ''' Set the value of the CL field buffer at index idx to the incident field at a location srcloc represented as a 3-tuple. The field plane is always assumed to have a z-height of 0; the z coordinate of srcloc is therefore the height of the source above the field plane. The transverse origin (x, y) = (0, 0) corresponds to the midpoint of the field plane. ''' inc = self.fld[idx] sx, sy, dz = [np.float32(s) for s in srcloc] self.prog.green3d(self.fwdque, self.grid, None, inc, sx, sy, dz) def objupdate(self, obj): ''' Update the rolling buffer with the index of refraction, corresponding to an object contrast obj, for the next slab. The transmission queue is used for updates to facilitate concurrency with forward propagation algorithms. ''' # Transfers occur in the transmission queue for concurrency prog, queue, grid = self.prog, self.recvque, self.grid # Roll the buffer so the next slab is second nxt, cur = self.obj self.obj = [cur, nxt] # Figure approximate sound speed extrema in the upcoming slab # If the imaginary part of the wave number is negligible # compared to the real, maximum sound speed corresponds to the # minimum contrast and vice versa if self.phasetol: ctextrema = [np.max(obj.real), np.min(obj.real)] self.speedlim = [1 / np.sqrt(ctv + 1.) for ctv in ctextrema] # Ensure buffer is not used by prior calculations nxt.sync(queue) # Transfer the object contrast into the next-slab buffer evt = self.rectxfer.todevice(queue, nxt, obj) # Return buffers of the current and next slabs and a transfer event return cur, nxt, evt def propagate(self, fld = None, dz = None, idx = 0, corr = None): ''' Propagate the field stored in the device buffer fld (or, if fld is None, the current in-device field at index idx) a step dz (or, if dz is None, the default step size) through the currently represented medium. If corr is not None, it should be a tuple as described in the reset() docstring to control the use of corrective terms in the spectral propagator. Otherwise, the instance default is used. ''' prog, grid = self.prog, self.grid fwdque = self.fwdque hospec, hospat = corr if corr is not None else self.propcorr # Point to the field, scratch buffers, and refractive index if fld is None: fld = self.fld[idx] u, v, x = self.scratch obj = self.obj[0] # These constants will be used in field computations one = np.float32(1.) dz = np.float32(dz if dz is not None else self.dz) # Attenuate the boundaries using a Hann window, if desired if self.l > 0: prog.attenx(fwdque, (self.l, grid[1]), None, fld) prog.atteny(fwdque, (grid[0], self.l), None, fld) # Multiply, in v, the field by the contrast if hospec: prog.ctmul(fwdque, grid, None, v, obj, fld) # Multiply, in u, the field by the high-order spatial operator if hospat: prog.hospat(fwdque, grid, None, u, obj, fld) # From here, the field should be spectral self.fftplan.execute(fld) # Compute high-order spatial corrections or set the buffer to NULL if hospat: # With high-order spatial terms, transform u as well self.fftplan.execute(u) # Compute the scaled, spectral Laplacians of u and the field (in x) prog.laplacian(fwdque, grid, None, u, u) prog.laplacian(fwdque, grid, None, x, fld) # Apply the high-order spatial operator to x self.fftplan.execute(x, inverse=True) prog.hospat(fwdque, grid, None, x, obj, x) self.fftplan.execute(x) # Add x to u to get the high-order spatial corrections prog.caxpy(fwdque, grid, None, x, one, u, x) else: x = None # Compute high-order spectral corrections or set buffers to NULL if hospec: # Apply, in u, the high-order spectral operator to the field prog.hospec(fwdque, grid, None, u, fld) # Multiply u by the contrast in the spatial domain self.fftplan.execute(u, inverse=True) prog.ctmul(fwdque, grid, None, u, obj, u) # Let v = v + u / w**2 in the spatial domain prog.caxpy(fwdque, grid, None, v, self.w, u, v) # Transform u and v into the spectral domain self.fftplan.execute(u) self.fftplan.execute(v) # Apply the high-order spectral operator to the new v prog.hospec(fwdque, grid, None, v, v) else: u, v = None, None # Add all appropriate high-order corrections to the field if hospat or hospec: prog.corrfld(fwdque, grid, None, fld, u, v, x, dz) # Propagate the field through a homogeneous slab prog.propagate(fwdque, grid, None, fld, dz) # Take the inverse FFT of the field and the Laplacian self.fftplan.execute(fld, inverse=True) # Multiply by the phase screen, returning the event return prog.screen(fwdque, grid, None, fld, obj, dz) def advance(self, obj, shift=False, corr=None, shcorr=None): ''' Propagate a field through the current slab and transmit it through an interface with the next slab characterized by object contrast obj. The transmission overwrites the refractive index of the current slab with the interface reflection coefficients. If shift is True, the forward is shifted by half a slab to agree with full-wave solutions and includes a backward-traveling contribution caused by reflection from the interface with the next slab. The relevant result (either the forward field or the half-shifted combined field) is copied into a device-side buffer for later retrieval and handling. If corr is not None, it should be a tuple as specified in the reset() docstring to override the default use of corrective terms in the spectral propagator. The argument shcorr is interpreted exactly as corr, but is used instead of corr for the propagation used to shift the field to the center of the slab. ''' prog, grid = self.prog, self.grid fwdque, recvque, sendque = self.fwdque, self.recvque, self.sendque # Point to the field components fwd, bck, buf = [f for f in self.fld] if shift: # Ensure that a prior copy isn't using the buffer buf.sync(fwdque) # Copy the forward field for shifting if necessary cl.enqueue_copy(fwdque, buf, fwd) # Copy the sound speed extrema for the current slab speedlim = list(self.speedlim) # Push the next slab to its buffer (overwrites speed extrema) ocur, onxt, obevt = self.objupdate(obj) if self.phasetol is not None: # Figure maximum propagation distance to not # exceed maximum permissible phase deviation dzl = [] for spd in speedlim: # Sign governs the sign of the phase deviation, # which is irrelevant, so ignore it here spdiff = max(abs(spd - 1.), 1e-8) # Preventing spdiff from reaching zero limits # maximum permissible propagation distance dzl.append(abs(0.5 * self.phasetol * spd / spdiff)) # Subdivide the slab into maximum propagation distance nsteps = max(1, int(np.round(self.dz / min(dzl)))) else: nsteps = 1 dz = self.dz / nsteps # Ensure that no prior copy is using the field buffer fwd.sync(fwdque) # Propagate the forward field through the slab on the fwdque for i in range(nsteps): self.propagate(fwd, dz, corr=corr) # Ensure next slab has been received before handling interface cl.enqueue_barrier(fwdque, wait_for=[obevt]) # Compute transmission through the interface # The reflected field is only of interest if a shift is desired transevt = prog.txreflect(fwdque, grid, None, fwd, bck if shift else None, ocur, onxt) # Hold the current contrast slab until the transmission is done ocur.attachevent(transevt) if shift: # Add the forward and backward fields prog.caxpy(fwdque, grid, None, buf, np.float32(1.), buf, bck) # Propagate the combined field a half step # Save the propagation event for delaying result copies pevt = self.propagate(buf, 0.5 * self.dz, corr=shcorr) # Handle Goertzel iterations to compute the Fourier # transform of the contrast source on the unit sphere if self._goertzel: # Compute the FFT of the source in the XY plane crt = self.scratch[0] prog.ctmul(fwdque, grid, None, crt, ocur, buf) self.fftplan.execute(crt) # Compute the next Goertzel iteration pn1, pn2 = self.goertzbuf dz = np.float32(self.dz) # The final argument (slab count) is not yet used nz = np.int32(0) prog.goertzelfft(fwdque, grid, None, pn1, pn2, crt, dz, nz) # Cycle the Goertzel buffers self.goertzbuf = [pn2, pn1] else: # Copy the shifted field into the result buffer # No result sync necessary, all mods occur on sendque evt = cl.enqueue_copy(sendque, self.result, buf, wait_for=[pevt]) # Attach the copy event to the source buffer buf.attachevent(evt) else: # Copy the forward field into the result buffer # Wait for transmissions to finish for consistency evt = cl.enqueue_copy(sendque, self.result, fwd, wait_for=[transevt]) # Attach the copy event to the field buffer fwd.attachevent(evt) def getresult(self, hbuf): ''' Wait for the intra-device transfer of the previous result to the result buffer, and initiate a device-to-host copy of the valid result buffer into hbuf. An event corresponding to the transfer is returned. ''' sendque = self.sendque # Initiate the rectangular transfer on the transfer queue # No sync necessary, all mods to result buffer occur on sendque evt = self.rectxfer.fromdevice(sendque, self.result, hbuf)[1] # Attach the copy event to the result buffer self.result.attachevent(evt) return evt def goertzelfft(self, nz = 0): ''' Finish Goertzel iterations carried out in repeated calls to advance() and copy the positive and negative hemispheres of the Fourier transform of the contrast source to successive planes of a Numpy array. If nz is specified, it is the number of slabs involved in the Fourier transform and is used to properly scale the output of the Goertzel algorithm. When nz = 0, no scaling is performed. Copies are synchronous and are done on the forward propagation queue. ''' prog, grid = self.prog, self.grid fwdque = self.fwdque hemispheres = np.zeros(list(grid) + [2], dtype=np.complex64, order='F') # If the spectral field hasn't been computed, just return zeros if not self._goertzel: return hemispheres # Finalize the Goertzel iteration pn1, pn2 = self.goertzbuf dz = np.float32(self.dz) nz = np.int32(nz) # Pass None as the contrast current to signal final iteration # After this, pn1 is the positive hemisphere, pn2 is the negative prog.goertzelfft(fwdque, grid, None, pn1, pn2, None, dz, nz) # Copy the two hemispheres into planes of an array cl.enqueue_copy(fwdque, hemispheres[:,:,0:1], pn1, is_blocking=False) cl.enqueue_copy(fwdque, hemispheres[:,:,1:2], pn2, is_blocking=True) return hemispheres
class Convolve: """ Class that computes the necessary information to perform a convolution and provides the actual convolution function. Can handle 2d or 3d convolutions. """ def __init__(self, in_size, kernel_size, batch_size, context, queue): self.sizes = [] for i in xrange(len(in_size)): self.sizes.append(get_power_of_two(in_size[i] + kernel_size[i] + 1)) self.sizes = tuple(self.sizes) self.ctx = context self.queue = queue self.plan = Plan(self.sizes, queue=self.queue) self.in_array = cl.array.zeros( self.queue, (batch_size, self.sizes[0], self.sizes[1], self.sizes[2]), numpy.complex64) self.kernel = cl.array.zeros( self.queue, (batch_size, self.sizes[0], self.sizes[1], self.sizes[2]), numpy.complex64) self.result_buffer = numpy.zeros(self.in_array.shape, numpy.complex64) self.kernel_center = [] for i in xrange(len(kernel_size)): self.kernel_center.append(kernel_size[i] / 2) self.kernel_center = tuple(self.kernel_center) self.halves = [] for i in xrange(len(kernel_size)): self.halves.append(numpy.ceil(kernel_size[i] / 2.0)) self.halves = tuple(self.halves) self.padding_locations = [] for i in xrange(len(self.sizes)): # without this if even kernels result in an incorrect edge in the result if kernel_size[i] % 2 == 0: self.padding_locations.append( -1 * ((in_size[i] - self.sizes[i]) / 2)) self.padding_locations.append( -1 * ((self.sizes[i] - in_size[i]) / 2)) else: self.padding_locations.append((self.sizes[i] - in_size[i]) / 2) self.padding_locations.append((in_size[i] - self.sizes[i]) / 2) self.padding_locations = tuple(self.padding_locations) self.valid_locations = [] for i in xrange(len(self.sizes)): self.valid_locations.append(self.padding_locations[(i * 2)] + self.halves[i] - 1) self.valid_locations.append(self.padding_locations[(i * 2)] + self.halves[i] + in_size[i] - kernel_size[i]) self.valid_locations = tuple(self.valid_locations) self.full_locations = [] for i in xrange(len(self.sizes)): offset = self.sizes[i] - (in_size[i] + kernel_size[i] - 1) self.full_locations.append(offset / 2) self.full_locations.append(-offset / 2) self.batch_size = batch_size def convolution(self, input_matrix, kernel, type_='valid'): in_array = numpy.zeros( (self.batch_size, self.sizes[0], self.sizes[1], self.sizes[2]), numpy.complex64) in_array[:, self.padding_locations[0]:self.padding_locations[1], self.padding_locations[2]:self.padding_locations[3], self. padding_locations[4]:self.padding_locations[5]] = input_matrix self.in_array = cl.array.to_device(self.queue, in_array) kernel_buffer = numpy.zeros( (self.batch_size, self.sizes[0], self.sizes[1], self.sizes[2]), numpy.complex64) kernel_buffer[:, :self.halves[0], :self.halves[1], :self.halves[2]] = \ kernel[self.kernel_center[0]:, self.kernel_center[1]:, self.kernel_center[2]:] kernel_buffer[:, :self.halves[0], :self.halves[1], -self.kernel_center[2]:] = \ kernel[self.kernel_center[0]:, self.kernel_center[1]:, :self.kernel_center[2]] kernel_buffer[:, :self.halves[0], -self.kernel_center[1]:, :self.halves[2]] = \ kernel[self.kernel_center[0]:, :self.kernel_center[1], self.kernel_center[2]:] kernel_buffer[:, :self.halves[0], -self.kernel_center[1]:, -self.kernel_center[2]:] = \ kernel[self.kernel_center[0]:, :self.kernel_center[1], :self.kernel_center[2]] if kernel.shape[0] > 1: kernel_buffer[:, -self.kernel_center[0]:, :self.halves[1], :self.halves[2]] = \ kernel[:self.kernel_center[0], self.kernel_center[1]:, self.kernel_center[2]:] kernel_buffer[:, -self.kernel_center[0]:, :self.halves[1], -self.kernel_center[2]:] = \ kernel[:self.kernel_center[0], self.kernel_center[1]:, :self.kernel_center[2]] kernel_buffer[:, -self.kernel_center[0]:, -self.kernel_center[1]:, :self.halves[2]] = \ kernel[:self.kernel_center[0], :self.kernel_center[1], self.kernel_center[2]:] kernel_buffer[:, -self.kernel_center[0]:, -self.kernel_center[1]:, -self.kernel_center[2]:] = \ kernel[:self.kernel_center[0], :self.kernel_center[1], :self.kernel_center[2]] self.kernel = cl.array.to_device(self.queue, kernel_buffer) # fourier transform, point wise multiply, then invert => convolution self.plan.execute(self.in_array.data, batch=self.batch_size) self.plan.execute(self.kernel.data, batch=self.batch_size) self.result_buffer = self.in_array * self.kernel self.plan.execute(self.result_buffer.data, inverse=True, batch=self.batch_size) result = self.result_buffer.get().astype(float) if type_ == 'same': return result[:, self.padding_locations[0]:self.padding_locations[1], self.padding_locations[2]:self.padding_locations[3], self.padding_locations[4]:self.padding_locations[5]] elif type_ == 'full': return result[:, self.full_locations[0]:self.full_locations[1], self.full_locations[2]:self.full_locations[3], self.full_locations[4]:self.full_locations[5]] elif type_ == 'valid': return result[:, self.valid_locations[0]:self.valid_locations[1], self.valid_locations[2]:self.valid_locations[3], self.valid_locations[4]:self.valid_locations[5]]