def kspacegaussian_filter_pyfftCL(ksp, sigma): clear_first_arg_caches() sz = ksp.shape dtype = np.complex64 ftype = np.float32 #api = cluda.ocl_api() ctx = cl.create_some_context(interactive=False) queue = cl.CommandQueue(ctx) queue.flush() data_dev = cl_array.to_device(queue, ksp) w = h = k = 512 plan = Plan((w, h, k), normalize=True, queue=queue) FACTOR = 1.0 program = cl.Program(ctx, """ #pragma OPENCL EXTENSION cl_khr_fp64: enable #include "pyopencl-complex.h" __kernel void gauss_kernel(__global cfloat_t *dest) //, __global cfloat_t *src) { uint x = get_global_id(0);uint y = get_global_id(1);uint z = get_global_id(2); uint dim1= %d; uint dim2= %d; uint dim3= %d; float sigma[3]; sigma[0]=%f;sigma[1]=%f;sigma[2]=%f; float factor = %f; float TWOPISQ = 19.739208802178716; //6.283185307179586; //2*3.141592; ulong idx = z*dim1*dim2 + y*dim1 + x; float i = (float)(x); //(x / dim3) / dim2); i = (i - (float)floor((float)(dim1)/2.0f))/(float)(dim1); float j = (float)y; //(x / dim3); //if((int)j > dim2) {j=(float)fmod(j, (float)dim2);}; j = (j - (float)floor((float)(dim2)/2.0f))/(float)(dim2); //Account for large global index (stored as ulong) before performing modulus //double pre_k=fmod((double)(x) , (double) dim3); float k = (float) z; // pre_k; k = (k - (float)floor((float)(dim3)/2.0f))/(float)(dim3); float weight = exp(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); dest[idx].x = dest[idx].x * weight; dest[idx].y = dest[idx].y * weight; } """ % (sz[0], sz[1], sz[2], sigma[0], sigma[1], sigma[2], FACTOR)).build() gauss_kernel = program.gauss_kernel #data_dev = thr.empty_like(ksp_dev) gauss_kernel(queue, sz, None, data_dev.data).wait() # , data_dev.data ksp_out = data_dev.get() queue.flush() ctx = cl.create_some_context(interactive=False) queue = cl.CommandQueue(ctx) w = h = k = 512 plan = Plan((w, h, k), normalize=True, queue=queue) data2_dev = cl_array.to_device(queue, ksp_out) plan.execute(data2_dev.data, inverse=True) result = data2_dev.get() result = np.fft.fftshift(result) queue.finish() return result # ,ksp_out
def __init__(self, in_scale, in_matrix, queue): self.scale = tuple(in_scale) # create plan self.plan = Plan(self.scale, queue=queue) # prepare data self.data = in_matrix self.gpu_data = cl_array.to_device(queue, self.data)
def test_fft(self): data = gpu_util.get_array(np.random.normal(100, 100, size=(4, 4)).astype(cfg.PRECISION.np_float)) orig = gpu_util.get_host(data) data = ip.fft_2(data) ip.ifft_2(data) np.testing.assert_almost_equal(orig, data.get().real, decimal=4) # With a plan from pyfft.cl import Plan plan = Plan((4, 4), queue=cfg.OPENCL.queue) data = ip.fft_2(np.copy(orig), plan=plan) ip.ifft_2(data, plan=plan) np.testing.assert_almost_equal(orig, data.get().real, decimal=4) # Test double precision syris.init(double_precision=True, device_index=0) data = gpu_util.get_array(np.random.normal(100, 100, size=(4, 4)).astype(cfg.PRECISION.np_float)) gt = np.fft.fft2(data.get()) data = ip.fft_2(data) np.testing.assert_almost_equal(gt, data.get(), decimal=4) gt = np.fft.ifft2(data.get()) data = ip.ifft_2(data) np.testing.assert_almost_equal(gt, data.get(), decimal=4)
def __call__(self, domain, field): # Setup plan for calculating fast Fourier transforms: self.plan = Plan(domain.total_samples, queue=self.queue) field_temp = np.empty_like(field) field_interaction = np.empty_like(field) from pyofss.modules.linearity import Linearity dispersion = Linearity(beta=[0.0, 0.0, 0.0, 1.0], sim_type="default") factor = dispersion(domain) self.send_arrays_to_device( field, field_temp, field_interaction, factor) stepsize = self.length / self.total_steps zs = np.linspace(0.0, self.length, self.total_steps + 1) #start = time.clock() for z in zs[:-1]: self.cl_rk4ip(self.buf_field, self.buf_temp, self.buf_interaction, self.buf_factor, stepsize) #stop = time.clock() #cl_result = self.buf_field.get() #print("cl_result: %e" % ((stop - start) / 1000.0)) return self.buf_field.get()
def __init__(self, total_samples, dorf, length=None, total_steps=None, name="ocl_fibre"): self.name = name self.queue = None self.np_float = None self.np_complex = None self.prg = None self.cl_initialise(dorf) self.plan = Plan(total_samples, queue=self.queue) self.buf_field = None self.buf_temp = None self.buf_interaction = None self.buf_factor = None self.shape = None self.plan = None self.cached_factor = False # Force usage of cached version of function: self.cl_linear = self.cl_linear_cached self.length = length self.total_steps = total_steps
def gs_mod_gpu(idata, itera=10, osize=256): cut = osize // 2 pl = cl.get_platforms()[0] devices = pl.get_devices(device_type=cl.device_type.GPU) ctx = cl.Context(devices=[devices[0]]) queue = cl.CommandQueue(ctx) plan = Plan(idata.shape, queue=queue, dtype=complex128) #no funciona con "complex128" src = str( Template(KERNEL).render( double_support=all(has_double_support(dev) for dev in devices), amd_double_support=all( has_amd_double_support(dev) for dev in devices))) prg = cl.Program(ctx, src).build() idata_gpu = cl_array.to_device(queue, ifftshift(idata).astype("complex128")) fdata_gpu = cl_array.empty_like(idata_gpu) rdata_gpu = cl_array.empty_like(idata_gpu) plan.execute(idata_gpu.data, fdata_gpu.data) mask = exp(2.j * pi * random(idata.shape)) mask[512 - cut:512 + cut, 512 - cut:512 + cut] = 0 idata_gpu = cl_array.to_device( queue, ifftshift(idata + mask).astype("complex128")) fdata_gpu = cl_array.empty_like(idata_gpu) rdata_gpu = cl_array.empty_like(idata_gpu) error_gpu = cl_array.to_device(ctx, queue, zeros(idata_gpu.shape).astype("double")) plan.execute(idata_gpu.data, fdata_gpu.data) e = 1000 ea = 1000 for i in range(itera): prg.norm(queue, fdata_gpu.shape, None, fdata_gpu.data) plan.execute(fdata_gpu.data, rdata_gpu.data, inverse=True) #~ prg.norm1(queue, rdata_gpu.shape,None,rdata_gpu.data,idata_gpu.data,error_gpu.data, int32(cut)) norm1 = prg.norm1 norm1.set_scalar_arg_dtypes([None, None, None, int32]) norm1(queue, rdata_gpu.shape, None, rdata_gpu.data, idata_gpu.data, error_gpu.data, int32(cut)) e = sqrt(cl_array.sum(error_gpu).get()) / (2 * cut) #~ if e>ea: #~ #~ break #~ ea=e plan.execute(rdata_gpu.data, fdata_gpu.data) fdata = fdata_gpu.get() fdata = ifftshift(fdata) fdata = exp(1.j * angle(fdata)) return fdata
def _get_plan(itype,otype,inlen): try: theplan = _plans[(itype,otype,inlen)] except KeyError: theplan = Plan(inlen,dtype=itype,queue=pycbc.scheme.mgr.state.queue,normalize=False,fast_math=True) _plans.update({(itype,otype,inlen) : theplan }) return theplan
def _ocl_fft_gpu_inplace(ocl_arr, inverse=False, plan=None): assert_bufs_type(np.complex64, ocl_arr) if plan is None: plan = Plan(ocl_arr.shape, queue=get_device().queue) plan.execute(ocl_arr.data, ocl_arr.data, inverse=inverse)
def __init__(self, in_size, kernel_size, batch_size, context, queue): self.sizes = [] for i in xrange(len(in_size)): self.sizes.append(get_power_of_two(in_size[i] + kernel_size[i] + 1)) self.sizes = tuple(self.sizes) self.ctx = context self.queue = queue self.plan = Plan(self.sizes, queue=self.queue) self.in_array = cl.array.zeros( self.queue, (batch_size, self.sizes[0], self.sizes[1], self.sizes[2]), numpy.complex64) self.kernel = cl.array.zeros( self.queue, (batch_size, self.sizes[0], self.sizes[1], self.sizes[2]), numpy.complex64) self.result_buffer = numpy.zeros(self.in_array.shape, numpy.complex64) self.kernel_center = [] for i in xrange(len(kernel_size)): self.kernel_center.append(kernel_size[i] / 2) self.kernel_center = tuple(self.kernel_center) self.halves = [] for i in xrange(len(kernel_size)): self.halves.append(numpy.ceil(kernel_size[i] / 2.0)) self.halves = tuple(self.halves) self.padding_locations = [] for i in xrange(len(self.sizes)): # without this if even kernels result in an incorrect edge in the result if kernel_size[i] % 2 == 0: self.padding_locations.append( -1 * ((in_size[i] - self.sizes[i]) / 2)) self.padding_locations.append( -1 * ((self.sizes[i] - in_size[i]) / 2)) else: self.padding_locations.append((self.sizes[i] - in_size[i]) / 2) self.padding_locations.append((in_size[i] - self.sizes[i]) / 2) self.padding_locations = tuple(self.padding_locations) self.valid_locations = [] for i in xrange(len(self.sizes)): self.valid_locations.append(self.padding_locations[(i * 2)] + self.halves[i] - 1) self.valid_locations.append(self.padding_locations[(i * 2)] + self.halves[i] + in_size[i] - kernel_size[i]) self.valid_locations = tuple(self.valid_locations) self.full_locations = [] for i in xrange(len(self.sizes)): offset = self.sizes[i] - (in_size[i] + kernel_size[i] - 1) self.full_locations.append(offset / 2) self.full_locations.append(-offset / 2) self.batch_size = batch_size
def clifftn(data): clear_first_arg_caches() ctx = cl.create_some_context(interactive=False) queue = cl.CommandQueue(ctx) plan = Plan(data.shape, normalize=True, queue=queue) # Inverse transform: plan.execute(gpu_data.data, inverse=True) result = gpu_data.get() return result
def _ocl_fft_gpu(ocl_arr, res_arr=None, inverse=False, plan=None): assert_bufs_type(np.complex64, ocl_arr) if plan is None: plan = Plan(ocl_arr.shape, queue=get_device().queue) if res_arr is None: res_arr = OCLArray.empty(ocl_arr.shape, np.complex64) plan.execute(ocl_arr.data, res_arr.data, inverse=inverse) return res_arr
def _ocl_fft_numpy(arr, inverse=False, plan=None): if plan is None: plan = Plan(arr.shape, queue=get_device().queue) if arr.dtype != np.complex64: logger.info("converting %s to complex64, might slow things down..." % arr.dtype) ocl_arr = OCLArray.from_array(arr.astype(np.complex64, copy=False)) _ocl_fft_gpu_inplace(ocl_arr, inverse=inverse, plan=plan) return ocl_arr.get()
def clfftn(data): """ OpenCL FFT 3D """ clear_first_arg_caches() #ctx = cl.create_some_context(interactive=False) #queue = cl.CommandQueue(ctx) ctx, queue = clinit() plan = Plan(data.shape, normalize=True, queue=queue) # forward transform on device gpu_data = cl_array.to_device(queue, data) # forward transform plan.execute(gpu_data.data) #result = gpu_data.get() result = gpu_data.get() return result
def _fft_2(data, inverse=False, plan=None, queue=None, block=True): """Execute FFT on *data*, which is first converted to a pyopencl array and retyped to complex. """ data = g_util.get_array(data, queue=queue) if data.dtype != cfg.PRECISION.np_cplx: data = data.astype(cfg.PRECISION.np_cplx) if not plan: if not queue: queue = cfg.OPENCL.queue if queue not in cfg.OPENCL.fft_plans: cfg.OPENCL.fft_plans[queue] = {} if data.shape not in cfg.OPENCL.fft_plans[queue]: LOG.debug('Creating FFT Plan for {} and shape {}'.format(queue, data.shape)) cfg.OPENCL.fft_plans[queue][data.shape] = Plan(data.shape, dtype=cfg.PRECISION.np_cplx, queue=queue) plan = cfg.OPENCL.fft_plans[queue][data.shape] LOG.debug('fft_2, shape: %s, inverse: %s', data.shape, inverse) plan.execute(data.data, inverse=inverse, wait_for_finish=block) return data
imggauss = kspacegaussian_filter_CL(ksp, np.ones(3), ctx) print 'PyFFT +OpenCL Gaussian filter:' toc() tic() print 'Complex K-space filter + Numpy IFFT' kspgauss2 = KSP.kspacegaussian_filter2(ksp, 1) image_filtered = simpleifft(procpar, dims, hdr, kspgauss2, args) toc() # PYFFT tic() #ctx = cl.create_some_context(interactive=False) #queue = cl.CommandQueue(ctx) w = h = k = 512 plan = Plan((w, h, k), normalize=True, queue=queue) gpu_data = cl_array.to_device(queue, ksp) plan.execute(gpu_data.data, inverse=True) result = gpu_data.get() toc() result = np.fft.fftshift(result) print "PyFFT OpenCL IFFT time and first three results:" print "%s sec, %s" % (toc(), str(np.abs(result[:3, 0, 0]))) tic() reference = np.fft.fftshift(np.fft.ifftn(ksp)) print "Numpy IFFTN time and first three results:" print "%s sec, %s" % (toc(), str(np.abs(reference[:3, 0, 0]))) print "Calulating L1 norm " print np.linalg.norm(result - reference) / np.linalg.norm(reference)
def gs_gpu(idata, itera=100): """Gerchberg-Saxton algorithm to calculate DOEs using the GPU Calculates the phase distribution in a object plane to obtain an specific amplitude distribution in the target plane. It uses a FFT to calculate the field propagation. The wavefront at the DOE plane is assumed as a plane wave. **ARGUMENTS:** ========== ====================================================== idata numpy array containing the target amplitude distribution itera Maximum number of iterations ========== ====================================================== """ pl = cl.get_platforms()[0] devices = pl.get_devices(device_type=cl.device_type.GPU) ctx = cl.Context(devices=[devices[0]]) queue = cl.CommandQueue(ctx) plan = Plan(idata.shape, queue=queue, dtype=complex128) #no funciona con "complex128" src = str( Template(KERNEL).render( double_support=all(has_double_support(dev) for dev in devices), amd_double_support=all( has_amd_double_support(dev) for dev in devices))) prg = cl.Program(ctx, src).build() idata_gpu = cl_array.to_device(queue, ifftshift(idata).astype("complex128")) fdata_gpu = cl_array.empty_like(idata_gpu) rdata_gpu = cl_array.empty_like(idata_gpu) plan.execute(idata_gpu.data, fdata_gpu.data) e = 1000 ea = 1000 for i in range(itera): prg.norm(queue, fdata_gpu.shape, None, fdata_gpu.data) plan.execute(fdata_gpu.data, rdata_gpu.data, inverse=True) tr = rdata_gpu.get() rdata = ifftshift(tr) #TODO: This calculation should be done in the GPU e = (abs(rdata) - idata).std() if e > ea: break ea = e prg.norm2(queue, rdata_gpu.shape, None, rdata_gpu.data, idata_gpu.data) plan.execute(rdata_gpu.data, fdata_gpu.data) fdata = fdata_gpu.get() #~ prg.norm(queue, fdata_gpu.shape, None,fdata_gpu.data) fdata = ifftshift(fdata) fdata = exp(1.j * angle(fdata)) #~ fdata=fdata_gpu.get() return fdata
def fft_plan(shape): """returns an opencl/pyfft plan of shape dshape""" return Plan(shape, queue=get_device().queue)
def myfunc(d_g): from pyfft.cl import Plan from gputools import get_device plan = Plan(d_g.shape, queue=get_device().queue, fast_math=True) plan.execute(d_g.data, d_g.data) plan.execute(d_g.data, d_g.data, inverse=True)
freq_hz = freq * 1000000.0 blocklen = (32 * 1024) # bring up OpenCL from pyfft.cl import Plan import numpy import pyopencl as cl import pyopencl.array as clarray import pyopencl.clmath as clmath ctx = cl.create_some_context(interactive=False) queue = cl.CommandQueue(ctx) plan = Plan(blocklen, queue=queue, dtype=numpy.complex64) lowpass_filter_b, lowpass_filter_a = sps.butter(8, (4.5/(freq/2)), 'low') # stubs for later f_deemp_b = [] f_deemp_a = [] # default deemp constants deemp_t1 = .825 deemp_t2 = 2.35 # audio filters Baudiorf = sps.firwin(65, 3.5 / (freq / 2), window='hamming', pass_zero=True) afreq = freq / 4
def __init__(self, k0, nx, ny, h, d=None, l=10, dz=None, w=0.39, propcorr=None, phasetol=None, context=None): ''' Initialize a split-step engine over an nx-by-ny grid with isotropic step size h. The unitless wave number is k0. The wave is advanced in steps of dz or (if dz is not provided) h. If d is provided, it is a 4-tuple that describes the directivity of any source as d = (dx, dy, dz, w), where (dx, dy, dz) is the directivity axis and w is the beam width parameter. Otherwise, all sources are treated as point sources. If l is specified and greater than zero, it is the width of a Hann window used to attenuate the field along each edge. The parameter w (as a multiplier 1 / w**2) governs the high-order spectral cross term. If propcorr is specified, it should be a tuple of Booleans of the form (hospec, hospat) that determines whether corrections involving high-order spectral or spatial terms, respectively, are used in the propagator by default. If propcorr is unspecified, both corrections are used. The parameter phasetol specifies the maximum permissible phase deviation (in fractions of pi), relative to propagation through the homogeneous background, incurred by propagating through the inhomogeneous medium. The number of steps per slab will be adjusted so that the phase shift incurred by propagating through materials with the most extreme sound speeds will never exceed phasetol. ''' # Ensure that the phase tolerance is not too small # Otherwise, number of propagation steps will blow up uncontrollably if phasetol is not None and abs(phasetol) < 1e-6: raise ValueError('Phase tolerance must be greater than 1e-6') # Copy the parameters self.grid = nx, ny self.h, self.k0, self.l = h, k0, l self.w = np.float32(1. / w**2) self.phasetol = phasetol # Specify the use of corrective terms if propcorr is not None: self.propcorr = tuple(propcorr) else: self.propcorr = (True, True) # Set the step length self.dz = dz if dz else h # Grab the provided context or create a default self.context = util.grabcontext(context) # Build the program for the context t = Template(filename=self._kernel, output_encoding='ascii') src = t.render(grid = self.grid, k0=k0, h=h, d=d, l=l) self.prog = cl.Program(self.context, src).build() # Create a command queue for forward propagation calculations self.fwdque = cl.CommandQueue(self.context) # Create command queues for transfers self.recvque = cl.CommandQueue(self.context) self.sendque = cl.CommandQueue(self.context) # Create an FFT plan in the OpenCL propagation queue # Reorder the axes to conform with row-major ordering self.fftplan = Plan((ny, nx), queue=self.fwdque) grid = self.grid def newbuffer(): nbytes = cutil.prod(grid) * np.complex64().nbytes flags = cl.mem_flags.READ_WRITE return util.SyncBuffer(self.context, flags, size=nbytes) # Buffers to store the propagating (twice) and backward fields self.fld = [newbuffer() for i in range(3)] # Scratch space used during computations self.scratch = [newbuffer() for i in range(3)] # The index of refraction gets two buffers for transmission self.obj = [newbuffer() for i in range(2)] # Two buffers are used for the Goertzel FFT of the contrast source self.goertzbuf = [newbuffer() for i in range(2)] # The sound speed extrema for the current slab are stored here self.speedlim = [1., 1.] # Initialize buffer to hold results of advance() self.result = newbuffer() # By default, volume fields will be transfered from the device self._goertzel = False # Initialize refractive index and fields self.reset() # By default, device exchange happens on the full grid self.rectxfer = util.RectangularTransfer(grid, grid, np.complex64, alloc_host=False)
def fft_plan(shape, **kwargs): """returns an opencl/pyfft plan of shape dshape kwargs are the same as pyfft.cl.Plan """ return Plan(shape, queue=get_device().queue, **kwargs)