def prepare_video_cuda(): # Things go *a lot* faster when you have the memory structures pre-allocated cs['plan1'] = fft.Plan(blocklen, np.float32, np.complex64) cs['plan1i'] = fft.Plan(blocklen, np.complex64, np.complex64) cs['fft1_out'] = gpuarray.empty((blocklen // 2) + 1, np.complex64) cs['filtered1'] = gpuarray.empty(blocklen, np.complex64) cs['fm_demod'] = gpuarray.empty(blocklen, np.float32) cs['postlpf'] = gpuarray.empty(blocklen, np.float32) cs['fft2_out'] = gpuarray.empty((blocklen // 2) + 1, np.complex64) cs['clipped_gpu'] = gpuarray.empty(blocklen, np.uint16) cs['plan2'] = fft.Plan(blocklen, np.float32, np.complex64) cs['plan2i'] = fft.Plan(blocklen, np.complex64, np.float32) # CUDA functions. The fewer setups we need, the faster it goes. cs['doclamp16'] = mod.get_function("clamp16") cs['doanglediff'] = mod.get_function("anglediff") # GPU-stored frequency-fomain filters cs['filt_post'] = FFTtoGPU(SysParams['fft_post']) cs['filt_video'] = FFTtoGPU(SysParams['fft_video']) cs['filt_video_inner'] = FFTtoGPU(SysParams['fft_video_inner'])
def prepare_audio_cuda(): cs['plan1'] = fft.Plan(blocklen, np.float32, np.complex64) cs['plan1i'] = fft.Plan(ablocklen, np.complex64, np.complex64) cs['fft1_out'] = gpuarray.empty(blocklen, np.complex64) cs['ifft1_out'] = gpuarray.empty(ablocklen, np.complex64) cs['fm_left'] = gpuarray.empty(ablocklen, np.complex64) cs['fm_right'] = gpuarray.empty(ablocklen, np.complex64) cs['left_clipped'] = gpuarray.empty(ablocklen, np.float32) cs['right_clipped'] = gpuarray.empty(ablocklen, np.float32) cs['left_fft1'] = gpuarray.empty(blocklen // 2 + 1, np.complex64) cs['right_fft1'] = gpuarray.empty(blocklen // 2 + 1, np.complex64) cs['left_fft2'] = gpuarray.empty(ablocklen // 2 + 1, np.complex64) cs['right_fft2'] = gpuarray.empty(ablocklen // 2 + 1, np.complex64) cs['left_out'] = gpuarray.empty(ablocklen, np.float32) cs['right_out'] = gpuarray.empty(ablocklen, np.float32) cs['plan2'] = fft.Plan(ablocklen, np.float32, np.complex64) cs['plan2i'] = fft.Plan(ablocklen, np.complex64, np.float32) cs['outlen'] = outlen = ablocklen // 20 cs['scaledout'] = gpuarray.empty(outlen * 2, np.float32) cs['left_scaledout'] = gpuarray.empty(outlen, np.float32) cs['right_scaledout'] = gpuarray.empty(outlen, np.float32) cs['doanglediff_mac'] = mod.get_function("anglediff_mac") cs['doaudioscale'] = mod.get_function("audioscale") cs['filt_audiolpf'] = FFTtoGPU(SysParams['fft_audiolpf']) cs['filt_audio_left'] = FFTtoGPU(SysParams['fft_audio_left']) cs['filt_audio_right'] = FFTtoGPU(SysParams['fft_audio_right'])
def cufft_conv(x, y): x = x.astype(np.complex64) y = y.astype(np.complex64) if (x.shape != y.shape): return -1 plan = fft.Plan(x.shape, np.complex64, np.complex64) inverse_plan = fft.Plan(x.shape, np.complex64, np.complex64) x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.to_gpu(y) x_fft = gpuarray.empty_like(x_gpu, dtype=np.complex64) y_fft = gpuarray.empty_like(y_gpu, dtype=np.complex64) out_gpu = gpuarray.empty_like(x_gpu, dtype=np.complex64) fft.fft(x_gpu, x_fft, plan) fft.fft(y_gpu, y_fft, plan) linalg.multiply(x_fft, y_fft, overwrite=True) fft.ifft(y_fft, out_gpu, inverse_plan, scale=True) conv_out = out_gpu.get() x_gpu.gpudata.free() y_gpu.gpudata.free() x_fft.gpudata.free() y_fft.gpudata.free() out_gpu.gpudata.free() return conv_out
def propagate_eager(self, wavelength, wavefront): """ 'Not-Too-Good' version of the propagation on the GPU (lots of Memory issues...) Remove in the future :param wavelength: :param wavefront: :return: """ N = self.N_PIX # free, total = cuda.mem_get_info() free, total = cuda.mem_get_info() print("Free: %.2f percent" % (free / total * 100)) # Pupil Plane -> Image Slicer complex_pupil = self.pupil_masks[wavelength] * np.exp( 1j * 2 * np.pi * self.pupil_masks[wavelength] / wavelength) complex_pupil_gpu = gpuarray.to_gpu( np.asarray(complex_pupil, np.complex64)) plan = cu_fft.Plan(complex_pupil_gpu.shape, np.complex64, np.complex64) cu_fft.fft(complex_pupil_gpu, complex_pupil_gpu, plan, scale=True) # Add N_slices copies to be Masked complex_slicer_cpu = complex_pupil_gpu.get() complex_pupil_gpu.gpudata.free() free, total = cuda.mem_get_info() print("*Free: %.2f percent" % (free / total * 100)) complex_slicer_cpu = np.stack([complex_slicer_cpu] * self.N_slices) complex_slicer_gpu = gpuarray.to_gpu(complex_slicer_cpu) slicer_masks_gpu = gpuarray.to_gpu(self.slicer_masks_fftshift) clinalg.multiply(slicer_masks_gpu, complex_slicer_gpu, overwrite=True) slicer_masks_gpu.gpudata.free() free, total = cuda.mem_get_info() print("**Free: %.2f percent" % (free / total * 100)) # Slicer -> Pupil Mirror plan = cu_fft.Plan((N, N), np.complex64, np.complex64, self.N_slices) cu_fft.ifft(complex_slicer_gpu, complex_slicer_gpu, plan, scale=True) mirror_mask_gpu = gpuarray.to_gpu(self.pupil_mirror_masks_fft) clinalg.multiply(mirror_mask_gpu, complex_slicer_gpu, overwrite=True) # Pupil Mirror -> Slits cu_fft.fft(complex_slicer_gpu, complex_slicer_gpu, plan) slits = complex_slicer_gpu.get() complex_slicer_gpu.gpudata.free() mirror_mask_gpu.gpudata.free() slit = fftshift(np.sum((np.abs(slits))**2, axis=0)) free, total = cuda.mem_get_info() print("***Free: %.2f percent" % (free / total * 100)) return slit
def __init__(self, nx, ny): shapeX = [ny, nx] shapeK = [ny, nx // 2 + 1] self.shapeX = shapeX self.shapeK = shapeK self.fftplan = skfft.Plan(self.shapeX, np.float32, np.complex64) self.ifftplan = skfft.Plan(self.shapeX, np.complex64, np.float32) self.coef_norm = nx * ny
def __init__(self, mesh, context=None): ''' Args: mesh The mesh on which the solver will operate. The dimensionality is deducted from mesh.dimension ''' # create the mesh grid and compute the greens function on it self.mesh = mesh self._context = context mesh_shape = self.mesh.shape # nz, ny, (nx) mesh_shape2 = [2*n for n in mesh_shape] # 2*nz, 2*ny, (2*nx) mesh_distances = list(reversed(self.mesh.distances)) #dz, dy, dx self.fgreentr = gpuarray.empty(mesh_shape2, dtype=np.complex128) self.tmpspace = gpuarray.zeros_like(self.fgreentr) sizeof_complex = np.dtype(np.complex128).itemsize # dimensionality function dispatch dim = self.mesh.dimension self._fgreen = getattr(self, '_fgreen' + str(dim) + 'd') self._mirror = getattr(self, '_mirror' + str(dim) + 'd') copy_fn = {'3d' : get_Memcpy3D_d2d, '2d': get_Memcpy2D_d2d} memcpy_nd = copy_fn[str(dim) + 'd'] dim_args = self.mesh.shape self._cpyrho2tmp = memcpy_nd( src=None, dst=self.tmpspace, # None because src(rho) not yet known src_pitch=self.mesh.nx*sizeof_complex, dst_pitch=2*self.mesh.nx*sizeof_complex, dim_args=dim_args, itemsize=np.dtype(np.complex128).itemsize, src_height=self.mesh.ny, dst_height=2*self.mesh.ny) self._cpytmp2rho = memcpy_nd( src=self.tmpspace, dst=None, # None because dst(rho) not yet know src_pitch=2*self.mesh.nx*sizeof_complex, dst_pitch=self.mesh.nx*sizeof_complex, dim_args=dim_args, itemsize=np.dtype(np.complex128).itemsize, src_height=2*self.mesh.ny, dst_height=self.mesh.ny) mesh_arr = [-mesh_distances[i]/2 + np.arange(mesh_shape[i]+1) * mesh_distances[i] for i in xrange(self.mesh.dimension) ] # mesh_arr is [mz, my, mx] mesh_grids = np.meshgrid(*mesh_arr, indexing='ij') fgreen = self._fgreen(*mesh_grids) fgreen = self._mirror(fgreen) self.plan_forward = cu_fft.Plan(self.tmpspace.shape, in_dtype=np.complex128, out_dtype=np.complex128) self.plan_backward = cu_fft.Plan(self.tmpspace.shape, in_dtype=np.complex128, out_dtype=np.complex128) cu_fft.fft(gpuarray.to_gpu(fgreen), self.fgreentr, plan=self.plan_forward)
def set_focal_series(self, fs, defoci=None): """ Setting the focal series also sets important parameters as the padding and defocus values. Parameters ---------- fs : Hyperspy Image Mandatory! A focal series with all the appropriate parameters. Padding is read from metadata. defoci : array Optional, use it to explicitly set defoci values. Must have a dimension equal to the navigation axis of fs. """ # Real space parameters Nz, Ny, Nx = fs.data.shape if defoci is None: defoci = fs.axes_manager.navigation_axes[0].axis Nz = len(defoci) self.zdim = defoci self.k2 = fs.get_fourier_space() self.Nz = Nz # In the middle of the focal series is the reference plane Nz_half = np.ceil(Nz / 2).astype('int') - 1 # Pad mask Npy, Npx = fs.metadata.Signal.pad_tuple mask = np.zeros((Nz, Ny, Nx), dtype=np.bool) mask[:, Npy[0]:(Ny - Npy[1]), Npx[0]:(Nx - Npx[1])] = True # Set some parameters self.new_wave = fs._get_signal_signal() self.new_wave.metadata = fs.metadata.deepcopy() self.shape = (Nz, Ny, Nx) # Allocate things ... if self.using_gpu: # ... in GPU! self.Iexp = to_gpu_f(fs.data) self.mask = to_gpu_b(mask) # - The plans for FFT self.pft3dcc = cu_fft.Plan((Ny, Nx), np.complex64, np.complex64, Nz) self.pft2dcc = cu_fft.Plan((Ny, Nx), np.complex64, np.complex64) else: # ... in CPU! self.Iexp = fs.data.astype('complex64') self.mask = mask
def filter(self): import pycuda.gpuarray as gpuarray import skcuda.fft as cu_fft import skcuda.linalg as linalg import pycuda.driver as cuda from pycuda.tools import make_default_context cuda.init() context = make_default_context() device = context.get_device() signal = self.series[0] window = self.series[1] linalg.init() nfft = determine_size(len(signal) + len(window) - 1) # Move data to GPU sig_zero_pad = np.zeros(nfft, dtype=self.precision['float']) win_zero_pad = np.zeros(nfft, dtype=self.precision['float']) sig_gpu = gpuarray.zeros(sig_zero_pad.shape, dtype=self.precision['float']) win_gpu = gpuarray.zeros(win_zero_pad.shape, dtype=self.precision['float']) sig_zero_pad[0:len(signal)] = signal win_zero_pad[0:len(window)] = window sig_gpu.set(sig_zero_pad) win_gpu.set(win_zero_pad) # Plan forwards sig_fft_gpu = gpuarray.zeros(nfft, dtype=self.precision['complex']) win_fft_gpu = gpuarray.zeros(nfft, dtype=self.precision['complex']) sig_plan_forward = cu_fft.Plan(sig_fft_gpu.shape, self.precision['float'], self.precision['complex']) win_plan_forward = cu_fft.Plan(win_fft_gpu.shape, self.precision['float'], self.precision['complex']) cu_fft.fft(sig_gpu, sig_fft_gpu, sig_plan_forward) cu_fft.fft(win_gpu, win_fft_gpu, win_plan_forward) # Convolve out_fft = linalg.multiply(sig_fft_gpu, win_fft_gpu, overwrite=True) linalg.scale(2.0, out_fft) # Plan inverse out_gpu = gpuarray.zeros_like(out_fft) plan_inverse = cu_fft.Plan(out_fft.shape, self.precision['complex'], self.precision['complex']) cu_fft.ifft(out_fft, out_gpu, plan_inverse, True) out_np = np.zeros(len(out_gpu), self.precision['complex']) out_gpu.get(out_np) context.pop() return out_np
def cuda_efftn(H, axes, forward): hShape = H.shape hDim = len(hShape) fftDim = len(axes) # Reshape 'axes' to be the array's end dimensions and ensure contiguity H = np.ascontiguousarray( np.moveaxis(H, axes, np.arange(hDim - fftDim, hDim, 1))) # Calculate number of batches batchSize = 1 for i in range(hDim - fftDim): batchSize *= H.shape[i] # Reshape to accomodate batching H = np.reshape( H, (batchSize, H.shape[hDim - 3], H.shape[hDim - 2], H.shape[hDim - 1])) # Pass array to the GPU and perform iFFT on each batch H_gpu = gpuarray.to_gpu(H) plan = skfft.Plan(H_gpu.shape[1:fftDim + 1], H.dtype, H.dtype, H_gpu.shape[0]) if forward: skfft.fft(H_gpu, H_gpu, plan) else: skfft.ifft(H_gpu, H_gpu, plan, True) # Reshape to original dimensions H = np.moveaxis(H_gpu.get(), 0, fftDim) H = np.reshape(H, hShape) return H
def ifft2_gpu(y, fftshift=False): """ C2C iFFT do numpy.fft.ifft2 The input y is a 2D complex numpy array """ #get the shape of the initial numpy array n1, n2 = y.shape #from numpy array to GPUarray. Take the only first n2/2+1 non-redundant FFT coefficients when R2C. # For C2C, the dimensions of input and output are the same. #if fftshift is False: # y2 = np.asarray(y[:,0:n2//2+1],np.complex64) #else: # y2 = np.asarray(np.fft.ifftshift(y)[:,0:n2//2+1],np.complex64) if fftshift: y2 = np.fft.ifftshift(y) else: y2 = y ygpu = gpuarray.to_gpu(y2) #Initialise empty output GPUarray x = gpuarray.empty((n1, n2), np.complex128) #inverse FFT plan_backward = cu_fft.Plan((n1, n2), np.complex128, np.complex128) cu_fft.ifft(ygpu, x, plan_backward) #Must divide by the total number of pixels in the image to get the normalization right xout = x.get() / n1 / n2 return xout
def ifft2_gpu(y, fftshift=False): ''' This function produce an output that is completely compatible with numpy.fft.ifft2 The input y is a 2D complex numpy array''' # Convert the input array to complex64 if y.dtype != 'complex64': y = y.astype('complex64') # Get the shape of the initial numpy array n1, n2 = y.shape # From numpy array to GPUarray. Take only the first n2/2+1 non redundant FFT coefficients if fftshift is False: y2 = np.asarray(y[:, 0:n2 // 2 + 1], np.complex64) else: y2 = np.asarray(np.fft.ifftshift(y)[:, :n2 // 2 + 1], np.complex64) ygpu = gpuarray.to_gpu(y2) # Initialise empty output GPUarray x = gpuarray.empty((n1, n2), np.float32) # Inverse FFT plan_backward = cu_fft.Plan((n1, n2), np.complex64, np.float32) cu_fft.ifft(ygpu, x, plan_backward) # Must divide by the total number of pixels in the image to get the normalisation right xout = x.get() / n1 / n2 return xout
def setup_mesh(self, mesh): '''Create the meshgrid, compute and store integrated Green's function from mesh distances. Only accepts meshes with same shape as self.mesh . ''' assert (mesh.shape == self.mesh.shape) self.mesh = mesh mesh_arr = [ -mesh.distances[i]/2 + np.arange(mesh.shape_r[i] + 1.) * mesh.distances[i] for i in [1, 0] ] # mesh_arr is [my, mx] mesh_grids = np.meshgrid(*mesh_arr, indexing='ij') #choose my, mx fgreen2 = self._fgreen(*mesh_grids) fgreen2 = self._mirror(fgreen2) # tile in 3d dimension, yields to memerror, uses huuge amount of memory! #fgreen = np.tile(fgreen, (mesh.nz, 2*mesh.ny, 2*mesh.nx)) if self.save_memory: plan_2d = cu_fft.Plan([2*self.mesh.ny, 2*self.mesh.nx], in_dtype=np.complex128, out_dtype=np.complex128) cu_fft.fft(gpuarray.to_gpu(fgreen2), self.fgreentr, plan=plan_2d) else: fgreen = np.empty(shape=(mesh.nz, 2*mesh.ny, 2*mesh.nx), dtype=np.complex128) for nn in range(mesh.nz): fgreen[nn,:,:] = fgreen2 cu_fft.fft(gpuarray.to_gpu(fgreen), self.fgreentr, plan=self.plan_forward)
def do_ffts(img): if img.dtype != 'float32': img = img.astype('float32') sx, sy = img.shape # or the convention (width x height) vs (rows x columns) # is sure to bite you ## Prepare and run CUDA FFT on image # See https://www.idtools.com.au/gpu-accelerated-fft-compatible-numpy/ time0 = time.time() # Initialise CUDA input GPUArray x_gpu = gpuarray.to_gpu(img) # Initialise output GPUarray # N/2+1 non-redundant coefficients of a length-N input signal y_gpu = gpuarray.empty((sx, sy // 2 + 1), np.complex64) # Plan and run Cuda fft plan_fft = cu_fft.Plan((sx, sy), np.float32, np.complex64) cu_fft.fft(x_gpu, y_gpu, plan_fft) gpu_fft = y_gpu.get() gpu_time = time.time() - time0 print(f'GPU FFT preparation, execution and retrieval in {gpu_time:6.4f} s') ## Run np fft time0 = time.time() cpu_fft = np.fft.fft2(img) cpu_time = time.time() - time0 print(f'NumPY CPU FFT in {cpu_time:6.4f} s') return (gpu_fft, cpu_fft, gpu_time, cpu_time)
def __init__(self, mesh, context=None, save_memory=True): ''' Args: mesh The mesh on which the solver will operate. The dimensionality is deducted from mesh.dimension save_memory: Decide whether to store all slices of the transformed greens function (more memory but faster) or save 1 slice only (saves memory but slower, default) ''' # create the mesh grid and compute the greens function on it if (mesh.dimension != 3): print ('Error: Use a 3d mesh for the 2.5d algorithm!. Abort.') return None self.is_25D = True self.save_memory = save_memory self.mesh = mesh self._context = context nz, ny, nx = mesh.shape mesh_shape2 = [2*n for n in mesh.shape] # 2*nz, 2*ny, (2*nx) if save_memory: self.fgreentr = gpuarray.empty((2*ny, 2*nx), dtype=np.complex128) self._solve_kernel = self._solve_kernel_slow else: self.fgreentr = gpuarray.empty((nz, 2*ny, 2*nx), dtype=np.complex128) self._solve_kernel = self._solve_kernel_fast self.tmpspace = gpuarray.zeros((nz, 2*ny, 2*nx), dtype=np.complex128) sizeof_complex = np.dtype(np.complex128).itemsize # dimensionality function dispatch self._fgreen = getattr(self, '_fgreen25d') self._mirror = getattr(self, '_mirror2d') #copy_fn = {'3d' : get_Memcpy3D_d2d, '2d': get_Memcpy2D_d2d} memcpy_nd = get_Memcpy3D_d2d #memcpy_nd = copy_fn[str(dim) + 'd'] dim_args = mesh.shape self._cpyrho2tmp = memcpy_nd( src=None, dst=self.tmpspace, # None because src(rho) not yet known src_pitch=mesh.nx*sizeof_complex, dst_pitch=2*mesh.nx*sizeof_complex, dim_args=dim_args, itemsize=np.dtype(np.complex128).itemsize, src_height=mesh.ny, dst_height=2*mesh.ny) self._cpytmp2rho = memcpy_nd( src=self.tmpspace, dst=None, # None because dst(rho) not yet known src_pitch=2*mesh.nx*sizeof_complex, dst_pitch=mesh.nx*sizeof_complex, dim_args=dim_args, itemsize=np.dtype(np.complex128).itemsize, src_height=2*mesh.ny, dst_height=mesh.ny) self.plan_forward = cu_fft.Plan([2*mesh.ny, 2*mesh.nx], in_dtype=np.complex128, out_dtype=np.complex128, batch=mesh.nz) self.plan_backward = self.plan_forward self.setup_mesh(mesh)
def test_fft_float32_to_complex64_1d(self): x = np.asarray(np.random.rand(self.N), np.float32) xf = np.fft.rfftn(x) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty(self.N // 2 + 1, np.complex64) plan = fft.Plan(x.shape, np.float32, np.complex64) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
def test_multiple_streams(self): x = np.asarray(np.random.rand(self.N), np.float32) xf = np.fft.rfftn(x) y = np.asarray(np.random.rand(self.N), np.float32) yf = np.fft.rfftn(y) x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.to_gpu(y) xf_gpu = gpuarray.empty(self.N // 2 + 1, np.complex64) yf_gpu = gpuarray.empty(self.N // 2 + 1, np.complex64) stream0 = drv.Stream() stream1 = drv.Stream() plan1 = fft.Plan(x.shape, np.float32, np.complex64, stream=stream0) plan2 = fft.Plan(y.shape, np.float32, np.complex64, stream=stream1) fft.fft(x_gpu, xf_gpu, plan1) fft.fft(y_gpu, yf_gpu, plan2) assert np.allclose(xf, xf_gpu.get(), atol=atol_float32) assert np.allclose(yf, yf_gpu.get(), atol=atol_float32)
def test_ifft_complex128_to_float64_1d(self): x = np.asarray(np.random.rand(self.N), np.float64) xf = np.asarray(np.fft.rfftn(x), np.complex128) xf_gpu = gpuarray.to_gpu(xf) x_gpu = gpuarray.empty(self.N, np.float64) plan = fft.Plan(x.shape, np.complex128, np.float64) fft.ifft(xf_gpu, x_gpu, plan, True) assert np.allclose(x, x_gpu.get(), atol=atol_float64)
def test_batch_fft_float32_to_complex64_2d(self): x = np.asarray(np.random.rand(self.B, self.N, self.M), np.float32) xf = np.fft.rfftn(x, axes=(1,2)) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((self.B, self.N, self.M//2+1), np.complex64) plan = fft.Plan([self.N, self.M], np.float32, np.complex64, batch=self.B) fft.fft(x_gpu, xf_gpu, plan) np.testing.assert_allclose(xf, xf_gpu.get(), atol=atol_float32)
def _get_inv_plan(itype, otype, outlen, batch=1): try: theplan = _reverse_plans[(itype, otype, outlen, batch)] except KeyError: theplan = cu_fft.Plan((outlen, ), itype, otype, batch=batch) _reverse_plans.update({(itype, otype, outlen): theplan}) return theplan
def _get_fwd_plan(itype, otype, inlen, batch=1): try: theplan = _forward_plans[(itype, otype, inlen, batch)] except KeyError: theplan = cu_fft.Plan((inlen, ), itype, otype, batch=batch) _forward_plans.update({(itype, otype, inlen): theplan}) return theplan
def test_batch_fft_float64_to_complex128_1d(self): x = np.asarray(np.random.rand(self.B, self.N), np.float64) xf = np.fft.rfft(x, axis=1) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((self.B, self.N // 2 + 1), np.complex128) plan = fft.Plan(x.shape[1], np.float64, np.complex128, batch=self.B) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float64)
def filter_fft_cuda(signal: np.array, window: np.array, prec: dict): """ Computes the low_pass filter using the numpy pycuda method. Also auto-inits the pycuda library :param signal: The input series :param window: The input window :param prec: The precision entry :return: The filtered signal """ import pycuda.autoinit # Here because it initialises a new cuda environment every trial. import pycuda.gpuarray as gpuarray import skcuda.fft as cu_fft import skcuda.linalg as linalg linalg.init() nfft = determine_size(len(signal) + len(window) - 1) # Move data to GPU sig_zero_pad = np.zeros(nfft, dtype=prec['float']) win_zero_pad = np.zeros(nfft, dtype=prec['float']) sig_gpu = gpuarray.zeros(sig_zero_pad.shape, dtype=prec['float']) win_gpu = gpuarray.zeros(win_zero_pad.shape, dtype=prec['float']) sig_zero_pad[0:len(signal)] = signal win_zero_pad[0:len(window)] = window sig_gpu.set(sig_zero_pad) win_gpu.set(win_zero_pad) # Plan forwards sig_fft_gpu = gpuarray.zeros(nfft, dtype=prec['complex']) win_fft_gpu = gpuarray.zeros(nfft, dtype=prec['complex']) sig_plan_forward = cu_fft.Plan(sig_fft_gpu.shape, prec['float'], prec['complex']) win_plan_forward = cu_fft.Plan(win_fft_gpu.shape, prec['float'], prec['complex']) cu_fft.fft(sig_gpu, sig_fft_gpu, sig_plan_forward) cu_fft.fft(win_gpu, win_fft_gpu, win_plan_forward) # Convolve out_fft = linalg.multiply(sig_fft_gpu, win_fft_gpu, overwrite=True) linalg.scale(2.0, out_fft) # Plan inverse out_gpu = gpuarray.zeros_like(out_fft) plan_inverse = cu_fft.Plan(out_fft.shape, prec['complex'], prec['complex']) cu_fft.ifft(out_fft, out_gpu, plan_inverse, True) out_np = np.zeros(len(out_gpu), prec['complex']) out_gpu.get(out_np) return out_np
def test_fft_float64_to_complex128_2d(self): x = np.asarray(np.random.rand(self.N, self.M), np.float64) xf = np.fft.rfftn(x) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((self.N, self.M // 2 + 1), np.complex128) plan = fft.Plan(x.shape, np.float64, np.complex128) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float64)
def test_work_area(self): x = np.asarray(np.random.rand(self.N), np.float32) xf = np.fft.rfftn(x) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty(self.N // 2 + 1, np.complex64) plan = fft.Plan(x.shape, np.float32, np.complex64, auto_allocate=False) work_area = gpuarray.empty((plan.worksize, ), np.uint8) plan.set_work_area(work_area) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
def test_batch_ifft_complex128_to_float64_1d(self): # Note that since rfftn returns a Fortran-ordered array, it # needs to be reformatted as a C-ordered array before being # passed to gpuarray.to_gpu: x = np.asarray(np.random.rand(self.B, self.N), np.float64) xf = np.asarray(np.fft.rfft(x, axis=1), np.complex128) xf_gpu = gpuarray.to_gpu(np.ascontiguousarray(xf)) x_gpu = gpuarray.empty((self.B, self.N), np.float64) plan = fft.Plan(x.shape[1], np.complex128, np.float64, batch=self.B) fft.ifft(xf_gpu, x_gpu, plan, True) assert np.allclose(x, x_gpu.get(), atol=atol_float64)
def test_ifft_complex64_to_float32_2d(self): # Note that since rfftn returns a Fortran-ordered array, it # needs to be reformatted as a C-ordered array before being # passed to gpuarray.to_gpu: x = np.asarray(np.random.rand(self.N, self.M), np.float32) xf = np.asarray(np.fft.rfftn(x), np.complex64) xf_gpu = gpuarray.to_gpu(np.ascontiguousarray(xf)) x_gpu = gpuarray.empty((self.N, self.M), np.float32) plan = fft.Plan(x.shape, np.complex64, np.float32) fft.ifft(xf_gpu, x_gpu, plan, True) assert np.allclose(x, x_gpu.get(), atol=atol_float32)
def allocate_grid(self, **kwargs): self.nf = kwargs.get('nf', self.nf) assert (self.nf is not None) self.n = int(self.sigma * self.nf) self.ghat_g = gpuarray.zeros(self.n, dtype=self.complex_type) self.cu_plan = cufft.Plan(self.n, self.complex_type, self.complex_type, stream=self.stream) return self
def irfft(a, normalize=True, nthreads=0): if is_memory_enough(a): arg = gpuarray.to_gpu(a) shape = [s for s in a.shape] shape[-1] = (shape[-1]-1)*2 rtype = G_CTYPES[a.dtype.type] afg = gpuarray.empty(shape, rtype) plan = fft.Plan(shape, a.dtype.type, rtype) fft.ifft(arg, afg, plan) return afg.get() else: return _irfft(a)
def rfft(a, nthreads=0): if is_memory_enough(a): arg = gpuarray.to_gpu(a) shape = [s for s in a.shape] shape[-1] = shape[-1]//2 + 1 ctype = G_RTYPES[a.dtype.type] afg = gpuarray.empty(shape, ctype) plan = fft.Plan(shape, a.dtype.type, ctype) print(shape, a.dtype.type, ctype) fft.fft(arg, afg, plan) return afg.get() else: return _rfft(a)
def __init__(self, Nx, xmax, **kwargs): """__init__(self, Nx, xmax, **kwargs) -> None initialize this class. Parameters ---------- Nx : int the number of grid points xmax : float the maximum of space in the x direction. The spatial resolution is defined as 2*xmax/Nx. kwargs : options fft_type : str (default : 'numpy') 'numpy' : use fft methods in the numpy module 'fftw' : use fft methods in the pyfftw module if supported 'cufft' : use fft methods in the cufft module if supported others : See the following classes or methods. space class InitPhoton() InitMSFT() InitAxes() InitDensity() """ space.__init__(self, Nx, xmax, **kwargs) self.InitPhoton(**kwargs) self.InitMSFT(**kwargs) self.InitAxes(**kwargs) self.InitDensity(**kwargs) # Check the validity of `fft_type` if kwargs.get('fft_type') is None: self.__fft_type = 'cufft' elif kwargs.get('fft_type') not in ['numpy', 'fftw', 'cufft']: raise ValueError('Invalid value for the keyword "fft_type."') else: self.__fft_type = kwargs.get('fft_type') # Check the validity of cufft if self.__fft_type == 'cufft': if found_cufft is True: # in case of the existence of CUDA buff = self.mesh_space()[0].shape self.__x_gpu = gpuarray.empty(buff, np.complex64) self.__xf_gpu = gpuarray.empty(buff, np.complex64) self.__plan = cu_fft.Plan(buff, np.complex64, np.complex64) else: # change into fftw self.__fft_type = 'fftw' # Check the validity of fftw if self.__fft_type == 'fftw': if found_pyfftw is False: # change into numpy self.__fft_type = 'numpy'