def test_batch_fft_float64_to_complex128_1d(self): x = np.asarray(np.random.rand(self.B, self.N), np.float64) xf = np.fft.rfft(x, axis=1) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((self.B, self.N//2+1), np.complex128) plan = fft.Plan(x.shape[1], np.float64, np.complex128, batch=self.B) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float64)
def test_fft_float32_to_complex64_2d(self): x = np.asarray(np.random.rand(self.N, self.M), np.float32) xf = np.fft.rfftn(x) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((self.N, self.M//2+1), np.complex64) plan = fft.Plan(x.shape, np.float32, np.complex64) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
def test_batch_fft_float64_to_complex128_2d(self): x = np.asarray(np.random.rand(self.B, self.N, self.M), np.float64) xf = np.fft.rfftn(x, axes=(1,2)) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((self.B, self.N, self.M//2+1), np.complex128) plan = fft.Plan([self.N, self.M], np.float64, np.complex128, batch=self.B) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float64)
def test_fft_float64_to_complex128_1d(self): x = np.asarray(np.random.rand(self.N), np.float64) xf = np.fft.rfftn(x) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty(self.N//2+1, np.complex128) plan = fft.Plan(x.shape, np.float64, np.complex128) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float64)
def test_batch_fft_float32_to_complex64_2d(self): x = np.asarray(np.random.rand(self.B, self.N, self.M), np.float32) xf = np.fft.rfftn(x, axes=(1,2)) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((self.B, self.N, self.M//2+1), np.complex64) plan = fft.Plan([self.N, self.M], np.float32, np.complex64, batch=self.B) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
def test_batch_fft_float32_to_complex64_1d(self): x = np.asarray(np.random.rand(self.B, self.N), np.float32) xf = np.fft.rfft(x, axis=1) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((self.B, self.N//2+1), np.complex64) plan = fft.Plan(x.shape[1], np.float32, np.complex64, batch=self.B) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
def test1(): N = 128 x = np.asarray(np.random.rand(N), np.complex64) xf = np.fft.fft(x) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty(N, np.complex64) plan = Plan(x.shape, np.complex64, np.complex64) fft(x_gpu, xf_gpu, plan) print(np.allclose(xf[0:N], xf_gpu.get(), atol=1e-3))
def test3(): N = 128 x = np.asarray(np.random.rand(N, N, N), np.complex64) xf = np.fft.fftn(x, s=None, axes=(0, 1, 2)) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((N, N, N), np.complex64) plan = Plan(x.shape, np.complex64, np.complex64) fft(x_gpu, xf_gpu, plan) print(np.allclose(xf[0:N, 0:N, 0:N], xf_gpu.get(), atol=1e-2))
def _solve_kernel_slow(self): ''' Slow version, use when save_memory is True: Stores only 1 slice of the fgreentr function and loops over all slices ''' cu_fft.fft(self.tmpspace, self.tmpspace, plan=self.plan_forward) for i in xrange(self.mesh.nz): self.tmpspace[i,:,:] = self.tmpspace[i,:,:] * self.fgreentr cu_fft.ifft(self.tmpspace, self.tmpspace, plan=self.plan_backward)
def test_work_area(self): x = np.asarray(np.random.rand(self.N), np.float32) xf = np.fft.rfftn(x) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty(self.N // 2 + 1, np.complex64) plan = fft.Plan(x.shape, np.float32, np.complex64, auto_allocate=False) work_area = gpuarray.empty((plan.worksize,), np.uint8) plan.set_work_area(work_area) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
def test_work_area(self): x = np.asarray(np.random.rand(self.N), np.float32) xf = np.fft.rfftn(x) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty(self.N // 2 + 1, np.complex64) plan = fft.Plan(x.shape, np.float32, np.complex64, auto_allocate=False) work_area = gpuarray.empty((plan.worksize, ), np.uint8) plan.set_work_area(work_area) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
def inplaceFractShift(img, dx, dy, PhaseShiftFunc, bInverse=False): if dx == 0 and dy == 0: return global plan global FT Cache(img.shape) cu_fft.fft(img, FT, plan) PhaseShiftFunc(FT, kxx, kyy, np.float32(dx), np.float32(dy)) cu_fft.ifft(FT, img, plan, True)
def process_video_cuda(data): global cs, cs_first # fft_overlap(data, FiltV_GPU) if cs_first == True: prepare_video_filters(SysParams) prepare_video_cuda() cs_first = False fdata = np.float32(data) gpudata = gpuarray.to_gpu(fdata) # first fft->ifft cycle applies pre-decoding filtering (low pass filters, CAV/CLV emphasis) # and very importantly, performs the Hilbert transform fft.fft(gpudata, cs['fft1_out'], cs['plan1']) if Inner: cs['fft1_out'] *= cs['filt_video_inner'] else: cs['fft1_out'] *= cs['filt_video'] fft.ifft(cs['fft1_out'], cs['filtered1'], cs['plan1i'], True) cs['doanglediff'](cs['fm_demod'], cs['filtered1'], block=(1024, 1, 1), grid=(blocklenk, 1)) # post-processing: output low-pass filtering and deemphasis fft.fft(cs['fm_demod'], cs['fft2_out'], cs['plan2']) cs['fft2_out'] *= cs['filt_post'] fft.ifft(cs['fft2_out'], cs['postlpf'], cs['plan2i'], True) cs['doclamp16'](cs['clipped_gpu'], cs['postlpf'], np.float32(-SysParams['output_minfreq']), np.float32(SysParams['output_scale']), block=(1024, 1, 1), grid=(blocklenk, 1)) output_16 = cs['clipped_gpu'].get() chop = 512 return output_16[chop:len(output_16) - chop] # graph for debug # plt.plot(cs['postlpf'].get()[5000:7500]) plt.plot(output_16[5000:7000]) # plt.plot(range(0, len(output_16)), output_16) # plt.plot(range(0, len(doutput)), doutput) # plt.plot(range(0, len(output_prefilt)), output_prefilt) plt.show() exit()
def process_video_cuda(data): global cs, cs_first # fft_overlap(data, FiltV_GPU) if cs_first == True: prepare_video_filters() prepare_video_cuda() cs_first = False fdata = np.float32(data) gpudata = gpuarray.to_gpu(fdata) # first fft->ifft cycle applies pre-decoding filtering (low pass filters, CAV/CLV emphasis) # and very importantly, performs the Hilbert transform fft.fft(gpudata, cs["fft1_out"], cs["plan1"]) if Inner: cs["fft1_out"] *= cs["filt_video_inner"] else: cs["fft1_out"] *= cs["filt_video"] fft.ifft(cs["fft1_out"], cs["filtered1"], cs["plan1i"], True) cs["doanglediff"](cs["fm_demod"], cs["filtered1"], block=(1024, 1, 1), grid=(blocklenk, 1)) # post-processing: output low-pass filtering and deemphasis fft.fft(cs["fm_demod"], cs["fft2_out"], cs["plan2"]) cs["fft2_out"] *= cs["filt_post"] fft.ifft(cs["fft2_out"], cs["postlpf"], cs["plan2i"], True) cs["doclamp16"]( cs["clipped_gpu"], cs["postlpf"], np.float32(-SP["output_minfreq"]), np.float32(SP["output_scale"]), block=(1024, 1, 1), grid=(blocklenk, 1), ) output_16 = cs["clipped_gpu"].get() chop = 512 return output_16[chop : len(output_16) - chop] # graph for debug # output = (sps.lfilter(f_deemp_b, f_deemp_a, output)[128:len(output)]) / deemp_corr # plt.plot(cs['postlpf'].get()[5000:7500]) plt.plot(output_16[5000:7000]) # plt.plot(range(0, len(output_16)), output_16) # plt.plot(range(0, len(doutput)), doutput) # plt.plot(range(0, len(output_prefilt)), output_prefilt) plt.show() exit()
def propagate_eager(self, wavelength, wavefront): """ 'Not-Too-Good' version of the propagation on the GPU (lots of Memory issues...) Remove in the future :param wavelength: :param wavefront: :return: """ N = self.N_PIX # free, total = cuda.mem_get_info() free, total = cuda.mem_get_info() print("Free: %.2f percent" % (free / total * 100)) # Pupil Plane -> Image Slicer complex_pupil = self.pupil_masks[wavelength] * np.exp( 1j * 2 * np.pi * self.pupil_masks[wavelength] / wavelength) complex_pupil_gpu = gpuarray.to_gpu( np.asarray(complex_pupil, np.complex64)) plan = cu_fft.Plan(complex_pupil_gpu.shape, np.complex64, np.complex64) cu_fft.fft(complex_pupil_gpu, complex_pupil_gpu, plan, scale=True) # Add N_slices copies to be Masked complex_slicer_cpu = complex_pupil_gpu.get() complex_pupil_gpu.gpudata.free() free, total = cuda.mem_get_info() print("*Free: %.2f percent" % (free / total * 100)) complex_slicer_cpu = np.stack([complex_slicer_cpu] * self.N_slices) complex_slicer_gpu = gpuarray.to_gpu(complex_slicer_cpu) slicer_masks_gpu = gpuarray.to_gpu(self.slicer_masks_fftshift) clinalg.multiply(slicer_masks_gpu, complex_slicer_gpu, overwrite=True) slicer_masks_gpu.gpudata.free() free, total = cuda.mem_get_info() print("**Free: %.2f percent" % (free / total * 100)) # Slicer -> Pupil Mirror plan = cu_fft.Plan((N, N), np.complex64, np.complex64, self.N_slices) cu_fft.ifft(complex_slicer_gpu, complex_slicer_gpu, plan, scale=True) mirror_mask_gpu = gpuarray.to_gpu(self.pupil_mirror_masks_fft) clinalg.multiply(mirror_mask_gpu, complex_slicer_gpu, overwrite=True) # Pupil Mirror -> Slits cu_fft.fft(complex_slicer_gpu, complex_slicer_gpu, plan) slits = complex_slicer_gpu.get() complex_slicer_gpu.gpudata.free() mirror_mask_gpu.gpudata.free() slit = fftshift(np.sum((np.abs(slits))**2, axis=0)) free, total = cuda.mem_get_info() print("***Free: %.2f percent" % (free / total * 100)) return slit
def cross_correlate(plan, normalize=True): #norm_template = sum(plan.mask.d_data) fft(plan.volume, plan.volume_fft, plan.fwd_plan) fft(plan.templatePadded, plan.template_fft, plan.fwd_plan) conj(plan.template_fft, overwrite=True) volume_fft = plan.volume_fft * plan.template_fft ifft(volume_fft, plan.ccc_map, plan.inv_plan, scale=True) plan.ccc_map /= np.float32(plan.p.get()) * plan.stdV
def FractShift(src, dest, dx, dy, PhaseShiftFunc): if dx == 0 and dy == 0: return global plan global FT Cache(src.shape) cu_fft.fft(src, FT, plan, PhaseShiftFunc) PhaseShift(FT, dx, dy) cu_fft.ifft(FT, dest, plan, True)
def __init__(self, mesh, context=None): ''' Args: mesh The mesh on which the solver will operate. The dimensionality is deducted from mesh.dimension ''' # create the mesh grid and compute the greens function on it self.mesh = mesh self._context = context mesh_shape = self.mesh.shape # nz, ny, (nx) mesh_shape2 = [2*n for n in mesh_shape] # 2*nz, 2*ny, (2*nx) mesh_distances = list(reversed(self.mesh.distances)) #dz, dy, dx self.fgreentr = gpuarray.empty(mesh_shape2, dtype=np.complex128) self.tmpspace = gpuarray.zeros_like(self.fgreentr) sizeof_complex = np.dtype(np.complex128).itemsize # dimensionality function dispatch dim = self.mesh.dimension self._fgreen = getattr(self, '_fgreen' + str(dim) + 'd') self._mirror = getattr(self, '_mirror' + str(dim) + 'd') copy_fn = {'3d' : get_Memcpy3D_d2d, '2d': get_Memcpy2D_d2d} memcpy_nd = copy_fn[str(dim) + 'd'] dim_args = self.mesh.shape self._cpyrho2tmp = memcpy_nd( src=None, dst=self.tmpspace, # None because src(rho) not yet known src_pitch=self.mesh.nx*sizeof_complex, dst_pitch=2*self.mesh.nx*sizeof_complex, dim_args=dim_args, itemsize=np.dtype(np.complex128).itemsize, src_height=self.mesh.ny, dst_height=2*self.mesh.ny) self._cpytmp2rho = memcpy_nd( src=self.tmpspace, dst=None, # None because dst(rho) not yet know src_pitch=2*self.mesh.nx*sizeof_complex, dst_pitch=self.mesh.nx*sizeof_complex, dim_args=dim_args, itemsize=np.dtype(np.complex128).itemsize, src_height=2*self.mesh.ny, dst_height=self.mesh.ny) mesh_arr = [-mesh_distances[i]/2 + np.arange(mesh_shape[i]+1) * mesh_distances[i] for i in xrange(self.mesh.dimension) ] # mesh_arr is [mz, my, mx] mesh_grids = np.meshgrid(*mesh_arr, indexing='ij') fgreen = self._fgreen(*mesh_grids) fgreen = self._mirror(fgreen) self.plan_forward = cu_fft.Plan(self.tmpspace.shape, in_dtype=np.complex128, out_dtype=np.complex128) self.plan_backward = cu_fft.Plan(self.tmpspace.shape, in_dtype=np.complex128, out_dtype=np.complex128) cu_fft.fft(gpuarray.to_gpu(fgreen), self.fgreentr, plan=self.plan_forward)
def rfft(a, nthreads=0): if is_memory_enough(a): arg = gpuarray.to_gpu(a) shape = [s for s in a.shape] shape[-1] = shape[-1]//2 + 1 ctype = G_RTYPES[a.dtype.type] afg = gpuarray.empty(shape, ctype) plan = fft.Plan(shape, a.dtype.type, ctype) print(shape, a.dtype.type, ctype) fft.fft(arg, afg, plan) return afg.get() else: return _rfft(a)
def filter(self): import pycuda.gpuarray as gpuarray import skcuda.fft as cu_fft import skcuda.linalg as linalg import pycuda.driver as cuda from pycuda.tools import make_default_context cuda.init() context = make_default_context() device = context.get_device() signal = self.series[0] window = self.series[1] linalg.init() nfft = determine_size(len(signal) + len(window) - 1) # Move data to GPU sig_zero_pad = np.zeros(nfft, dtype=self.precision['float']) win_zero_pad = np.zeros(nfft, dtype=self.precision['float']) sig_gpu = gpuarray.zeros(sig_zero_pad.shape, dtype=self.precision['float']) win_gpu = gpuarray.zeros(win_zero_pad.shape, dtype=self.precision['float']) sig_zero_pad[0:len(signal)] = signal win_zero_pad[0:len(window)] = window sig_gpu.set(sig_zero_pad) win_gpu.set(win_zero_pad) # Plan forwards sig_fft_gpu = gpuarray.zeros(nfft, dtype=self.precision['complex']) win_fft_gpu = gpuarray.zeros(nfft, dtype=self.precision['complex']) sig_plan_forward = cu_fft.Plan(sig_fft_gpu.shape, self.precision['float'], self.precision['complex']) win_plan_forward = cu_fft.Plan(win_fft_gpu.shape, self.precision['float'], self.precision['complex']) cu_fft.fft(sig_gpu, sig_fft_gpu, sig_plan_forward) cu_fft.fft(win_gpu, win_fft_gpu, win_plan_forward) # Convolve out_fft = linalg.multiply(sig_fft_gpu, win_fft_gpu, overwrite=True) linalg.scale(2.0, out_fft) # Plan inverse out_gpu = gpuarray.zeros_like(out_fft) plan_inverse = cu_fft.Plan(out_fft.shape, self.precision['complex'], self.precision['complex']) cu_fft.ifft(out_fft, out_gpu, plan_inverse, True) out_np = np.zeros(len(out_gpu), self.precision['complex']) out_gpu.get(out_np) context.pop() return out_np
def RunCorrection(neib,ROI,DifPad,rspace,kspace,exitWave,buffer_exitWave,finalObj,offsetx,offsety,objsizex,roisizex,CopyFromROI,ExitwaveAndBuffer,ApplyDifPad,cufftplan,aperture,fcachevector): Fs = [] for jpos in range(-neib,neib+1): for ipos in range(-neib,neib+1): CopyFromROI(rspace, finalObj, np.int32(offsety+jpos), np.int32(offsetx+ipos), roisizex, objsizex) ExitwaveAndBuffer(exitWave, buffer_exitWave, aperture, rspace) # Compute exitwaves cu_fft.fft(exitWave,kspace,cufftplan) # kspace = wave at detector ApplyDifPad(kspace,DifPad,fcachevector) # replace amplitudes. cu_fft.ifft(kspace,exitWave,cufftplan,True) # new exitwave errori = np.sum(((exitWave-buffer_exitWave).__abs__()**2).get()) Fs.append(errori+0) return GetMin(Fs,neib)
def thunk(): input_shape = inputs[0][0].shape s = inputs[1][0] # Since padding is not supported, assert s matches input shape. assert (input_shape[1:] == s).all() # construct output shape output_shape = [input_shape[0]] + list(s) # DFT of real input is symmetric, no need to store # redundant coefficients output_shape[-1] = output_shape[-1] // 2 + 1 # extra dimension with length 2 for real/imag output_shape += [2] output_shape = tuple(output_shape) z = outputs[0] # only allocate if there is no previous allocation of the # right size. if z[0] is None or z[0].shape != output_shape: z[0] = pygpu.zeros(output_shape, context=inputs[0][0].context, dtype="float32") input_pycuda = inputs[0][0] # I thought we'd need to change the type on output_pycuda # so it is complex64, but as it turns out skcuda.fft # doesn't really care either way and treats the array as # if it is complex64 anyway. output_pycuda = z[0] with input_pycuda.context: # only initialise plan if necessary if plan[0] is None or plan_input_shape[0] != input_shape: plan_input_shape[0] = input_shape plan[0] = fft.Plan(s, np.float32, np.complex64, batch=input_shape[0]) # Sync GPU variables before computation input_pycuda.sync() output_pycuda.sync() fft.fft(input_pycuda, output_pycuda, plan[0]) # Sync results to ensure output contains completed computation pycuda.driver.Context.synchronize()
def __init__(self, volume, template, gpu): self.gpu = gpu volume_gpu = gu.to_gpu(volume) self.fwd_plan = Plan(volume.shape, volume.dtype, np.complex64) self.volume_fft = gu.zeros_like(volume_gpu, dtype=np.complex64) fft(volume_gpu, self.volume_fft, self.fwd_plan) self.template_fft = gu.zeros_like(volume_gpu, dtype=np.complex64) self.ccc_map = gu.zeros_like(volume_gpu, dtype=np.float32) self.norm_volume = gu.prod(volume_gpu.shape) #self.scores = gu.zeros_like(volume_gpu, dtype=np.float32) #self.angles = gu.zeros_like(volume_gpu, dtype=np.float32) self.padded_volume = gu.zeros_like(volume_gpu, dtype=np.float32) del volume_gpu self.inv_plan = Plan(volume.shape, np.complex64, volume.dtype) self.template = Volume(template)
def fft_2d(x, N, M, batch_size): # print('Testing in-place fft..') # for i in range(batch_size): # x[i, :, :] = np.asarray(np.random.rand(N, M), np.complex64) x_gpu = gpuarray.to_gpu(x) # start = timer() plan = cu_fft.Plan((N, M), np.complex128, np.complex128, batch_size) cu_fft.fft(x_gpu, x_gpu, plan) # timeit2=timer()-start x_gpu1 = x_gpu.get() # print ('take time:',timeit2) return x_gpu1
def fft2c2c_cuda(x, axes=(0, 1)): rank = len(axes) x = np.array(x).astype(np.complex64) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty(x.shape, np.complex64) if len(x.shape) > rank: batch = np.prod(x.shape[rank:len(x.shape)]) plan = Plan(x.shape[0:rank], np.complex64, np.complex64, batch, None, 1, \ np.array(x.shape[0:rank]).astype(np.int32), np.prod(x.shape[rank:len(x.shape)]), 1, \ np.array(x.shape[0:rank]).astype(np.int32), np.prod(x.shape[rank:len(x.shape)]), 1 ) else: batch = 1 plan = Plan(x.shape[0:rank], np.complex64, np.complex64) fft(x_gpu, xf_gpu, plan) xf = xf_gpu.get() return xf
def fft2_gpu(x, fftshift=False): """ R2C FFT This function produce an output that is compatible with numpy.fft.fft2. The input x is a 2D numpy array """ #converting the input array to single precision float if x.dtype != "float64": x = x.astype(np.float64) #get the shape of the initial numpy array n1, n2 = x.shape # from numpy array to GPUarray xgpu = gpuarray.to_gpu(x) #initialize output GPUarray # For real to complex transformations, the fft function computes # N/2+1 non-redundant coefficients of a length-N input signal ysize = n2 // 2 + 1 y = gpuarray.empty((n1, ysize), np.complex128) #forward FFT plan_forward = cu_fft.Plan((n1, n2), np.float64, np.complex128) cu_fft.fft(xgpu, y, plan_forward) left = y.get() #to make the output array compatible with the numpy output # we need to stack horizontally the y.get() array and its flipped version # we must take care of handling even or odd sized array to get the correct size of the final array if n2 // 2 == n2 / 2: #even right = np.roll(np.fliplr(np.flipud(left))[:, 1:-1], 1, axis=0) else: #odd right = np.roll(np.fliplr(np.flipud(left))[:, :-1], 1, axis=0) print(right.shape) print(left.shape) #get a numpy array back to compatible with np.fft if fftshift is False: yout = np.hstack((left, right)) else: yout = np.fft.fftshift(np.hstack((left, right))) return yout
def test_multiple_streams(self): x = np.asarray(np.random.rand(self.N), np.float32) xf = np.fft.rfftn(x) y = np.asarray(np.random.rand(self.N), np.float32) yf = np.fft.rfftn(y) x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.to_gpu(y) xf_gpu = gpuarray.empty(self.N//2+1, np.complex64) yf_gpu = gpuarray.empty(self.N//2+1, np.complex64) stream0 = drv.Stream() stream1 = drv.Stream() plan1 = fft.Plan(x.shape, np.float32, np.complex64, stream=stream0) plan2 = fft.Plan(y.shape, np.float32, np.complex64, stream=stream1) fft.fft(x_gpu, xf_gpu, plan1) fft.fft(y_gpu, yf_gpu, plan2) assert np.allclose(xf, xf_gpu.get(), atol=atol_float32) assert np.allclose(yf, yf_gpu.get(), atol=atol_float32)
def test_multiple_streams(self): x = np.asarray(np.random.rand(self.N), np.float32) xf = np.fft.rfftn(x) y = np.asarray(np.random.rand(self.N), np.float32) yf = np.fft.rfftn(y) x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.to_gpu(y) xf_gpu = gpuarray.empty(self.N // 2 + 1, np.complex64) yf_gpu = gpuarray.empty(self.N // 2 + 1, np.complex64) stream0 = drv.Stream() stream1 = drv.Stream() plan1 = fft.Plan(x.shape, np.float32, np.complex64, stream=stream0) plan2 = fft.Plan(y.shape, np.float32, np.complex64, stream=stream1) fft.fft(x_gpu, xf_gpu, plan1) fft.fft(y_gpu, yf_gpu, plan2) assert np.allclose(xf, xf_gpu.get(), atol=atol_float32) assert np.allclose(yf, yf_gpu.get(), atol=atol_float32)
def filter_fft_cuda(signal: np.array, window: np.array, prec: dict): """ Computes the low_pass filter using the numpy pycuda method. Also auto-inits the pycuda library :param signal: The input series :param window: The input window :param prec: The precision entry :return: The filtered signal """ import pycuda.autoinit # Here because it initialises a new cuda environment every trial. import pycuda.gpuarray as gpuarray import skcuda.fft as cu_fft import skcuda.linalg as linalg linalg.init() nfft = determine_size(len(signal) + len(window) - 1) # Move data to GPU sig_zero_pad = np.zeros(nfft, dtype=prec['float']) win_zero_pad = np.zeros(nfft, dtype=prec['float']) sig_gpu = gpuarray.zeros(sig_zero_pad.shape, dtype=prec['float']) win_gpu = gpuarray.zeros(win_zero_pad.shape, dtype=prec['float']) sig_zero_pad[0:len(signal)] = signal win_zero_pad[0:len(window)] = window sig_gpu.set(sig_zero_pad) win_gpu.set(win_zero_pad) # Plan forwards sig_fft_gpu = gpuarray.zeros(nfft, dtype=prec['complex']) win_fft_gpu = gpuarray.zeros(nfft, dtype=prec['complex']) sig_plan_forward = cu_fft.Plan(sig_fft_gpu.shape, prec['float'], prec['complex']) win_plan_forward = cu_fft.Plan(win_fft_gpu.shape, prec['float'], prec['complex']) cu_fft.fft(sig_gpu, sig_fft_gpu, sig_plan_forward) cu_fft.fft(win_gpu, win_fft_gpu, win_plan_forward) # Convolve out_fft = linalg.multiply(sig_fft_gpu, win_fft_gpu, overwrite=True) linalg.scale(2.0, out_fft) # Plan inverse out_gpu = gpuarray.zeros_like(out_fft) plan_inverse = cu_fft.Plan(out_fft.shape, prec['complex'], prec['complex']) cu_fft.ifft(out_fft, out_gpu, plan_inverse, True) out_np = np.zeros(len(out_gpu), prec['complex']) out_gpu.get(out_np) return out_np
def setup_mesh(self, mesh): '''Create the meshgrid, compute and store integrated Green's function from mesh distances. Only accepts meshes with same shape as self.mesh . ''' assert (mesh.shape == self.mesh.shape) self.mesh = mesh mesh_arr = [ -mesh.distances[i]/2 + np.arange(mesh.shape_r[i] + 1.) * mesh.distances[i] for i in range(mesh.dimension)[::-1] ] # mesh_arr is [mz, my, mx] mesh_grids = np.meshgrid(*mesh_arr, indexing='ij') fgreen = self._fgreen(*mesh_grids) fgreen = self._mirror(fgreen) cu_fft.fft(gpuarray.to_gpu(fgreen), self.fgreentr, plan=self.plan_forward)
def fft2_gpu(x, fftshift=False): #code taken verbatim from https://www.idtools.com.au/gpu-accelerated-fft-compatible-numpy/ ''' This function produce an output that is compatible with numpy.fft.fft2 The input x is a 2D numpy array''' # Convert the input array to single precision float if x.dtype != 'float32': x = x.astype('float32') # Get the shape of the initial numpy array n1, n2 = x.shape # From numpy array to GPUarray xgpu = gpuarray.to_gpu(x) # Initialise output GPUarray # For real to complex transformations, the fft function computes # N/2+1 non-redundant coefficients of a length-N input signal. y = gpuarray.empty((n1, n2 // 2 + 1), np.complex64) # Forward FFT plan_forward = cu_fft.Plan((n1, n2), np.float32, np.complex64) cu_fft.fft(xgpu, y, plan_forward) left = y.get() # To make the output array compatible with the numpy output # we need to stack horizontally the y.get() array and its flipped version # We must take care of handling even or odd sized array to get the correct # size of the final array if n2 // 2 == n2 / 2: right = np.roll(np.fliplr(np.flipud(y.get()))[:, 1:-1], 1, axis=0) else: right = np.roll(np.fliplr(np.flipud(y.get()))[:, :-1], 1, axis=0) # Get a numpy array back compatible with np.fft if fftshift is False: yout = np.hstack((left, right)) else: yout = np.fft.fftshift(np.hstack((left, right))) return yout.astype('complex128')
def Simulate(self, period, width): period=np.float64(period) width=np.float64(width) for i in range(len(self.DatFiles)): self.DatFiles[i].gpu_pulsar_signal = self.DatFiles[i].gpu_time - 0*period self.MakeSignal(self.DatFiles[i].gpu_pulsar_signal, period, ((period*width)**2), grid=(self.DatFiles[i].Tblocks,1), block=(self.DatFiles[i].block_size,1,1)) s = self.DatFiles[i].gpu_pulsar_signal.get() np.savetxt("realsig.dat", zip(np.arange(0,10000),s[:10000])) fft.fft(self.DatFiles[i].gpu_pulsar_signal, self.DatFiles[i].gpu_pulsar_fft, self.DatFiles[i].Plan) ranPhases = np.random.uniform(0,1, len(self.DatFiles[i].gpu_pulsar_fft)) CompRan = np.cos(2*np.pi*ranPhases) + 1j*np.sin(2*np.pi*ranPhases) CompRan[0] = 1 + 0j OComp = self.DatFiles[i].gpu_pulsar_fft.get() NComp = OComp*CompRan s = np.fft.irfft(NComp) np.savetxt("ransig.dat", zip(np.arange(0,10000),s[:10000]))
def fft2_gpu_c2c(x, fftshift=True): """ C2C FFT This function produce an output that is compatible with numpy.fft.fft2. The input x is a 2D numpy array """ if x.dtype != np.complex128: x = x.astype(np.complex128) #get the shape of the initial numpy array n1, n2 = x.shape xgpu = gpuarray.to_gpu(x) #Initialise empty output GPUarray y = gpuarray.empty((n1, n2), np.complex128) #FFT plan_forward = cu_fft.Plan((n1, n2), np.complex128, np.complex128) cu_fft.fft(xgpu, y, plan_forward) #Must divide by the total number of pixels in the image to get the normalization right yout = y.get() / n1 / n2 if fftshift: yout = np.fft.fftshift(yout) return yout
def poisson_solve(self, rho): ''' Solve the poisson equation with the given charge distribution Args: rho: Charge distribution (same dimensions as mesh) Returns: Phi (same dimensions as rho) ''' rho = rho.astype(np.complex128) self._cpyrho2tmp.set_src_device(rho.gpudata) self._cpytmp2rho.set_dst_device(rho.gpudata) # set to 0 since it might be filled with the old potential self.tmpspace.fill(0) self._cpyrho2tmp() cu_fft.fft(self.tmpspace, self.tmpspace, plan=self.plan_forward) cu_fft.ifft(self.tmpspace * self.fgreentr, self.tmpspace, plan=self.plan_backward) # store the result in the rho gpuarray to save space self._cpytmp2rho() # scale (cuFFT is unscaled) phi = rho.real/(2**self.mesh.dimension * self.mesh.n_nodes) phi *= self.mesh.volume_elem/(2**(self.mesh.dimension-1)*np.pi*epsilon_0) return phi
def cu_lpf(stimulus, dt, freq): """ CUDA implementation of low-pass-filter. stimulus: ndarray The input to be filtered. dt: float The sampling interval of the input. freq: float The cut-off frequency of the low pass filter. """ num = len(stimulus) num_fft = int(num / 2 + 1) idtype = stimulus.dtype odtype = np.complex128 if idtype == np.float64 else np.complex64 if not isinstance(stimulus, gpuarray.GPUArray): d_stimulus = gpuarray.to_gpu(stimulus) else: d_stimulus = stimulus plan = Plan(stimulus.shape, idtype, odtype) d_fstimulus = gpuarray.empty(num_fft, odtype) fft(d_stimulus, d_fstimulus, plan) df = 1.0 / dt / num idx = int(freq // df) unit = int(d_fstimulus.dtype.itemsize / 4) offset = int(d_fstimulus.gpudata) + d_fstimulus.dtype.itemsize * idx cuda.memset_d32(offset, 0, unit * (num_fft - idx)) plan = Plan(stimulus.shape, odtype, idtype) d_lpf_stimulus = gpuarray.empty(num, idtype) ifft(d_fstimulus, d_lpf_stimulus, plan, False) return d_lpf_stimulus.get()
def funcfftw(F, *args, **kwargs): """funcfftw(F, *args, **kwargs) -> numpy.2darray apply 2D Fourier transform Parameters ---------- F : numpy.2darray args : options kwargs : options """ if found_pyfftw is True and kwargs.get('fft_type') == 'fftw': pyfftw.forget_wisdom() func = pyfftw.builders.fft2(F, overwrite_input=True, planner_effort='FFTW_ESTIMATE', threads=CPU_COUNT) return func() elif found_cufft is True and kwargs.get('fft_type') == 'cufft': x_gpu = gpuarray.to_gpu(F.astype(np.complex64)) xf_gpu = gpuarray.empty(F.shape, np.complex64) cu_fft.fft(x_gpu, xf_gpu, args[0]) return xf_gpu.get() else: return fft2(F)
def __init__(self, mesh, context=None, save_memory=True): ''' Args: mesh The mesh on which the solver will operate. The dimensionality is deducted from mesh.dimension save_memory: Decide whether to store all slices of the transformed greens function (more memory but faster) or save 1 slice only (saves memory but slower, default) ''' # create the mesh grid and compute the greens function on it if (mesh.dimension != 3): print ('Error: Use a 3d mesh for the 2.5d algorithm!. Abort.') return None self.is_25D = True self.mesh = mesh self._context = context mesh_shape = self.mesh.shape # nz, ny, (nx) nz, ny, nx = mesh_shape mesh_shape2 = [2*n for n in mesh_shape] # 2*nz, 2*ny, (2*nx) mesh_distances = list(reversed(self.mesh.distances)) #dz, dy, dx if save_memory: self.fgreentr = gpuarray.empty((2*ny, 2*nx), dtype=np.complex128) self._solve_kernel = self._solve_kernel_slow else: self.fgreentr = gpuarray.empty((nz, 2*ny, 2*nx), dtype=np.complex128) self._solve_kernel = self._solve_kernel_fast self.tmpspace = gpuarray.zeros((nz, 2*ny, 2*nx), dtype=np.complex128) sizeof_complex = np.dtype(np.complex128).itemsize # dimensionality function dispatch self._fgreen = getattr(self, '_fgreen25d') self._mirror = getattr(self, '_mirror2d') #copy_fn = {'3d' : get_Memcpy3D_d2d, '2d': get_Memcpy2D_d2d} memcpy_nd = get_Memcpy3D_d2d #memcpy_nd = copy_fn[str(dim) + 'd'] dim_args = self.mesh.shape self._cpyrho2tmp = memcpy_nd( src=None, dst=self.tmpspace, # None because src(rho) not yet known src_pitch=self.mesh.nx*sizeof_complex, dst_pitch=2*self.mesh.nx*sizeof_complex, dim_args=dim_args, itemsize=np.dtype(np.complex128).itemsize, src_height=self.mesh.ny, dst_height=2*self.mesh.ny) self._cpytmp2rho = memcpy_nd( src=self.tmpspace, dst=None, # None because dst(rho) not yet know src_pitch=2*self.mesh.nx*sizeof_complex, dst_pitch=self.mesh.nx*sizeof_complex, dim_args=dim_args, itemsize=np.dtype(np.complex128).itemsize, src_height=2*self.mesh.ny, dst_height=self.mesh.ny) mesh_arr = [-mesh_distances[i]/2 + np.arange(mesh.shape[i]+1) * mesh_distances[i] for i in [1,2] ] # mesh_arr is [mz, my, mx] mesh_grids = np.meshgrid(*mesh_arr, indexing='ij') #choose my, mx fgreen2 = self._fgreen(*mesh_grids) fgreen2 = self._mirror(fgreen2) fgreen = np.empty(shape=(mesh.nz, 2*mesh.ny, 2*mesh.nx), dtype=np.complex128) for nn in xrange(mesh.nz): fgreen[nn,:,:] = fgreen2 # tile in 3d dimension, yields to memerror, uses huuge amount of memory! #fgreen = np.tile(fgreen, (mesh.nz, 2*mesh.ny, 2*mesh.nx)) self.plan_forward = cu_fft.Plan([2*self.mesh.ny, 2*self.mesh.nx], in_dtype=np.complex128, out_dtype=np.complex128, batch=self.mesh.nz) self.plan_backward = cu_fft.Plan([2*self.mesh.ny, 2*self.mesh.nx], in_dtype=np.complex128, out_dtype=np.complex128, batch=self.mesh.nz) if save_memory: plan_2d = cu_fft.Plan([2*self.mesh.ny, 2*self.mesh.nx], in_dtype=np.complex128, out_dtype=np.complex128) cu_fft.fft(gpuarray.to_gpu(fgreen2), self.fgreentr, plan=plan_2d) else: cu_fft.fft(gpuarray.to_gpu(fgreen), self.fgreentr, plan=self.plan_forward)
import numpy as np import skcuda.fft as cu_fft print('Testing fft/ifft..') N = 1024 M = N//2 x = np.asarray(np.random.rand(N, M), np.float32) xf = np.fft.fft2(x) y = np.real(np.fft.ifft2(xf)) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((x.shape[0], x.shape[1]//2+1), np.complex64) plan_forward = cu_fft.Plan(x_gpu.shape, np.float32, np.complex64) cu_fft.fft(x_gpu, xf_gpu, plan_forward) y_gpu = gpuarray.empty_like(x_gpu) plan_inverse = cu_fft.Plan(x_gpu.shape, np.complex64, np.float32) cu_fft.ifft(xf_gpu, y_gpu, plan_inverse, True) print('Success status: %r' % np.allclose(y, y_gpu.get(), atol=1e-6)) print('Testing in-place fft..') x = np.asarray(np.random.rand(N, M) + 1j * np.random.rand(N, M), np.complex64) x_gpu = gpuarray.to_gpu(x) plan = cu_fft.Plan(x_gpu.shape, np.complex64, np.complex64) cu_fft.fft(x_gpu, x_gpu, plan) cu_fft.ifft(x_gpu, x_gpu, plan, True)
def process_audio_cuda(data): global cs, csa, csa_first if csa_first == True: prepare_audio_filters() prepare_audio_cuda() csa_first = False fdata = np.float32(data) gpudata = gpuarray.to_gpu(fdata) fft.fft(gpudata, cs["fft1_out"], cs["plan1"]) cs["left_fft1"] = (cs["fft1_out"] * cs["filt_audio_left"])[ 0 : (ablocklen // 2) + 1 ] # [0:blocklen])[0:(ablocklen//2)+1] cs["right_fft1"] = (cs["fft1_out"] * cs["filt_audio_right"])[0 : (ablocklen // 2) + 1] fft.ifft(cs["left_fft1"], cs["fm_left"], cs["plan1i"], True) fft.ifft(cs["right_fft1"], cs["fm_right"], cs["plan1i"], True) cs["doanglediff_mac"]( cs["left_clipped"], cs["fm_left"], np.float32((afreq_hz / 1.0 / np.pi)), np.float32(-SysParams["audio_lfreq"]), block=(1024, 1, 1), grid=(ablocklenk, 1), ) cs["doanglediff_mac"]( cs["right_clipped"], cs["fm_right"], np.float32((afreq_hz / 1.0 / np.pi)), np.float32(-SysParams["audio_rfreq"]), block=(1024, 1, 1), grid=(ablocklenk, 1), ) fft.fft(cs["left_clipped"], cs["left_fft2"], cs["plan2"]) fft.fft(cs["right_clipped"], cs["right_fft2"], cs["plan2"]) cs["left_fft2"] *= cs["filt_audiolpf"] cs["right_fft2"] *= cs["filt_audiolpf"] fft.ifft(cs["left_fft2"], cs["left_out"], cs["plan2i"], True) fft.ifft(cs["right_fft2"], cs["right_out"], cs["plan2i"], True) aclip = 256 outlen = ablocklen cs["doaudioscale"]( cs["scaledout"], cs["left_out"], cs["right_out"], np.float32(20), np.float32(0), block=(32, 1, 1), grid=(outlen // 32, 1), ) output = cs["scaledout"].get()[aclip:-aclip] return output, len(output) * 80 / 2 plt.plot(cs["scaledout"].get()) # plt.plot(cs['right_clipped'].get()[768:-768]) # plt.plot(cs['right_out'].get()[768:-768] + 100000) plt.show() exit()
def _solve_kernel_fast(self): '''Fast kernel, use when save_memory is False ''' cu_fft.fft(self.tmpspace, self.tmpspace, plan=self.plan_forward) cu_fft.ifft(self.tmpspace * self.fgreentr, self.tmpspace, plan=self.plan_backward)
def execute(self): cu_fft.fft(self.invec, self.outvec, self.plan)
modSquared = mod.get_function("modSquared") psiNonlinear = mod2.get_function("test") modSquared.prepare(["P", "P", "I"]) psiNonlinear.prepare("FFFPPPI") block = (16, 16, 1) grid = (64, 64) for n in np.arange(N_RUNS): start = time.time() for step in xrange(N_TIMESTEPS): # print step # Implementing split-step method # Update wavefunction and resovoir, record density cu_fft.fft(psi_gpu, psi_gpu, plan_forward) psi_gpu *= kineticFactorHalf_gpu cu_fft.ifft(psi_gpu, psi_gpu, plan_inverse, scale=True) # currentDensity_gpu = abs(psi_gpu) ** 2 # currentDensity_gpu = psi_gpu.real **2 + psi_gpu.imag ** 2 currentDensity_gpu = (psi_gpu * psi_gpu.conj()).real # modSquared.prepared_call(grid, block, psi_gpu.gpudata, # currentDensity_gpu.gpudata, 1024) # n_gpu *= cumath.exp(-gammaRdt_gpu + Rdt_gpu * currentDensity_gpu) n_gpu *= cumath.exp(misc.add(- gammaRdt_gpu, - misc.multiply(Rdt_gpu, currentDensity_gpu))) n_gpu += Pdt_gpu psi_gpu *= cumath.exp( misc.add( misc.add(misc.multiply(expFactorPolFirst_gpu, n_gpu),
def fft(invec, outvec, prec, itype, otype): cuplan = _get_fwd_plan(invec.dtype, outvec.dtype, len(invec)) cu_fft.fft(invec.data, outvec.data, cuplan)