def fft_multiply_repeated(h_fft, x, cuda_dict=dict(use_cuda=False)): """Do FFT multiplication by a filter function (possibly using CUDA) Parameters ---------- h_fft : 1-d array or gpuarray The filtering array to apply. x : 1-d array The array to filter. cuda_dict : dict Dictionary constructed using setup_cuda_multiply_repeated(). Returns ------- x : 1-d array Filtered version of x. """ if not cuda_dict["use_cuda"]: # do the fourier-domain operations x = np.real(ifft(h_fft * fft(x), overwrite_x=True)).ravel() else: # do the fourier-domain operations, results in second param cuda_dict["x"].set(x.astype(np.float64)) cudafft.fft(cuda_dict["x"], cuda_dict["x_fft"], cuda_dict["fft_plan"]) cuda_multiply_inplace_c128(h_fft, cuda_dict["x_fft"]) # If we wanted to do it locally instead of using our own kernel: # cuda_seg_fft.set(cuda_seg_fft.get() * h_fft) cudafft.ifft(cuda_dict["x_fft"], cuda_dict["x"], cuda_dict["ifft_plan"], False) x = np.array(cuda_dict["x"].get(), dtype=x.dtype, subok=True, copy=False) return x
def thunk(): input_shape = inputs[0][0].shape # construct output shape output_shape = list(input_shape) # DFT of real input is symmetric, no need to store # redundant coefficients output_shape[-1] = output_shape[-1] // 2 + 1 # extra dimension with length 2 for real/imag output_shape += [2] output_shape = tuple(output_shape) z = outputs[0] # only allocate if there is no previous allocation of the # right size. if z[0] is None or z[0].shape != output_shape: z[0] = CudaNdarray.zeros(output_shape) input_pycuda = to_gpuarray(inputs[0][0]) # I thought we'd need to change the type on output_pycuda # so it is complex64, but as it turns out scikits.cuda.fft # doesn't really care either way and treats the array as # if it is complex64 anyway. output_pycuda = to_gpuarray(z[0]) # only initialise plan if necessary if plan[0] is None or plan_input_shape[0] != input_shape: plan_input_shape[0] = input_shape plan[0] = fft.Plan(input_shape[1:], np.float32, np.complex64, batch=input_shape[0]) fft.fft(input_pycuda, output_pycuda, plan[0])
def thunk(): input_shape = inputs[0][0].shape output_shape = input_shape z = outputs[0] # only allocate if there is no previous allocation of the # right size. if z[0] is None or z[0].shape != output_shape: z[0] = CudaNdarray.zeros(output_shape) input_pycuda = to_gpuarray(inputs[0][0]) # I thought we'd need to change the type on output_pycuda # so it is complex64, but as it turns out scikits.cuda.fft # doesn't really care either way and treats the array as # if it is complex64 anyway. output_pycuda = to_gpuarray(z[0]) # only initialise plan if necessary if plan[0] is None or plan_input_shape[0] != input_shape: plan_input_shape[0] = input_shape plan[0] = fft.Plan(input_shape[1:-1], np.complex64, np.complex64, batch=input_shape[0]) fft.fft(input_pycuda, output_pycuda, plan[0]) compute_map[node.outputs[0]][0] = True
def gpu_r2c_fft(in1, is_gpuarray=False, store_on_gpu=False): """ This function makes use of the scikits implementation of the FFT for GPUs to take the real to complex FFT. INPUTS: in1 (no default): The array on which the FFT is to be performed. is_gpuarray (default=True): Boolean specifier for whether or not input is on the gpu. store_on_gpu (default=False): Boolean specifier for whether the result is to be left on the gpu or not. OUTPUTS: gpu_out1 The gpu array containing the result. OR gpu_out1.get() The result from the gpu array. """ if is_gpuarray: gpu_in1 = in1 else: gpu_in1 = gpuarray.to_gpu_async(in1.astype(np.float32)) output_size = np.array(in1.shape) output_size[1] = 0.5*output_size[1] + 1 gpu_out1 = gpuarray.empty([output_size[0], output_size[1]], np.complex64) gpu_plan = Plan(gpu_in1.shape, np.float32, np.complex64) fft(gpu_in1, gpu_out1, gpu_plan) if store_on_gpu: return gpu_out1 else: return gpu_out1.get()
def thunk(): input_shape = inputs[0][0].shape # construct output shape output_shape = tuple(input_shape) # print 'FFT shapes:', input_shape, '->', output_shape # print 'Batch size:', input_shape[0] # print 'Core shape:', input_shape[1:-1] z = outputs[0] # only allocate if there is no previous allocation of the right size. if z[0] is None or z[0].shape != output_shape: z[0] = CudaNdarray.zeros(output_shape) input_pycuda = to_gpuarray(inputs[0][0]) # I thought we'd need to change the type on output_pycuda # so it is complex64, but as it turns out scikits.cuda.fft # doesn't really care either way and treats the array as # if it is complex64 anyway. output_pycuda = to_gpuarray(z[0]) # only initialise plan if necessary if plan[0] is None or plan_input_shape[0] != input_shape: plan_input_shape[0] = input_shape plan[0] = fft.Plan(shape=input_shape[1:-1], # Exclude batch dim and complex dim in_dtype=np.complex64, out_dtype=np.complex64, batch=input_shape[0]) fft.fft(input_pycuda, output_pycuda, plan[0])
def test_fft_float64_to_complex128(self): x = np.asarray(np.random.rand(self.N), np.float64) xf = np.fft.fft(x) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty(self.N/2+1, np.complex128) plan = fft.Plan(x.shape, np.float64, np.complex128) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf[0:self.N/2+1], xf_gpu.get(), atol=atol_float64)
def test_batch_fft_float64_to_complex128_1d(self): x = np.asarray(np.random.rand(self.B, self.N), np.float64) xf = np.fft.rfft(x, axis=1) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((self.B, self.N/2+1), np.complex128) plan = fft.Plan(x.shape[1], np.float64, np.complex128, batch=self.B) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float64)
def test_batch_fft_float32_to_complex64_1d(self): x = np.asarray(np.random.rand(self.B, self.N), np.float32) xf = np.fft.rfft(x, axis=1) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((self.B, self.N / 2 + 1), np.complex64) plan = fft.Plan(x.shape[1], np.float32, np.complex64, batch=self.B) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
def test_fft_float32_to_complex64_2d(self): x = np.asarray(np.random.rand(self.N, self.M), np.float32) xf = np.fft.rfftn(x) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((self.N, self.M / 2 + 1), np.complex64) plan = fft.Plan(x.shape, np.float32, np.complex64) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
def test_fft_float32_to_complex64_2d(self): x = np.asarray(np.random.rand(self.N, self.M), np.float32) xf = np.fft.rfftn(x) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((self.N, self.M/2+1), np.complex64) plan = fft.Plan(x.shape, np.float32, np.complex64) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
def test_fft_float64_to_complex128_1d(self): x = np.asarray(np.random.rand(self.N), np.float64) xf = np.fft.rfftn(x) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty(self.N // 2 + 1, np.complex128) plan = fft.Plan(x.shape, np.float64, np.complex128) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float64)
def test_batch_fft_float64_to_complex128_2d(self): x = np.asarray(np.random.rand(self.B, self.N, self.M), np.float64) xf = np.fft.rfftn(x, axes=(1,2)) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((self.B, self.N, self.M/2+1), np.complex128) plan = fft.Plan([self.N, self.M], np.float64, np.complex128, batch=self.B) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float64)
def rfft2(self, i, o=None, cache=True): shape = i.shape[:-2] rshape = i.shape[-2:] cshape = (rshape[0], rshape[1] / 2 + 1) batch = np.prod(shape, dtype=np.int) plan = self.get_plan(cache, rshape, self.rtype, self.ctype, batch) if o is None: o = self.context.empty(shape + cshape, self.ctype) cu_fft.fft(i, o, plan, scale=False) return o
def rfft2(self, i, o = None, cache = True): shape = i.shape[:-2] rshape = i.shape[-2:] cshape = (rshape[0], rshape[1]/2+1) batch = np.prod(shape, dtype=np.int) plan = self.get_plan(cache, rshape, self.rtype, self.ctype, batch) if o is None: o = self.context.empty(shape+cshape, self.ctype) cu_fft.fft(i, o, plan, scale=False) return o
def test_batch_fft_float64_to_complex128_2d(self): x = np.asarray(np.random.rand(self.B, self.N, self.M), np.float64) xf = np.fft.rfftn(x, axes=(1, 2)) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((self.B, self.N, self.M / 2 + 1), np.complex128) plan = fft.Plan([self.N, self.M], np.float64, np.complex128, batch=self.B) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float64)
def test_multiple_streams(self): x = np.asarray(np.random.rand(self.N), np.float32) xf = np.fft.fft(x) y = np.asarray(np.random.rand(self.N), np.float32) yf = np.fft.fft(y) x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.to_gpu(y) xf_gpu = gpuarray.empty(self.N/2+1, np.complex64) yf_gpu = gpuarray.empty(self.N/2+1, np.complex64) stream0 = drv.Stream() stream1 = drv.Stream() plan1 = fft.Plan(x.shape, np.float32, np.complex64, stream=stream0) plan2 = fft.Plan(y.shape, np.float32, np.complex64, stream=stream1) fft.fft(x_gpu, xf_gpu, plan1) fft.fft(y_gpu, yf_gpu, plan2) assert np.allclose(xf[0:self.N/2+1], xf_gpu.get(), atol=atol_float32) assert np.allclose(yf[0:self.N/2+1], yf_gpu.get(), atol=atol_float32)
def test_multiple_streams(self): x = np.asarray(np.random.rand(self.N), np.float32) xf = np.fft.rfftn(x) y = np.asarray(np.random.rand(self.N), np.float32) yf = np.fft.rfftn(y) x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.to_gpu(y) xf_gpu = gpuarray.empty(self.N / 2 + 1, np.complex64) yf_gpu = gpuarray.empty(self.N / 2 + 1, np.complex64) stream0 = drv.Stream() stream1 = drv.Stream() plan1 = fft.Plan(x.shape, np.float32, np.complex64, stream=stream0) plan2 = fft.Plan(y.shape, np.float32, np.complex64, stream=stream1) fft.fft(x_gpu, xf_gpu, plan1) fft.fft(y_gpu, yf_gpu, plan2) assert np.allclose(xf, xf_gpu.get(), atol=atol_float32) assert np.allclose(yf, yf_gpu.get(), atol=atol_float32)
def convol(self, data1, data2): self.init() self.ctx.push() plan = self.__class__.plans[self.shape] data1_gpu = self.__class__.data1_gpus[self.shape] data2_gpu = self.__class__.data2_gpus[self.shape] data1_gpu.set(data1.astype(numpy.complex128)) cu_fft.fft(data1_gpu, data1_gpu, plan) data2_gpu.set(data2.astype(numpy.complex128)) cu_fft.fft(data2_gpu, data2_gpu, plan) # data1_gpu *= data2_gpu.conj() self.multconj(data1_gpu, data2_gpu) cu_fft.ifft(data1_gpu, data1_gpu, plan, True) # self.ctx.synchronize() res = data1_gpu.get().real self.ctx.pop() return res
def correlate(self, data1, data2): self.init() with self.__class__.sem: self.ctx.push() plan = self.__class__.plans[self.shape] data1_gpu = self.__class__.data1_gpus[self.shape] data2_gpu = self.__class__.data2_gpus[self.shape] data1_gpu.set(data1.astype(numpy.complex128)) cu_fft.fft(data1_gpu, data1_gpu, plan) data2_gpu.set(data2.astype(numpy.complex128)) cu_fft.fft(data2_gpu, data2_gpu, plan) # data1_gpu *= data2_gpu.conj() self.multconj(data1_gpu, data2_gpu) cu_fft.ifft(data1_gpu, data1_gpu, plan, True) # self.ctx.synchronize() res = data1_gpu.get().real self.ctx.pop() return res
def cufft(data,shape=None,inverse=False): if shape: data = pad2(data,shape) plan = CUFFT_PLANS.get(data.shape) if not plan: plan = cu_fft.Plan(data.shape,np.complex64,np.complex64) CUFFT_PLANS[data.shape] = plan gpu_data = gpuarray.to_gpu(np.cast[np.complex64](data)) if inverse: cu_fft.ifft(gpu_data,gpu_data,plan) else: cu_fft.fft(gpu_data,gpu_data,plan) r = gpu_data.get() return r
def fft(invec,outvec,prec,itype,otype): cuplan = _get_fwd_plan(invec.dtype,outvec.dtype,len(invec)) cu_fft.fft(invec.data,outvec.data,cuplan)
def cuda_gridvis(sub_array, f, settings, plan, chan): """ Grid the visibilities parallelized by pixel. References: - Chapter 10 in "Interferometry and Synthesis in Radio Astronomy" by Thompson, Moran, & Swenson - Daniel Brigg's PhD Thesis: http://www.aoc.nrao.edu/dissertations/dbriggs/ """ print "Gridding the visibilities" t_start = time.time() if sub_array==1: Antennas = 40 else: Antennas = 60 # unpack parameters vfile = settings['vfile'] briggs = settings['briggs'] imsize = settings['imsize'] cell = settings['cell'] nx = np.int32(2 * imsize) noff = np.int32((nx - imsize) / 2) ## constants arc2rad = np.float32(np.pi / 180. / 3600.) du = np.float32(1. / (arc2rad * cell * nx)) # determin the file type (uvfits or fitsidi) h_u = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float64') h_v = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float64') h_re = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float32') h_im = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float32') #Get Visibility Data and values of UVW if settings['vfile'].find('.uvfits') != -1: freq = 3.45E11 #np.float32(f[0].header['CRVAL4']) light_speed = 299792458. good = np.where(f[0].data.data[:, 0, 0, chan, 0, 0] != 0) h_u = np.float32(light_speed * f[0].data.par('uu')[good]) print "h_u", h_u.shape h_v = np.float32(light_speed * f[0].data.par('vv')[good]) gcount = np.int32(np.size(h_u)) ## assume data is unpolarized h_re = np.float32(f[0].data.data[good, 0, 0, chan, 0, 0]) h_im = np.float32(f[0].data.data[good, 0, 0, chan, 0, 1]) freq = 1702500000. light_speed = 299792458. # Speed of light ## assume data is unpolarized #print chan print 'GCOUNT', gcount # h_ : host, d_ : device h_grd = np.zeros((nx, nx), dtype=np.complex64) h_cnt = np.zeros((nx, nx), dtype=np.int32) d_u = gpu.to_gpu(np.array(h_u,dtype='float32')) d_v = gpu.to_gpu(np.array(h_v,dtype='float32')) d_re = gpu.to_gpu(np.array(h_re,dtype='float32')) d_im = gpu.to_gpu(np.array(h_im,dtype='float32')) d_cnt = gpu.zeros((np.int(nx), np.int(nx)), np.int32) d_grd = gpu.zeros((np.int(nx), np.int(nx)), np.complex64) d_ngrd = gpu.zeros_like(d_grd) d_bm = gpu.zeros_like(d_grd) d_nbm = gpu.zeros_like(d_grd) d_fim = gpu.zeros((np.int(imsize), np.int(imsize)), np.float32) ## define kernel parameters if imsize == 1024: blocksize2D = (8, 16, 1) gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])), np.int(np.ceil(1. * nx / blocksize2D[1]))) blocksizeF2D = (16, 16, 1) gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])), np.int(np.ceil(1. * imsize / blocksizeF2D[1]))) blocksize1D = (256, 1, 1) else: blocksize2D = (16, 32, 1) gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])), np.int(np.ceil(1. * nx / blocksize2D[1]))) blocksizeF2D = (32, 32, 1) gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])), np.int(np.ceil(1. * imsize / blocksizeF2D[1]))) blocksize1D = (512, 1, 1) gridsize1D = (np.int(np.ceil(1. * gcount / blocksize1D[0])), 1) # ------------------------ # make gridding kernels # ------------------------ ## make spheroidal convolution kernel (don't mess with these!) width = 6. ngcf = 24. h_cgf = gcf(ngcf, width) ## make grid correction h_corr = corrfun(nx, width) d_cgf = module.get_global('cgf')[0] d_corr = gpu.to_gpu(h_corr) cu.memcpy_htod(d_cgf, h_cgf) # ------------------------ # grid it up # ------------------------ d_umax = gpu.max(cumath.fabs(d_u)) d_vmax = gpu.max(cumath.fabs(d_v)) umax = np.int32(np.ceil(d_umax.get() / du)) vmax = np.int32(np.ceil(d_vmax.get() / du)) ## grid ($$) # This should be improvable via: # - shared memory solution? I tried... # - better coalesced memory access? I tried... # - reorganzing and indexing UV data beforehand? # (i.e. http://www.nvidia.com/docs/IO/47905/ECE757_Project_Report_Gregerson.pdf) # - storing V(u,v) in texture memory? # Each pixel in the uv plane goes through the data and check to see whether the pixel is included in the convolution. # This kernel also calculates the point spread function and the local sampling # from the data (for applying the weights later). gridVis_wBM_kernel(d_grd, d_bm, d_cnt, d_u, d_v, d_re, d_im, nx, du, gcount, umax, vmax, \ block=blocksize2D, grid=gridsize2D) ## apply weights wgtGrid_kernel(d_bm, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D) hfac = np.int32(1) dblGrid_kernel(d_bm, nx, hfac, block=blocksize2D, grid=gridsize2D) shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D) ## normalize wgtGrid_kernel(d_grd, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D) ## Reflect grid about v axis hfac = np.int32(-1) dblGrid_kernel(d_grd, nx, hfac, block=blocksize2D, grid=gridsize2D) ## Shift both shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D) # ------------------------ # Make the beam # ------------------------ ## Transform to image plane fft.fft(d_nbm, d_bm, plan) ## Shift shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D) ## Correct for C corrGrid_kernel(d_nbm, d_corr, nx, block=blocksize2D, grid=gridsize2D) # Trim trimIm_kernel(d_nbm, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Normalize d_bmax = gpu.max(d_fim) bmax = d_bmax.get() bmax = np.float32(1. / bmax) nrmBeam_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Pull onto CPU dpsf = d_fim.get() # ------------------------ # Make the map # ------------------------ ## Transform to image plane fft.fft(d_ngrd, d_grd, plan) ## Shift shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D) ## Correct for C corrGrid_kernel(d_ngrd, d_corr, nx, block=blocksize2D, grid=gridsize2D) ## Trim trimIm_kernel(d_ngrd, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Normalize (Jy/beam) nrmGrid_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Finish timers t_end = time.time() t_full = t_end - t_start print "Gridding execution time %0.5f" % t_full + ' s' print "\t%0.5f" % (t_full / gcount) + ' s per visibility' ## Return dirty psf (CPU) and dirty image (GPU) return dpsf, d_fim
def cuda_gridvis(settings, plan): """ Grid the visibilities parallelized by pixel. References: - Chapter 10 in "Interferometry and Synthesis in Radio Astronomy" by Thompson, Moran, & Swenson - Daniel Brigg's PhD Thesis: http://www.aoc.nrao.edu/dissertations/dbriggs/ """ print "Gridding the visibilities" t_start = time.time() # unpack parameters vfile = settings['vfile'] briggs = settings['briggs'] imsize = settings['imsize'] cell = settings['cell'] nx = np.int32(2 * imsize) noff = np.int32((nx - imsize) / 2) ## constants arc2rad = np.float32(np.pi / 180 / 3600.) du = np.float32(1. / (arc2rad * cell * nx)) ## grab data f = pyfits.open(settings['vfile']) # determin the file type (uvfits or fitsidi) if settings['vfile'].find('.fitsidi') != -1: ## quickly figure out what data is not flagged freq = 3.45E11 #np.float32(f[7].header['CRVAL3']) 299792458vvvv #good = np.where(f[0].data.data[:,0,0,0,0,0,0] != 0) #h_u = np.float32(freq*f[0].data.par('uu')[good]) #h_v = np.float32(freq*f[0].data.par('vv')[good]) light_speed = 299792458. # Speed of light h_u = np.ndarray(shape=(780, 1),dtype='float64') h_v = np.ndarray(shape=(780, 1),dtype='float64') h_re = np.ndarray(shape=(780, 1),dtype='float32') h_im = np.ndarray(shape=(780, 1),dtype='float32') h_u = np.float64(light_speed * f[0].data[:].UU) h_v = np.float64(light_speed * f[0].data[:].VV) for bl in range(0, 780): #gcount += np.int32(np.size(h_u[bl])) ## assume data is unpolarized #h_re = np.float32(0.5*(f[0].data.data[good,0,0,0,0,0,0]+f[0].data.data[good,0,0,0,0,1,0])) #h_im = np.float32(0.5*(f[0].data.data[good,0,0,0,0,0,1]+f[0].data.data[good,0,0,0,0,1,1])) h_re[bl] = np.float32(f[0].data[:].data[bl][0][0][0][0][0][0]) h_im[bl] = np.float32(f[0].data[:].data[bl][0][0][0][0][0][1]) ## make GPU arrays h_u = np.float32(h_u.ravel()) h_v = np.float32(h_v.ravel()) gcount = np.int32(np.size(h_u)) #gcount = len(gcount.ravel()) h_re = np.float32(h_re.ravel()) h_im = np.float32(h_im.ravel()) print len(h_re),len(h_im) elif settings['vfile'].find('.uvfits') != -1: freq = 3.45E11 #np.float32(f[0].header['CRVAL4']) light_speed = 299792458. good = np.where(f[0].data.data[:, 0, 0, 0, 0, 0] != 0) h_u = np.float32(light_speed * f[0].data.par('uu')[good]) h_v = np.float32(light_speed * f[0].data.par('vv')[good]) gcount = np.int32(np.size(h_u)) ## assume data is unpolarized h_re = np.float32(f[0].data.data[good, 0, 0, 0, 0, 0]) h_im = np.float32(f[0].data.data[good, 0, 0, 0, 0, 1]) print h_u # h_ : host, d_ : device h_grd = np.zeros((nx, nx), dtype=np.complex64) h_cnt = np.zeros((nx, nx), dtype=np.int32) d_u = gpu.to_gpu(h_u) d_v = gpu.to_gpu(h_v) d_re = gpu.to_gpu(h_re) d_im = gpu.to_gpu(h_im) d_cnt = gpu.zeros((np.int(nx), np.int(nx)), np.int32) d_grd = gpu.zeros((np.int(nx), np.int(nx)), np.complex64) d_ngrd = gpu.zeros_like(d_grd) d_bm = gpu.zeros_like(d_grd) d_nbm = gpu.zeros_like(d_grd) d_fim = gpu.zeros((np.int(imsize), np.int(imsize)), np.float32) ## define kernel parameters blocksize2D = (8, 16, 1) gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])), np.int(np.ceil(1. * nx / blocksize2D[1]))) blocksizeF2D = (16, 16, 1) gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])), np.int(np.ceil(1. * imsize / blocksizeF2D[1]))) blocksize1D = (256, 1, 1) gridsize1D = (np.int(np.ceil(1. * gcount / blocksize1D[0])), 1) # ------------------------ # make gridding kernels # ------------------------ ## make spheroidal convolution kernel (don't mess with these!) width = 6. ngcf = 24. h_cgf = gcf(ngcf, width) ## make grid correction h_corr = corrfun(nx, width) d_cgf = module.get_global('cgf')[0] d_corr = gpu.to_gpu(h_corr) cu.memcpy_htod(d_cgf, h_cgf) # ------------------------ # grid it up # ------------------------ d_umax = gpu.max(cumath.fabs(d_u)) d_vmax = gpu.max(cumath.fabs(d_v)) umax = np.int32(np.ceil(d_umax.get() / du)) vmax = np.int32(np.ceil(d_vmax.get() / du)) ## grid ($$) # This should be improvable via: # - shared memory solution? I tried... # - better coalesced memory access? I tried... # - reorganzing and indexing UV data beforehand? # (i.e. http://www.nvidia.com/docs/IO/47905/ECE757_Project_Report_Gregerson.pdf) # - storing V(u,v) in texture memory? gridVis_wBM_kernel(d_grd, d_bm, d_cnt, d_u, d_v, d_re, d_im, nx, du, gcount, umax, vmax, \ block=blocksize2D, grid=gridsize2D) ## apply weights wgtGrid_kernel(d_bm, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D) hfac = np.int32(1) dblGrid_kernel(d_bm, nx, hfac, block=blocksize2D, grid=gridsize2D) shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D) ## normalize wgtGrid_kernel(d_grd, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D) ## Reflect grid about v axis hfac = np.int32(-1) dblGrid_kernel(d_grd, nx, hfac, block=blocksize2D, grid=gridsize2D) ## Shift both shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D) # ------------------------ # Make the beam # ------------------------ ## Transform to image plane fft.fft(d_nbm, d_bm, plan) ## Shift shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D) ## Correct for C corrGrid_kernel(d_nbm, d_corr, nx, block=blocksize2D, grid=gridsize2D) # Trim trimIm_kernel(d_nbm, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Normalize d_bmax = gpu.max(d_fim) bmax = d_bmax.get() bmax = np.float32(1. / bmax) nrmBeam_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Pull onto CPU dpsf = d_fim.get() # ------------------------ # Make the map # ------------------------ ## Transform to image plane fft.fft(d_ngrd, d_grd, plan) ## Shift shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D) ## Correct for C corrGrid_kernel(d_ngrd, d_corr, nx, block=blocksize2D, grid=gridsize2D) ## Trim trimIm_kernel(d_ngrd, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Normalize (Jy/beam) nrmGrid_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Finish timers t_end = time.time() t_full = t_end - t_start print "Gridding execution time %0.5f" % t_full + ' s' print "\t%0.5f" % (t_full / gcount) + ' s per visibility' ## Return dirty psf (CPU) and dirty image (GPU) return dpsf, d_fim
def sample_defrost_gpu(lat, func, gamma, m2_eff): """Calculates a sample of random values in the lattice lat = Lattice func = name of Cuda kernel n = size of cubic lattice gamma = -0.25 or +0.25 m2_eff = effective mass This uses CuFFT to calculate FFTW. """ import scikits.cuda.fft as fft import fftw3 "Various constants:" mpl = lat.mpl n = lat.n nn = lat.nn os = 16 nos = n*pow(os,2) dk = lat.dk dx = lat.dx dkos = dk/(2.*os) dxos = dx/os kcut = nn*dk/2.0 norm = 0.5/(math.sqrt(2*pi*dk**3.)*mpl)*(dkos/dxos) ker = np.empty(nos,dtype = lat.prec_real) fft1 = fftw3.Plan(ker,ker, direction='forward', flags=['measure'], realtypes = ['realodd 10']) for k in xrange(nos): kk = (k+0.5)*dkos ker[k]=kk*(kk**2. + m2_eff)**gamma*math.exp(-(kk/kcut)**2.) fft1.execute() fftw3.destroy_plan(fft1) for k in xrange(nos): ker[k] = norm*ker[k]/(k+1) Fk_gpu = gpuarray.zeros((n/2+1,n,n), dtype = lat.prec_complex) ker_gpu = gpuarray.to_gpu(ker) tmp_gpu = gpuarray.zeros((n,n,n),dtype = lat.prec_real) plan = fft.Plan(tmp_gpu.shape, lat.prec_real, lat.prec_complex) plan2 = fft.Plan(tmp_gpu.shape, lat.prec_complex, lat.prec_real) func(tmp_gpu, ker_gpu, np.uint32(nn), np.float64(os), np.uint32(lat.dimx), np.uint32(lat.dimy), np.uint32(lat.dimz), block = lat.cuda_block_1, grid = lat.cuda_grid) fft.fft(tmp_gpu, Fk_gpu, plan) if lat.test==True: print'Testing mode on! Set testQ to False to disable this.\n' np.random.seed(1) rr1 = (np.random.normal(size=Fk_gpu.shape)+ np.random.normal(size=Fk_gpu.shape)*1j) Fk = Fk_gpu.get() Fk*= rr1 Fk_gpu = gpuarray.to_gpu(Fk) fft.ifft(Fk_gpu, tmp_gpu, plan2) res = (tmp_gpu.get()).astype(lat.prec_real) res *= 1./lat.VL return res
def sample_defrost_gpu(lat, func, gamma, m2_eff): """Calculates a sample of random values in the lattice lat = Lattice func = name of Cuda kernel n = size of cubic lattice gamma = -0.25 or +0.25 m2_eff = effective mass This uses CuFFT to calculate FFTW. """ import scikits.cuda.fft as fft import fftw3 "Various constants:" mpl = lat.mpl n = lat.n nn = lat.nn os = 16 nos = n * pow(os, 2) dk = lat.dk dx = lat.dx dkos = dk / (2. * os) dxos = dx / os kcut = nn * dk / 2.0 norm = 0.5 / (math.sqrt(2 * pi * dk**3.) * mpl) * (dkos / dxos) ker = np.empty(nos, dtype=lat.prec_real) fft1 = fftw3.Plan(ker, ker, direction='forward', flags=['measure'], realtypes=['realodd 10']) for k in xrange(nos): kk = (k + 0.5) * dkos ker[k] = kk * (kk**2. + m2_eff)**gamma * math.exp(-(kk / kcut)**2.) fft1.execute() fftw3.destroy_plan(fft1) for k in xrange(nos): ker[k] = norm * ker[k] / (k + 1) Fk_gpu = gpuarray.zeros((n / 2 + 1, n, n), dtype=lat.prec_complex) ker_gpu = gpuarray.to_gpu(ker) tmp_gpu = gpuarray.zeros((n, n, n), dtype=lat.prec_real) plan = fft.Plan(tmp_gpu.shape, lat.prec_real, lat.prec_complex) plan2 = fft.Plan(tmp_gpu.shape, lat.prec_complex, lat.prec_real) func(tmp_gpu, ker_gpu, np.uint32(nn), np.float64(os), np.uint32(lat.dimx), np.uint32(lat.dimy), np.uint32(lat.dimz), block=lat.cuda_block_1, grid=lat.cuda_grid) fft.fft(tmp_gpu, Fk_gpu, plan) if lat.test == True: print 'Testing mode on! Set testQ to False to disable this.\n' np.random.seed(1) rr1 = (np.random.normal(size=Fk_gpu.shape) + np.random.normal(size=Fk_gpu.shape) * 1j) Fk = Fk_gpu.get() Fk *= rr1 Fk_gpu = gpuarray.to_gpu(Fk) fft.ifft(Fk_gpu, tmp_gpu, plan2) res = (tmp_gpu.get()).astype(lat.prec_real) res *= 1. / lat.VL return res
import numpy as np import scikits.cuda.fft as cu_fft print 'Testing fft/ifft..' N = 4096 * 16 batch_size = 16 x = np.asarray(np.random.rand(batch_size, N), np.float32) xf = np.fft.fft(x) y = np.real(np.fft.ifft(xf)) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((batch_size, N / 2 + 1), np.complex64) plan_forward = cu_fft.Plan(N, np.float32, np.complex64, batch_size) cu_fft.fft(x_gpu, xf_gpu, plan_forward) y_gpu = gpuarray.empty_like(x_gpu) plan_inverse = cu_fft.Plan(N, np.complex64, np.float32, batch_size) cu_fft.ifft(xf_gpu, y_gpu, plan_inverse, True) print 'Success status: ', np.allclose(y, y_gpu.get(), atol=1e-6) print 'Testing in-place fft..' x = np.asarray(np.random.rand(batch_size, N)+\ 1j*np.random.rand(batch_size, N), np.complex64) x_gpu = gpuarray.to_gpu(x) plan = cu_fft.Plan(N, np.complex64, np.complex64, batch_size) cu_fft.fft(x_gpu, x_gpu, plan)
def fft_resample(x, W, new_len, npad, to_remove, cuda_dict=dict(use_cuda=False)): """Do FFT resampling with a filter function (possibly using CUDA) Parameters ---------- x : 1-d array The array to resample. W : 1-d array or gpuarray The filtering function to apply. new_len : int The size of the output array (before removing padding). npad : int Amount of padding to apply before resampling. to_remove : int Number of samples to remove after resampling. cuda_dict : dict Dictionary constructed using setup_cuda_multiply_repeated(). Returns ------- x : 1-d array Filtered version of x. """ # add some padding at beginning and end to make this work a little cleaner x = _smart_pad(x, npad) old_len = len(x) shorter = new_len < old_len if not cuda_dict['use_cuda']: N = int(min(new_len, old_len)) sl_1 = slice((N + 1) // 2) y_fft = np.zeros(new_len, np.complex128) x_fft = fft(x).ravel() * W y_fft[sl_1] = x_fft[sl_1] sl_2 = slice(-(N - 1) // 2, None) y_fft[sl_2] = x_fft[sl_2] y = np.real(ifft(y_fft, overwrite_x=True)).ravel() else: cuda_dict['x'].set( np.concatenate((x, np.zeros(max(new_len - old_len, 0), x.dtype)))) # do the fourier-domain operations, results put in second param cudafft.fft(cuda_dict['x'], cuda_dict['x_fft'], cuda_dict['fft_plan']) cuda_multiply_inplace_c128(W, cuda_dict['x_fft']) # This is not straightforward, but because x_fft and y_fft share # the same data (and only one half of the full DFT is stored), we # don't have to transfer the slice like we do in scipy. All we # need to worry about is the Nyquist component, either halving it # or taking just the real component... use_len = new_len if shorter else old_len func = cuda_real_c128 if shorter else cuda_halve_c128 if use_len % 2 == 0: nyq = int((use_len - (use_len % 2)) // 2) func(cuda_dict['x_fft'], slice=slice(nyq, nyq + 1)) cudafft.ifft(cuda_dict['x_fft'], cuda_dict['x'], cuda_dict['ifft_plan'], scale=False) y = cuda_dict['x'].get()[:new_len if shorter else None] # now let's trim it back to the correct size (if there was padding) if to_remove > 0: keep = np.ones((new_len), dtype='bool') keep[:to_remove] = False keep[-to_remove:] = False y = np.compress(keep, y) return y
import numpy as np import scikits.cuda.fft as cu_fft print 'Testing fft/ifft..' N = 4096*16 batch_size = 16 x = np.asarray(np.random.rand(batch_size, N), np.float32) xf = np.fft.fft(x) y = np.real(np.fft.ifft(xf)) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((batch_size, N/2+1), np.complex64) plan_forward = cu_fft.Plan(N, np.float32, np.complex64, batch_size) cu_fft.fft(x_gpu, xf_gpu, plan_forward) y_gpu = gpuarray.empty_like(x_gpu) plan_inverse = cu_fft.Plan(N, np.complex64, np.float32, batch_size) cu_fft.ifft(xf_gpu, y_gpu, plan_inverse, True) print 'Success status: ', np.allclose(y, y_gpu.get(), atol=1e-6) print 'Testing in-place fft..' x = np.asarray(np.random.rand(batch_size, N)+\ 1j*np.random.rand(batch_size, N), np.complex64) x_gpu = gpuarray.to_gpu(x) plan = cu_fft.Plan(N, np.complex64, np.complex64, batch_size) cu_fft.fft(x_gpu, x_gpu, plan)
def cuda_gridvis(csrh_sun, csrh_satellite, settings, plan, chan): """ Grid the visibilities parallelized by pixel. References: - Chapter 10 in "Interferometry and Synthesis in Radio Astronomy" by Thompson, Moran, & Swenson - Daniel Brigg's PhD Thesis: http://www.aoc.nrao.edu/dissertations/dbriggs/ """ print "Gridding the visibilities" t_start = time.time() #f = pyfits.open(settings['vfile']) # unpack parameters vfile = settings['vfile'] briggs = settings['briggs'] imsize = settings['imsize'] cell = settings['cell'] nx = np.int32(2 * imsize) noff = np.int32((nx - imsize) / 2) ## constants arc2rad = np.float32(np.pi / 180. / 3600.) du = np.float32(1. / (arc2rad * cell * nx)) ## grab data #f = pyfits.open(settings['vfile']) Data = np.ndarray(shape=(44, 44, 16), dtype=complex) UVW = np.ndarray(shape=(780, 1), dtype='float64') Data, UVW = visibility(csrh_sun, csrh_satellite, chan) print "UVW*****\n", UVW # determin the file type (uvfits or fitsidi) h_uu = np.ndarray(shape=(780), dtype='float64') h_vv = np.ndarray(shape=(780), dtype='float64') h_rere = np.ndarray(shape=(780), dtype='float32') h_imim = np.ndarray(shape=(780), dtype='float32') freq = 1702500000. light_speed = 299792458. # Speed of light ## quickly figure out what data is not flagged #np.float32(f[7].header['CRVAL3']) 299792458vvvv #good = np.where(f[0].data.data[:,0,0,0,0,0,0] != 0) #h_u = np.float32(freq*f[0].data.par('uu')[good]) #h_v = np.float32(freq*f[0].data.par('vv')[good]) blen = 0 for antenna1 in range(0, 39): for antenna2 in range(antenna1 + 1, 40): h_rere[blen] = Data[antenna1][antenna2][chan].real h_imim[blen] = Data[antenna1][antenna2][chan].imag h_uu[blen] = freq * UVW[blen][0] h_vv[blen] = freq * UVW[blen][1] blen += 1 print "h_u", h_uu #h_u = np.float32(h_u.ravel()) #h_v = np.float32(h_v.ravel()) gcount = np.int32(np.size(h_uu)) #gcount = len(gcount.ravel()) #h_re = np.float32(h_re.ravel()) #h_im = np.float32(h_im.ravel()) #freq = 3.45E11 #np.float32(f[0].header['CRVAL4']) blen = 0 bl_order = np.ndarray(shape=(780, 2), dtype=int) good = [] for border1 in range(0, 39): for border2 in range(border1 + 1, 40): bl_order[blen][0] = border1 bl_order[blen][1] = border2 blen = blen + 1 blen = 0 h_u = [] h_v = [] h_re = [] h_im = [] Flag_Ant = [ 0, 4, 8, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 28, 29, 37, 38, 39 ] for blen in range(0, 780): if (bl_order[blen][0] not in Flag_Ant) and (bl_order[blen][1] not in Flag_Ant): good.append(blen) h_u.append(h_uu[blen]) h_v.append(h_vv[blen]) h_re.append(h_rere[blen]) h_im.append(h_imim[blen]) #print "Good:",good gcount = np.int32(np.size(h_u)) ## assume data is unpolarized #print chan print 'GCOUNT', gcount #print "H_U", h_u #print "H_V", h_v #print h_re #print h_im # h_ : host, d_ : device h_grd = np.zeros((nx, nx), dtype=np.complex64) h_cnt = np.zeros((nx, nx), dtype=np.int32) d_u = gpu.to_gpu(np.array(h_uu, dtype='float32')) d_v = gpu.to_gpu(np.array(h_vv, dtype='float32')) d_re = gpu.to_gpu(np.array(h_rere, dtype='float32')) d_im = gpu.to_gpu(np.array(h_imim, dtype='float32')) d_cnt = gpu.zeros((np.int(nx), np.int(nx)), np.int32) d_grd = gpu.zeros((np.int(nx), np.int(nx)), np.complex64) d_ngrd = gpu.zeros_like(d_grd) d_bm = gpu.zeros_like(d_grd) d_nbm = gpu.zeros_like(d_grd) d_fim = gpu.zeros((np.int(imsize), np.int(imsize)), np.float32) ## define kernel parameters if imsize == 1024: blocksize2D = (8, 16, 1) gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])), np.int(np.ceil(1. * nx / blocksize2D[1]))) blocksizeF2D = (16, 16, 1) gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])), np.int(np.ceil(1. * imsize / blocksizeF2D[1]))) blocksize1D = (256, 1, 1) else: blocksize2D = (16, 32, 1) gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])), np.int(np.ceil(1. * nx / blocksize2D[1]))) blocksizeF2D = (32, 32, 1) gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])), np.int(np.ceil(1. * imsize / blocksizeF2D[1]))) blocksize1D = (512, 1, 1) gridsize1D = (np.int(np.ceil(1. * gcount / blocksize1D[0])), 1) # ------------------------ # make gridding kernels # ------------------------ ## make spheroidal convolution kernel (don't mess with these!) width = 6. ngcf = 24. h_cgf = gcf(ngcf, width) ## make grid correction h_corr = corrfun(nx, width) d_cgf = module.get_global('cgf')[0] d_corr = gpu.to_gpu(h_corr) cu.memcpy_htod(d_cgf, h_cgf) # ------------------------ # grid it up # ------------------------ d_umax = gpu.max(cumath.fabs(d_u)) d_vmax = gpu.max(cumath.fabs(d_v)) umax = np.int32(np.ceil(d_umax.get() / du)) vmax = np.int32(np.ceil(d_vmax.get() / du)) ## grid ($$) # This should be improvable via: # - shared memory solution? I tried... # - better coalesced memory access? I tried... # - reorganzing and indexing UV data beforehand? # (i.e. http://www.nvidia.com/docs/IO/47905/ECE757_Project_Report_Gregerson.pdf) # - storing V(u,v) in texture memory? gridVis_wBM_kernel(d_grd, d_bm, d_cnt, d_u, d_v, d_re, d_im, nx, du, gcount, umax, vmax, \ block=blocksize2D, grid=gridsize2D) ## apply weights wgtGrid_kernel(d_bm, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D) hfac = np.int32(1) dblGrid_kernel(d_bm, nx, hfac, block=blocksize2D, grid=gridsize2D) shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D) ## normalize wgtGrid_kernel(d_grd, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D) ## Reflect grid about v axis hfac = np.int32(-1) dblGrid_kernel(d_grd, nx, hfac, block=blocksize2D, grid=gridsize2D) ## Shift both shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D) # ------------------------ # Make the beam # ------------------------ ## Transform to image plane fft.fft(d_nbm, d_bm, plan) ## Shift shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D) ## Correct for C corrGrid_kernel(d_nbm, d_corr, nx, block=blocksize2D, grid=gridsize2D) # Trim trimIm_kernel(d_nbm, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Normalize d_bmax = gpu.max(d_fim) bmax = d_bmax.get() bmax = np.float32(1. / bmax) nrmBeam_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Pull onto CPU dpsf = d_fim.get() # ------------------------ # Make the map # ------------------------ ## Transform to image plane fft.fft(d_ngrd, d_grd, plan) ## Shift shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D) ## Correct for C corrGrid_kernel(d_ngrd, d_corr, nx, block=blocksize2D, grid=gridsize2D) ## Trim trimIm_kernel(d_ngrd, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Normalize (Jy/beam) nrmGrid_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Finish timers t_end = time.time() t_full = t_end - t_start print "Gridding execution time %0.5f" % t_full + ' s' print "\t%0.5f" % (t_full / gcount) + ' s per visibility' ## Return dirty psf (CPU) and dirty image (GPU) return dpsf, d_fim
ii = 0 tmpimg = numpy.zeros((n, m, k), dtype=numpy.float32) ln = sq + 5 mags = mag[indexp].sum() del indexp s = 3 N2 = int(N * 0.7) N3 = int(N * 0.7) gpu_data.set(sobject.astype(numpy.complex64)) pycuda.driver.memcpy_dtod(gpu_last.gpudata, gpu_data.gpudata, gpu_data.nbytes) gpu_intensity.set(mag) gpu_mask.set(sobm) #print real_space.nbytes for i in range(N): t0 = time() cu_fft.fft(gpu_data, gpu_data, plan) constrains_fourier(gpu_data, gpu_intensity) cu_fft.ifft(gpu_data, gpu_data, plan, True) constrains_real(gpu_data, gpu_last, gpu_mask, beta) pycuda.driver.memcpy_dtod(gpu_last.gpudata, gpu_data.gpudata, gpu_data.nbytes) t1 = time() ctx.synchronize() t2 = time() print("With CUDA, the full loop took %.3fs but after sync %.3fs" % (t1 - t0, t2 - t0)) del tmpimg print "it took", time() - time0, N / (time() - time0) print "smallest error", serr, "number", nerr
def cuda_gridvis(settings,plan): """ Grid the visibilities parallelized by pixel. References: - Chapter 10 in "Interferometry and Synthesis in Radio Astronomy" by Thompson, Moran, & Swenson - Daniel Brigg's PhD Thesis: http://www.aoc.nrao.edu/dissertations/dbriggs/ """ print "Gridding the visibilities" t_start=time.time() # unpack parameters vfile = settings['vfile'] briggs = settings['briggs'] imsize = settings['imsize'] cell = settings['cell'] nx = np.int32(2*imsize) noff = np.int32((nx-imsize)/2) ## constants arc2rad = np.float32(np.pi/180/3600.) du = np.float32(1./(arc2rad*cell*nx)) ## grab data f = pyfits.open(settings['vfile']) ## quickly figure out what data is not flagged freq = np.float32(f[0].header['CRVAL4']) good = np.where(f[0].data.data[:,0,0,0,0,0,0] != 0) h_u = np.float32(freq*f[0].data.par('uu')[good]) h_v = np.float32(freq*f[0].data.par('vv')[good]) gcount = np.int32(np.size(h_u)) ## assume data is unpolarized h_re = np.float32(0.5*(f[0].data.data[good,0,0,0,0,0,0]+f[0].data.data[good,0,0,0,0,1,0])) h_im = np.float32(0.5*(f[0].data.data[good,0,0,0,0,0,1]+f[0].data.data[good,0,0,0,0,1,1])) ## make GPU arrays h_grd = np.zeros((nx,nx),dtype=np.complex64) h_cnt = np.zeros((nx,nx),dtype=np.int32) d_u = gpu.to_gpu(h_u) d_v = gpu.to_gpu(h_v) d_re = gpu.to_gpu(h_re) d_im = gpu.to_gpu(h_im) d_cnt = gpu.zeros((np.int(nx),np.int(nx)),np.int32) d_grd = gpu.zeros((np.int(nx),np.int(nx)),np.complex64) d_ngrd = gpu.zeros_like(d_grd) d_bm = gpu.zeros_like(d_grd) d_nbm = gpu.zeros_like(d_grd) d_fim = gpu.zeros((np.int(imsize),np.int(imsize)),np.float32) ## define kernel parameters blocksize2D = (8,16,1) gridsize2D = (np.int(np.ceil(1.*nx/blocksize2D[0])),np.int(np.ceil(1.*nx/blocksize2D[1]))) blocksizeF2D = (16,16,1) gridsizeF2D = (np.int(np.ceil(1.*imsize/blocksizeF2D[0])),np.int(np.ceil(1.*imsize/blocksizeF2D[1]))) blocksize1D = (256,1,1) gridsize1D = (np.int(np.ceil(1.*gcount/blocksize1D[0])),1) # ------------------------ # make gridding kernels # ------------------------ ## make spheroidal convolution kernel (don't mess with these!) width = 6. ngcf = 24. h_cgf = gcf(ngcf,width) ## make grid correction h_corr = corrfun(nx,width) d_cgf = module.get_global('cgf')[0] d_corr = gpu.to_gpu(h_corr) cu.memcpy_htod(d_cgf,h_cgf) # ------------------------ # grid it up # ------------------------ d_umax = gpu.max(cumath.fabs(d_u)) d_vmax = gpu.max(cumath.fabs(d_v)) umax = np.int32(np.ceil(d_umax.get()/du)) vmax = np.int32(np.ceil(d_vmax.get()/du)) ## grid ($$) # This should be improvable via: # - shared memory solution? I tried... # - better coalesced memory access? I tried... # - reorganzing and indexing UV data beforehand? # (i.e. http://www.nvidia.com/docs/IO/47905/ECE757_Project_Report_Gregerson.pdf) # - storing V(u,v) in texture memory? gridVis_wBM_kernel(d_grd,d_bm,d_cnt,d_u,d_v,d_re,d_im,nx,du,gcount,umax,vmax,\ block=blocksize2D,grid=gridsize2D) ## apply weights wgtGrid_kernel(d_bm,d_cnt,briggs,nx,block=blocksize2D,grid=gridsize2D) hfac = np.int32(1) dblGrid_kernel(d_bm,nx,hfac,block=blocksize2D,grid=gridsize2D) shiftGrid_kernel(d_bm,d_nbm,nx,block=blocksize2D,grid=gridsize2D) ## normalize wgtGrid_kernel(d_grd,d_cnt,briggs,nx,block=blocksize2D,grid=gridsize2D) ## Reflect grid about v axis hfac = np.int32(-1) dblGrid_kernel(d_grd,nx,hfac,block=blocksize2D,grid=gridsize2D) ## Shift both shiftGrid_kernel(d_grd,d_ngrd,nx,block=blocksize2D,grid=gridsize2D) # ------------------------ # Make the beam # ------------------------ ## Transform to image plane fft.fft(d_nbm,d_bm,plan) ## Shift shiftGrid_kernel(d_bm,d_nbm,nx,block=blocksize2D,grid=gridsize2D) ## Correct for C corrGrid_kernel(d_nbm,d_corr,nx,block=blocksize2D,grid=gridsize2D) # Trim trimIm_kernel(d_nbm,d_fim,noff,nx,imsize,block=blocksizeF2D,grid=gridsizeF2D) ## Normalize d_bmax = gpu.max(d_fim) bmax = d_bmax.get() bmax = np.float32(1./bmax) nrmBeam_kernel(d_fim,bmax,imsize,block=blocksizeF2D,grid=gridsizeF2D) ## Pull onto CPU dpsf = d_fim.get() # ------------------------ # Make the map # ------------------------ ## Transform to image plane fft.fft(d_ngrd,d_grd,plan) ## Shift shiftGrid_kernel(d_grd,d_ngrd,nx,block=blocksize2D,grid=gridsize2D) ## Correct for C corrGrid_kernel(d_ngrd,d_corr,nx,block=blocksize2D,grid=gridsize2D) ## Trim trimIm_kernel(d_ngrd,d_fim,noff,nx,imsize,block=blocksizeF2D,grid=gridsizeF2D) ## Normalize (Jy/beam) nrmGrid_kernel(d_fim,bmax,imsize,block=blocksizeF2D,grid=gridsizeF2D) ## Finish timers t_end=time.time() t_full=t_end-t_start print "Gridding execution time %0.5f"%t_full+' s' print "\t%0.5f"%(t_full/gcount)+' s per visibility' ## Return dirty psf (CPU) and dirty image (GPU) return dpsf,d_fim
def fft(invec, outvec, prec, itype, otype): cuplan = _get_fwd_plan(invec.dtype, outvec.dtype, len(invec)) cu_fft.fft(invec.data, outvec.data, cuplan)
def fft_resample(x, W, new_len, npad, to_remove, cuda_dict=dict(use_cuda=False)): """Do FFT resampling with a filter function (possibly using CUDA) Parameters ---------- x : 1-d array The array to resample. W : 1-d array or gpuarray The filtering function to apply. new_len : int The size of the output array (before removing padding). npad : int Amount of padding to apply before resampling. to_remove : int Number of samples to remove after resampling. cuda_dict : dict Dictionary constructed using setup_cuda_multiply_repeated(). Returns ------- x : 1-d array Filtered version of x. """ # add some padding at beginning and end to make this work a little cleaner x = _smart_pad(x, npad) old_len = len(x) shorter = new_len < old_len if not cuda_dict["use_cuda"]: N = int(min(new_len, old_len)) sl_1 = slice((N + 1) // 2) y_fft = np.zeros(new_len, np.complex128) x_fft = fft(x).ravel() * W y_fft[sl_1] = x_fft[sl_1] sl_2 = slice(-(N - 1) // 2, None) y_fft[sl_2] = x_fft[sl_2] y = np.real(ifft(y_fft, overwrite_x=True)).ravel() else: cuda_dict["x"].set(np.concatenate((x, np.zeros(max(new_len - old_len, 0), x.dtype)))) # do the fourier-domain operations, results put in second param cudafft.fft(cuda_dict["x"], cuda_dict["x_fft"], cuda_dict["fft_plan"]) cuda_multiply_inplace_c128(W, cuda_dict["x_fft"]) # This is not straightforward, but because x_fft and y_fft share # the same data (and only one half of the full DFT is stored), we # don't have to transfer the slice like we do in scipy. All we # need to worry about is the Nyquist component, either halving it # or taking just the real component... use_len = new_len if shorter else old_len func = cuda_real_c128 if shorter else cuda_halve_c128 if use_len % 2 == 0: nyq = int((use_len - (use_len % 2)) // 2) func(cuda_dict["x_fft"], slice=slice(nyq, nyq + 1)) cudafft.ifft(cuda_dict["x_fft"], cuda_dict["x"], cuda_dict["ifft_plan"], scale=False) y = cuda_dict["x"].get()[: new_len if shorter else None] # now let's trim it back to the correct size (if there was padding) if to_remove > 0: keep = np.ones((new_len), dtype="bool") keep[:to_remove] = False keep[-to_remove:] = False y = np.compress(keep, y) return y
### Initial Computation # Compute image OTF size_2D = [N, M] fx = np.int32([[1, -1]]) fy = np.int32([[1], [-1]]) otfFx = L0_helpers.psf2otf(fx, size_2D) otfFy = L0_helpers.psf2otf(fy, size_2D) # Compute MTF otfFx_d = gpu.to_gpu(otfFx) otfFy_d = gpu.to_gpu(otfFy) mtf_kernel(MTF_d, otfFx_d, otfFy_d, Nx, Ny, block=blocksize, grid=gridsize) # Compute Fourier transform of original image cu_fft.fft(FFTiR_d, FIR_d, plan) cu_fft.fft(FFTiG_d, FIG_d, plan) cu_fft.fft(FFTiB_d, FIB_d, plan) ### Iteration settings beta_max = 1e5 beta = 2 * _lambda iteration = 0 # Done initializing init_time = time.time() ### Iterate until desired convergence in similarity while beta < beta_max: if verbose: