Example #1
0
def fft_multiply_repeated(h_fft, x, cuda_dict=dict(use_cuda=False)):
    """Do FFT multiplication by a filter function (possibly using CUDA)

    Parameters
    ----------
    h_fft : 1-d array or gpuarray
        The filtering array to apply.
    x : 1-d array
        The array to filter.
    cuda_dict : dict
        Dictionary constructed using setup_cuda_multiply_repeated().

    Returns
    -------
    x : 1-d array
        Filtered version of x.
    """
    if not cuda_dict["use_cuda"]:
        # do the fourier-domain operations
        x = np.real(ifft(h_fft * fft(x), overwrite_x=True)).ravel()
    else:
        # do the fourier-domain operations, results in second param
        cuda_dict["x"].set(x.astype(np.float64))
        cudafft.fft(cuda_dict["x"], cuda_dict["x_fft"], cuda_dict["fft_plan"])
        cuda_multiply_inplace_c128(h_fft, cuda_dict["x_fft"])
        # If we wanted to do it locally instead of using our own kernel:
        # cuda_seg_fft.set(cuda_seg_fft.get() * h_fft)
        cudafft.ifft(cuda_dict["x_fft"], cuda_dict["x"], cuda_dict["ifft_plan"], False)
        x = np.array(cuda_dict["x"].get(), dtype=x.dtype, subok=True, copy=False)
    return x
Example #2
0
        def thunk():
            input_shape = inputs[0][0].shape

            # construct output shape
            output_shape = list(input_shape)
            # DFT of real input is symmetric, no need to store
            # redundant coefficients
            output_shape[-1] = output_shape[-1] // 2 + 1
            # extra dimension with length 2 for real/imag
            output_shape += [2]
            output_shape = tuple(output_shape)

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # I thought we'd need to change the type on output_pycuda
            # so it is complex64, but as it turns out scikits.cuda.fft
            # doesn't really care either way and treats the array as
            # if it is complex64 anyway.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(input_shape[1:], np.float32, np.complex64,
                                   batch=input_shape[0])

            fft.fft(input_pycuda, output_pycuda, plan[0])
Example #3
0
        def thunk():
            input_shape = inputs[0][0].shape
            output_shape = input_shape

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # I thought we'd need to change the type on output_pycuda
            # so it is complex64, but as it turns out scikits.cuda.fft
            # doesn't really care either way and treats the array as
            # if it is complex64 anyway.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(input_shape[1:-1], np.complex64, np.complex64,
                                   batch=input_shape[0])

            fft.fft(input_pycuda, output_pycuda, plan[0])
            compute_map[node.outputs[0]][0] = True
Example #4
0
def gpu_r2c_fft(in1, is_gpuarray=False, store_on_gpu=False):
    """
    This function makes use of the scikits implementation of the FFT for GPUs to take the real to complex FFT.

    INPUTS:
    in1             (no default):       The array on which the FFT is to be performed.
    is_gpuarray     (default=True):     Boolean specifier for whether or not input is on the gpu.
    store_on_gpu    (default=False):    Boolean specifier for whether the result is to be left on the gpu or not.

    OUTPUTS:
    gpu_out1                            The gpu array containing the result.
    OR
    gpu_out1.get()                      The result from the gpu array.
    """

    if is_gpuarray:
        gpu_in1 = in1
    else:
        gpu_in1 = gpuarray.to_gpu_async(in1.astype(np.float32))

    output_size = np.array(in1.shape)
    output_size[1] = 0.5*output_size[1] + 1

    gpu_out1 = gpuarray.empty([output_size[0], output_size[1]], np.complex64)
    gpu_plan = Plan(gpu_in1.shape, np.float32, np.complex64)
    fft(gpu_in1, gpu_out1, gpu_plan)

    if store_on_gpu:
        return gpu_out1
    else:
        return gpu_out1.get()
Example #5
0
        def thunk():
            input_shape = inputs[0][0].shape

            # construct output shape
            output_shape = tuple(input_shape)

            # print 'FFT shapes:', input_shape, '->', output_shape
            # print 'Batch size:', input_shape[0]
            # print 'Core shape:', input_shape[1:-1]

            z = outputs[0]

            # only allocate if there is no previous allocation of the right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # I thought we'd need to change the type on output_pycuda
            # so it is complex64, but as it turns out scikits.cuda.fft
            # doesn't really care either way and treats the array as
            # if it is complex64 anyway.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(shape=input_shape[1:-1],  # Exclude batch dim and complex dim
                                   in_dtype=np.complex64,
                                   out_dtype=np.complex64,
                                   batch=input_shape[0])

            fft.fft(input_pycuda, output_pycuda, plan[0])
Example #6
0
 def test_fft_float64_to_complex128(self):
     x = np.asarray(np.random.rand(self.N), np.float64)
     xf = np.fft.fft(x)
     x_gpu = gpuarray.to_gpu(x)
     xf_gpu = gpuarray.empty(self.N/2+1, np.complex128)
     plan = fft.Plan(x.shape, np.float64, np.complex128)
     fft.fft(x_gpu, xf_gpu, plan)
     assert np.allclose(xf[0:self.N/2+1], xf_gpu.get(), atol=atol_float64)
Example #7
0
 def test_batch_fft_float64_to_complex128_1d(self):
     x = np.asarray(np.random.rand(self.B, self.N), np.float64)
     xf = np.fft.rfft(x, axis=1)
     x_gpu = gpuarray.to_gpu(x)
     xf_gpu = gpuarray.empty((self.B, self.N/2+1), np.complex128)
     plan = fft.Plan(x.shape[1], np.float64, np.complex128, batch=self.B)
     fft.fft(x_gpu, xf_gpu, plan)
     assert np.allclose(xf, xf_gpu.get(), atol=atol_float64)
Example #8
0
 def test_batch_fft_float32_to_complex64_1d(self):
     x = np.asarray(np.random.rand(self.B, self.N), np.float32)
     xf = np.fft.rfft(x, axis=1)
     x_gpu = gpuarray.to_gpu(x)
     xf_gpu = gpuarray.empty((self.B, self.N / 2 + 1), np.complex64)
     plan = fft.Plan(x.shape[1], np.float32, np.complex64, batch=self.B)
     fft.fft(x_gpu, xf_gpu, plan)
     assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
Example #9
0
 def test_fft_float32_to_complex64_2d(self):
     x = np.asarray(np.random.rand(self.N, self.M), np.float32)
     xf = np.fft.rfftn(x)
     x_gpu = gpuarray.to_gpu(x)
     xf_gpu = gpuarray.empty((self.N, self.M / 2 + 1), np.complex64)
     plan = fft.Plan(x.shape, np.float32, np.complex64)
     fft.fft(x_gpu, xf_gpu, plan)
     assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
Example #10
0
 def test_fft_float32_to_complex64_2d(self):
     x = np.asarray(np.random.rand(self.N, self.M), np.float32)
     xf = np.fft.rfftn(x)
     x_gpu = gpuarray.to_gpu(x)
     xf_gpu = gpuarray.empty((self.N, self.M/2+1), np.complex64)
     plan = fft.Plan(x.shape, np.float32, np.complex64)
     fft.fft(x_gpu, xf_gpu, plan)
     assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
Example #11
0
 def test_fft_float64_to_complex128_1d(self):
     x = np.asarray(np.random.rand(self.N), np.float64)
     xf = np.fft.rfftn(x)
     x_gpu = gpuarray.to_gpu(x)
     xf_gpu = gpuarray.empty(self.N // 2 + 1, np.complex128)
     plan = fft.Plan(x.shape, np.float64, np.complex128)
     fft.fft(x_gpu, xf_gpu, plan)
     assert np.allclose(xf, xf_gpu.get(), atol=atol_float64)
Example #12
0
 def test_batch_fft_float64_to_complex128_2d(self):
     x = np.asarray(np.random.rand(self.B, self.N, self.M), np.float64)
     xf = np.fft.rfftn(x, axes=(1,2))
     x_gpu = gpuarray.to_gpu(x)
     xf_gpu = gpuarray.empty((self.B, self.N, self.M/2+1), np.complex128)
     plan = fft.Plan([self.N, self.M], np.float64, np.complex128, batch=self.B)
     fft.fft(x_gpu, xf_gpu, plan)
     assert np.allclose(xf, xf_gpu.get(), atol=atol_float64)
Example #13
0
 def rfft2(self, i, o=None, cache=True):
     shape = i.shape[:-2]
     rshape = i.shape[-2:]
     cshape = (rshape[0], rshape[1] / 2 + 1)
     batch = np.prod(shape, dtype=np.int)
     plan = self.get_plan(cache, rshape, self.rtype, self.ctype, batch)
     if o is None:
         o = self.context.empty(shape + cshape, self.ctype)
     cu_fft.fft(i, o, plan, scale=False)
     return o
Example #14
0
 def rfft2(self, i, o = None, cache = True):
     shape = i.shape[:-2]
     rshape = i.shape[-2:]
     cshape = (rshape[0], rshape[1]/2+1)
     batch = np.prod(shape, dtype=np.int)
     plan = self.get_plan(cache, rshape, self.rtype, self.ctype, batch)
     if o is None:
         o = self.context.empty(shape+cshape, self.ctype)
     cu_fft.fft(i, o, plan, scale=False)
     return o
Example #15
0
 def test_batch_fft_float64_to_complex128_2d(self):
     x = np.asarray(np.random.rand(self.B, self.N, self.M), np.float64)
     xf = np.fft.rfftn(x, axes=(1, 2))
     x_gpu = gpuarray.to_gpu(x)
     xf_gpu = gpuarray.empty((self.B, self.N, self.M / 2 + 1),
                             np.complex128)
     plan = fft.Plan([self.N, self.M],
                     np.float64,
                     np.complex128,
                     batch=self.B)
     fft.fft(x_gpu, xf_gpu, plan)
     assert np.allclose(xf, xf_gpu.get(), atol=atol_float64)
Example #16
0
 def test_multiple_streams(self):
     x = np.asarray(np.random.rand(self.N), np.float32)
     xf = np.fft.fft(x)
     y = np.asarray(np.random.rand(self.N), np.float32)
     yf = np.fft.fft(y)
     x_gpu = gpuarray.to_gpu(x)
     y_gpu = gpuarray.to_gpu(y)
     xf_gpu = gpuarray.empty(self.N/2+1, np.complex64)
     yf_gpu = gpuarray.empty(self.N/2+1, np.complex64)
     stream0 = drv.Stream()
     stream1 = drv.Stream()
     plan1 = fft.Plan(x.shape, np.float32, np.complex64, stream=stream0)
     plan2 = fft.Plan(y.shape, np.float32, np.complex64, stream=stream1)
     fft.fft(x_gpu, xf_gpu, plan1)
     fft.fft(y_gpu, yf_gpu, plan2)
     assert np.allclose(xf[0:self.N/2+1], xf_gpu.get(), atol=atol_float32)
     assert np.allclose(yf[0:self.N/2+1], yf_gpu.get(), atol=atol_float32)
Example #17
0
 def test_multiple_streams(self):
     x = np.asarray(np.random.rand(self.N), np.float32)
     xf = np.fft.rfftn(x)
     y = np.asarray(np.random.rand(self.N), np.float32)
     yf = np.fft.rfftn(y)
     x_gpu = gpuarray.to_gpu(x)
     y_gpu = gpuarray.to_gpu(y)
     xf_gpu = gpuarray.empty(self.N / 2 + 1, np.complex64)
     yf_gpu = gpuarray.empty(self.N / 2 + 1, np.complex64)
     stream0 = drv.Stream()
     stream1 = drv.Stream()
     plan1 = fft.Plan(x.shape, np.float32, np.complex64, stream=stream0)
     plan2 = fft.Plan(y.shape, np.float32, np.complex64, stream=stream1)
     fft.fft(x_gpu, xf_gpu, plan1)
     fft.fft(y_gpu, yf_gpu, plan2)
     assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
     assert np.allclose(yf, yf_gpu.get(), atol=atol_float32)
Example #18
0
    def convol(self, data1, data2):
	self.init()
	self.ctx.push()
	plan = self.__class__.plans[self.shape]
	data1_gpu = self.__class__.data1_gpus[self.shape]
	data2_gpu = self.__class__.data2_gpus[self.shape]
	data1_gpu.set(data1.astype(numpy.complex128))
	cu_fft.fft(data1_gpu, data1_gpu, plan)
	data2_gpu.set(data2.astype(numpy.complex128))
	cu_fft.fft(data2_gpu, data2_gpu, plan)
	# data1_gpu *= data2_gpu.conj()
	self.multconj(data1_gpu, data2_gpu)
	cu_fft.ifft(data1_gpu, data1_gpu, plan, True)
	# self.ctx.synchronize()
	res = data1_gpu.get().real
	self.ctx.pop()
   	return res
Example #19
0
 def correlate(self, data1, data2):
     self.init()
     with self.__class__.sem:
         self.ctx.push()
         plan = self.__class__.plans[self.shape]
         data1_gpu = self.__class__.data1_gpus[self.shape]
         data2_gpu = self.__class__.data2_gpus[self.shape]
         data1_gpu.set(data1.astype(numpy.complex128))
         cu_fft.fft(data1_gpu, data1_gpu, plan)
         data2_gpu.set(data2.astype(numpy.complex128))
         cu_fft.fft(data2_gpu, data2_gpu, plan)
         #            data1_gpu *= data2_gpu.conj()
         self.multconj(data1_gpu, data2_gpu)
         cu_fft.ifft(data1_gpu, data1_gpu, plan, True)
         #            self.ctx.synchronize()
         res = data1_gpu.get().real
         self.ctx.pop()
     return res
Example #20
0
def cufft(data,shape=None,inverse=False):

    if shape:
        data = pad2(data,shape)
                        
    plan  = CUFFT_PLANS.get(data.shape)
    if not plan:
        plan = cu_fft.Plan(data.shape,np.complex64,np.complex64)
        CUFFT_PLANS[data.shape] = plan
    
    gpu_data = gpuarray.to_gpu(np.cast[np.complex64](data))
    if inverse:
        cu_fft.ifft(gpu_data,gpu_data,plan)
    else:
        cu_fft.fft(gpu_data,gpu_data,plan)
    r = gpu_data.get()
    
    return r
Example #21
0
def fft(invec,outvec,prec,itype,otype):
    cuplan = _get_fwd_plan(invec.dtype,outvec.dtype,len(invec))
    cu_fft.fft(invec.data,outvec.data,cuplan)
Example #22
0
def cuda_gridvis(sub_array, f, settings, plan, chan):
    """
    Grid the visibilities parallelized by pixel.
    References:
      - Chapter 10 in "Interferometry and Synthesis in Radio Astronomy"
          by Thompson, Moran, & Swenson
      - Daniel Brigg's PhD Thesis: http://www.aoc.nrao.edu/dissertations/dbriggs/
    """
    print "Gridding the visibilities"
    t_start = time.time()
    if sub_array==1:
        Antennas = 40
    else:
        Antennas = 60

    # unpack parameters
    vfile = settings['vfile']
    briggs = settings['briggs']
    imsize = settings['imsize']
    cell = settings['cell']
    nx = np.int32(2 * imsize)
    noff = np.int32((nx - imsize) / 2)

    ## constants
    arc2rad = np.float32(np.pi / 180. / 3600.)
    du = np.float32(1. / (arc2rad * cell * nx))

    # determin the file type (uvfits or fitsidi)
    h_u = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float64')
    h_v = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float64')
    h_re = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float32')
    h_im = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float32')

    #Get Visibility Data and values of UVW
    if settings['vfile'].find('.uvfits') != -1:
        freq = 3.45E11 #np.float32(f[0].header['CRVAL4'])
        light_speed = 299792458.
        good = np.where(f[0].data.data[:, 0, 0, chan, 0, 0] != 0)

        h_u = np.float32(light_speed * f[0].data.par('uu')[good])
        print "h_u", h_u.shape
        h_v = np.float32(light_speed * f[0].data.par('vv')[good])
        gcount = np.int32(np.size(h_u))
        ## assume data is unpolarized
        h_re = np.float32(f[0].data.data[good, 0, 0, chan, 0, 0])
        h_im = np.float32(f[0].data.data[good, 0, 0, chan, 0, 1])

        freq = 1702500000.
        light_speed = 299792458.  # Speed of light


        ## assume data is unpolarized
        #print chan
        print 'GCOUNT', gcount

        # h_ : host,  d_ : device
        h_grd = np.zeros((nx, nx), dtype=np.complex64)
        h_cnt = np.zeros((nx, nx), dtype=np.int32)
        d_u = gpu.to_gpu(np.array(h_u,dtype='float32'))
        d_v = gpu.to_gpu(np.array(h_v,dtype='float32'))
        d_re = gpu.to_gpu(np.array(h_re,dtype='float32'))
        d_im = gpu.to_gpu(np.array(h_im,dtype='float32'))
        d_cnt = gpu.zeros((np.int(nx), np.int(nx)), np.int32)
        d_grd = gpu.zeros((np.int(nx), np.int(nx)), np.complex64)
        d_ngrd = gpu.zeros_like(d_grd)
        d_bm = gpu.zeros_like(d_grd)
        d_nbm = gpu.zeros_like(d_grd)
        d_fim = gpu.zeros((np.int(imsize), np.int(imsize)), np.float32)

        ## define kernel parameters
        if imsize == 1024:
            blocksize2D = (8, 16, 1)
            gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])), np.int(np.ceil(1. * nx / blocksize2D[1])))
            blocksizeF2D = (16, 16, 1)
            gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])), np.int(np.ceil(1. * imsize / blocksizeF2D[1])))
            blocksize1D = (256, 1, 1)
        else:
            blocksize2D = (16, 32, 1)
            gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])), np.int(np.ceil(1. * nx / blocksize2D[1])))
            blocksizeF2D = (32, 32, 1)
            gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])), np.int(np.ceil(1. * imsize / blocksizeF2D[1])))
            blocksize1D = (512, 1, 1)

        gridsize1D = (np.int(np.ceil(1. * gcount / blocksize1D[0])), 1)

        # ------------------------
        # make gridding kernels
        # ------------------------
        ## make spheroidal convolution kernel (don't mess with these!)
        width = 6.
        ngcf = 24.
        h_cgf = gcf(ngcf, width)
        ## make grid correction
        h_corr = corrfun(nx, width)
        d_cgf = module.get_global('cgf')[0]
        d_corr = gpu.to_gpu(h_corr)
        cu.memcpy_htod(d_cgf, h_cgf)

        # ------------------------
        # grid it up
        # ------------------------
        d_umax = gpu.max(cumath.fabs(d_u))
        d_vmax = gpu.max(cumath.fabs(d_v))
        umax = np.int32(np.ceil(d_umax.get() / du))
        vmax = np.int32(np.ceil(d_vmax.get() / du))

        ## grid ($$)
        #  This should be improvable via:
        #    - shared memory solution? I tried...
        #    - better coalesced memory access? I tried...
        #    - reorganzing and indexing UV data beforehand?
        #       (i.e. http://www.nvidia.com/docs/IO/47905/ECE757_Project_Report_Gregerson.pdf)
        #    - storing V(u,v) in texture memory?

        # Each pixel in the uv plane goes through the data and check to see whether the pixel is included in the convolution.
        # This kernel also calculates the point spread function and the local sampling
        # from the data (for applying the weights later).
        gridVis_wBM_kernel(d_grd, d_bm, d_cnt, d_u, d_v, d_re, d_im, nx, du, gcount, umax, vmax, \
                           block=blocksize2D, grid=gridsize2D)

        ## apply weights
        wgtGrid_kernel(d_bm, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D)
        hfac = np.int32(1)
        dblGrid_kernel(d_bm, nx, hfac, block=blocksize2D, grid=gridsize2D)
        shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D)
        ## normalize

        wgtGrid_kernel(d_grd, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D)
        ## Reflect grid about v axis
        hfac = np.int32(-1)
        dblGrid_kernel(d_grd, nx, hfac, block=blocksize2D, grid=gridsize2D)
        ## Shift both
        shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D)

        # ------------------------
        # Make the beam
        # ------------------------
        ## Transform to image plane
        fft.fft(d_nbm, d_bm, plan)
        ## Shift
        shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D)
        ## Correct for C
        corrGrid_kernel(d_nbm, d_corr, nx, block=blocksize2D, grid=gridsize2D)
        # Trim
        trimIm_kernel(d_nbm, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D)
        ## Normalize
        d_bmax = gpu.max(d_fim)
        bmax = d_bmax.get()
        bmax = np.float32(1. / bmax)
        nrmBeam_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D)
        ## Pull onto CPU
        dpsf = d_fim.get()

        # ------------------------
        # Make the map
        # ------------------------
        ## Transform to image plane
        fft.fft(d_ngrd, d_grd, plan)
        ## Shift
        shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D)
        ## Correct for C
        corrGrid_kernel(d_ngrd, d_corr, nx, block=blocksize2D, grid=gridsize2D)
        ## Trim
        trimIm_kernel(d_ngrd, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D)
        ## Normalize (Jy/beam)
        nrmGrid_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D)

        ## Finish timers
        t_end = time.time()
        t_full = t_end - t_start
        print "Gridding execution time %0.5f" % t_full + ' s'
        print "\t%0.5f" % (t_full / gcount) + ' s per visibility'

        ## Return dirty psf (CPU) and dirty image (GPU)
        return dpsf, d_fim
Example #23
0
def cuda_gridvis(settings, plan):
    """
    Grid the visibilities parallelized by pixel.
    References:
      - Chapter 10 in "Interferometry and Synthesis in Radio Astronomy"
          by Thompson, Moran, & Swenson
      - Daniel Brigg's PhD Thesis: http://www.aoc.nrao.edu/dissertations/dbriggs/
    """
    print "Gridding the visibilities"
    t_start = time.time()

    # unpack parameters
    vfile = settings['vfile']
    briggs = settings['briggs']
    imsize = settings['imsize']
    cell = settings['cell']
    nx = np.int32(2 * imsize)
    noff = np.int32((nx - imsize) / 2)

    ## constants
    arc2rad = np.float32(np.pi / 180 / 3600.)
    du = np.float32(1. / (arc2rad * cell * nx))
    ## grab data
    f = pyfits.open(settings['vfile'])

    # determin the file type (uvfits or fitsidi)

    if settings['vfile'].find('.fitsidi') != -1:

        ## quickly figure out what data is not flagged
        freq = 3.45E11 #np.float32(f[7].header['CRVAL3']) 299792458vvvv
        #good  = np.where(f[0].data.data[:,0,0,0,0,0,0] != 0)

        #h_u   = np.float32(freq*f[0].data.par('uu')[good])
        #h_v   = np.float32(freq*f[0].data.par('vv')[good])
        light_speed = 299792458.         # Speed of light

        h_u = np.ndarray(shape=(780, 1),dtype='float64')
        h_v = np.ndarray(shape=(780, 1),dtype='float64')
        h_re = np.ndarray(shape=(780, 1),dtype='float32')
        h_im = np.ndarray(shape=(780, 1),dtype='float32')

        h_u = np.float64(light_speed * f[0].data[:].UU)
        h_v = np.float64(light_speed * f[0].data[:].VV)

        for bl in range(0, 780):

            #gcount += np.int32(np.size(h_u[bl]))
            ## assume data is unpolarized
            #h_re   = np.float32(0.5*(f[0].data.data[good,0,0,0,0,0,0]+f[0].data.data[good,0,0,0,0,1,0]))
            #h_im   = np.float32(0.5*(f[0].data.data[good,0,0,0,0,0,1]+f[0].data.data[good,0,0,0,0,1,1]))
            h_re[bl] = np.float32(f[0].data[:].data[bl][0][0][0][0][0][0])
            h_im[bl] = np.float32(f[0].data[:].data[bl][0][0][0][0][0][1])
            ## make GPU arrays

        h_u = np.float32(h_u.ravel())
        h_v = np.float32(h_v.ravel())
        gcount = np.int32(np.size(h_u))
        #gcount = len(gcount.ravel())
        h_re = np.float32(h_re.ravel())
        h_im = np.float32(h_im.ravel())
        print len(h_re),len(h_im)
    elif settings['vfile'].find('.uvfits') != -1:
        freq = 3.45E11 #np.float32(f[0].header['CRVAL4'])
        light_speed = 299792458.
        good = np.where(f[0].data.data[:, 0, 0, 0, 0, 0] != 0)
        h_u = np.float32(light_speed * f[0].data.par('uu')[good])
        h_v = np.float32(light_speed * f[0].data.par('vv')[good])
        gcount = np.int32(np.size(h_u))
        ## assume data is unpolarized
        h_re = np.float32(f[0].data.data[good, 0, 0, 0, 0, 0])
        h_im = np.float32(f[0].data.data[good, 0, 0, 0, 0, 1])

    print h_u

    # h_ : host,  d_ : device
    h_grd = np.zeros((nx, nx), dtype=np.complex64)
    h_cnt = np.zeros((nx, nx), dtype=np.int32)
    d_u = gpu.to_gpu(h_u)
    d_v = gpu.to_gpu(h_v)
    d_re = gpu.to_gpu(h_re)
    d_im = gpu.to_gpu(h_im)
    d_cnt = gpu.zeros((np.int(nx), np.int(nx)), np.int32)
    d_grd = gpu.zeros((np.int(nx), np.int(nx)), np.complex64)
    d_ngrd = gpu.zeros_like(d_grd)
    d_bm = gpu.zeros_like(d_grd)
    d_nbm = gpu.zeros_like(d_grd)
    d_fim = gpu.zeros((np.int(imsize), np.int(imsize)), np.float32)
    ## define kernel parameters
    blocksize2D = (8, 16, 1)
    gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])), np.int(np.ceil(1. * nx / blocksize2D[1])))
    blocksizeF2D = (16, 16, 1)
    gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])), np.int(np.ceil(1. * imsize / blocksizeF2D[1])))
    blocksize1D = (256, 1, 1)
    gridsize1D = (np.int(np.ceil(1. * gcount / blocksize1D[0])), 1)

    # ------------------------
    # make gridding kernels
    # ------------------------
    ## make spheroidal convolution kernel (don't mess with these!)
    width = 6.
    ngcf = 24.
    h_cgf = gcf(ngcf, width)
    ## make grid correction
    h_corr = corrfun(nx, width)
    d_cgf = module.get_global('cgf')[0]
    d_corr = gpu.to_gpu(h_corr)
    cu.memcpy_htod(d_cgf, h_cgf)

    # ------------------------
    # grid it up
    # ------------------------
    d_umax = gpu.max(cumath.fabs(d_u))
    d_vmax = gpu.max(cumath.fabs(d_v))
    umax = np.int32(np.ceil(d_umax.get() / du))
    vmax = np.int32(np.ceil(d_vmax.get() / du))

    ## grid ($$)
    #  This should be improvable via:
    #    - shared memory solution? I tried...
    #    - better coalesced memory access? I tried...
    #    - reorganzing and indexing UV data beforehand?
    #       (i.e. http://www.nvidia.com/docs/IO/47905/ECE757_Project_Report_Gregerson.pdf)
    #    - storing V(u,v) in texture memory?
    gridVis_wBM_kernel(d_grd, d_bm, d_cnt, d_u, d_v, d_re, d_im, nx, du, gcount, umax, vmax, \
                       block=blocksize2D, grid=gridsize2D)
    ## apply weights
    wgtGrid_kernel(d_bm, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D)
    hfac = np.int32(1)
    dblGrid_kernel(d_bm, nx, hfac, block=blocksize2D, grid=gridsize2D)
    shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D)
    ## normalize
    wgtGrid_kernel(d_grd, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D)
    ## Reflect grid about v axis
    hfac = np.int32(-1)
    dblGrid_kernel(d_grd, nx, hfac, block=blocksize2D, grid=gridsize2D)
    ## Shift both
    shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D)

    # ------------------------
    # Make the beam
    # ------------------------
    ## Transform to image plane
    fft.fft(d_nbm, d_bm, plan)
    ## Shift
    shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D)
    ## Correct for C
    corrGrid_kernel(d_nbm, d_corr, nx, block=blocksize2D, grid=gridsize2D)
    # Trim
    trimIm_kernel(d_nbm, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D)
    ## Normalize
    d_bmax = gpu.max(d_fim)
    bmax = d_bmax.get()
    bmax = np.float32(1. / bmax)
    nrmBeam_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D)
    ## Pull onto CPU
    dpsf = d_fim.get()

    # ------------------------
    # Make the map
    # ------------------------
    ## Transform to image plane
    fft.fft(d_ngrd, d_grd, plan)
    ## Shift
    shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D)
    ## Correct for C
    corrGrid_kernel(d_ngrd, d_corr, nx, block=blocksize2D, grid=gridsize2D)
    ## Trim
    trimIm_kernel(d_ngrd, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D)
    ## Normalize (Jy/beam)
    nrmGrid_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D)

    ## Finish timers
    t_end = time.time()
    t_full = t_end - t_start
    print "Gridding execution time %0.5f" % t_full + ' s'
    print "\t%0.5f" % (t_full / gcount) + ' s per visibility'

    ## Return dirty psf (CPU) and dirty image (GPU)
    return dpsf, d_fim
Example #24
0
def sample_defrost_gpu(lat, func, gamma, m2_eff):
    """Calculates a sample of random values in the lattice

    lat = Lattice
    func = name of Cuda kernel
    n = size of cubic lattice
    gamma = -0.25 or +0.25
    m2_eff = effective mass

    This uses CuFFT to calculate FFTW.
    """
    import scikits.cuda.fft as fft
    import fftw3

    "Various constants:"
    mpl = lat.mpl
    n = lat.n
    nn = lat.nn
    os = 16
    nos = n*pow(os,2)
    dk = lat.dk
    dx = lat.dx
    dkos = dk/(2.*os)
    dxos = dx/os
    kcut = nn*dk/2.0
    norm = 0.5/(math.sqrt(2*pi*dk**3.)*mpl)*(dkos/dxos)

    ker = np.empty(nos,dtype = lat.prec_real)
    fft1 = fftw3.Plan(ker,ker, direction='forward', flags=['measure'],
                     realtypes = ['realodd 10'])

    for k in xrange(nos):
        kk = (k+0.5)*dkos
        ker[k]=kk*(kk**2. + m2_eff)**gamma*math.exp(-(kk/kcut)**2.)
    fft1.execute()
    fftw3.destroy_plan(fft1)

    for k in xrange(nos):
        ker[k] = norm*ker[k]/(k+1)

    Fk_gpu = gpuarray.zeros((n/2+1,n,n), dtype = lat.prec_complex)

    ker_gpu = gpuarray.to_gpu(ker)
    tmp_gpu = gpuarray.zeros((n,n,n),dtype = lat.prec_real)

    plan = fft.Plan(tmp_gpu.shape, lat.prec_real, lat.prec_complex)
    plan2 = fft.Plan(tmp_gpu.shape, lat.prec_complex, lat.prec_real)
    
    func(tmp_gpu, ker_gpu, np.uint32(nn), np.float64(os),
         np.uint32(lat.dimx), np.uint32(lat.dimy), np.uint32(lat.dimz),
         block = lat.cuda_block_1, grid = lat.cuda_grid)
    
    fft.fft(tmp_gpu, Fk_gpu, plan)
    
    if lat.test==True:
        print'Testing mode on! Set testQ to False to disable this.\n'
        np.random.seed(1)

    rr1 = (np.random.normal(size=Fk_gpu.shape)+
           np.random.normal(size=Fk_gpu.shape)*1j)

    Fk = Fk_gpu.get()
    Fk*= rr1
    Fk_gpu = gpuarray.to_gpu(Fk)

    fft.ifft(Fk_gpu, tmp_gpu, plan2)
    res = (tmp_gpu.get()).astype(lat.prec_real)

    res *= 1./lat.VL

    return res
Example #25
0
def sample_defrost_gpu(lat, func, gamma, m2_eff):
    """Calculates a sample of random values in the lattice

    lat = Lattice
    func = name of Cuda kernel
    n = size of cubic lattice
    gamma = -0.25 or +0.25
    m2_eff = effective mass

    This uses CuFFT to calculate FFTW.
    """
    import scikits.cuda.fft as fft
    import fftw3

    "Various constants:"
    mpl = lat.mpl
    n = lat.n
    nn = lat.nn
    os = 16
    nos = n * pow(os, 2)
    dk = lat.dk
    dx = lat.dx
    dkos = dk / (2. * os)
    dxos = dx / os
    kcut = nn * dk / 2.0
    norm = 0.5 / (math.sqrt(2 * pi * dk**3.) * mpl) * (dkos / dxos)

    ker = np.empty(nos, dtype=lat.prec_real)
    fft1 = fftw3.Plan(ker,
                      ker,
                      direction='forward',
                      flags=['measure'],
                      realtypes=['realodd 10'])

    for k in xrange(nos):
        kk = (k + 0.5) * dkos
        ker[k] = kk * (kk**2. + m2_eff)**gamma * math.exp(-(kk / kcut)**2.)
    fft1.execute()
    fftw3.destroy_plan(fft1)

    for k in xrange(nos):
        ker[k] = norm * ker[k] / (k + 1)

    Fk_gpu = gpuarray.zeros((n / 2 + 1, n, n), dtype=lat.prec_complex)

    ker_gpu = gpuarray.to_gpu(ker)
    tmp_gpu = gpuarray.zeros((n, n, n), dtype=lat.prec_real)

    plan = fft.Plan(tmp_gpu.shape, lat.prec_real, lat.prec_complex)
    plan2 = fft.Plan(tmp_gpu.shape, lat.prec_complex, lat.prec_real)

    func(tmp_gpu,
         ker_gpu,
         np.uint32(nn),
         np.float64(os),
         np.uint32(lat.dimx),
         np.uint32(lat.dimy),
         np.uint32(lat.dimz),
         block=lat.cuda_block_1,
         grid=lat.cuda_grid)

    fft.fft(tmp_gpu, Fk_gpu, plan)

    if lat.test == True:
        print 'Testing mode on! Set testQ to False to disable this.\n'
        np.random.seed(1)

    rr1 = (np.random.normal(size=Fk_gpu.shape) +
           np.random.normal(size=Fk_gpu.shape) * 1j)

    Fk = Fk_gpu.get()
    Fk *= rr1
    Fk_gpu = gpuarray.to_gpu(Fk)

    fft.ifft(Fk_gpu, tmp_gpu, plan2)
    res = (tmp_gpu.get()).astype(lat.prec_real)

    res *= 1. / lat.VL

    return res
Example #26
0
import numpy as np

import scikits.cuda.fft as cu_fft

print 'Testing fft/ifft..'
N = 4096 * 16
batch_size = 16

x = np.asarray(np.random.rand(batch_size, N), np.float32)
xf = np.fft.fft(x)
y = np.real(np.fft.ifft(xf))

x_gpu = gpuarray.to_gpu(x)
xf_gpu = gpuarray.empty((batch_size, N / 2 + 1), np.complex64)
plan_forward = cu_fft.Plan(N, np.float32, np.complex64, batch_size)
cu_fft.fft(x_gpu, xf_gpu, plan_forward)

y_gpu = gpuarray.empty_like(x_gpu)
plan_inverse = cu_fft.Plan(N, np.complex64, np.float32, batch_size)
cu_fft.ifft(xf_gpu, y_gpu, plan_inverse, True)

print 'Success status: ', np.allclose(y, y_gpu.get(), atol=1e-6)

print 'Testing in-place fft..'
x = np.asarray(np.random.rand(batch_size, N)+\
               1j*np.random.rand(batch_size, N), np.complex64)
x_gpu = gpuarray.to_gpu(x)

plan = cu_fft.Plan(N, np.complex64, np.complex64, batch_size)
cu_fft.fft(x_gpu, x_gpu, plan)
Example #27
0
def fft_resample(x,
                 W,
                 new_len,
                 npad,
                 to_remove,
                 cuda_dict=dict(use_cuda=False)):
    """Do FFT resampling with a filter function (possibly using CUDA)

    Parameters
    ----------
    x : 1-d array
        The array to resample.
    W : 1-d array or gpuarray
        The filtering function to apply.
    new_len : int
        The size of the output array (before removing padding).
    npad : int
        Amount of padding to apply before resampling.
    to_remove : int
        Number of samples to remove after resampling.
    cuda_dict : dict
        Dictionary constructed using setup_cuda_multiply_repeated().

    Returns
    -------
    x : 1-d array
        Filtered version of x.
    """
    # add some padding at beginning and end to make this work a little cleaner
    x = _smart_pad(x, npad)
    old_len = len(x)
    shorter = new_len < old_len
    if not cuda_dict['use_cuda']:
        N = int(min(new_len, old_len))
        sl_1 = slice((N + 1) // 2)
        y_fft = np.zeros(new_len, np.complex128)
        x_fft = fft(x).ravel() * W
        y_fft[sl_1] = x_fft[sl_1]
        sl_2 = slice(-(N - 1) // 2, None)
        y_fft[sl_2] = x_fft[sl_2]
        y = np.real(ifft(y_fft, overwrite_x=True)).ravel()
    else:
        cuda_dict['x'].set(
            np.concatenate((x, np.zeros(max(new_len - old_len, 0), x.dtype))))
        # do the fourier-domain operations, results put in second param
        cudafft.fft(cuda_dict['x'], cuda_dict['x_fft'], cuda_dict['fft_plan'])
        cuda_multiply_inplace_c128(W, cuda_dict['x_fft'])
        # This is not straightforward, but because x_fft and y_fft share
        # the same data (and only one half of the full DFT is stored), we
        # don't have to transfer the slice like we do in scipy. All we
        # need to worry about is the Nyquist component, either halving it
        # or taking just the real component...
        use_len = new_len if shorter else old_len
        func = cuda_real_c128 if shorter else cuda_halve_c128
        if use_len % 2 == 0:
            nyq = int((use_len - (use_len % 2)) // 2)
            func(cuda_dict['x_fft'], slice=slice(nyq, nyq + 1))
        cudafft.ifft(cuda_dict['x_fft'],
                     cuda_dict['x'],
                     cuda_dict['ifft_plan'],
                     scale=False)
        y = cuda_dict['x'].get()[:new_len if shorter else None]

    # now let's trim it back to the correct size (if there was padding)
    if to_remove > 0:
        keep = np.ones((new_len), dtype='bool')
        keep[:to_remove] = False
        keep[-to_remove:] = False
        y = np.compress(keep, y)

    return y
Example #28
0
import numpy as np

import scikits.cuda.fft as cu_fft

print 'Testing fft/ifft..'
N = 4096*16
batch_size = 16

x = np.asarray(np.random.rand(batch_size, N), np.float32)
xf = np.fft.fft(x)
y = np.real(np.fft.ifft(xf))

x_gpu = gpuarray.to_gpu(x)
xf_gpu = gpuarray.empty((batch_size, N/2+1), np.complex64)
plan_forward = cu_fft.Plan(N, np.float32, np.complex64, batch_size)
cu_fft.fft(x_gpu, xf_gpu, plan_forward)

y_gpu = gpuarray.empty_like(x_gpu)
plan_inverse = cu_fft.Plan(N, np.complex64, np.float32, batch_size)
cu_fft.ifft(xf_gpu, y_gpu, plan_inverse, True)

print 'Success status: ', np.allclose(y, y_gpu.get(), atol=1e-6)

print 'Testing in-place fft..'
x = np.asarray(np.random.rand(batch_size, N)+\
               1j*np.random.rand(batch_size, N), np.complex64)
x_gpu = gpuarray.to_gpu(x)

plan = cu_fft.Plan(N, np.complex64, np.complex64, batch_size)
cu_fft.fft(x_gpu, x_gpu, plan)
Example #29
0
def cuda_gridvis(csrh_sun, csrh_satellite, settings, plan, chan):
    """
    Grid the visibilities parallelized by pixel.
    References:
      - Chapter 10 in "Interferometry and Synthesis in Radio Astronomy"
          by Thompson, Moran, & Swenson
      - Daniel Brigg's PhD Thesis: http://www.aoc.nrao.edu/dissertations/dbriggs/
    """
    print "Gridding the visibilities"
    t_start = time.time()

    #f = pyfits.open(settings['vfile'])

    # unpack parameters
    vfile = settings['vfile']
    briggs = settings['briggs']
    imsize = settings['imsize']
    cell = settings['cell']
    nx = np.int32(2 * imsize)
    noff = np.int32((nx - imsize) / 2)

    ## constants
    arc2rad = np.float32(np.pi / 180. / 3600.)
    du = np.float32(1. / (arc2rad * cell * nx))
    ## grab data
    #f = pyfits.open(settings['vfile'])

    Data = np.ndarray(shape=(44, 44, 16), dtype=complex)
    UVW = np.ndarray(shape=(780, 1), dtype='float64')
    Data, UVW = visibility(csrh_sun, csrh_satellite, chan)
    print "UVW*****\n", UVW

    # determin the file type (uvfits or fitsidi)
    h_uu = np.ndarray(shape=(780), dtype='float64')
    h_vv = np.ndarray(shape=(780), dtype='float64')
    h_rere = np.ndarray(shape=(780), dtype='float32')
    h_imim = np.ndarray(shape=(780), dtype='float32')

    freq = 1702500000.
    light_speed = 299792458.  # Speed of light

    ## quickly figure out what data is not flagged
    #np.float32(f[7].header['CRVAL3']) 299792458vvvv
    #good  = np.where(f[0].data.data[:,0,0,0,0,0,0] != 0)

    #h_u   = np.float32(freq*f[0].data.par('uu')[good])
    #h_v   = np.float32(freq*f[0].data.par('vv')[good])

    blen = 0

    for antenna1 in range(0, 39):
        for antenna2 in range(antenna1 + 1, 40):
            h_rere[blen] = Data[antenna1][antenna2][chan].real
            h_imim[blen] = Data[antenna1][antenna2][chan].imag
            h_uu[blen] = freq * UVW[blen][0]
            h_vv[blen] = freq * UVW[blen][1]
            blen += 1

    print "h_u", h_uu
    #h_u = np.float32(h_u.ravel())
    #h_v = np.float32(h_v.ravel())
    gcount = np.int32(np.size(h_uu))
    #gcount = len(gcount.ravel())
    #h_re = np.float32(h_re.ravel())
    #h_im = np.float32(h_im.ravel())
    #freq = 3.45E11  #np.float32(f[0].header['CRVAL4'])

    blen = 0
    bl_order = np.ndarray(shape=(780, 2), dtype=int)
    good = []

    for border1 in range(0, 39):
        for border2 in range(border1 + 1, 40):
            bl_order[blen][0] = border1
            bl_order[blen][1] = border2
            blen = blen + 1

    blen = 0

    h_u = []
    h_v = []
    h_re = []
    h_im = []
    Flag_Ant = [
        0, 4, 8, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26,
        28, 29, 37, 38, 39
    ]
    for blen in range(0, 780):
        if (bl_order[blen][0] not in Flag_Ant) and (bl_order[blen][1]
                                                    not in Flag_Ant):
            good.append(blen)
            h_u.append(h_uu[blen])
            h_v.append(h_vv[blen])
            h_re.append(h_rere[blen])
            h_im.append(h_imim[blen])

    #print "Good:",good

    gcount = np.int32(np.size(h_u))
    ## assume data is unpolarized
    #print chan
    print 'GCOUNT', gcount
    #print "H_U", h_u
    #print "H_V", h_v
    #print h_re
    #print h_im

    # h_ : host,  d_ : device
    h_grd = np.zeros((nx, nx), dtype=np.complex64)
    h_cnt = np.zeros((nx, nx), dtype=np.int32)
    d_u = gpu.to_gpu(np.array(h_uu, dtype='float32'))
    d_v = gpu.to_gpu(np.array(h_vv, dtype='float32'))
    d_re = gpu.to_gpu(np.array(h_rere, dtype='float32'))
    d_im = gpu.to_gpu(np.array(h_imim, dtype='float32'))
    d_cnt = gpu.zeros((np.int(nx), np.int(nx)), np.int32)
    d_grd = gpu.zeros((np.int(nx), np.int(nx)), np.complex64)
    d_ngrd = gpu.zeros_like(d_grd)
    d_bm = gpu.zeros_like(d_grd)
    d_nbm = gpu.zeros_like(d_grd)
    d_fim = gpu.zeros((np.int(imsize), np.int(imsize)), np.float32)
    ## define kernel parameters
    if imsize == 1024:
        blocksize2D = (8, 16, 1)
        gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])),
                      np.int(np.ceil(1. * nx / blocksize2D[1])))
        blocksizeF2D = (16, 16, 1)
        gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])),
                       np.int(np.ceil(1. * imsize / blocksizeF2D[1])))
        blocksize1D = (256, 1, 1)
    else:
        blocksize2D = (16, 32, 1)
        gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])),
                      np.int(np.ceil(1. * nx / blocksize2D[1])))
        blocksizeF2D = (32, 32, 1)
        gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])),
                       np.int(np.ceil(1. * imsize / blocksizeF2D[1])))
        blocksize1D = (512, 1, 1)

    gridsize1D = (np.int(np.ceil(1. * gcount / blocksize1D[0])), 1)

    # ------------------------
    # make gridding kernels
    # ------------------------
    ## make spheroidal convolution kernel (don't mess with these!)
    width = 6.
    ngcf = 24.
    h_cgf = gcf(ngcf, width)
    ## make grid correction
    h_corr = corrfun(nx, width)
    d_cgf = module.get_global('cgf')[0]
    d_corr = gpu.to_gpu(h_corr)
    cu.memcpy_htod(d_cgf, h_cgf)

    # ------------------------
    # grid it up
    # ------------------------
    d_umax = gpu.max(cumath.fabs(d_u))
    d_vmax = gpu.max(cumath.fabs(d_v))
    umax = np.int32(np.ceil(d_umax.get() / du))
    vmax = np.int32(np.ceil(d_vmax.get() / du))

    ## grid ($$)
    #  This should be improvable via:
    #    - shared memory solution? I tried...
    #    - better coalesced memory access? I tried...
    #    - reorganzing and indexing UV data beforehand?
    #       (i.e. http://www.nvidia.com/docs/IO/47905/ECE757_Project_Report_Gregerson.pdf)
    #    - storing V(u,v) in texture memory?
    gridVis_wBM_kernel(d_grd, d_bm, d_cnt, d_u, d_v, d_re, d_im, nx, du, gcount, umax, vmax, \
                       block=blocksize2D, grid=gridsize2D)

    ## apply weights
    wgtGrid_kernel(d_bm, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D)
    hfac = np.int32(1)
    dblGrid_kernel(d_bm, nx, hfac, block=blocksize2D, grid=gridsize2D)
    shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D)
    ## normalize

    wgtGrid_kernel(d_grd,
                   d_cnt,
                   briggs,
                   nx,
                   block=blocksize2D,
                   grid=gridsize2D)
    ## Reflect grid about v axis
    hfac = np.int32(-1)
    dblGrid_kernel(d_grd, nx, hfac, block=blocksize2D, grid=gridsize2D)
    ## Shift both
    shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D)

    # ------------------------
    # Make the beam
    # ------------------------
    ## Transform to image plane
    fft.fft(d_nbm, d_bm, plan)
    ## Shift
    shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D)
    ## Correct for C
    corrGrid_kernel(d_nbm, d_corr, nx, block=blocksize2D, grid=gridsize2D)
    # Trim
    trimIm_kernel(d_nbm,
                  d_fim,
                  noff,
                  nx,
                  imsize,
                  block=blocksizeF2D,
                  grid=gridsizeF2D)
    ## Normalize
    d_bmax = gpu.max(d_fim)
    bmax = d_bmax.get()
    bmax = np.float32(1. / bmax)
    nrmBeam_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D)
    ## Pull onto CPU
    dpsf = d_fim.get()

    # ------------------------
    # Make the map
    # ------------------------
    ## Transform to image plane
    fft.fft(d_ngrd, d_grd, plan)
    ## Shift
    shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D)
    ## Correct for C
    corrGrid_kernel(d_ngrd, d_corr, nx, block=blocksize2D, grid=gridsize2D)
    ## Trim
    trimIm_kernel(d_ngrd,
                  d_fim,
                  noff,
                  nx,
                  imsize,
                  block=blocksizeF2D,
                  grid=gridsizeF2D)
    ## Normalize (Jy/beam)
    nrmGrid_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D)

    ## Finish timers
    t_end = time.time()
    t_full = t_end - t_start
    print "Gridding execution time %0.5f" % t_full + ' s'
    print "\t%0.5f" % (t_full / gcount) + ' s per visibility'

    ## Return dirty psf (CPU) and dirty image (GPU)
    return dpsf, d_fim
Example #30
0
ii = 0
tmpimg = numpy.zeros((n, m, k), dtype=numpy.float32)

ln = sq + 5
mags = mag[indexp].sum()
del indexp
s = 3
N2 = int(N * 0.7)
N3 = int(N * 0.7)

gpu_data.set(sobject.astype(numpy.complex64))
pycuda.driver.memcpy_dtod(gpu_last.gpudata, gpu_data.gpudata, gpu_data.nbytes)
gpu_intensity.set(mag)
gpu_mask.set(sobm)
#print real_space.nbytes
for i in range(N):
    t0 = time()
    cu_fft.fft(gpu_data, gpu_data, plan)
    constrains_fourier(gpu_data, gpu_intensity)
    cu_fft.ifft(gpu_data, gpu_data, plan, True)
    constrains_real(gpu_data, gpu_last, gpu_mask, beta)
    pycuda.driver.memcpy_dtod(gpu_last.gpudata, gpu_data.gpudata, gpu_data.nbytes)
    t1 = time()
    ctx.synchronize()
    t2 = time()
    print("With CUDA, the full loop took %.3fs but after sync %.3fs" % (t1 - t0, t2 - t0))

del tmpimg
print "it took", time() - time0, N / (time() - time0)
print "smallest error", serr, "number", nerr
Example #31
0
def cuda_gridvis(settings,plan):
  """
  Grid the visibilities parallelized by pixel.
  References:
    - Chapter 10 in "Interferometry and Synthesis in Radio Astronomy"
        by Thompson, Moran, & Swenson
    - Daniel Brigg's PhD Thesis: http://www.aoc.nrao.edu/dissertations/dbriggs/
  """
  print "Gridding the visibilities"
  t_start=time.time()

  # unpack parameters
  vfile   = settings['vfile']
  briggs  = settings['briggs']
  imsize  = settings['imsize']
  cell    = settings['cell']
  nx      = np.int32(2*imsize)
  noff    = np.int32((nx-imsize)/2)

  ## constants
  arc2rad = np.float32(np.pi/180/3600.)
  du      = np.float32(1./(arc2rad*cell*nx))
  ## grab data
  f  = pyfits.open(settings['vfile'])
  ## quickly figure out what data is not flagged
  freq  = np.float32(f[0].header['CRVAL4'])
  good  = np.where(f[0].data.data[:,0,0,0,0,0,0] != 0)
  h_u   = np.float32(freq*f[0].data.par('uu')[good])
  h_v   = np.float32(freq*f[0].data.par('vv')[good])
  gcount = np.int32(np.size(h_u))
  ## assume data is unpolarized
  h_re   = np.float32(0.5*(f[0].data.data[good,0,0,0,0,0,0]+f[0].data.data[good,0,0,0,0,1,0]))
  h_im   = np.float32(0.5*(f[0].data.data[good,0,0,0,0,0,1]+f[0].data.data[good,0,0,0,0,1,1]))
  ## make GPU arrays
  h_grd  = np.zeros((nx,nx),dtype=np.complex64)
  h_cnt  = np.zeros((nx,nx),dtype=np.int32)
  d_u    = gpu.to_gpu(h_u)
  d_v    = gpu.to_gpu(h_v)
  d_re   = gpu.to_gpu(h_re)
  d_im   = gpu.to_gpu(h_im)
  d_cnt  = gpu.zeros((np.int(nx),np.int(nx)),np.int32)
  d_grd  = gpu.zeros((np.int(nx),np.int(nx)),np.complex64)
  d_ngrd = gpu.zeros_like(d_grd)
  d_bm   = gpu.zeros_like(d_grd)
  d_nbm  = gpu.zeros_like(d_grd)
  d_fim  = gpu.zeros((np.int(imsize),np.int(imsize)),np.float32)
  ## define kernel parameters
  blocksize2D  = (8,16,1)
  gridsize2D   = (np.int(np.ceil(1.*nx/blocksize2D[0])),np.int(np.ceil(1.*nx/blocksize2D[1])))
  blocksizeF2D = (16,16,1)
  gridsizeF2D  = (np.int(np.ceil(1.*imsize/blocksizeF2D[0])),np.int(np.ceil(1.*imsize/blocksizeF2D[1])))
  blocksize1D  = (256,1,1)
  gridsize1D   = (np.int(np.ceil(1.*gcount/blocksize1D[0])),1)

  # ------------------------
  # make gridding kernels
  # ------------------------
  ## make spheroidal convolution kernel (don't mess with these!)
  width = 6.
  ngcf  = 24.
  h_cgf = gcf(ngcf,width)
  ## make grid correction
  h_corr = corrfun(nx,width)
  d_cgf  = module.get_global('cgf')[0]
  d_corr = gpu.to_gpu(h_corr)
  cu.memcpy_htod(d_cgf,h_cgf)

  # ------------------------
  # grid it up
  # ------------------------
  d_umax = gpu.max(cumath.fabs(d_u))
  d_vmax = gpu.max(cumath.fabs(d_v))
  umax   = np.int32(np.ceil(d_umax.get()/du))
  vmax   = np.int32(np.ceil(d_vmax.get()/du))

  ## grid ($$)
  #  This should be improvable via:
  #    - shared memory solution? I tried...
  #    - better coalesced memory access? I tried...
  #    - reorganzing and indexing UV data beforehand?
  #       (i.e. http://www.nvidia.com/docs/IO/47905/ECE757_Project_Report_Gregerson.pdf)
  #    - storing V(u,v) in texture memory?
  gridVis_wBM_kernel(d_grd,d_bm,d_cnt,d_u,d_v,d_re,d_im,nx,du,gcount,umax,vmax,\
			block=blocksize2D,grid=gridsize2D)
  ## apply weights
  wgtGrid_kernel(d_bm,d_cnt,briggs,nx,block=blocksize2D,grid=gridsize2D)
  hfac = np.int32(1)
  dblGrid_kernel(d_bm,nx,hfac,block=blocksize2D,grid=gridsize2D)
  shiftGrid_kernel(d_bm,d_nbm,nx,block=blocksize2D,grid=gridsize2D)
  ## normalize
  wgtGrid_kernel(d_grd,d_cnt,briggs,nx,block=blocksize2D,grid=gridsize2D)
  ## Reflect grid about v axis
  hfac = np.int32(-1)
  dblGrid_kernel(d_grd,nx,hfac,block=blocksize2D,grid=gridsize2D)
  ## Shift both
  shiftGrid_kernel(d_grd,d_ngrd,nx,block=blocksize2D,grid=gridsize2D)

  # ------------------------
  # Make the beam
  # ------------------------
  ## Transform to image plane
  fft.fft(d_nbm,d_bm,plan)
  ## Shift
  shiftGrid_kernel(d_bm,d_nbm,nx,block=blocksize2D,grid=gridsize2D)
  ## Correct for C
  corrGrid_kernel(d_nbm,d_corr,nx,block=blocksize2D,grid=gridsize2D)
  # Trim
  trimIm_kernel(d_nbm,d_fim,noff,nx,imsize,block=blocksizeF2D,grid=gridsizeF2D)
  ## Normalize
  d_bmax = gpu.max(d_fim)
  bmax = d_bmax.get()
  bmax = np.float32(1./bmax)
  nrmBeam_kernel(d_fim,bmax,imsize,block=blocksizeF2D,grid=gridsizeF2D)
  ## Pull onto CPU
  dpsf  = d_fim.get()

  # ------------------------
  # Make the map
  # ------------------------
  ## Transform to image plane
  fft.fft(d_ngrd,d_grd,plan)
  ## Shift
  shiftGrid_kernel(d_grd,d_ngrd,nx,block=blocksize2D,grid=gridsize2D)
  ## Correct for C
  corrGrid_kernel(d_ngrd,d_corr,nx,block=blocksize2D,grid=gridsize2D)
  ## Trim
  trimIm_kernel(d_ngrd,d_fim,noff,nx,imsize,block=blocksizeF2D,grid=gridsizeF2D)
  ## Normalize (Jy/beam)
  nrmGrid_kernel(d_fim,bmax,imsize,block=blocksizeF2D,grid=gridsizeF2D)

  ## Finish timers
  t_end=time.time()
  t_full=t_end-t_start
  print "Gridding execution time %0.5f"%t_full+' s'
  print "\t%0.5f"%(t_full/gcount)+' s per visibility'

  ## Return dirty psf (CPU) and dirty image (GPU)
  return dpsf,d_fim
Example #32
0
def fft(invec, outvec, prec, itype, otype):
    cuplan = _get_fwd_plan(invec.dtype, outvec.dtype, len(invec))
    cu_fft.fft(invec.data, outvec.data, cuplan)
Example #33
0
def fft_resample(x, W, new_len, npad, to_remove, cuda_dict=dict(use_cuda=False)):
    """Do FFT resampling with a filter function (possibly using CUDA)

    Parameters
    ----------
    x : 1-d array
        The array to resample.
    W : 1-d array or gpuarray
        The filtering function to apply.
    new_len : int
        The size of the output array (before removing padding).
    npad : int
        Amount of padding to apply before resampling.
    to_remove : int
        Number of samples to remove after resampling.
    cuda_dict : dict
        Dictionary constructed using setup_cuda_multiply_repeated().

    Returns
    -------
    x : 1-d array
        Filtered version of x.
    """
    # add some padding at beginning and end to make this work a little cleaner
    x = _smart_pad(x, npad)
    old_len = len(x)
    shorter = new_len < old_len
    if not cuda_dict["use_cuda"]:
        N = int(min(new_len, old_len))
        sl_1 = slice((N + 1) // 2)
        y_fft = np.zeros(new_len, np.complex128)
        x_fft = fft(x).ravel() * W
        y_fft[sl_1] = x_fft[sl_1]
        sl_2 = slice(-(N - 1) // 2, None)
        y_fft[sl_2] = x_fft[sl_2]
        y = np.real(ifft(y_fft, overwrite_x=True)).ravel()
    else:
        cuda_dict["x"].set(np.concatenate((x, np.zeros(max(new_len - old_len, 0), x.dtype))))
        # do the fourier-domain operations, results put in second param
        cudafft.fft(cuda_dict["x"], cuda_dict["x_fft"], cuda_dict["fft_plan"])
        cuda_multiply_inplace_c128(W, cuda_dict["x_fft"])
        # This is not straightforward, but because x_fft and y_fft share
        # the same data (and only one half of the full DFT is stored), we
        # don't have to transfer the slice like we do in scipy. All we
        # need to worry about is the Nyquist component, either halving it
        # or taking just the real component...
        use_len = new_len if shorter else old_len
        func = cuda_real_c128 if shorter else cuda_halve_c128
        if use_len % 2 == 0:
            nyq = int((use_len - (use_len % 2)) // 2)
            func(cuda_dict["x_fft"], slice=slice(nyq, nyq + 1))
        cudafft.ifft(cuda_dict["x_fft"], cuda_dict["x"], cuda_dict["ifft_plan"], scale=False)
        y = cuda_dict["x"].get()[: new_len if shorter else None]

    # now let's trim it back to the correct size (if there was padding)
    if to_remove > 0:
        keep = np.ones((new_len), dtype="bool")
        keep[:to_remove] = False
        keep[-to_remove:] = False
        y = np.compress(keep, y)

    return y
Example #34
0
    ### Initial Computation

    # Compute image OTF
    size_2D = [N, M]
    fx = np.int32([[1, -1]])
    fy = np.int32([[1], [-1]])
    otfFx = L0_helpers.psf2otf(fx, size_2D)
    otfFy = L0_helpers.psf2otf(fy, size_2D)

    # Compute MTF
    otfFx_d = gpu.to_gpu(otfFx)
    otfFy_d = gpu.to_gpu(otfFy)
    mtf_kernel(MTF_d, otfFx_d, otfFy_d, Nx, Ny, block=blocksize, grid=gridsize)

    # Compute Fourier transform of original image
    cu_fft.fft(FFTiR_d, FIR_d, plan)
    cu_fft.fft(FFTiG_d, FIG_d, plan)
    cu_fft.fft(FFTiB_d, FIB_d, plan)

    ### Iteration settings
    beta_max = 1e5
    beta = 2 * _lambda
    iteration = 0

    # Done initializing
    init_time = time.time()

    ### Iterate until desired convergence in similarity
    while beta < beta_max:

        if verbose: