Example #1
0
        def thunk():
            input_shape = inputs[0][0].shape

            # construct output shape
            output_shape = list(input_shape)
            # DFT of real input is symmetric, no need to store
            # redundant coefficients
            output_shape[-1] = output_shape[-1] // 2 + 1
            # extra dimension with length 2 for real/imag
            output_shape += [2]
            output_shape = tuple(output_shape)

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # I thought we'd need to change the type on output_pycuda
            # so it is complex64, but as it turns out scikits.cuda.fft
            # doesn't really care either way and treats the array as
            # if it is complex64 anyway.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(input_shape[1:],
                                   np.float32,
                                   np.complex64,
                                   batch=input_shape[0])

            fft.fft(input_pycuda, output_pycuda, plan[0])
Example #2
0
def init_cuda(ignore_config=False):
    """Initialize CUDA functionality

    This function attempts to load the necessary interfaces
    (hardware connectivity) to run CUDA-based filtering. This
    function should only need to be run once per session.

    If the config var (set via mne.set_config or in ENV)
    MNE_USE_CUDA == 'true', this function will be executed when
    the first CUDA setup is performed. If this variable is not
    set, this function can be manually executed.
    """
    global _cuda_capable, _multiply_inplace_c128, _halve_c128, _real_c128
    if _cuda_capable:
        return
    if not ignore_config and (get_config('MNE_USE_CUDA', 'false').lower() !=
                              'true'):
        logger.info('CUDA not enabled in config, skipping initialization')
        return
    # Triage possible errors for informative messaging
    _cuda_capable = False
    try:
        from pycuda import gpuarray, driver  # noqa
        from pycuda.elementwise import ElementwiseKernel
    except ImportError:
        logger.warning('module pycuda not found, CUDA not enabled')
        return
    try:
        # Initialize CUDA; happens with importing autoinit
        import pycuda.autoinit  # noqa
    except ImportError:
        logger.warning('pycuda.autoinit could not be imported, likely '
                       'a hardware error, CUDA not enabled')
        return
    # Make sure scikits.cuda is installed
    try:
        from scikits.cuda import fft as cudafft
    except ImportError:
        logger.warning('module scikits.cuda not found, CUDA not ' 'enabled')
        return

    # let's construct our own CUDA multiply in-place function
    _multiply_inplace_c128 = ElementwiseKernel(
        'pycuda::complex<double> *a, pycuda::complex<double> *b',
        'b[i] *= a[i]', 'multiply_inplace')
    _halve_c128 = ElementwiseKernel('pycuda::complex<double> *a',
                                    'a[i] /= 2.0', 'halve_value')
    _real_c128 = ElementwiseKernel('pycuda::complex<double> *a',
                                   'a[i] = real(a[i])', 'real_value')

    # Make sure we can use 64-bit FFTs
    try:
        cudafft.Plan(16, np.float64, np.complex128)  # will get auto-GC'ed
    except:
        logger.warning('Device does not support 64-bit FFTs, '
                       'CUDA not enabled')
        return
    _cuda_capable = True
    # Figure out limit for CUDA FFT calculations
    logger.info('Enabling CUDA with %s available memory' % get_cuda_memory())
Example #3
0
        def thunk():
            input_shape = inputs[0][0].shape

            # construct output shape
            output_shape = tuple(input_shape)

            # print 'FFT shapes:', input_shape, '->', output_shape
            # print 'Batch size:', input_shape[0]
            # print 'Core shape:', input_shape[1:-1]

            z = outputs[0]

            # only allocate if there is no previous allocation of the right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # I thought we'd need to change the type on output_pycuda
            # so it is complex64, but as it turns out scikits.cuda.fft
            # doesn't really care either way and treats the array as
            # if it is complex64 anyway.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(shape=input_shape[1:-1],  # Exclude batch dim and complex dim
                                   in_dtype=np.complex64,
                                   out_dtype=np.complex64,
                                   batch=input_shape[0])

            fft.fft(input_pycuda, output_pycuda, plan[0])
Example #4
0
 def init(self):
     if self.ctx is None:
         with self.__class__.initsem:
             if self.ctx is None:
                 import pycuda
                 if "autoinit" not in dir(pycuda):
                     import pycuda.autoinit
                 self.__class__.ctx = pycuda.autoinit.context
     if not self.shape in self.plans:
         with self.__class__.initsem:
             if not self.shape in self.plans:
                 self.ctx.push()
                 if not self.__class__.multconj:
                     self.__class__.multconj = pycuda.elementwise.ElementwiseKernel(
                         "pycuda::complex<double> *a, pycuda::complex<double> *b",
                         "a[i]*=conj(b[i])")
                 if self.shape not in self.__class__.data1_gpus:
                     self.__class__.data1_gpus[self.shape] = gpuarray.empty(
                         self.shape, numpy.complex128)
                 if self.shape not in self.__class__.data2_gpus:
                     self.__class__.data2_gpus[self.shape] = gpuarray.empty(
                         self.shape, numpy.complex128)
                 if self.shape not in self.__class__.plans:
                     self.__class__.plans[self.shape] = cu_fft.Plan(
                         self.shape, numpy.complex128, numpy.complex128)
                 self.ctx.synchronize()
                 self.ctx.pop()
Example #5
0
        def thunk():
            input_shape = inputs[0][0].shape
            output_shape = input_shape

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # I thought we'd need to change the type on output_pycuda
            # so it is complex64, but as it turns out scikits.cuda.fft
            # doesn't really care either way and treats the array as
            # if it is complex64 anyway.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(input_shape[1:-1], np.complex64, np.complex64,
                                   batch=input_shape[0])

            fft.fft(input_pycuda, output_pycuda, plan[0])
            compute_map[node.outputs[0]][0] = True
Example #6
0
        def thunk():
            input_shape = inputs[0][0].shape

            # construct output shape
            # chop off the extra length-2 dimension for real/imag
            output_shape = list(input_shape[:-1])
            # restore full signal length
            output_shape[-1] = (output_shape[-1] - 1) * 2
            output_shape = tuple(output_shape)

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # input_pycuda is a float32 array with an extra dimension,
            # but will be interpreted by scikits.cuda as a complex64
            # array instead.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(output_shape[1:],
                                   np.complex64,
                                   np.float32,
                                   batch=output_shape[0])

            fft.ifft(input_pycuda, output_pycuda, plan[0])
Example #7
0
        def thunk():
            input_shape = inputs[0][0].shape
            output_shape = input_shape

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # input_pycuda is a float32 array with an extra dimension,
            # but will be interpreted by scikits.cuda as a complex64
            # array instead.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(output_shape[1:-1], np.complex64, np.complex64,
                                   batch=output_shape[0])

            fft.ifft(input_pycuda, output_pycuda, plan[0])
            compute_map[node.outputs[0]][0] = True
Example #8
0
 def get_plan(self, cache, *args):
     if not args in self.plan_cache:
         plan = cu_fft.Plan(*args)
         if cache:
             self.plan_cache[args] = plan
     else:
         plan = self.plan_cache[args]
     return plan
Example #9
0
 def test_batch_fft_float64_to_complex128_1d(self):
     x = np.asarray(np.random.rand(self.B, self.N), np.float64)
     xf = np.fft.rfft(x, axis=1)
     x_gpu = gpuarray.to_gpu(x)
     xf_gpu = gpuarray.empty((self.B, self.N / 2 + 1), np.complex128)
     plan = fft.Plan(x.shape[1], np.float64, np.complex128, batch=self.B)
     fft.fft(x_gpu, xf_gpu, plan)
     assert np.allclose(xf, xf_gpu.get(), atol=atol_float64)
Example #10
0
 def test_fft_float64_to_complex128_2d(self):
     x = np.asarray(np.random.rand(self.N, self.M), np.float64)
     xf = np.fft.rfftn(x)
     x_gpu = gpuarray.to_gpu(x)
     xf_gpu = gpuarray.empty((self.N, self.M / 2 + 1), np.complex128)
     plan = fft.Plan(x.shape, np.float64, np.complex128)
     fft.fft(x_gpu, xf_gpu, plan)
     assert np.allclose(xf, xf_gpu.get(), atol=atol_float64)
Example #11
0
 def test_fft_float32_to_complex64_1d(self):
     x = np.asarray(np.random.rand(self.N), np.float32)
     xf = np.fft.rfftn(x)
     x_gpu = gpuarray.to_gpu(x)
     xf_gpu = gpuarray.empty(self.N / 2 + 1, np.complex64)
     plan = fft.Plan(x.shape, np.float32, np.complex64)
     fft.fft(x_gpu, xf_gpu, plan)
     assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
Example #12
0
 def test_multiple_streams(self):
     x = np.asarray(np.random.rand(self.N), np.float32)
     xf = np.fft.rfftn(x)
     y = np.asarray(np.random.rand(self.N), np.float32)
     yf = np.fft.rfftn(y)
     x_gpu = gpuarray.to_gpu(x)
     y_gpu = gpuarray.to_gpu(y)
     xf_gpu = gpuarray.empty(self.N / 2 + 1, np.complex64)
     yf_gpu = gpuarray.empty(self.N / 2 + 1, np.complex64)
     stream0 = drv.Stream()
     stream1 = drv.Stream()
     plan1 = fft.Plan(x.shape, np.float32, np.complex64, stream=stream0)
     plan2 = fft.Plan(y.shape, np.float32, np.complex64, stream=stream1)
     fft.fft(x_gpu, xf_gpu, plan1)
     fft.fft(y_gpu, yf_gpu, plan2)
     assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
     assert np.allclose(yf, yf_gpu.get(), atol=atol_float32)
Example #13
0
 def test_ifft_complex128_to_float64(self):
     x = np.asarray(np.random.rand(self.N), np.float64)
     xf = np.asarray(np.fft.fft(x), np.complex128)
     xf_gpu = gpuarray.to_gpu(xf[0:self.N / 2 + 1])
     x_gpu = gpuarray.empty(self.N, np.float64)
     plan = fft.Plan(x.shape, np.complex128, np.float64)
     fft.ifft(xf_gpu, x_gpu, plan, True)
     assert np.allclose(x, x_gpu.get(), atol=atol_float64)
Example #14
0
 def test_ifft_complex64_to_float32_1d(self):
     x = np.asarray(np.random.rand(self.N), np.float32)
     xf = np.asarray(np.fft.rfftn(x), np.complex64)
     xf_gpu = gpuarray.to_gpu(xf)
     x_gpu = gpuarray.empty(self.N, np.float32)
     plan = fft.Plan(x.shape, np.complex64, np.float32)
     fft.ifft(xf_gpu, x_gpu, plan, True)
     assert np.allclose(x, x_gpu.get(), atol=atol_float32)
Example #15
0
def _get_fwd_plan(itype, otype, inlen):
    try:
        theplan = _forward_plans[(itype, otype, inlen)]
    except KeyError:
        theplan = cu_fft.Plan((inlen, ), itype, otype)
        _forward_plans.update({(itype, otype, inlen): theplan})

    return theplan
Example #16
0
def _get_inv_plan(itype, otype, outlen):
    try:
        theplan = _reverse_plans[(itype, otype, outlen)]
    except KeyError:
        theplan = cu_fft.Plan((outlen, ), itype, otype)
        _reverse_plans.update({(itype, otype, outlen): theplan})

    return theplan
Example #17
0
 def test_batch_fft_float32_to_complex64_2d(self):
     x = np.asarray(np.random.rand(self.B, self.N, self.M), np.float32)
     xf = np.fft.rfftn(x, axes=(1, 2))
     x_gpu = gpuarray.to_gpu(x)
     xf_gpu = gpuarray.empty((self.B, self.N, self.M / 2 + 1), np.complex64)
     plan = fft.Plan([self.N, self.M],
                     np.float32,
                     np.complex64,
                     batch=self.B)
     fft.fft(x_gpu, xf_gpu, plan)
     assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
Example #18
0
    def test_batch_ifft_complex128_to_float64_1d(self):

        # Note that since rfftn returns a Fortran-ordered array, it
        # needs to be reformatted as a C-ordered array before being
        # passed to gpuarray.to_gpu:
        x = np.asarray(np.random.rand(self.B, self.N), np.float64)
        xf = np.asarray(np.fft.rfft(x, axis=1), np.complex128)
        xf_gpu = gpuarray.to_gpu(np.ascontiguousarray(xf))
        x_gpu = gpuarray.empty((self.B, self.N), np.float64)
        plan = fft.Plan(x.shape[1], np.complex128, np.float64, batch=self.B)
        fft.ifft(xf_gpu, x_gpu, plan, True)
        assert np.allclose(x, x_gpu.get(), atol=atol_float64)
Example #19
0
    def test_ifft_complex64_to_float32_2d(self):

        # Note that since rfftn returns a Fortran-ordered array, it
        # needs to be reformatted as a C-ordered array before being
        # passed to gpuarray.to_gpu:
        x = np.asarray(np.random.rand(self.N, self.M), np.float32)
        xf = np.asarray(np.fft.rfftn(x), np.complex64)
        xf_gpu = gpuarray.to_gpu(np.ascontiguousarray(xf))
        x_gpu = gpuarray.empty((self.N, self.M), np.float32)
        plan = fft.Plan(x.shape, np.complex64, np.float32)
        fft.ifft(xf_gpu, x_gpu, plan, True)
        assert np.allclose(x, x_gpu.get(), atol=atol_float32)
Example #20
0
    # Start time
    start_time = time.time()

    # Validate image format
    N, M, D = image.shape
    assert D == 3, "Error: input must be 3-channel RGB image"
    print "Processing %d x %d RGB image" % (M, N)

    ### Compile and initialize CUDA kernels and FFT plans
    mtf_kernel = cuda_compile(mtf_kernel_source, "mtf_kernel")
    hv_kernel = cuda_compile(hv_kernel_source, "hv_kernel")
    Sa_kernel = cuda_compile(Sa_kernel_source, "Sa_kernel")
    Sb_kernel = cuda_compile(Sb_kernel_source, "Sb_kernel")
    d_kernel = cuda_compile(d_kernel_source, "d_kernel")
    merge_r_kernel = cuda_compile(merge_r_kernel_source, "merge_r_kernel")
    plan = cu_fft.Plan((N, M), np.complex64, np.complex64)

    ### CUDA kernel settings
    Nx, Ny = np.int32(M), np.int32(N)
    x_tpb = 32
    y_tpb = 16
    x_blocks = int(np.ceil(Nx * 1.0 / x_tpb))
    y_blocks = int(np.ceil(Ny * 1.0 / y_tpb))
    blocksize = (x_tpb, y_tpb, 1)
    gridsize = (x_blocks, y_blocks)

    # Initialize S with I and normalize RGB values
    S = np.float32(image) / 256

    ### Allocate memory on GPU
Example #21
0
    settings = dict([])
    # setup filename
    settings['vfile'] = example
    settings['imsize'] = np.int32(ISIZE)    # number of image pixels
    # 1 degree viewfield, 1*3.1415926535/180*3600 =
    settings['cell'] = np.float32(3600. / ISIZE)    # pixel size in arcseconds (rad ? degree?)
    settings['briggs'] = np.float32(1e7)    # weight parameter

    ## make cuFFT plan #improvable#
    imsize = settings['imsize']

    # nx - 2 imsize, it means 2048 when imsize=1024
    nx = np.int32(2 * imsize)

    # create fft plan nx*nx
    plan = fft.Plan((np.int(nx), np.int(nx)), np.complex64, np.complex64)

    ## Create the PSF & dirty image
    #   dpsf - PSF, gpu_im ( dirty image)
    #   dpsf is computed by CPU, gpu_im is in the GPU
    dpsf, gpu_im = cuda_gridvis(settings, plan)

    gpu_dpsf = gpu.to_gpu(dpsf)
    if PLOTME:
        dirty = np.roll(np.fliplr(gpu_im.get()), 1, axis=1)

    ## Clean the PSF
    cpsf = serial_clean_beam(dpsf, imsize / 50.)
    gpu_cpsf = gpu.to_gpu(cpsf)

    if PLOTME:
Example #22
0
def resample_sdbe_to_r2dbe_zpfft(Xs):
    """
	Resample SWARM spectrum product in time-domain at R2DBE rate using
	zero-padding and a radix-2 iFFT algorithm.
	
	Arguments:
	----------
	Xs -- MxN numpy array in which the zeroth dimension is increasing
	snapshot index, and the first dimension is the positive frequency
	half of the spectrum.
	
	Returns:
	--------
	xs -- The time-domain signal sampled at the R2DBE rate.
	next_start_vec -- Start indecies for each FFT window.

	"""

    # timestep sizes for SWARM and R2DBE rates
    dt_s = 1.0 / SWARM_RATE
    dt_r = 1.0 / R2DBE_RATE

    # we need to oversample by factor 64 and then undersample by factor 39
    simple_r = 64  # 4096
    simple_s = 39  # 2496
    fft_window_oversample = 2 * SWARM_CHANNELS * simple_r  # 2* due to real FFT

    # oversample timestep size
    dt_f = dt_s / simple_r

    # the timespan of one SWARM FFT window
    T_s = dt_s * SWARM_SAMPLES_PER_WINDOW

    # what are these...?
    x_t2_0 = None
    x_t2_1 = None

    # time vectors over one SWARM FFT window in different step sizes
    t_r = arange(0, T_s, dt_r)
    t_s = arange(0, T_s, dt_s)
    t_f = arange(0, T_s, dt_f)

    # offset in oversampled time series that corresponds to one dt_r step
    # from the last R2DBE rate sample in the previous window
    next_start = 0

    # some time offsets...?
    offset_in_window_offset_s = list()
    offset_global_s = list()

    # total number of time series samples
    N_x = int(ceil(Xs.shape[0] * SWARM_SAMPLES_PER_WINDOW * dt_s / dt_r))
    # and initialize the output
    xs = zeros(N_x, dtype=float32)
    #fine_sample_index = zeros(N_x)
    next_start_vec = zeros(Xs.shape[0])
    # index in output where samples from next window are stored
    start_output = 0

    # cuFFT plan for complex to real DFT
    plan = cu_fft.Plan(fft_window_oversample, complex64, float32)

    # padding kernel
    fill_padded = mod.get_function("fill_padded")

    # downsampling kernel
    downsample = mod.get_function("downsample")

    # FFT scaling kernel
    scale = ElementwiseKernel(
        "float *a", "a[i] = {0} * a[i]".format(1. / fft_window_oversample),
        "scale")

    # max size of resampled chunk from a single window
    xs_chunk_size_max = int32(ceil((1. * fft_window_oversample) / simple_s))

    # create memory on device for cuFFT
    xf_d = gpuarray.empty(fft_window_oversample, dtype=float32)
    xp_d = gpuarray.zeros(fft_window_oversample / 2 + 1, dtype=complex64)
    y_d = gpuarray.empty(xs_chunk_size_max, dtype=float32)

    for ii in range(Xs.shape[0]):

        # move window to device
        x_d = gpuarray.to_gpu(Xs[ii, :])

        # threads per block
        # number of blocks (keep the array as zeros to save time)
        TPB = 1024
        nB = int(ceil(1. * Xs.shape[1] / TPB))
        # pad with zeros to oversample by 64
        fill_padded(int32(1), xp_d, int32(fft_window_oversample/2+1),\
             x_d, int32(Xs.shape[1]),\
             block=(TPB,1,1), grid=(nB,1))

        # iFFT
        cu_fft.ifft(xp_d, xf_d, plan, scale=False)

        xs_chunk_size = int32(
            ceil((1. * fft_window_oversample - next_start) / simple_s))
        # threads per block
        TPB = 64
        # number of blocks
        nB = ceil(1. * xs_chunk_size / TPB).astype(int)
        ## undersample by 39 to correct rate, and start at the correct
        ## offset in this window
        downsample(xf_d,int32(fft_window_oversample),\
          y_d,xs_chunk_size,
          int32(next_start),int32(simple_s),\
          block=(TPB,1,1),grid=(nB,1))

        # rescale from ifft using ElementwiseKernel
        scale(y_d)

        # pull data back onto host
        xs_chunk = y_d.get()

        # fill output numpy array
        stop_output = start_output + xs_chunk_size
        xs[start_output:stop_output] = xs_chunk[:xs_chunk_size]
        # update the starting index in the output array
        start_output = stop_output

        # mark the time of the last used sample relative to the start
        # of this window
        time_window_start_to_last_used_sample = t_f[next_start::39][-1]
        # calculate the remaining time in this window
        time_remaining_in_window = T_s - time_window_start_to_last_used_sample
        # convert to the equivalent number of oversample timesteps
        num_dt_f_steps_short = round(time_remaining_in_window / dt_f)
        next_start_vec[ii] = next_start
        if (num_dt_f_steps_short == 0):
            next_start = 0
        else:
            next_start = simple_s - num_dt_f_steps_short
    return xs, next_start_vec
Example #23
0
def resample_sdbe_to_r2dbe_fft_interp(Xs, interp_kind="nearest"):
    """
	Resample SWARM spectrum product in time-domain at R2DBE rate using
	iFFT and then interpolation in the time-domain.
	
	Arguments:
	----------
	Xs -- MxN numpy array in which the zeroth dimension is increasing
	snapshot index, and the first dimension is the positive frequency
	half of the spectrum.
	interp_kind -- Kind of interpolation.
	
	Returns:
	--------
	xs -- The time-domain signal sampled at the R2DBE rate.
	"""
    # timestep sizes for SWARM and R2DBE rates
    dt_s = 1.0 / SWARM_RATE
    dt_r = 1.0 / R2DBE_RATE

    # cuFFT plan for complex to real DFT
    plan = cu_fft.Plan(SWARM_SAMPLES_PER_WINDOW, complex64, float32,
                       Xs.shape[0])

    # load complex spectrum to device
    x_d = gpuarray.to_gpu(Xs)
    xp_d = gpuarray.empty((Xs.shape[0], Xs.shape[1] + 1), dtype=complex64)

    # pad nyquist with zeros
    block = (32, 32, 1)
    grid = (int(ceil(1. * (Xs.shape[1] + 1) / block[1])),
            int(ceil(1. * Xs.shape[0] / block[0])))
    fill_padded = mod.get_function("fill_padded")
    fill_padded(int32(Xs.shape[0]),xp_d,int32(Xs.shape[1]+1),x_d,int32(Xs.shape[1]),\
     block=block,grid=grid)

    # allocate memory for time series
    xf_d = gpuarray.empty((Xs.shape[0], SWARM_SAMPLES_PER_WINDOW), float32)

    # calculate time series, include scaling
    cu_fft.ifft(xp_d, xf_d, plan, scale=True)

    # and interpolate
    xs_size = int(floor(
        Xs.shape[0] * SWARM_SAMPLES_PER_WINDOW * dt_s / dt_r)) - 1
    TPB = 64  # threads per block
    nB = int(ceil(1. * xs_size / TPB))  # number of blocks
    xs_d = gpuarray.empty(xs_size, float32)  # decimated time-series
    if interp_kind == 'nearest':
        # compile kernel
        nearest_interp = mod.get_function(interp_kind)
        # call kernel
        nearest_interp(xf_d,
                       xs_d,
                       int32(xs_size),
                       float64(dt_r / dt_s),
                       block=(TPB, 1, 1),
                       grid=(nB, 1))
    elif interp_kind == 'linear':
        # compile kernel
        linear_interp = mod.get_function("copy_texture_kernel")
        # get texture reference
        a_texref = mod.get_texref("a_tex")
        a_texref.set_filter_mode(drv.filter_mode.LINEAR)  # linear
        #a_texref.set_filter_mode(drv.filter_mode.POINT)	# nearest-neighbor
        # move time series to texture reference
        # following http://lists.tiker.net/pipermail/pycuda/2009-November/001916.html
        descr = drv.ArrayDescriptor()
        descr.format = drv.array_format.FLOAT
        descr.height = Xs.shape[0]
        descr.width = SWARM_SAMPLES_PER_WINDOW
        descr.num_channels = 1
        a_texref.set_address_2d(xf_d.gpudata, descr,
                                SWARM_SAMPLES_PER_WINDOW * 4)
        # set up linear interpolation over texture
        linear_interp(xs_d,int32(xs_size),float64(dt_r/dt_s),int32(SWARM_SAMPLES_PER_WINDOW),\
          texrefs=[a_texref],block=(TPB,1,1),grid=(nB,1))

    return xs_d.get()
Example #24
0
def setup_cuda_fft_multiply_repeated(n_jobs, h_fft):
    """Set up repeated CUDA FFT multiplication with a given filter

    Parameters
    ----------
    n_jobs : int | str
        If n_jobs == 'cuda', the function will attempt to set up for CUDA
        FFT multiplication.
    h_fft : array
        The filtering function that will be used repeatedly.
        If n_jobs='cuda', this function will be shortened (since CUDA
        assumes FFTs of real signals are half the length of the signal)
        and turned into a gpuarray.

    Returns
    -------
    n_jobs : int
        Sets n_jobs = 1 if n_jobs == 'cuda' was passed in, otherwise
        original n_jobs is passed.
    cuda_dict : dict
        Dictionary with the following CUDA-related variables:
            use_cuda : bool
                Whether CUDA should be used.
            fft_plan : instance of FFTPlan
                FFT plan to use in calculating the FFT.
            ifft_plan : instance of FFTPlan
                FFT plan to use in calculating the IFFT.
            x_fft : instance of gpuarray
                Empty allocated GPU space for storing the result of the
                frequency-domain multiplication.
            x : instance of gpuarray
                Empty allocated GPU space for the data to filter.
    h_fft : array | instance of gpuarray
        This will either be a gpuarray (if CUDA enabled) or np.ndarray.
        If CUDA is enabled, h_fft will be modified appropriately for use
        with filter.fft_multiply().

    Notes
    -----
    This function is designed to be used with fft_multiply_repeated().
    """
    cuda_dict = dict(use_cuda=False,
                     fft_plan=None,
                     ifft_plan=None,
                     x_fft=None,
                     x=None)
    n_fft = len(h_fft)
    cuda_fft_len = int((n_fft - (n_fft % 2)) / 2 + 1)
    if n_jobs == 'cuda':
        n_jobs = 1
        if cuda_capable:
            # set up all arrays necessary for CUDA
            # try setting up for float64
            try:
                # do the IFFT normalization now so we don't have to later
                h_fft = gpuarray.to_gpu(
                    h_fft[:cuda_fft_len].astype('complex_') / len(h_fft))
                cuda_dict.update(
                    use_cuda=True,
                    fft_plan=cudafft.Plan(n_fft, np.float64, np.complex128),
                    ifft_plan=cudafft.Plan(n_fft, np.complex128, np.float64),
                    x_fft=gpuarray.empty(cuda_fft_len, np.complex128),
                    x=gpuarray.empty(int(n_fft), np.float64))
                logger.info('Using CUDA for FFT FIR filtering')
            except Exception:
                logger.info('CUDA not used, could not instantiate memory '
                            '(arrays may be too large), falling back to '
                            'n_jobs=1')
        else:
            logger.info('CUDA not used, CUDA has not been initialized, '
                        'falling back to n_jobs=1')
    return n_jobs, cuda_dict, h_fft
Example #25
0
def init_cuda():
    """Initialize CUDA functionality

    This function attempts to load the necessary interfaces
    (hardware connectivity) to run CUDA-based filtering. This
    function should only need to be run once per session.

    If the config var (set via mne.set_config or in ENV)
    MNE_USE_CUDA == 'true', this function will be executed when
    importing mne. If this variable is not set, this function can
    be manually executed.
    """
    global cuda_capable
    global cuda_multiply_inplace_c128
    global cuda_halve_c128
    global cuda_real_c128
    if cuda_capable is True:
        logger.info('CUDA previously enabled, currently %s available memory' %
                    sizeof_fmt(mem_get_info()[0]))
        return
    # Triage possible errors for informative messaging
    cuda_capable = False
    try:
        import pycuda.gpuarray
        import pycuda.driver
    except ImportError:
        logger.warning('module pycuda not found, CUDA not enabled')
        return
    try:
        # Initialize CUDA; happens with importing autoinit
        import pycuda.autoinit  # noqa, analysis:ignore
    except ImportError:
        logger.warning('pycuda.autoinit could not be imported, likely '
                       'a hardware error, CUDA not enabled')
        return
    # Make sure scikits.cuda is installed
    try:
        from scikits.cuda import fft as cudafft
    except ImportError:
        logger.warning('module scikits.cuda not found, CUDA not ' 'enabled')
        return

    # Make our multiply inplace kernel
    from pycuda.elementwise import ElementwiseKernel

    # let's construct our own CUDA multiply in-place function
    cuda_multiply_inplace_c128 = ElementwiseKernel(
        'pycuda::complex<double> *a, pycuda::complex<double> *b',
        'b[i] *= a[i]', 'multiply_inplace')
    cuda_halve_c128 = ElementwiseKernel('pycuda::complex<double> *a',
                                        'a[i] /= 2.0', 'halve_value')
    cuda_real_c128 = ElementwiseKernel('pycuda::complex<double> *a',
                                       'a[i] = real(a[i])', 'real_value')

    # Make sure we can use 64-bit FFTs
    try:
        cudafft.Plan(16, np.float64, np.complex128)  # will get auto-GC'ed
    except:
        logger.warning('Device does not support 64-bit FFTs, '
                       'CUDA not enabled')
        return
    cuda_capable = True
    # Figure out limit for CUDA FFT calculations
    logger.info('Enabling CUDA with %s available memory' %
                sizeof_fmt(mem_get_info()[0]))
Example #26
0
def setup_cuda_fft_resample(n_jobs, W, new_len):
    """Set up CUDA FFT resampling

    Parameters
    ----------
    n_jobs : int | str
        If n_jobs == 'cuda', the function will attempt to set up for CUDA
        FFT resampling.
    W : array
        The filtering function to be used during resampling.
        If n_jobs='cuda', this function will be shortened (since CUDA
        assumes FFTs of real signals are half the length of the signal)
        and turned into a gpuarray.
    new_len : int
        The size of the array following resampling.

    Returns
    -------
    n_jobs : int
        Sets n_jobs = 1 if n_jobs == 'cuda' was passed in, otherwise
        original n_jobs is passed.
    cuda_dict : dict
        Dictionary with the following CUDA-related variables:
            use_cuda : bool
                Whether CUDA should be used.
            fft_plan : instance of FFTPlan
                FFT plan to use in calculating the FFT.
            ifft_plan : instance of FFTPlan
                FFT plan to use in calculating the IFFT.
            x_fft : instance of gpuarray
                Empty allocated GPU space for storing the result of the
                frequency-domain multiplication.
            x : instance of gpuarray
                Empty allocated GPU space for the data to resample.
    W : array | instance of gpuarray
        This will either be a gpuarray (if CUDA enabled) or np.ndarray.
        If CUDA is enabled, W will be modified appropriately for use
        with filter.fft_multiply().

    Notes
    -----
    This function is designed to be used with fft_resample().
    """
    cuda_dict = dict(use_cuda=False,
                     fft_plan=None,
                     ifft_plan=None,
                     x_fft=None,
                     x=None,
                     y_fft=None,
                     y=None)
    n_fft_x, n_fft_y = len(W), new_len
    cuda_fft_len_x = int((n_fft_x - (n_fft_x % 2)) // 2 + 1)
    cuda_fft_len_y = int((n_fft_y - (n_fft_y % 2)) // 2 + 1)
    if n_jobs == 'cuda':
        n_jobs = 1
        if cuda_capable:
            # try setting up for float64
            try:
                # do the IFFT normalization now so we don't have to later
                W = gpuarray.to_gpu(W[:cuda_fft_len_x].astype('complex_') /
                                    n_fft_y)
                cuda_dict.update(
                    use_cuda=True,
                    fft_plan=cudafft.Plan(n_fft_x, np.float64, np.complex128),
                    ifft_plan=cudafft.Plan(n_fft_y, np.complex128, np.float64),
                    x_fft=gpuarray.zeros(max(cuda_fft_len_x, cuda_fft_len_y),
                                         np.complex128),
                    x=gpuarray.empty(max(int(n_fft_x), int(n_fft_y)),
                                     np.float64))
                logger.info('Using CUDA for FFT resampling')
            except Exception:
                logger.info('CUDA not used, could not instantiate memory '
                            '(arrays may be too large), falling back to '
                            'n_jobs=1')
        else:
            logger.info('CUDA not used, CUDA has not been initialized, '
                        'falling back to n_jobs=1')
    return n_jobs, cuda_dict, W
Example #27
0
import pycuda.gpuarray as gpuarray
import numpy as np

import scikits.cuda.fft as cu_fft

print 'Testing fft/ifft..'
N = 4096 * 16
batch_size = 16

x = np.asarray(np.random.rand(batch_size, N), np.float32)
xf = np.fft.fft(x)
y = np.real(np.fft.ifft(xf))

x_gpu = gpuarray.to_gpu(x)
xf_gpu = gpuarray.empty((batch_size, N / 2 + 1), np.complex64)
plan_forward = cu_fft.Plan(N, np.float32, np.complex64, batch_size)
cu_fft.fft(x_gpu, xf_gpu, plan_forward)

y_gpu = gpuarray.empty_like(x_gpu)
plan_inverse = cu_fft.Plan(N, np.complex64, np.float32, batch_size)
cu_fft.ifft(xf_gpu, y_gpu, plan_inverse, True)

print 'Success status: ', np.allclose(y, y_gpu.get(), atol=1e-6)

print 'Testing in-place fft..'
x = np.asarray(np.random.rand(batch_size, N)+\
               1j*np.random.rand(batch_size, N), np.complex64)
x_gpu = gpuarray.to_gpu(x)

plan = cu_fft.Plan(N, np.complex64, np.complex64, batch_size)
cu_fft.fft(x_gpu, x_gpu, plan)
Example #28
0
def init_cuda():
    """Initialize CUDA functionality

    This function attempts to load the necessary interfaces
    (hardware connectivity) to run CUDA-based filering. This
    function should only need to be run once per session.

    If the config var (set via mne.set_config or in ENV)
    MNE_USE_CUDA == 'true', this function will be executed when
    importing mne. If this variable is not set, this function can
    be manually executed.
    """
    global cuda_capable
    global cuda_multiply_inplace_complex128
    global cuda_halve_value_complex128
    global cuda_real_value_complex128
    global requires_cuda
    if cuda_capable is True:
        logger.info('CUDA previously enabled, currently %s available memory' %
                    sizeof_fmt(mem_get_info()[0]))
        return
    # Triage possible errors for informative messaging
    cuda_capable = False
    try:
        import pycuda.gpuarray
        import pycuda.driver
    except ImportError:
        logger.warn('module pycuda not found, CUDA not enabled')
    else:
        try:
            # Initialize CUDA; happens with importing autoinit
            import pycuda.autoinit
        except ImportError:
            logger.warn('pycuda.autoinit could not be imported, likely '
                        'a hardware error, CUDA not enabled')
        else:
            # Make our multiply inplace kernel
            try:
                from pycuda.elementwise import ElementwiseKernel
                # let's construct our own CUDA multiply in-place function
                dtype = 'pycuda::complex<double>'
                cuda_multiply_inplace_complex128 = \
                    ElementwiseKernel(dtype + ' *a, ' + dtype + ' *b',
                                      'b[i] *= a[i]', 'multiply_inplace')
                cuda_halve_value_complex128 = \
                    ElementwiseKernel(dtype + ' *a', 'a[i] /= 2.0',
                                      'halve_value')
                cuda_real_value_complex128 = \
                    ElementwiseKernel(dtype + ' *a', 'a[i] = real(a[i])',
                                      'real_value')
            except:
                # This should never happen
                raise RuntimeError('pycuda ElementwiseKernel could not be '
                                   'constructed, please report this issue '
                                   'to mne-python developers with your '
                                   'system information and pycuda version')
            else:
                # Make sure scikits.cuda is installed
                try:
                    from scikits.cuda import fft as cudafft
                except ImportError:
                    logger.warn('module scikits.cuda not found, CUDA not '
                                'enabled')
                else:
                    # Make sure we can use 64-bit FFTs
                    try:
                        fft_plan = cudafft.Plan(16, np.float64, np.complex128)
                        del fft_plan
                    except:
                        logger.warn('Device does not support 64-bit FFTs, '
                                    'CUDA not enabled')
                    else:
                        cuda_capable = True
                        # Figure out limit for CUDA FFT calculations
                        logger.info('Enabling CUDA with %s available memory' %
                                    sizeof_fmt(mem_get_info()[0]))
    requires_cuda = np.testing.dec.skipif(not cuda_capable,
                                          'CUDA not initialized')
Example #29
0
def sample_defrost_gpu(lat, func, gamma, m2_eff):
    """Calculates a sample of random values in the lattice

    lat = Lattice
    func = name of Cuda kernel
    n = size of cubic lattice
    gamma = -0.25 or +0.25
    m2_eff = effective mass

    This uses CuFFT to calculate FFTW.
    """
    import scikits.cuda.fft as fft
    import fftw3

    "Various constants:"
    mpl = lat.mpl
    n = lat.n
    nn = lat.nn
    os = 16
    nos = n * pow(os, 2)
    dk = lat.dk
    dx = lat.dx
    dkos = dk / (2. * os)
    dxos = dx / os
    kcut = nn * dk / 2.0
    norm = 0.5 / (math.sqrt(2 * pi * dk**3.) * mpl) * (dkos / dxos)

    ker = np.empty(nos, dtype=lat.prec_real)
    fft1 = fftw3.Plan(ker,
                      ker,
                      direction='forward',
                      flags=['measure'],
                      realtypes=['realodd 10'])

    for k in xrange(nos):
        kk = (k + 0.5) * dkos
        ker[k] = kk * (kk**2. + m2_eff)**gamma * math.exp(-(kk / kcut)**2.)
    fft1.execute()
    fftw3.destroy_plan(fft1)

    for k in xrange(nos):
        ker[k] = norm * ker[k] / (k + 1)

    Fk_gpu = gpuarray.zeros((n / 2 + 1, n, n), dtype=lat.prec_complex)

    ker_gpu = gpuarray.to_gpu(ker)
    tmp_gpu = gpuarray.zeros((n, n, n), dtype=lat.prec_real)

    plan = fft.Plan(tmp_gpu.shape, lat.prec_real, lat.prec_complex)
    plan2 = fft.Plan(tmp_gpu.shape, lat.prec_complex, lat.prec_real)

    func(tmp_gpu,
         ker_gpu,
         np.uint32(nn),
         np.float64(os),
         np.uint32(lat.dimx),
         np.uint32(lat.dimy),
         np.uint32(lat.dimz),
         block=lat.cuda_block_1,
         grid=lat.cuda_grid)

    fft.fft(tmp_gpu, Fk_gpu, plan)

    if lat.test == True:
        print 'Testing mode on! Set testQ to False to disable this.\n'
        np.random.seed(1)

    rr1 = (np.random.normal(size=Fk_gpu.shape) +
           np.random.normal(size=Fk_gpu.shape) * 1j)

    Fk = Fk_gpu.get()
    Fk *= rr1
    Fk_gpu = gpuarray.to_gpu(Fk)

    fft.ifft(Fk_gpu, tmp_gpu, plan2)
    res = (tmp_gpu.get()).astype(lat.prec_real)

    res *= 1. / lat.VL

    return res
Example #30
0
def main(infile, outdir, ISIZE, PLOT_ME):
    # Load settings for each example
    settings = dict([])
    # setup filename
    settings['vfile'] = infile
    settings['imsize'] = np.int32(ISIZE)    # number of image pixels
    # 1 degree viewfield, 1*3.1415926535/180*3600 =
    settings['cell'] = np.float32(3600. / ISIZE)    # pixel size in arcseconds (rad ? degree?)
    settings['briggs'] = np.float32(1e7)    # weight parameter

    ## make cuFFT plan #improvable#

    ## Create the PSF & dirty image
    #   dpsf - PSF, gpu_im ( dirty image)
    #   dpsf is computed by CPU, gpu_im is in the GPU

    imsize = settings['imsize']

    # nx - 2 imsize, it means 2048 when imsize=1024
    nx = np.int32(2 * imsize)
    # create fft plan nx*nx
    plan = fft.Plan((np.int(nx), np.int(nx)), np.complex64, np.complex64)

    f = pyfits.open(settings['vfile'])
    channel = f[0].data.data.shape[3]

    for chan in range(4, 5):

        dpsf, gpu_im = cuda_gridvis(f, settings, plan, chan)

        gpu_dpsf = gpu.to_gpu(dpsf)
        if PLOTME:
            dirty = np.roll(np.fliplr(gpu_im.get()), 1, axis=1)

        ## Clean the PSF
        if imsize >= 1024:
            cpsf = serial_clean_beam(dpsf, imsize / 50.)
        elif imsize >= 512:
            cpsf = serial_clean_beam(dpsf, imsize / 25.)
        elif imsize >= 256:
            cpsf = serial_clean_beam(dpsf, imsize / 12.)

        gpu_cpsf = gpu.to_gpu(cpsf)

        if PLOTME:
            print "Plotting dirty and cleaned beam"
            fig, axs = plt.subplots(); #1, 2, sharex=True, sharey=True);
            plt.subplots_adjust(wspace=0)
            axs.imshow(dpsf, vmin=np.percentile(dpsf, 0), vmax=np.percentile(dpsf, 99), cmap=cm.gray)
            #axs[1].imshow(cpsf, vmin=np.percentile(dpsf, 0), vmax=np.percentile(dpsf, 99), cmap=cm.gray)
            pathPrefix = outdir
            if pathPrefix == None:
                plt.savefig('test_cleanbeam_%d.png'%chan)
            else:
                if pathPrefix[-1:] == '/':
                    pathPrefix = pathPrefix[:-1]
                if not os.path.exists(pathPrefix):
                    os.makedirs(pathPrefix)
                plt.savefig(pathPrefix + '/' + 'test_cleanbeam_%d.png'%chan)
            plt.close()

        ## Run CLEAN
        gpu_dirty, gpu_pmodel, gpu_clean = cuda_hogbom(gpu_im, gpu_dpsf, gpu_cpsf, thresh=0.2, gain=0.1)

        if PLOTME:
            prefix = infile
            prefix, ext = os.path.splitext(os.path.basename(prefix))
            try:
                vra
            except NameError:
                vra = [np.percentile(dirty, 1), np.percentile(dirty, 99)]

            print "Plotting dirty image and dirty image after iterative source removal"
            fig, axs = plt.subplots() #1, 2, sharex=True, sharey=True, figsize=(12.2, 6));
            plt.subplots_adjust(wspace=0)
            axs.imshow(dirty, vmin=vra[0], vmax=vra[1], cmap=cm.jet, origin='lower')
            axs.set_title('Original dirty image')
            #axs[1].imshow(np.roll(np.fliplr(gpu_dirty.get()), 1, axis=1), vmin=vra[0], vmax=vra[1], cmap=cm.gray,
            #          origin='lower')
            #axs[1].set_title('Dirty image cleaned of sources')
            pathPrefix = outdir
            if pathPrefix == None:
                plt.savefig(prefix + '_dirty_final_%d.png'%chan)
                #dirty.tofile(prefix+'_dirty_final_axs0_%d.dat'%chan)
                #(np.roll(np.fliplr(gpu_dirty.get()),1,axis=1)).tofile(prefix+'_dirty_final_axs1.dat')
            else:
                if pathPrefix[-1:] == '/':
                    pathPrefix = pathPrefix[:-1]
                plt.savefig(pathPrefix + '/' + prefix + '_dirty_final_%d.png'%chan)
                #dirty.tofile(pathPrefix+'/'+prefix+'_dirty_final_axs0_%d.dat'%chan)
                #(np.roll(np.fliplr(gpu_dirty.get()),1,axis=1)).tofile(pathPrefix+'/'+prefix+'_dirty_final_axs1.dat')
            plt.close()

            print "Plotting dirty image and final clean image"
            vra = [np.percentile(dirty, 1), np.percentile(dirty, 99)]
            fig, axs = plt.subplots(figsize=(6.1, 6)) #1, 2, sharex=True, sharey=True, figsize=(12.2, 6));
            plt.subplots_adjust(wspace=0)
            clean = np.roll(np.fliplr(gpu_clean.get()), 1, axis=1)
            #axs.imshow(dirty, vmin=vra[0], vmax=vra[1], cmap=cm.gray, origin='lower')
            #axs.set_title('Original dirty image')
            axs.imshow(clean, vmin=vra[0], vmax=vra[1], cmap=cm.hot, origin='lower')
            axs.set_title('Final cleaned image')
            pathPrefix = outdir
            if pathPrefix == None:
                plt.savefig(prefix + '_clean_final_%d.png'%chan)
                #dirty.tofile(prefix+'_clean_final_axs0_%d.dat'%chan)
                #clean.tofile(prefix+'_clean_final_axs1_%d.dat'%chan)
            else:
                if pathPrefix[-1:] == '/':
                    pathPrefix = pathPrefix[:-1]
                plt.savefig(pathPrefix + '/' + prefix + '_clean_final_%d.png'%chan)
                #dirty.tofile(pathPrefix+'/'+prefix+'_clean_final_axs0_%d.dat'%chan)
                #clean.tofile(pathPrefix+'/'+prefix+'_clean_final_axs1_%d.dat'%chan)
            plt.close()