Python cufftDestroy Examples, scikits.cuda.cufft.cufftDestroy Python Examples

Example #1

0

Show file

def scenario_contiguous_channels_wpadding(batch, tic, toc):
    '''
  # Scenario: batched IFFT of 39 snapshots
  # 39 x 16385 complex64 --> 39 x 32768 float32
  # padding complex input so channel dimension has 16400 elements.  
  '''

    n = array([2 * BENG_CHANNELS_], int32)
    beng_channels_padded = 16400

    # create batched FFT plan configuration
    inembed = array([beng_channels_padded], int32)
    onembed = array([2 * BENG_CHANNELS_], int32)
    istride = int32(beng_channels_padded)
    plan = cufft.cufftPlanMany(int32(1), n.ctypes.data, inembed.ctypes.data,
                               int32(1), istride,
                               onembed.ctypes.data, int32(1),
                               int32(2 * BENG_CHANNELS_), cufft.CUFFT_C2R,
                               int32(batch))
    # construct arrays
    gpu_in = cuda.mem_alloc(8 * batch * beng_channels_padded)  # complex64
    gpu_out = cuda.mem_alloc(4 * batch * 2 * BENG_CHANNELS_)  # float32
    cpu_in = standard_normal(
        batch * beng_channels_padded) + 1j * standard_normal(
            batch * beng_channels_padded)
    cpu_in = cpu_in.astype(complex64)
    cuda.memcpy_htod(gpu_in, cpu_in)
    # execute plan

    tic.record()
    cufft.cufftExecC2R(plan, int(gpu_in), int(gpu_out))
    toc.record()
    toc.synchronize()

    # read out result
    cpu_out = empty(batch * 2 * BENG_CHANNELS_, float32)
    cuda.memcpy_dtoh(cpu_out, gpu_out)
    cpu_out.resize((batch, 2 * BENG_CHANNELS_))

    # execute on CPU
    cpu = irfft(cpu_in.reshape(
        (batch, beng_channels_padded))[:, :BENG_CHANNELS],
                axis=-1)

    # destroy plan
    cufft.cufftDestroy(plan)

    # test
    print '\nContiguous Channel w/ Padding Scenario:'
    print '1-D %d-element C2R iFFT in batch of %d.' % (n, batch)
    print 'test results: ' 'pass' if allclose(cpu, cpu_out /
                                              (2 * BENG_CHANNELS_)) else 'fail'
    print 'real time:', batch * 13.128e-3, ' ms'
    print 'GPU time:', tic.time_till(toc), ' ms =  ', tic.time_till(toc) / (
        batch * 0.5 * 13.128e-3), ' x real (both SB)'

Example #2

0

Show file

File: vrf.py Project: bionet/vtem

 def compute_Ds(self, Mx, My):
     """
     Parameters
     ----------
     Mx : integer
         Order in the x dimension
     My : integer
         Order in the y dimension
     
     Returns
     -------
     The dirichlet coefficients of all gabor filters with order Mx, My
     in the format of PitchArray with shape (num_neurons, 2*Mx+1, 2*My+1)
     """
     
     import scikits.cuda.cufft as cufft
     d_Ds = parray.empty((self.num_neurons, 2*My+1, 2*Mx+1), self.dtype)
     ONE_TIME_FILTER = min(1024**3 / (self.Px * self.Py * d_Ds.dtype.itemsize) / 2, self.num_neurons)
     
     n = np.asarray((self.Py, self.Px) ,np.int32)
     
     if self.dtype == np.complex128:
         plan = cufft.cufftPlanMany(2, n.ctypes.data, None, 1, 0, None, 1, 0, cufft.CUFFT_Z2Z, ONE_TIME_FILTER)
         fftfunc = cufft.cufftExecZ2Z
     else:
         plan = cufft.cufftPlanMany(2, n.ctypes.data, None, 1, 0, None, 1, 0, cufft.CUFFT_C2C, ONE_TIME_FILTER)
         fftfunc = cufft.cufftExecC2C
     
     fft2Dsfun = get_fft2Ds_kernel(dtype = self.dtype)
     
     
     for i in range(0, self.num_neurons, ONE_TIME_FILTER):
         N_filters = min(ONE_TIME_FILTER, self.num_neurons - i)
         self.generate_visual_receptive_fields(startbias = i, N_filters = N_filters)
         
         
         if N_filters < ONE_TIME_FILTER:
             cufft.cufftDestroy(plan)
             if self.dtype == np.complex128:
                 plan = cufft.cufftPlanMany(2, n.ctypes.data, None, 1, 0, None, 1, 0, cufft.CUFFT_Z2Z, N_filters)
                 
             else:
                 plan = cufft.cufftPlanMany(2, n.ctypes.data, None, 1, 0, None, 1, 0, cufft.CUFFT_C2C, N_filters)
             
         
         
         #be careful with the side-by-side constraint
         fftfunc(plan, int(self.filters.gpudata), int(self.filters.gpudata), cufft.CUFFT_FORWARD)
         
         launch_kernel(fft2Dsfun, (256, 1, 1), (Mx*2+1, My * 2+1), [[d_Ds, i * d_Ds.ld], self.filters, Mx, My, self.Px, self.Py, N_filters, d_Ds.ld, self.dx*self.dy]);
     
     cufft.cufftDestroy(plan)
     
     return d_Ds

Example #3

0

Show file

def scenario_contiguous_channels(batch, tic, toc):
    '''
  # Scenario: batched IFFT of batch snapshots
  # batch x 16385 complex64 --> batch x 32768 float32
  # no padding.
  '''

    n = array([2 * BENG_CHANNELS_], int32)
    seed(12740)

    # create batched FFT plan configuration
    inembed = array([BENG_CHANNELS], int32)
    onembed = array([2 * BENG_CHANNELS_], int32)
    plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1,
                               BENG_CHANNELS, onembed.ctypes.data, 1,
                               2 * BENG_CHANNELS_, cufft.CUFFT_C2R, batch)

    # construct arrays
    gpu_in = cuda.mem_alloc(8 * batch * BENG_CHANNELS)  # complex64
    gpu_out = cuda.mem_alloc(4 * batch * 2 * BENG_CHANNELS_)  # float32
    cpu_in = standard_normal(
        batch * BENG_CHANNELS) + 1j * standard_normal(batch * BENG_CHANNELS)
    cpu_in = cpu_in.astype(complex64)
    cuda.memcpy_htod(gpu_in, cpu_in)

    # execute plan
    tic.record()
    cufft.cufftExecC2R(plan, int(gpu_in), int(gpu_out))
    toc.record()
    toc.synchronize()

    # read out result
    cpu_out = empty(batch * 2 * BENG_CHANNELS_, float32)
    cuda.memcpy_dtoh(cpu_out, gpu_out)
    cpu_out.resize((batch, 2 * BENG_CHANNELS_))

    # execute on CPU
    cpu = irfft(cpu_in.reshape((batch, BENG_CHANNELS)), axis=-1)

    # destroy plan
    cufft.cufftDestroy(plan)

    # test
    print '\nContiguous Channel Scenario:'
    print '1-D %d-element C2R iFFT in batch of %d.' % (n, batch)
    print 'test results: ' 'pass' if allclose(cpu, cpu_out /
                                              (2 * BENG_CHANNELS_)) else 'fail'
    print 'real time:', batch * 13.128e-3, ' ms'
    print 'GPU time:', tic.time_till(toc), ' ms =  ', tic.time_till(toc) / (
        batch * 0.5 * 13.128e-3), ' x real (both SB)'

Example #4

0

Show file

def scenario_contiguous_channels_oversampled64(batch, tic, toc):
    '''
  Scenario: batched IFFT of 2*2**14*64 channels
  '''
    fft_window_oversample = 64 * 2 * 2**14
    n = array([fft_window_oversample], int32)

    # create batched FFT plan configuration
    inembed = array([fft_window_oversample / 2 + 1], int32)
    onembed = array([fft_window_oversample], int32)
    plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1,
                               fft_window_oversample / 2 + 1,
                               onembed.ctypes.data, 1, fft_window_oversample,
                               cufft.CUFFT_C2R, batch)
    # construct arrays
    gpu_in = cuda.mem_alloc(8 * batch *
                            (fft_window_oversample / 2 + 1))  # complex64
    gpu_out = cuda.mem_alloc(4 * batch * fft_window_oversample)  # float32
    data_shape = (batch, fft_window_oversample / 2 + 1)
    cpu_in = standard_normal(data_shape) + 1j * standard_normal(data_shape)
    cpu_in = cpu_in.astype(complex64)
    cuda.memcpy_htod(gpu_in, cpu_in)
    # execute plan

    tic.record()
    cufft.cufftExecC2R(plan, int(gpu_in), int(gpu_out))
    toc.record()
    toc.synchronize()

    # read out result
    cpu_out = empty((batch, fft_window_oversample), float32)
    cuda.memcpy_dtoh(cpu_out, gpu_out)

    # execute on CPU
    cpu = irfft(cpu_in, axis=-1)

    # destroy plan
    cufft.cufftDestroy(plan)

    # test
    print '\nOversampling by x64 Scenario with batches:'
    print '1-D %d-element C2R iFFT in batch of %d.' % (n, batch)
    print 'test results: ' 'pass' if allclose(
        cpu, cpu_out / (fft_window_oversample)) else 'fail'
    print 'real time:', batch * 13.128e-3, ' ms'
    print 'GPU time:', tic.time_till(toc), ' ms =  ', tic.time_till(toc) / (
        batch * 0.5 * 13.128e-3), ' x real (both SB)'

Example #5

0

Show file

File: fft_test.py Project: sma-wideband/sdbe

def scenario_contiguous_channels(batch,tic,toc):
  '''
  # Scenario: batched IFFT of batch snapshots
  # batch x 16385 complex64 --> batch x 32768 float32
  # no padding.
  '''
  
  n = array([2*BENG_CHANNELS_],int32)
  seed(12740)
  
  # create batched FFT plan configuration
  inembed = array([BENG_CHANNELS],int32)
  onembed = array([2*BENG_CHANNELS_],int32)
  plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1, BENG_CHANNELS,
  	                                     onembed.ctypes.data, 1, 2*BENG_CHANNELS_,
  					     cufft.CUFFT_C2R, batch)
  
  # construct arrays 
  gpu_in  = cuda.mem_alloc(8*batch*BENG_CHANNELS)		# complex64
  gpu_out = cuda.mem_alloc(4*batch*2*BENG_CHANNELS_)	# float32
  cpu_in = standard_normal(batch*BENG_CHANNELS) + 1j * standard_normal(batch*BENG_CHANNELS)
  cpu_in = cpu_in.astype(complex64)
  cuda.memcpy_htod(gpu_in,cpu_in)
  
  # execute plan
  tic.record()
  cufft.cufftExecC2R(plan,int(gpu_in),int(gpu_out))
  toc.record()
  toc.synchronize()
  
  # read out result
  cpu_out = empty(batch*2*BENG_CHANNELS_,float32)
  cuda.memcpy_dtoh(cpu_out,gpu_out)
  cpu_out.resize((batch,2*BENG_CHANNELS_))
  
  # execute on CPU
  cpu = irfft(cpu_in.reshape((batch,BENG_CHANNELS)),axis=-1)
  
  # destroy plan
  cufft.cufftDestroy(plan)
  
  # test
  print '\nContiguous Channel Scenario:'
  print '1-D %d-element C2R iFFT in batch of %d.' % (n, batch)
  print 'test results: ' 'pass' if allclose(cpu,cpu_out/(2*BENG_CHANNELS_)) else 'fail'
  print 'real time:', batch * 13.128e-3,' ms'
  print 'GPU time:', tic.time_till(toc),' ms =  ',tic.time_till(toc)/(batch*0.5*13.128e-3),' x real (both SB)'

Example #6

0

Show file

File: fft_test.py Project: sma-wideband/sdbe

def scenario_contiguous_channels_wpadding(batch,tic,toc):
  '''
  # Scenario: batched IFFT of 39 snapshots
  # 39 x 16385 complex64 --> 39 x 32768 float32
  # padding complex input so channel dimension has 16400 elements.  
  '''
  
  n = array([2*BENG_CHANNELS_],int32)
  beng_channels_padded = 16400
  
  # create batched FFT plan configuration
  inembed = array([beng_channels_padded],int32)
  onembed = array([2*BENG_CHANNELS_],int32)
  istride = int32(beng_channels_padded)
  plan = cufft.cufftPlanMany(int32(1), n.ctypes.data, inembed.ctypes.data, int32(1), istride,
  	                                     onembed.ctypes.data, int32(1), int32(2*BENG_CHANNELS_),
  					     cufft.CUFFT_C2R, int32(batch))
  # construct arrays 
  gpu_in  = cuda.mem_alloc(8*batch*beng_channels_padded)	# complex64
  gpu_out = cuda.mem_alloc(4*batch*2*BENG_CHANNELS_)	# float32
  cpu_in = standard_normal(batch*beng_channels_padded) + 1j * standard_normal(batch*beng_channels_padded)
  cpu_in = cpu_in.astype(complex64)
  cuda.memcpy_htod(gpu_in,cpu_in)
  # execute plan
  
  tic.record()
  cufft.cufftExecC2R(plan,int(gpu_in),int(gpu_out))
  toc.record()
  toc.synchronize()
  
  # read out result
  cpu_out = empty(batch*2*BENG_CHANNELS_,float32)
  cuda.memcpy_dtoh(cpu_out,gpu_out)
  cpu_out.resize((batch,2*BENG_CHANNELS_))
  
  # execute on CPU
  cpu = irfft(cpu_in.reshape((batch,beng_channels_padded))[:,:BENG_CHANNELS],axis=-1)
  
  # destroy plan
  cufft.cufftDestroy(plan)
  
  # test
  print '\nContiguous Channel w/ Padding Scenario:'
  print '1-D %d-element C2R iFFT in batch of %d.' % (n, batch)
  print 'test results: ' 'pass' if allclose(cpu,cpu_out/(2*BENG_CHANNELS_)) else 'fail'
  print 'real time:', batch * 13.128e-3,' ms'
  print 'GPU time:', tic.time_till(toc),' ms =  ',tic.time_till(toc)/(batch*0.5*13.128e-3),' x real (both SB)'

Example #7

0

Show file

File: fft_test.py Project: sma-wideband/sdbe

def scenario_contiguous_channels_oversampled64(batch,tic,toc):
  '''
  Scenario: batched IFFT of 2*2**14*64 channels
  '''
  fft_window_oversample = 64*2*2**14
  n = array([fft_window_oversample],int32)
  
  # create batched FFT plan configuration
  inembed = array([fft_window_oversample/2+1],int32)
  onembed = array([fft_window_oversample],int32)
  plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1, fft_window_oversample/2+1,
  	                                     onembed.ctypes.data, 1, fft_window_oversample,
  					     cufft.CUFFT_C2R, batch)
  # construct arrays 
  gpu_in  = cuda.mem_alloc(8*batch*(fft_window_oversample/2+1))	# complex64
  gpu_out = cuda.mem_alloc(4*batch*fft_window_oversample)	# float32
  data_shape = (batch,fft_window_oversample/2+1)
  cpu_in = standard_normal(data_shape) + 1j * standard_normal(data_shape)
  cpu_in = cpu_in.astype(complex64)
  cuda.memcpy_htod(gpu_in,cpu_in)
  # execute plan
  
  tic.record()
  cufft.cufftExecC2R(plan,int(gpu_in),int(gpu_out))
  toc.record()
  toc.synchronize()
  
  # read out result
  cpu_out = empty((batch,fft_window_oversample),float32)
  cuda.memcpy_dtoh(cpu_out,gpu_out)
  
  # execute on CPU
  cpu = irfft(cpu_in,axis=-1)
  
  # destroy plan
  cufft.cufftDestroy(plan)
  
  # test
  print '\nOversampling by x64 Scenario with batches:'
  print '1-D %d-element C2R iFFT in batch of %d.' % (n, batch)
  print 'test results: ' 'pass' if allclose(cpu,cpu_out/(fft_window_oversample)) else 'fail'
  print 'real time:', batch * 13.128e-3,' ms'
  print 'GPU time:', tic.time_till(toc),' ms =  ',tic.time_till(toc)/(batch*0.5*13.128e-3),' x real (both SB)'

Example #8

0

Show file

    def cleanup(self):

        # destroy plans
        cufft.cufftDestroy(self.__plan_A)
        cufft.cufftDestroy(self.__plan_B)
        cufft.cufftDestroy(self.__plan_C)

        #
        self.__gpu_quantized_0.free()
        self.__gpu_quantized_1.free()
        self.__gpu_time_series_0.free()
        self.__gpu_time_series_1.free()

Example #9

0

Show file

time_gpu = tic.time_till(toc)

print 'DEBUG::loading resampled time series cpu_r2dbe'
gpumeminfo(cuda)
cpu_r2dbe = np.empty(num_r2dbe_samples / 2, dtype=float32)
cuda.memcpy_dtoh(cpu_r2dbe, gpu_r2dbe)

print ''
print 'time resampled:', 13.128e-3 * BENG_SNAPSHOTS * (BENG_BUFFER_IN_COUNTS -
                                                       1), ' ms'
print 'Transfer size was %d bytes' % cpu_vdif_buf.nbytes
print 'GPU time:', time_gpu, ' ms'

# destroy plans
cufft.cufftDestroy(plan_A)
cufft.cufftDestroy(plan_interp_A)
cufft.cufftDestroy(plan_interp_B)

# free memory
#gpu_r2dbe.free()

if DEBUG:
    import matplotlib.pyplot as plt
    # Now read R2DBE data covering roughly the same time window as the SWARM
    # data. Start at an offset of zero (i.e. from the first VDIF packet) to
    # keep things simple.
    N_r_vdif_frames = int(
        np.ceil(read_sdbe_vdif.SWARM_TRANSPOSE_SIZE *
                (BENG_BUFFER_IN_COUNTS - 1) * read_sdbe_vdif.R2DBE_RATE /
                read_sdbe_vdif.SWARM_RATE))

Example #10

0

Show file

def scenario_contiguous_batched39_resample(num_snapshots, tic, toc):
    '''
  # Scenario: Fourier resample of num_snapshots 
  # A iFFT: [num_snapshots,16385] complex64 --> 
  # B FFT: [39,num_snapshots/39 * 32768] float32 --> 
  # C iFFT + zero-padding: [39,num_snapshots/39* 32768*4096/2496/ 2 + 1] complex 64 -->
  # [39,num_snapshots * 32768 * 4096 / 2496] float32
  #
  # 1 C(B(A(gpu_1)))  = C(B(gpu_2)) = C(gpu_1) = gpu_2
  # num_snapshots is a multiple of 39.  
  # A executed using batch = num_snapshots
  # B&C executed using batch = num_snapshots / 39
  '''
    print '\nContiguous channel Fourier resampling scenario in batches of 39:'
    assert num_snapshots % 39 is 0, 'error: num_snapshots must be integer multiple of 39'

    # construct arrays
    batch = num_snapshots / 39
    print 'batch: %d' % batch
    gpu_1 = cuda.mem_alloc(
        int(8 * batch *
            (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1)))
    gpu_2 = cuda.mem_alloc(
        int(4 * batch * (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE)))
    cpu_in = standard_normal(
        num_snapshots *
        BENG_CHANNELS) + 1j * standard_normal(num_snapshots * BENG_CHANNELS)
    cpu_in = cpu_in.astype(complex64)

    # create FFT plans
    n_A = array([2 * BENG_CHANNELS_], int32)
    inembed_A = array([BENG_CHANNELS], int32)
    onembed_A = array([2 * BENG_CHANNELS_], int32)
    plan_A = cufft.cufftPlanMany(1, n_A.ctypes.data, inembed_A.ctypes.data, 1,
                                 BENG_CHANNELS, onembed_A.ctypes.data, 1,
                                 2 * BENG_CHANNELS_, cufft.CUFFT_C2R,
                                 num_snapshots)

    n_B = array([39 * 2 * BENG_CHANNELS_], int32)
    inembed_B = array([39 * 2 * BENG_CHANNELS_], int32)
    onembed_B = array(
        [int(39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1)],
        int32)
    plan_B = cufft.cufftPlanMany(
        1, n_B.ctypes.data, inembed_B.ctypes.data, 1, 39 * 2 * BENG_CHANNELS_,
        onembed_B.ctypes.data, 1,
        int32(39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1),
        cufft.CUFFT_R2C, batch)

    n_C = array([39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE], int32)
    inembed_C = array(
        [39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1], int32)
    onembed_C = array([39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE],
                      int32)
    plan_C = cufft.cufftPlanMany(
        1, n_C.ctypes.data, inembed_C.ctypes.data, 1,
        int32(39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1),
        onembed_C.ctypes.data, 1,
        int32(39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE),
        cufft.CUFFT_C2R, batch)

    # zero out gpu_1
    kernel_module = SourceModule(kernel_source)
    zero_out = kernel_module.get_function('zero_out')

    # sanity check:
    zero_out(
        gpu_1,
        int32(batch *
              (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1)),
        block=(1024, 1, 1),
        grid=(int(
            ceil(batch *
                 (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1) /
                 1024.)), 1))
    cpu_out = empty(
        (batch * (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1)),
        complex64)
    cuda.memcpy_dtoh(cpu_out, gpu_1)
    assert len(unique(cpu_out)) == 1, 'problem with zero_out'

    # move data to device
    cuda.memcpy_htod(gpu_1, cpu_in)

    tic.record()

    # Turn SWARM snapshots into timeseries
    cufft.cufftExecC2R(plan_A, int(gpu_1), int(gpu_2))

    # zero out gpu_1
    zero_out(
        gpu_1,
        int32(batch *
              (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1)),
        block=(1024, 1, 1),
        grid=(int(
            ceil(batch *
                 (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1) /
                 1024.)), 1))

    # Turn concatenated SWARM time series into single spectrum (already zero-padded)
    cufft.cufftExecR2C(plan_B, int(gpu_2), int(gpu_1))

    # Turn padded SWARM spectrum into time series with R2DBE sampling rate
    cufft.cufftExecC2R(plan_C, int(gpu_1), int(gpu_2))

    toc.record()
    toc.synchronize()

    # check on CPU
    cpu_A = irfft(cpu_in.reshape(num_snapshots, BENG_CHANNELS),
                  axis=-1).astype(float32)
    cpu_B = rfft(cpu_A.reshape(batch, 39 * 2 * BENG_CHANNELS_),
                 axis=-1).astype(complex64)
    cpu_C = irfft(hstack([
        cpu_B,
        zeros((batch,
               (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1) -
               (39 * 2 * BENG_CHANNELS_ / 2 + 1)), complex64)
    ]),
                  axis=-1)
    cpu_out = empty(
        num_snapshots * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE, float32)
    cuda.memcpy_dtoh(cpu_out, gpu_2)

    print 'test results: ', 'pass' if allclose(
        cpu_C.flatten(), cpu_out /
        (cpu_C.shape[-1] * 2 * BENG_CHANNELS_)) else 'fail'
    print 'max residual: ', max(
        abs(cpu_C.flatten() - cpu_out /
            (cpu_C.shape[-1] * 2 * BENG_CHANNELS_)))
    print 'GPU time:', tic.time_till(toc), ' ms = ', tic.time_till(toc) / (
        num_snapshots * 0.5 * 13.128e-3), ' x real (both SB)'

    # destroy plans
    cufft.cufftDestroy(plan_A)
    cufft.cufftDestroy(plan_B)
    cufft.cufftDestroy(plan_C)

Example #11

0

Show file

File: resample.py Project: sma-wideband/sdbe

def fft_interp(gpu_1,gpu_2,num_snapshots,interp_kind='nearest',cpu_check=True):
  '''
  Batched fft to time series and then interpolation to resample.
  No filter applied yet...
  '''
  tic = cuda.Event()
  toc = cuda.Event()

  batch_size = num_snapshots
  print 'batch size: %d' % batch_size

  # create batched FFT plan configuration
  n = array([2*BENG_CHANNELS_],int32)
  inembed = array([BENG_CHANNELS],int32)
  onembed = array([2*BENG_CHANNELS_],int32)
  plan = cufft.cufftPlanMany(1, n.ctypes.data,
 			inembed.ctypes.data, 1, BENG_CHANNELS,
                        onembed.ctypes.data, 1, 2*BENG_CHANNELS_,
  			cufft.CUFFT_C2R, batch_size)

  # fetch kernel that resamples 
  kernel_module = SourceModule(kernel_source)
  interp_1d = kernel_module.get_function(interp_kind)

  # execute plan
  cufft.cufftExecC2R(plan,int(gpu_1),int(gpu_2))

  # interpolate
  tic.record()
  xs_size = int(floor(batch_size*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE)) - 1
  TPB = 512                               # threads per block
  nB = int(ceil(1. * xs_size / TPB))      # number of blocks
  if interp_kind is 'linear':
    interp_1d(gpu_2,gpu_1,int32(xs_size),float64(SWARM_RATE/R2DBE_RATE),float32(1./(2*BENG_CHANNELS_)),
		block=(TPB,1,1),grid=(nB,1))
  else:
    interp_1d(gpu_2,gpu_1,int32(xs_size),float64(SWARM_RATE/R2DBE_RATE),float32(1./(2*BENG_CHANNELS_)),
		block=(TPB,1,1),grid=(nB,1))

  toc.record()
  toc.synchronize()

  print 'GPU time:', tic.time_till(toc),' ms = ',tic.time_till(toc)/(num_snapshots*0.5*13.128e-3),' x real (both SB)' 

  # destroy plan
  cufft.cufftDestroy(plan)

  # check on CPU
  if (cpu_check):
    # timestep sizes for SWARM and R2DBE rates
    dt_s = 1.0/SWARM_RATE
    dt_r = 1.0/R2DBE_RATE
    # the timespan of one SWARM FFT window
    T_s = dt_s*2*BENG_CHANNELS_
    # the timespan of all SWARM data
    T_s_all = T_s*batch_size
    # get time-domain signal
    xs_swarm_rate = irfft(cpu_in,n=2*BENG_CHANNELS_,axis=1).flatten()
    # and calculate sample points
    t_swarm_rate = arange(0,T_s_all,dt_s)
    print t_swarm_rate[0],t_swarm_rate[-1]
    # calculate resample points (subtract one dt_s from end to avoid extrapolation)
    t_r2dbe_rate = arange(0,T_s_all-dt_s,dt_r)
    # and interpolate
    x_interp = interp1d(t_swarm_rate,xs_swarm_rate,kind=interp_kind)
    cpu_A = x_interp(t_r2dbe_rate)

    cpu_out = np.empty_like(cpu_A,dtype=float32)
    cuda.memcpy_dtoh(cpu_out,gpu_1)

    print 'median residual: ',median(abs(cpu_A-cpu_out))
    if interp_kind is 'nearest':
      cpu_A[::32] = 0
      cpu_out[::32] = 0
    print 'test results: ', 'pass' if allclose(cpu_A,cpu_out) else 'fail'

Example #12

0

Show file

File: fft_test.py Project: sma-wideband/sdbe

def scenario_contiguous_batched39_resample(num_snapshots,tic,toc):
  '''
  # Scenario: Fourier resample of num_snapshots 
  # A iFFT: [num_snapshots,16385] complex64 --> 
  # B FFT: [39,num_snapshots/39 * 32768] float32 --> 
  # C iFFT + zero-padding: [39,num_snapshots/39* 32768*4096/2496/ 2 + 1] complex 64 -->
  # [39,num_snapshots * 32768 * 4096 / 2496] float32
  #
  # 1 C(B(A(gpu_1)))  = C(B(gpu_2)) = C(gpu_1) = gpu_2
  # num_snapshots is a multiple of 39.  
  # A executed using batch = num_snapshots
  # B&C executed using batch = num_snapshots / 39
  '''
  print '\nContiguous channel Fourier resampling scenario in batches of 39:'
  assert num_snapshots % 39 is 0, 'error: num_snapshots must be integer multiple of 39'

  # construct arrays
  batch = num_snapshots / 39
  print 'batch: %d' % batch
  gpu_1 = cuda.mem_alloc(int(8 * batch * (39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1)))
  gpu_2 = cuda.mem_alloc(int(4 * batch * (39*2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE)))
  cpu_in = standard_normal(num_snapshots*BENG_CHANNELS) + 1j * standard_normal(num_snapshots*BENG_CHANNELS)
  cpu_in = cpu_in.astype(complex64)

  # create FFT plans
  n_A = array([2*BENG_CHANNELS_],int32)
  inembed_A = array([BENG_CHANNELS],int32)
  onembed_A = array([2*BENG_CHANNELS_],int32)
  plan_A = cufft.cufftPlanMany(1, n_A.ctypes.data, inembed_A.ctypes.data, 1, BENG_CHANNELS,
  	                                     onembed_A.ctypes.data, 1, 2*BENG_CHANNELS_,
  					     cufft.CUFFT_C2R, num_snapshots)

  n_B = array([39*2*BENG_CHANNELS_],int32)
  inembed_B = array([39*2*BENG_CHANNELS_],int32)
  onembed_B = array([int(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1)],int32)
  plan_B = cufft.cufftPlanMany(1, n_B.ctypes.data,
					inembed_B.ctypes.data,1,39*2*BENG_CHANNELS_,
					onembed_B.ctypes.data,1,int32(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1),
					cufft.CUFFT_R2C, batch)

  n_C = array([39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE],int32)
  inembed_C = array([39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1],int32)
  onembed_C = array([39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE],int32)
  plan_C = cufft.cufftPlanMany(1, n_C.ctypes.data,
					inembed_C.ctypes.data,1,int32(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1),
					onembed_C.ctypes.data,1,int32(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE),
					cufft.CUFFT_C2R, batch)

  # zero out gpu_1
  kernel_module = SourceModule(kernel_source)
  zero_out = kernel_module.get_function('zero_out')

  # sanity check:
  zero_out(gpu_1,int32(batch * (39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1)),
	block=(1024,1,1),grid=(int(ceil(batch*(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1)/1024.)),1))
  cpu_out = empty((batch * (39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1)),complex64)
  cuda.memcpy_dtoh(cpu_out,gpu_1)
  assert len(unique(cpu_out)) == 1, 'problem with zero_out'

  # move data to device
  cuda.memcpy_htod(gpu_1,cpu_in)

  tic.record()

  # Turn SWARM snapshots into timeseries
  cufft.cufftExecC2R(plan_A,int(gpu_1),int(gpu_2))

  # zero out gpu_1
  zero_out(gpu_1,int32(batch*(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1)),
	block=(1024,1,1),grid=(int(ceil(batch*(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1)/1024.)),1))

  # Turn concatenated SWARM time series into single spectrum (already zero-padded)
  cufft.cufftExecR2C(plan_B,int(gpu_2),int(gpu_1))

  # Turn padded SWARM spectrum into time series with R2DBE sampling rate
  cufft.cufftExecC2R(plan_C,int(gpu_1),int(gpu_2))

  toc.record()
  toc.synchronize()

  # check on CPU
  cpu_A = irfft(cpu_in.reshape(num_snapshots,BENG_CHANNELS),axis=-1).astype(float32)
  cpu_B = rfft(cpu_A.reshape(batch,39*2*BENG_CHANNELS_),axis=-1).astype(complex64)
  cpu_C = irfft(hstack([cpu_B, zeros((batch,(39*2*BENG_CHANNELS_* R2DBE_RATE/SWARM_RATE/2+1)-(39*2*BENG_CHANNELS_/2+1)),complex64)]),axis=-1)
  cpu_out = empty(num_snapshots*2*BENG_CHANNELS_* R2DBE_RATE/SWARM_RATE,float32)
  cuda.memcpy_dtoh(cpu_out,gpu_2)

  print 'test results: ', 'pass' if allclose(cpu_C.flatten(),cpu_out/(cpu_C.shape[-1]*2*BENG_CHANNELS_)) else 'fail'
  print 'max residual: ',max(abs(cpu_C.flatten()-cpu_out/(cpu_C.shape[-1]*2*BENG_CHANNELS_)))
  print 'GPU time:', tic.time_till(toc),' ms = ',tic.time_till(toc)/(num_snapshots*0.5*13.128e-3),' x real (both SB)' 

  # destroy plans
  cufft.cufftDestroy(plan_A)
  cufft.cufftDestroy(plan_B)
  cufft.cufftDestroy(plan_C)

Example #13

0

Show file

File: fft_lindy.py Project: sma-wideband/sdbe

toc.synchronize()

time_gpu = tic.time_till(toc)

print 'DEBUG::loading resampled time series cpu_r2dbe'
gpumeminfo(cuda)
cpu_r2dbe = np.empty(num_r2dbe_samples/2,dtype=float32)
cuda.memcpy_dtoh(cpu_r2dbe,gpu_r2dbe)

print ''
print 'time resampled:', 13.128e-3 * BENG_SNAPSHOTS * (BENG_BUFFER_IN_COUNTS - 1), ' ms'
print 'Transfer size was %d bytes' % cpu_vdif_buf.nbytes
print 'GPU time:',time_gpu,' ms'

# destroy plans
cufft.cufftDestroy(plan_A)
cufft.cufftDestroy(plan_interp_A)
cufft.cufftDestroy(plan_interp_B)

# free memory
#gpu_r2dbe.free()

if DEBUG:
  import matplotlib.pyplot as plt
  # Now read R2DBE data covering roughly the same time window as the SWARM
  # data. Start at an offset of zero (i.e. from the first VDIF packet) to
  # keep things simple.
  N_r_vdif_frames = int(np.ceil(read_sdbe_vdif.SWARM_TRANSPOSE_SIZE*(BENG_BUFFER_IN_COUNTS-1)*read_sdbe_vdif.R2DBE_RATE/read_sdbe_vdif.SWARM_RATE))
  vdif_frames_offset = 0
  rel_path_to_in = '/home/shared/sdbe_preprocessed/'
  d = sdbe_preprocess.get_diagnostics_from_file(scan_filename_base,rel_path=rel_path_to_in)

Example #14

0

Show file

def fft_interp(gpu_1,
               gpu_2,
               num_snapshots,
               interp_kind='nearest',
               cpu_check=True):
    '''
  Batched fft to time series and then interpolation to resample.
  No filter applied yet...
  '''
    tic = cuda.Event()
    toc = cuda.Event()

    batch_size = num_snapshots
    print 'batch size: %d' % batch_size

    # create batched FFT plan configuration
    n = array([2 * BENG_CHANNELS_], int32)
    inembed = array([BENG_CHANNELS], int32)
    onembed = array([2 * BENG_CHANNELS_], int32)
    plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1,
                               BENG_CHANNELS, onembed.ctypes.data, 1,
                               2 * BENG_CHANNELS_, cufft.CUFFT_C2R, batch_size)

    # fetch kernel that resamples
    kernel_module = SourceModule(kernel_source)
    interp_1d = kernel_module.get_function(interp_kind)

    # execute plan
    cufft.cufftExecC2R(plan, int(gpu_1), int(gpu_2))

    # interpolate
    tic.record()
    xs_size = int(
        floor(batch_size * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE)) - 1
    TPB = 512  # threads per block
    nB = int(ceil(1. * xs_size / TPB))  # number of blocks
    if interp_kind is 'linear':
        interp_1d(gpu_2,
                  gpu_1,
                  int32(xs_size),
                  float64(SWARM_RATE / R2DBE_RATE),
                  float32(1. / (2 * BENG_CHANNELS_)),
                  block=(TPB, 1, 1),
                  grid=(nB, 1))
    else:
        interp_1d(gpu_2,
                  gpu_1,
                  int32(xs_size),
                  float64(SWARM_RATE / R2DBE_RATE),
                  float32(1. / (2 * BENG_CHANNELS_)),
                  block=(TPB, 1, 1),
                  grid=(nB, 1))

    toc.record()
    toc.synchronize()

    print 'GPU time:', tic.time_till(toc), ' ms = ', tic.time_till(toc) / (
        num_snapshots * 0.5 * 13.128e-3), ' x real (both SB)'

    # destroy plan
    cufft.cufftDestroy(plan)

    # check on CPU
    if (cpu_check):
        # timestep sizes for SWARM and R2DBE rates
        dt_s = 1.0 / SWARM_RATE
        dt_r = 1.0 / R2DBE_RATE
        # the timespan of one SWARM FFT window
        T_s = dt_s * 2 * BENG_CHANNELS_
        # the timespan of all SWARM data
        T_s_all = T_s * batch_size
        # get time-domain signal
        xs_swarm_rate = irfft(cpu_in, n=2 * BENG_CHANNELS_, axis=1).flatten()
        # and calculate sample points
        t_swarm_rate = arange(0, T_s_all, dt_s)
        print t_swarm_rate[0], t_swarm_rate[-1]
        # calculate resample points (subtract one dt_s from end to avoid extrapolation)
        t_r2dbe_rate = arange(0, T_s_all - dt_s, dt_r)
        # and interpolate
        x_interp = interp1d(t_swarm_rate, xs_swarm_rate, kind=interp_kind)
        cpu_A = x_interp(t_r2dbe_rate)

        cpu_out = np.empty_like(cpu_A, dtype=float32)
        cuda.memcpy_dtoh(cpu_out, gpu_1)

        print 'median residual: ', median(abs(cpu_A - cpu_out))
        if interp_kind is 'nearest':
            cpu_A[::32] = 0
            cpu_out[::32] = 0
        print 'test results: ', 'pass' if allclose(cpu_A, cpu_out) else 'fail'

Example #15

0

Show file

File: resample.py Project: sma-wideband/sdbe

def fft_batched(gpu_1,gpu_2,num_snapshots,snapshots_per_batch=39,cpu_check=True):
  '''
  gpu_1: pointer to Mx16385 array on GPU device where zeroth dimension is positive frequency half of spectrum
        and the first dimension is is increasing snapshot index.  This array will be destroyed.
	Must have byte size: int(8*batch_size*(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1))
  gpu_2: pointer to result
  snapshots_per_batch: number of snapshots grouped for resampling (% 39 == 0)
  '''
  tic = cuda.Event()
  toc = cuda.Event()

  batch_size = num_snapshots / snapshots_per_batch
  print 'batch size: %d' % batch_size

  # create FFT plans
  n_A = array([2*BENG_CHANNELS_],int32)
  inembed_A = array([BENG_CHANNELS],int32)
  onembed_A = array([2*BENG_CHANNELS_],int32)
  plan_A = cufft.cufftPlanMany(1, n_A.ctypes.data, inembed_A.ctypes.data, 1, BENG_CHANNELS,
  	                                     onembed_A.ctypes.data, 1, 2*BENG_CHANNELS_,
  					     cufft.CUFFT_C2R, num_snapshots)

  n_B = array([snapshots_per_batch*2*BENG_CHANNELS_],int32)
  inembed_B = array([snapshots_per_batch*2*BENG_CHANNELS_],int32)
  onembed_B = array([int(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1)],int32)
  plan_B = cufft.cufftPlanMany(1, n_B.ctypes.data,
			inembed_B.ctypes.data,1,snapshots_per_batch*2*BENG_CHANNELS_,
			onembed_B.ctypes.data,1,int32(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1),
					cufft.CUFFT_R2C, batch_size)

  n_C = array([snapshots_per_batch*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE],int32)
  inembed_C = array([snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1],int32)
  onembed_C = array([snapshots_per_batch*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE],int32)
  plan_C = cufft.cufftPlanMany(1, n_C.ctypes.data,
			inembed_C.ctypes.data,1,int32(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1),
			onembed_C.ctypes.data,1,int32(snapshots_per_batch*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE),
					cufft.CUFFT_C2R, batch_size)

  # fetch kernel that zeroes out an array
  kernel_module = SourceModule(kernel_source)
  zero_out = kernel_module.get_function('zero_out')

  tic.record()

  # Turn SWARM snapshots into timeseries
  cufft.cufftExecC2R(plan_A,int(gpu_1),int(gpu_2))

  # zero out gpu_1
  zero_out(gpu_1,int32(batch_size*(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1)),
	block=(1024,1,1),
	grid=(int(ceil(batch_size*(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1)/1024.)),1))

  # Turn concatenated SWARM time series into single spectrum (already zero-padded)
  cufft.cufftExecR2C(plan_B,int(gpu_2),int(gpu_1))

  # Turn padded SWARM spectrum into time series with R2DBE sampling rate
  cufft.cufftExecC2R(plan_C,int(gpu_1),int(gpu_2))

  toc.record()
  toc.synchronize()

  # check on CPU
  if (cpu_check):
    cpu_A = irfft(cpu_in.reshape(num_snapshots,BENG_CHANNELS),axis=-1).astype(float32)
    cpu_B = rfft(cpu_A.reshape(batch_size,snapshots_per_batch*2*BENG_CHANNELS_),axis=-1).astype(complex64)
    cpu_C = irfft(hstack([cpu_B, 
			zeros((batch_size,(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1)-
				(snapshots_per_batch*BENG_CHANNELS_+1)),complex64)]),axis=-1)
    cpu_out = empty(num_snapshots*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE,float32)
    cuda.memcpy_dtoh(cpu_out,gpu_2)

    print 'test results: ', 'pass' if allclose(cpu_C.flatten(),cpu_out/(cpu_C.shape[-1]*2*BENG_CHANNELS_)) else 'fail'
    print 'max residual: ',max(abs(cpu_C.flatten()-cpu_out/(cpu_C.shape[-1]*2*BENG_CHANNELS_)))

  print 'GPU time:', tic.time_till(toc),' ms = ',tic.time_till(toc)/(num_snapshots*0.5*13.128e-3),' x real (both SB)' 

  # destroy plans
  cufft.cufftDestroy(plan_A)
  cufft.cufftDestroy(plan_B)
  cufft.cufftDestroy(plan_C)

Example #16

0

Show file

def fft_batched(gpu_1,
                gpu_2,
                num_snapshots,
                snapshots_per_batch=39,
                cpu_check=True):
    '''
  gpu_1: pointer to Mx16385 array on GPU device where zeroth dimension is positive frequency half of spectrum
        and the first dimension is is increasing snapshot index.  This array will be destroyed.
	Must have byte size: int(8*batch_size*(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1))
  gpu_2: pointer to result
  snapshots_per_batch: number of snapshots grouped for resampling (% 39 == 0)
  '''
    tic = cuda.Event()
    toc = cuda.Event()

    batch_size = num_snapshots / snapshots_per_batch
    print 'batch size: %d' % batch_size

    # create FFT plans
    n_A = array([2 * BENG_CHANNELS_], int32)
    inembed_A = array([BENG_CHANNELS], int32)
    onembed_A = array([2 * BENG_CHANNELS_], int32)
    plan_A = cufft.cufftPlanMany(1, n_A.ctypes.data, inembed_A.ctypes.data, 1,
                                 BENG_CHANNELS, onembed_A.ctypes.data, 1,
                                 2 * BENG_CHANNELS_, cufft.CUFFT_C2R,
                                 num_snapshots)

    n_B = array([snapshots_per_batch * 2 * BENG_CHANNELS_], int32)
    inembed_B = array([snapshots_per_batch * 2 * BENG_CHANNELS_], int32)
    onembed_B = array([
        int(snapshots_per_batch * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE + 1)
    ], int32)
    plan_B = cufft.cufftPlanMany(
        1, n_B.ctypes.data, inembed_B.ctypes.data, 1,
        snapshots_per_batch * 2 * BENG_CHANNELS_, onembed_B.ctypes.data, 1,
        int32(snapshots_per_batch * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE +
              1), cufft.CUFFT_R2C, batch_size)

    n_C = array(
        [snapshots_per_batch * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE],
        int32)
    inembed_C = array(
        [snapshots_per_batch * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE + 1],
        int32)
    onembed_C = array(
        [snapshots_per_batch * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE],
        int32)
    plan_C = cufft.cufftPlanMany(
        1, n_C.ctypes.data, inembed_C.ctypes.data, 1,
        int32(snapshots_per_batch * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE +
              1), onembed_C.ctypes.data, 1,
        int32(snapshots_per_batch * 2 * BENG_CHANNELS_ * R2DBE_RATE /
              SWARM_RATE), cufft.CUFFT_C2R, batch_size)

    # fetch kernel that zeroes out an array
    kernel_module = SourceModule(kernel_source)
    zero_out = kernel_module.get_function('zero_out')

    tic.record()

    # Turn SWARM snapshots into timeseries
    cufft.cufftExecC2R(plan_A, int(gpu_1), int(gpu_2))

    # zero out gpu_1
    zero_out(
        gpu_1,
        int32(batch_size *
              (snapshots_per_batch * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE +
               1)),
        block=(1024, 1, 1),
        grid=(int(
            ceil(batch_size * (snapshots_per_batch * BENG_CHANNELS_ *
                               R2DBE_RATE / SWARM_RATE + 1) / 1024.)), 1))

    # Turn concatenated SWARM time series into single spectrum (already zero-padded)
    cufft.cufftExecR2C(plan_B, int(gpu_2), int(gpu_1))

    # Turn padded SWARM spectrum into time series with R2DBE sampling rate
    cufft.cufftExecC2R(plan_C, int(gpu_1), int(gpu_2))

    toc.record()
    toc.synchronize()

    # check on CPU
    if (cpu_check):
        cpu_A = irfft(cpu_in.reshape(num_snapshots, BENG_CHANNELS),
                      axis=-1).astype(float32)
        cpu_B = rfft(cpu_A.reshape(batch_size,
                                   snapshots_per_batch * 2 * BENG_CHANNELS_),
                     axis=-1).astype(complex64)
        cpu_C = irfft(hstack([
            cpu_B,
            zeros((batch_size, (snapshots_per_batch * BENG_CHANNELS_ *
                                R2DBE_RATE / SWARM_RATE + 1) -
                   (snapshots_per_batch * BENG_CHANNELS_ + 1)), complex64)
        ]),
                      axis=-1)
        cpu_out = empty(
            num_snapshots * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE,
            float32)
        cuda.memcpy_dtoh(cpu_out, gpu_2)

        print 'test results: ', 'pass' if allclose(
            cpu_C.flatten(), cpu_out /
            (cpu_C.shape[-1] * 2 * BENG_CHANNELS_)) else 'fail'
        print 'max residual: ', max(
            abs(cpu_C.flatten() - cpu_out /
                (cpu_C.shape[-1] * 2 * BENG_CHANNELS_)))

    print 'GPU time:', tic.time_till(toc), ' ms = ', tic.time_till(toc) / (
        num_snapshots * 0.5 * 13.128e-3), ' x real (both SB)'

    # destroy plans
    cufft.cufftDestroy(plan_A)
    cufft.cufftDestroy(plan_B)
    cufft.cufftDestroy(plan_C)

Example #17

0

Show file

File: vrf.py Project: bionet/vtem

 def compute_dirich_space_fft(self, d_Ds, Mx, My, Px, Py, Sx, Sy, Wx, Wy):
     import scikits.cuda.cufft as cufft
     
     dx = Sx / Px
     dy = Sy / Py
     
     Px1 = int(np.round(self.Sx / dx))
     Py1 = int(np.round(self.Sy / dy))
     
     
     if self.dtype == np.complex128:
         typef = np.dtype(np.float64)
     else:
         typef = np.dtype(np.float32)
     
     d_dirich = parray.empty((self.num_neurons, Py, Px),typef)
     
     freemem,totalmem = cuda.mem_get_info()
     
     ONE_TIME_FILTER = int(min(freemem / (Px1 * Py1 * d_Ds.dtype.itemsize) / 4, self.num_neurons))
     
     
     n = np.asarray((Py1, Px1) ,np.int32)
     
     
     if self.dtype == np.complex128:
         plan = cufft.cufftPlanMany(2, n.ctypes.data, None, 1, 0, None, 1, 0, cufft.CUFFT_Z2Z, ONE_TIME_FILTER)
         fftfunc = cufft.cufftExecZ2Z
     else:
         plan = cufft.cufftPlanMany(2, n.ctypes.data, None, 1, 0, None, 1, 0, cufft.CUFFT_C2C, ONE_TIME_FILTER)
         fftfunc = cufft.cufftExecC2C
     
     
     Ds2fftfun = get_Ds2fft_kernel(self.dtype)
     d_filter_complex = parray.empty((ONE_TIME_FILTER, Px1*Py1), self.dtype)
     filter2recfun = get_filter2rec_kernel(self.dtype)
     
     for i in range(0, self.num_neurons, ONE_TIME_FILTER):
         N_filters = min(ONE_TIME_FILTER, self.num_neurons - i)
         d_filter_complex.fill(0)
         
         launch_kernel(Ds2fftfun, (256,1,1), (Mx*2+1, My*2+1), [[d_Ds,i * d_Ds.ld], d_Ds.ld, d_filter_complex, d_filter_complex.ld, Mx, My, Px1, Py1, N_filters])
         
         if N_filters < ONE_TIME_FILTER:
             cufft.cufftDestroy(plan)
             if self.dtype == np.complex128:
                 plan = cufft.cufftPlanMany(2, n.ctypes.data, None, 1, 0, None, 1, 0, cufft.CUFFT_Z2Z, N_filters)
             
             else:
                 plan = cufft.cufftPlanMany(2, n.ctypes.data, None, 1, 0, None, 1, 0, cufft.CUFFT_C2C, N_filters)
         
         
         
         #be careful with the side-by-side constraint
         fftfunc(plan, int(d_filter_complex.gpudata), int(d_filter_complex.gpudata), cufft.CUFFT_INVERSE)
         
         
         BLOCK_SIZE = 16
         launch_kernel(filter2recfun, (BLOCK_SIZE,BLOCK_SIZE,1), (((Px-1) / BLOCK_SIZE + 1)* ((Py-1) / BLOCK_SIZE+1), N_filters), [[d_dirich,i * d_dirich.ld],d_dirich.ld, d_filter_complex, d_filter_complex.ld, N_filters, Px, Py, Px1, Py1])
     
     cufft.cufftDestroy(plan)
     
     return d_dirich