def scenario_contiguous_channels_wpadding(batch, tic, toc): ''' # Scenario: batched IFFT of 39 snapshots # 39 x 16385 complex64 --> 39 x 32768 float32 # padding complex input so channel dimension has 16400 elements. ''' n = array([2 * BENG_CHANNELS_], int32) beng_channels_padded = 16400 # create batched FFT plan configuration inembed = array([beng_channels_padded], int32) onembed = array([2 * BENG_CHANNELS_], int32) istride = int32(beng_channels_padded) plan = cufft.cufftPlanMany(int32(1), n.ctypes.data, inembed.ctypes.data, int32(1), istride, onembed.ctypes.data, int32(1), int32(2 * BENG_CHANNELS_), cufft.CUFFT_C2R, int32(batch)) # construct arrays gpu_in = cuda.mem_alloc(8 * batch * beng_channels_padded) # complex64 gpu_out = cuda.mem_alloc(4 * batch * 2 * BENG_CHANNELS_) # float32 cpu_in = standard_normal( batch * beng_channels_padded) + 1j * standard_normal( batch * beng_channels_padded) cpu_in = cpu_in.astype(complex64) cuda.memcpy_htod(gpu_in, cpu_in) # execute plan tic.record() cufft.cufftExecC2R(plan, int(gpu_in), int(gpu_out)) toc.record() toc.synchronize() # read out result cpu_out = empty(batch * 2 * BENG_CHANNELS_, float32) cuda.memcpy_dtoh(cpu_out, gpu_out) cpu_out.resize((batch, 2 * BENG_CHANNELS_)) # execute on CPU cpu = irfft(cpu_in.reshape( (batch, beng_channels_padded))[:, :BENG_CHANNELS], axis=-1) # destroy plan cufft.cufftDestroy(plan) # test print '\nContiguous Channel w/ Padding Scenario:' print '1-D %d-element C2R iFFT in batch of %d.' % (n, batch) print 'test results: ' 'pass' if allclose(cpu, cpu_out / (2 * BENG_CHANNELS_)) else 'fail' print 'real time:', batch * 13.128e-3, ' ms' print 'GPU time:', tic.time_till(toc), ' ms = ', tic.time_till(toc) / ( batch * 0.5 * 13.128e-3), ' x real (both SB)'
def compute_Ds(self, Mx, My): """ Parameters ---------- Mx : integer Order in the x dimension My : integer Order in the y dimension Returns ------- The dirichlet coefficients of all gabor filters with order Mx, My in the format of PitchArray with shape (num_neurons, 2*Mx+1, 2*My+1) """ import scikits.cuda.cufft as cufft d_Ds = parray.empty((self.num_neurons, 2*My+1, 2*Mx+1), self.dtype) ONE_TIME_FILTER = min(1024**3 / (self.Px * self.Py * d_Ds.dtype.itemsize) / 2, self.num_neurons) n = np.asarray((self.Py, self.Px) ,np.int32) if self.dtype == np.complex128: plan = cufft.cufftPlanMany(2, n.ctypes.data, None, 1, 0, None, 1, 0, cufft.CUFFT_Z2Z, ONE_TIME_FILTER) fftfunc = cufft.cufftExecZ2Z else: plan = cufft.cufftPlanMany(2, n.ctypes.data, None, 1, 0, None, 1, 0, cufft.CUFFT_C2C, ONE_TIME_FILTER) fftfunc = cufft.cufftExecC2C fft2Dsfun = get_fft2Ds_kernel(dtype = self.dtype) for i in range(0, self.num_neurons, ONE_TIME_FILTER): N_filters = min(ONE_TIME_FILTER, self.num_neurons - i) self.generate_visual_receptive_fields(startbias = i, N_filters = N_filters) if N_filters < ONE_TIME_FILTER: cufft.cufftDestroy(plan) if self.dtype == np.complex128: plan = cufft.cufftPlanMany(2, n.ctypes.data, None, 1, 0, None, 1, 0, cufft.CUFFT_Z2Z, N_filters) else: plan = cufft.cufftPlanMany(2, n.ctypes.data, None, 1, 0, None, 1, 0, cufft.CUFFT_C2C, N_filters) #be careful with the side-by-side constraint fftfunc(plan, int(self.filters.gpudata), int(self.filters.gpudata), cufft.CUFFT_FORWARD) launch_kernel(fft2Dsfun, (256, 1, 1), (Mx*2+1, My * 2+1), [[d_Ds, i * d_Ds.ld], self.filters, Mx, My, self.Px, self.Py, N_filters, d_Ds.ld, self.dx*self.dy]); cufft.cufftDestroy(plan) return d_Ds
def scenario_contiguous_channels(batch, tic, toc): ''' # Scenario: batched IFFT of batch snapshots # batch x 16385 complex64 --> batch x 32768 float32 # no padding. ''' n = array([2 * BENG_CHANNELS_], int32) seed(12740) # create batched FFT plan configuration inembed = array([BENG_CHANNELS], int32) onembed = array([2 * BENG_CHANNELS_], int32) plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1, BENG_CHANNELS, onembed.ctypes.data, 1, 2 * BENG_CHANNELS_, cufft.CUFFT_C2R, batch) # construct arrays gpu_in = cuda.mem_alloc(8 * batch * BENG_CHANNELS) # complex64 gpu_out = cuda.mem_alloc(4 * batch * 2 * BENG_CHANNELS_) # float32 cpu_in = standard_normal( batch * BENG_CHANNELS) + 1j * standard_normal(batch * BENG_CHANNELS) cpu_in = cpu_in.astype(complex64) cuda.memcpy_htod(gpu_in, cpu_in) # execute plan tic.record() cufft.cufftExecC2R(plan, int(gpu_in), int(gpu_out)) toc.record() toc.synchronize() # read out result cpu_out = empty(batch * 2 * BENG_CHANNELS_, float32) cuda.memcpy_dtoh(cpu_out, gpu_out) cpu_out.resize((batch, 2 * BENG_CHANNELS_)) # execute on CPU cpu = irfft(cpu_in.reshape((batch, BENG_CHANNELS)), axis=-1) # destroy plan cufft.cufftDestroy(plan) # test print '\nContiguous Channel Scenario:' print '1-D %d-element C2R iFFT in batch of %d.' % (n, batch) print 'test results: ' 'pass' if allclose(cpu, cpu_out / (2 * BENG_CHANNELS_)) else 'fail' print 'real time:', batch * 13.128e-3, ' ms' print 'GPU time:', tic.time_till(toc), ' ms = ', tic.time_till(toc) / ( batch * 0.5 * 13.128e-3), ' x real (both SB)'
def scenario_contiguous_channels_oversampled64(batch, tic, toc): ''' Scenario: batched IFFT of 2*2**14*64 channels ''' fft_window_oversample = 64 * 2 * 2**14 n = array([fft_window_oversample], int32) # create batched FFT plan configuration inembed = array([fft_window_oversample / 2 + 1], int32) onembed = array([fft_window_oversample], int32) plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1, fft_window_oversample / 2 + 1, onembed.ctypes.data, 1, fft_window_oversample, cufft.CUFFT_C2R, batch) # construct arrays gpu_in = cuda.mem_alloc(8 * batch * (fft_window_oversample / 2 + 1)) # complex64 gpu_out = cuda.mem_alloc(4 * batch * fft_window_oversample) # float32 data_shape = (batch, fft_window_oversample / 2 + 1) cpu_in = standard_normal(data_shape) + 1j * standard_normal(data_shape) cpu_in = cpu_in.astype(complex64) cuda.memcpy_htod(gpu_in, cpu_in) # execute plan tic.record() cufft.cufftExecC2R(plan, int(gpu_in), int(gpu_out)) toc.record() toc.synchronize() # read out result cpu_out = empty((batch, fft_window_oversample), float32) cuda.memcpy_dtoh(cpu_out, gpu_out) # execute on CPU cpu = irfft(cpu_in, axis=-1) # destroy plan cufft.cufftDestroy(plan) # test print '\nOversampling by x64 Scenario with batches:' print '1-D %d-element C2R iFFT in batch of %d.' % (n, batch) print 'test results: ' 'pass' if allclose( cpu, cpu_out / (fft_window_oversample)) else 'fail' print 'real time:', batch * 13.128e-3, ' ms' print 'GPU time:', tic.time_till(toc), ' ms = ', tic.time_till(toc) / ( batch * 0.5 * 13.128e-3), ' x real (both SB)'
def scenario_contiguous_channels(batch,tic,toc): ''' # Scenario: batched IFFT of batch snapshots # batch x 16385 complex64 --> batch x 32768 float32 # no padding. ''' n = array([2*BENG_CHANNELS_],int32) seed(12740) # create batched FFT plan configuration inembed = array([BENG_CHANNELS],int32) onembed = array([2*BENG_CHANNELS_],int32) plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1, BENG_CHANNELS, onembed.ctypes.data, 1, 2*BENG_CHANNELS_, cufft.CUFFT_C2R, batch) # construct arrays gpu_in = cuda.mem_alloc(8*batch*BENG_CHANNELS) # complex64 gpu_out = cuda.mem_alloc(4*batch*2*BENG_CHANNELS_) # float32 cpu_in = standard_normal(batch*BENG_CHANNELS) + 1j * standard_normal(batch*BENG_CHANNELS) cpu_in = cpu_in.astype(complex64) cuda.memcpy_htod(gpu_in,cpu_in) # execute plan tic.record() cufft.cufftExecC2R(plan,int(gpu_in),int(gpu_out)) toc.record() toc.synchronize() # read out result cpu_out = empty(batch*2*BENG_CHANNELS_,float32) cuda.memcpy_dtoh(cpu_out,gpu_out) cpu_out.resize((batch,2*BENG_CHANNELS_)) # execute on CPU cpu = irfft(cpu_in.reshape((batch,BENG_CHANNELS)),axis=-1) # destroy plan cufft.cufftDestroy(plan) # test print '\nContiguous Channel Scenario:' print '1-D %d-element C2R iFFT in batch of %d.' % (n, batch) print 'test results: ' 'pass' if allclose(cpu,cpu_out/(2*BENG_CHANNELS_)) else 'fail' print 'real time:', batch * 13.128e-3,' ms' print 'GPU time:', tic.time_till(toc),' ms = ',tic.time_till(toc)/(batch*0.5*13.128e-3),' x real (both SB)'
def scenario_contiguous_channels_wpadding(batch,tic,toc): ''' # Scenario: batched IFFT of 39 snapshots # 39 x 16385 complex64 --> 39 x 32768 float32 # padding complex input so channel dimension has 16400 elements. ''' n = array([2*BENG_CHANNELS_],int32) beng_channels_padded = 16400 # create batched FFT plan configuration inembed = array([beng_channels_padded],int32) onembed = array([2*BENG_CHANNELS_],int32) istride = int32(beng_channels_padded) plan = cufft.cufftPlanMany(int32(1), n.ctypes.data, inembed.ctypes.data, int32(1), istride, onembed.ctypes.data, int32(1), int32(2*BENG_CHANNELS_), cufft.CUFFT_C2R, int32(batch)) # construct arrays gpu_in = cuda.mem_alloc(8*batch*beng_channels_padded) # complex64 gpu_out = cuda.mem_alloc(4*batch*2*BENG_CHANNELS_) # float32 cpu_in = standard_normal(batch*beng_channels_padded) + 1j * standard_normal(batch*beng_channels_padded) cpu_in = cpu_in.astype(complex64) cuda.memcpy_htod(gpu_in,cpu_in) # execute plan tic.record() cufft.cufftExecC2R(plan,int(gpu_in),int(gpu_out)) toc.record() toc.synchronize() # read out result cpu_out = empty(batch*2*BENG_CHANNELS_,float32) cuda.memcpy_dtoh(cpu_out,gpu_out) cpu_out.resize((batch,2*BENG_CHANNELS_)) # execute on CPU cpu = irfft(cpu_in.reshape((batch,beng_channels_padded))[:,:BENG_CHANNELS],axis=-1) # destroy plan cufft.cufftDestroy(plan) # test print '\nContiguous Channel w/ Padding Scenario:' print '1-D %d-element C2R iFFT in batch of %d.' % (n, batch) print 'test results: ' 'pass' if allclose(cpu,cpu_out/(2*BENG_CHANNELS_)) else 'fail' print 'real time:', batch * 13.128e-3,' ms' print 'GPU time:', tic.time_till(toc),' ms = ',tic.time_till(toc)/(batch*0.5*13.128e-3),' x real (both SB)'
def scenario_contiguous_channels_oversampled64(batch,tic,toc): ''' Scenario: batched IFFT of 2*2**14*64 channels ''' fft_window_oversample = 64*2*2**14 n = array([fft_window_oversample],int32) # create batched FFT plan configuration inembed = array([fft_window_oversample/2+1],int32) onembed = array([fft_window_oversample],int32) plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1, fft_window_oversample/2+1, onembed.ctypes.data, 1, fft_window_oversample, cufft.CUFFT_C2R, batch) # construct arrays gpu_in = cuda.mem_alloc(8*batch*(fft_window_oversample/2+1)) # complex64 gpu_out = cuda.mem_alloc(4*batch*fft_window_oversample) # float32 data_shape = (batch,fft_window_oversample/2+1) cpu_in = standard_normal(data_shape) + 1j * standard_normal(data_shape) cpu_in = cpu_in.astype(complex64) cuda.memcpy_htod(gpu_in,cpu_in) # execute plan tic.record() cufft.cufftExecC2R(plan,int(gpu_in),int(gpu_out)) toc.record() toc.synchronize() # read out result cpu_out = empty((batch,fft_window_oversample),float32) cuda.memcpy_dtoh(cpu_out,gpu_out) # execute on CPU cpu = irfft(cpu_in,axis=-1) # destroy plan cufft.cufftDestroy(plan) # test print '\nOversampling by x64 Scenario with batches:' print '1-D %d-element C2R iFFT in batch of %d.' % (n, batch) print 'test results: ' 'pass' if allclose(cpu,cpu_out/(fft_window_oversample)) else 'fail' print 'real time:', batch * 13.128e-3,' ms' print 'GPU time:', tic.time_till(toc),' ms = ',tic.time_till(toc)/(batch*0.5*13.128e-3),' x real (both SB)'
def cleanup(self): # destroy plans cufft.cufftDestroy(self.__plan_A) cufft.cufftDestroy(self.__plan_B) cufft.cufftDestroy(self.__plan_C) # self.__gpu_quantized_0.free() self.__gpu_quantized_1.free() self.__gpu_time_series_0.free() self.__gpu_time_series_1.free()
time_gpu = tic.time_till(toc) print 'DEBUG::loading resampled time series cpu_r2dbe' gpumeminfo(cuda) cpu_r2dbe = np.empty(num_r2dbe_samples / 2, dtype=float32) cuda.memcpy_dtoh(cpu_r2dbe, gpu_r2dbe) print '' print 'time resampled:', 13.128e-3 * BENG_SNAPSHOTS * (BENG_BUFFER_IN_COUNTS - 1), ' ms' print 'Transfer size was %d bytes' % cpu_vdif_buf.nbytes print 'GPU time:', time_gpu, ' ms' # destroy plans cufft.cufftDestroy(plan_A) cufft.cufftDestroy(plan_interp_A) cufft.cufftDestroy(plan_interp_B) # free memory #gpu_r2dbe.free() if DEBUG: import matplotlib.pyplot as plt # Now read R2DBE data covering roughly the same time window as the SWARM # data. Start at an offset of zero (i.e. from the first VDIF packet) to # keep things simple. N_r_vdif_frames = int( np.ceil(read_sdbe_vdif.SWARM_TRANSPOSE_SIZE * (BENG_BUFFER_IN_COUNTS - 1) * read_sdbe_vdif.R2DBE_RATE / read_sdbe_vdif.SWARM_RATE))
def scenario_contiguous_batched39_resample(num_snapshots, tic, toc): ''' # Scenario: Fourier resample of num_snapshots # A iFFT: [num_snapshots,16385] complex64 --> # B FFT: [39,num_snapshots/39 * 32768] float32 --> # C iFFT + zero-padding: [39,num_snapshots/39* 32768*4096/2496/ 2 + 1] complex 64 --> # [39,num_snapshots * 32768 * 4096 / 2496] float32 # # 1 C(B(A(gpu_1))) = C(B(gpu_2)) = C(gpu_1) = gpu_2 # num_snapshots is a multiple of 39. # A executed using batch = num_snapshots # B&C executed using batch = num_snapshots / 39 ''' print '\nContiguous channel Fourier resampling scenario in batches of 39:' assert num_snapshots % 39 is 0, 'error: num_snapshots must be integer multiple of 39' # construct arrays batch = num_snapshots / 39 print 'batch: %d' % batch gpu_1 = cuda.mem_alloc( int(8 * batch * (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1))) gpu_2 = cuda.mem_alloc( int(4 * batch * (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE))) cpu_in = standard_normal( num_snapshots * BENG_CHANNELS) + 1j * standard_normal(num_snapshots * BENG_CHANNELS) cpu_in = cpu_in.astype(complex64) # create FFT plans n_A = array([2 * BENG_CHANNELS_], int32) inembed_A = array([BENG_CHANNELS], int32) onembed_A = array([2 * BENG_CHANNELS_], int32) plan_A = cufft.cufftPlanMany(1, n_A.ctypes.data, inembed_A.ctypes.data, 1, BENG_CHANNELS, onembed_A.ctypes.data, 1, 2 * BENG_CHANNELS_, cufft.CUFFT_C2R, num_snapshots) n_B = array([39 * 2 * BENG_CHANNELS_], int32) inembed_B = array([39 * 2 * BENG_CHANNELS_], int32) onembed_B = array( [int(39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1)], int32) plan_B = cufft.cufftPlanMany( 1, n_B.ctypes.data, inembed_B.ctypes.data, 1, 39 * 2 * BENG_CHANNELS_, onembed_B.ctypes.data, 1, int32(39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1), cufft.CUFFT_R2C, batch) n_C = array([39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE], int32) inembed_C = array( [39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1], int32) onembed_C = array([39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE], int32) plan_C = cufft.cufftPlanMany( 1, n_C.ctypes.data, inembed_C.ctypes.data, 1, int32(39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1), onembed_C.ctypes.data, 1, int32(39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE), cufft.CUFFT_C2R, batch) # zero out gpu_1 kernel_module = SourceModule(kernel_source) zero_out = kernel_module.get_function('zero_out') # sanity check: zero_out( gpu_1, int32(batch * (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1)), block=(1024, 1, 1), grid=(int( ceil(batch * (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1) / 1024.)), 1)) cpu_out = empty( (batch * (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1)), complex64) cuda.memcpy_dtoh(cpu_out, gpu_1) assert len(unique(cpu_out)) == 1, 'problem with zero_out' # move data to device cuda.memcpy_htod(gpu_1, cpu_in) tic.record() # Turn SWARM snapshots into timeseries cufft.cufftExecC2R(plan_A, int(gpu_1), int(gpu_2)) # zero out gpu_1 zero_out( gpu_1, int32(batch * (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1)), block=(1024, 1, 1), grid=(int( ceil(batch * (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1) / 1024.)), 1)) # Turn concatenated SWARM time series into single spectrum (already zero-padded) cufft.cufftExecR2C(plan_B, int(gpu_2), int(gpu_1)) # Turn padded SWARM spectrum into time series with R2DBE sampling rate cufft.cufftExecC2R(plan_C, int(gpu_1), int(gpu_2)) toc.record() toc.synchronize() # check on CPU cpu_A = irfft(cpu_in.reshape(num_snapshots, BENG_CHANNELS), axis=-1).astype(float32) cpu_B = rfft(cpu_A.reshape(batch, 39 * 2 * BENG_CHANNELS_), axis=-1).astype(complex64) cpu_C = irfft(hstack([ cpu_B, zeros((batch, (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1) - (39 * 2 * BENG_CHANNELS_ / 2 + 1)), complex64) ]), axis=-1) cpu_out = empty( num_snapshots * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE, float32) cuda.memcpy_dtoh(cpu_out, gpu_2) print 'test results: ', 'pass' if allclose( cpu_C.flatten(), cpu_out / (cpu_C.shape[-1] * 2 * BENG_CHANNELS_)) else 'fail' print 'max residual: ', max( abs(cpu_C.flatten() - cpu_out / (cpu_C.shape[-1] * 2 * BENG_CHANNELS_))) print 'GPU time:', tic.time_till(toc), ' ms = ', tic.time_till(toc) / ( num_snapshots * 0.5 * 13.128e-3), ' x real (both SB)' # destroy plans cufft.cufftDestroy(plan_A) cufft.cufftDestroy(plan_B) cufft.cufftDestroy(plan_C)
def fft_interp(gpu_1,gpu_2,num_snapshots,interp_kind='nearest',cpu_check=True): ''' Batched fft to time series and then interpolation to resample. No filter applied yet... ''' tic = cuda.Event() toc = cuda.Event() batch_size = num_snapshots print 'batch size: %d' % batch_size # create batched FFT plan configuration n = array([2*BENG_CHANNELS_],int32) inembed = array([BENG_CHANNELS],int32) onembed = array([2*BENG_CHANNELS_],int32) plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1, BENG_CHANNELS, onembed.ctypes.data, 1, 2*BENG_CHANNELS_, cufft.CUFFT_C2R, batch_size) # fetch kernel that resamples kernel_module = SourceModule(kernel_source) interp_1d = kernel_module.get_function(interp_kind) # execute plan cufft.cufftExecC2R(plan,int(gpu_1),int(gpu_2)) # interpolate tic.record() xs_size = int(floor(batch_size*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE)) - 1 TPB = 512 # threads per block nB = int(ceil(1. * xs_size / TPB)) # number of blocks if interp_kind is 'linear': interp_1d(gpu_2,gpu_1,int32(xs_size),float64(SWARM_RATE/R2DBE_RATE),float32(1./(2*BENG_CHANNELS_)), block=(TPB,1,1),grid=(nB,1)) else: interp_1d(gpu_2,gpu_1,int32(xs_size),float64(SWARM_RATE/R2DBE_RATE),float32(1./(2*BENG_CHANNELS_)), block=(TPB,1,1),grid=(nB,1)) toc.record() toc.synchronize() print 'GPU time:', tic.time_till(toc),' ms = ',tic.time_till(toc)/(num_snapshots*0.5*13.128e-3),' x real (both SB)' # destroy plan cufft.cufftDestroy(plan) # check on CPU if (cpu_check): # timestep sizes for SWARM and R2DBE rates dt_s = 1.0/SWARM_RATE dt_r = 1.0/R2DBE_RATE # the timespan of one SWARM FFT window T_s = dt_s*2*BENG_CHANNELS_ # the timespan of all SWARM data T_s_all = T_s*batch_size # get time-domain signal xs_swarm_rate = irfft(cpu_in,n=2*BENG_CHANNELS_,axis=1).flatten() # and calculate sample points t_swarm_rate = arange(0,T_s_all,dt_s) print t_swarm_rate[0],t_swarm_rate[-1] # calculate resample points (subtract one dt_s from end to avoid extrapolation) t_r2dbe_rate = arange(0,T_s_all-dt_s,dt_r) # and interpolate x_interp = interp1d(t_swarm_rate,xs_swarm_rate,kind=interp_kind) cpu_A = x_interp(t_r2dbe_rate) cpu_out = np.empty_like(cpu_A,dtype=float32) cuda.memcpy_dtoh(cpu_out,gpu_1) print 'median residual: ',median(abs(cpu_A-cpu_out)) if interp_kind is 'nearest': cpu_A[::32] = 0 cpu_out[::32] = 0 print 'test results: ', 'pass' if allclose(cpu_A,cpu_out) else 'fail'
def scenario_contiguous_batched39_resample(num_snapshots,tic,toc): ''' # Scenario: Fourier resample of num_snapshots # A iFFT: [num_snapshots,16385] complex64 --> # B FFT: [39,num_snapshots/39 * 32768] float32 --> # C iFFT + zero-padding: [39,num_snapshots/39* 32768*4096/2496/ 2 + 1] complex 64 --> # [39,num_snapshots * 32768 * 4096 / 2496] float32 # # 1 C(B(A(gpu_1))) = C(B(gpu_2)) = C(gpu_1) = gpu_2 # num_snapshots is a multiple of 39. # A executed using batch = num_snapshots # B&C executed using batch = num_snapshots / 39 ''' print '\nContiguous channel Fourier resampling scenario in batches of 39:' assert num_snapshots % 39 is 0, 'error: num_snapshots must be integer multiple of 39' # construct arrays batch = num_snapshots / 39 print 'batch: %d' % batch gpu_1 = cuda.mem_alloc(int(8 * batch * (39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1))) gpu_2 = cuda.mem_alloc(int(4 * batch * (39*2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE))) cpu_in = standard_normal(num_snapshots*BENG_CHANNELS) + 1j * standard_normal(num_snapshots*BENG_CHANNELS) cpu_in = cpu_in.astype(complex64) # create FFT plans n_A = array([2*BENG_CHANNELS_],int32) inembed_A = array([BENG_CHANNELS],int32) onembed_A = array([2*BENG_CHANNELS_],int32) plan_A = cufft.cufftPlanMany(1, n_A.ctypes.data, inembed_A.ctypes.data, 1, BENG_CHANNELS, onembed_A.ctypes.data, 1, 2*BENG_CHANNELS_, cufft.CUFFT_C2R, num_snapshots) n_B = array([39*2*BENG_CHANNELS_],int32) inembed_B = array([39*2*BENG_CHANNELS_],int32) onembed_B = array([int(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1)],int32) plan_B = cufft.cufftPlanMany(1, n_B.ctypes.data, inembed_B.ctypes.data,1,39*2*BENG_CHANNELS_, onembed_B.ctypes.data,1,int32(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1), cufft.CUFFT_R2C, batch) n_C = array([39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE],int32) inembed_C = array([39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1],int32) onembed_C = array([39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE],int32) plan_C = cufft.cufftPlanMany(1, n_C.ctypes.data, inembed_C.ctypes.data,1,int32(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1), onembed_C.ctypes.data,1,int32(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE), cufft.CUFFT_C2R, batch) # zero out gpu_1 kernel_module = SourceModule(kernel_source) zero_out = kernel_module.get_function('zero_out') # sanity check: zero_out(gpu_1,int32(batch * (39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1)), block=(1024,1,1),grid=(int(ceil(batch*(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1)/1024.)),1)) cpu_out = empty((batch * (39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1)),complex64) cuda.memcpy_dtoh(cpu_out,gpu_1) assert len(unique(cpu_out)) == 1, 'problem with zero_out' # move data to device cuda.memcpy_htod(gpu_1,cpu_in) tic.record() # Turn SWARM snapshots into timeseries cufft.cufftExecC2R(plan_A,int(gpu_1),int(gpu_2)) # zero out gpu_1 zero_out(gpu_1,int32(batch*(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1)), block=(1024,1,1),grid=(int(ceil(batch*(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1)/1024.)),1)) # Turn concatenated SWARM time series into single spectrum (already zero-padded) cufft.cufftExecR2C(plan_B,int(gpu_2),int(gpu_1)) # Turn padded SWARM spectrum into time series with R2DBE sampling rate cufft.cufftExecC2R(plan_C,int(gpu_1),int(gpu_2)) toc.record() toc.synchronize() # check on CPU cpu_A = irfft(cpu_in.reshape(num_snapshots,BENG_CHANNELS),axis=-1).astype(float32) cpu_B = rfft(cpu_A.reshape(batch,39*2*BENG_CHANNELS_),axis=-1).astype(complex64) cpu_C = irfft(hstack([cpu_B, zeros((batch,(39*2*BENG_CHANNELS_* R2DBE_RATE/SWARM_RATE/2+1)-(39*2*BENG_CHANNELS_/2+1)),complex64)]),axis=-1) cpu_out = empty(num_snapshots*2*BENG_CHANNELS_* R2DBE_RATE/SWARM_RATE,float32) cuda.memcpy_dtoh(cpu_out,gpu_2) print 'test results: ', 'pass' if allclose(cpu_C.flatten(),cpu_out/(cpu_C.shape[-1]*2*BENG_CHANNELS_)) else 'fail' print 'max residual: ',max(abs(cpu_C.flatten()-cpu_out/(cpu_C.shape[-1]*2*BENG_CHANNELS_))) print 'GPU time:', tic.time_till(toc),' ms = ',tic.time_till(toc)/(num_snapshots*0.5*13.128e-3),' x real (both SB)' # destroy plans cufft.cufftDestroy(plan_A) cufft.cufftDestroy(plan_B) cufft.cufftDestroy(plan_C)
toc.synchronize() time_gpu = tic.time_till(toc) print 'DEBUG::loading resampled time series cpu_r2dbe' gpumeminfo(cuda) cpu_r2dbe = np.empty(num_r2dbe_samples/2,dtype=float32) cuda.memcpy_dtoh(cpu_r2dbe,gpu_r2dbe) print '' print 'time resampled:', 13.128e-3 * BENG_SNAPSHOTS * (BENG_BUFFER_IN_COUNTS - 1), ' ms' print 'Transfer size was %d bytes' % cpu_vdif_buf.nbytes print 'GPU time:',time_gpu,' ms' # destroy plans cufft.cufftDestroy(plan_A) cufft.cufftDestroy(plan_interp_A) cufft.cufftDestroy(plan_interp_B) # free memory #gpu_r2dbe.free() if DEBUG: import matplotlib.pyplot as plt # Now read R2DBE data covering roughly the same time window as the SWARM # data. Start at an offset of zero (i.e. from the first VDIF packet) to # keep things simple. N_r_vdif_frames = int(np.ceil(read_sdbe_vdif.SWARM_TRANSPOSE_SIZE*(BENG_BUFFER_IN_COUNTS-1)*read_sdbe_vdif.R2DBE_RATE/read_sdbe_vdif.SWARM_RATE)) vdif_frames_offset = 0 rel_path_to_in = '/home/shared/sdbe_preprocessed/' d = sdbe_preprocess.get_diagnostics_from_file(scan_filename_base,rel_path=rel_path_to_in)
def fft_interp(gpu_1, gpu_2, num_snapshots, interp_kind='nearest', cpu_check=True): ''' Batched fft to time series and then interpolation to resample. No filter applied yet... ''' tic = cuda.Event() toc = cuda.Event() batch_size = num_snapshots print 'batch size: %d' % batch_size # create batched FFT plan configuration n = array([2 * BENG_CHANNELS_], int32) inembed = array([BENG_CHANNELS], int32) onembed = array([2 * BENG_CHANNELS_], int32) plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1, BENG_CHANNELS, onembed.ctypes.data, 1, 2 * BENG_CHANNELS_, cufft.CUFFT_C2R, batch_size) # fetch kernel that resamples kernel_module = SourceModule(kernel_source) interp_1d = kernel_module.get_function(interp_kind) # execute plan cufft.cufftExecC2R(plan, int(gpu_1), int(gpu_2)) # interpolate tic.record() xs_size = int( floor(batch_size * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE)) - 1 TPB = 512 # threads per block nB = int(ceil(1. * xs_size / TPB)) # number of blocks if interp_kind is 'linear': interp_1d(gpu_2, gpu_1, int32(xs_size), float64(SWARM_RATE / R2DBE_RATE), float32(1. / (2 * BENG_CHANNELS_)), block=(TPB, 1, 1), grid=(nB, 1)) else: interp_1d(gpu_2, gpu_1, int32(xs_size), float64(SWARM_RATE / R2DBE_RATE), float32(1. / (2 * BENG_CHANNELS_)), block=(TPB, 1, 1), grid=(nB, 1)) toc.record() toc.synchronize() print 'GPU time:', tic.time_till(toc), ' ms = ', tic.time_till(toc) / ( num_snapshots * 0.5 * 13.128e-3), ' x real (both SB)' # destroy plan cufft.cufftDestroy(plan) # check on CPU if (cpu_check): # timestep sizes for SWARM and R2DBE rates dt_s = 1.0 / SWARM_RATE dt_r = 1.0 / R2DBE_RATE # the timespan of one SWARM FFT window T_s = dt_s * 2 * BENG_CHANNELS_ # the timespan of all SWARM data T_s_all = T_s * batch_size # get time-domain signal xs_swarm_rate = irfft(cpu_in, n=2 * BENG_CHANNELS_, axis=1).flatten() # and calculate sample points t_swarm_rate = arange(0, T_s_all, dt_s) print t_swarm_rate[0], t_swarm_rate[-1] # calculate resample points (subtract one dt_s from end to avoid extrapolation) t_r2dbe_rate = arange(0, T_s_all - dt_s, dt_r) # and interpolate x_interp = interp1d(t_swarm_rate, xs_swarm_rate, kind=interp_kind) cpu_A = x_interp(t_r2dbe_rate) cpu_out = np.empty_like(cpu_A, dtype=float32) cuda.memcpy_dtoh(cpu_out, gpu_1) print 'median residual: ', median(abs(cpu_A - cpu_out)) if interp_kind is 'nearest': cpu_A[::32] = 0 cpu_out[::32] = 0 print 'test results: ', 'pass' if allclose(cpu_A, cpu_out) else 'fail'
def fft_batched(gpu_1,gpu_2,num_snapshots,snapshots_per_batch=39,cpu_check=True): ''' gpu_1: pointer to Mx16385 array on GPU device where zeroth dimension is positive frequency half of spectrum and the first dimension is is increasing snapshot index. This array will be destroyed. Must have byte size: int(8*batch_size*(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1)) gpu_2: pointer to result snapshots_per_batch: number of snapshots grouped for resampling (% 39 == 0) ''' tic = cuda.Event() toc = cuda.Event() batch_size = num_snapshots / snapshots_per_batch print 'batch size: %d' % batch_size # create FFT plans n_A = array([2*BENG_CHANNELS_],int32) inembed_A = array([BENG_CHANNELS],int32) onembed_A = array([2*BENG_CHANNELS_],int32) plan_A = cufft.cufftPlanMany(1, n_A.ctypes.data, inembed_A.ctypes.data, 1, BENG_CHANNELS, onembed_A.ctypes.data, 1, 2*BENG_CHANNELS_, cufft.CUFFT_C2R, num_snapshots) n_B = array([snapshots_per_batch*2*BENG_CHANNELS_],int32) inembed_B = array([snapshots_per_batch*2*BENG_CHANNELS_],int32) onembed_B = array([int(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1)],int32) plan_B = cufft.cufftPlanMany(1, n_B.ctypes.data, inembed_B.ctypes.data,1,snapshots_per_batch*2*BENG_CHANNELS_, onembed_B.ctypes.data,1,int32(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1), cufft.CUFFT_R2C, batch_size) n_C = array([snapshots_per_batch*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE],int32) inembed_C = array([snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1],int32) onembed_C = array([snapshots_per_batch*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE],int32) plan_C = cufft.cufftPlanMany(1, n_C.ctypes.data, inembed_C.ctypes.data,1,int32(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1), onembed_C.ctypes.data,1,int32(snapshots_per_batch*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE), cufft.CUFFT_C2R, batch_size) # fetch kernel that zeroes out an array kernel_module = SourceModule(kernel_source) zero_out = kernel_module.get_function('zero_out') tic.record() # Turn SWARM snapshots into timeseries cufft.cufftExecC2R(plan_A,int(gpu_1),int(gpu_2)) # zero out gpu_1 zero_out(gpu_1,int32(batch_size*(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1)), block=(1024,1,1), grid=(int(ceil(batch_size*(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1)/1024.)),1)) # Turn concatenated SWARM time series into single spectrum (already zero-padded) cufft.cufftExecR2C(plan_B,int(gpu_2),int(gpu_1)) # Turn padded SWARM spectrum into time series with R2DBE sampling rate cufft.cufftExecC2R(plan_C,int(gpu_1),int(gpu_2)) toc.record() toc.synchronize() # check on CPU if (cpu_check): cpu_A = irfft(cpu_in.reshape(num_snapshots,BENG_CHANNELS),axis=-1).astype(float32) cpu_B = rfft(cpu_A.reshape(batch_size,snapshots_per_batch*2*BENG_CHANNELS_),axis=-1).astype(complex64) cpu_C = irfft(hstack([cpu_B, zeros((batch_size,(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1)- (snapshots_per_batch*BENG_CHANNELS_+1)),complex64)]),axis=-1) cpu_out = empty(num_snapshots*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE,float32) cuda.memcpy_dtoh(cpu_out,gpu_2) print 'test results: ', 'pass' if allclose(cpu_C.flatten(),cpu_out/(cpu_C.shape[-1]*2*BENG_CHANNELS_)) else 'fail' print 'max residual: ',max(abs(cpu_C.flatten()-cpu_out/(cpu_C.shape[-1]*2*BENG_CHANNELS_))) print 'GPU time:', tic.time_till(toc),' ms = ',tic.time_till(toc)/(num_snapshots*0.5*13.128e-3),' x real (both SB)' # destroy plans cufft.cufftDestroy(plan_A) cufft.cufftDestroy(plan_B) cufft.cufftDestroy(plan_C)
def fft_batched(gpu_1, gpu_2, num_snapshots, snapshots_per_batch=39, cpu_check=True): ''' gpu_1: pointer to Mx16385 array on GPU device where zeroth dimension is positive frequency half of spectrum and the first dimension is is increasing snapshot index. This array will be destroyed. Must have byte size: int(8*batch_size*(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1)) gpu_2: pointer to result snapshots_per_batch: number of snapshots grouped for resampling (% 39 == 0) ''' tic = cuda.Event() toc = cuda.Event() batch_size = num_snapshots / snapshots_per_batch print 'batch size: %d' % batch_size # create FFT plans n_A = array([2 * BENG_CHANNELS_], int32) inembed_A = array([BENG_CHANNELS], int32) onembed_A = array([2 * BENG_CHANNELS_], int32) plan_A = cufft.cufftPlanMany(1, n_A.ctypes.data, inembed_A.ctypes.data, 1, BENG_CHANNELS, onembed_A.ctypes.data, 1, 2 * BENG_CHANNELS_, cufft.CUFFT_C2R, num_snapshots) n_B = array([snapshots_per_batch * 2 * BENG_CHANNELS_], int32) inembed_B = array([snapshots_per_batch * 2 * BENG_CHANNELS_], int32) onembed_B = array([ int(snapshots_per_batch * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE + 1) ], int32) plan_B = cufft.cufftPlanMany( 1, n_B.ctypes.data, inembed_B.ctypes.data, 1, snapshots_per_batch * 2 * BENG_CHANNELS_, onembed_B.ctypes.data, 1, int32(snapshots_per_batch * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE + 1), cufft.CUFFT_R2C, batch_size) n_C = array( [snapshots_per_batch * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE], int32) inembed_C = array( [snapshots_per_batch * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE + 1], int32) onembed_C = array( [snapshots_per_batch * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE], int32) plan_C = cufft.cufftPlanMany( 1, n_C.ctypes.data, inembed_C.ctypes.data, 1, int32(snapshots_per_batch * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE + 1), onembed_C.ctypes.data, 1, int32(snapshots_per_batch * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE), cufft.CUFFT_C2R, batch_size) # fetch kernel that zeroes out an array kernel_module = SourceModule(kernel_source) zero_out = kernel_module.get_function('zero_out') tic.record() # Turn SWARM snapshots into timeseries cufft.cufftExecC2R(plan_A, int(gpu_1), int(gpu_2)) # zero out gpu_1 zero_out( gpu_1, int32(batch_size * (snapshots_per_batch * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE + 1)), block=(1024, 1, 1), grid=(int( ceil(batch_size * (snapshots_per_batch * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE + 1) / 1024.)), 1)) # Turn concatenated SWARM time series into single spectrum (already zero-padded) cufft.cufftExecR2C(plan_B, int(gpu_2), int(gpu_1)) # Turn padded SWARM spectrum into time series with R2DBE sampling rate cufft.cufftExecC2R(plan_C, int(gpu_1), int(gpu_2)) toc.record() toc.synchronize() # check on CPU if (cpu_check): cpu_A = irfft(cpu_in.reshape(num_snapshots, BENG_CHANNELS), axis=-1).astype(float32) cpu_B = rfft(cpu_A.reshape(batch_size, snapshots_per_batch * 2 * BENG_CHANNELS_), axis=-1).astype(complex64) cpu_C = irfft(hstack([ cpu_B, zeros((batch_size, (snapshots_per_batch * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE + 1) - (snapshots_per_batch * BENG_CHANNELS_ + 1)), complex64) ]), axis=-1) cpu_out = empty( num_snapshots * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE, float32) cuda.memcpy_dtoh(cpu_out, gpu_2) print 'test results: ', 'pass' if allclose( cpu_C.flatten(), cpu_out / (cpu_C.shape[-1] * 2 * BENG_CHANNELS_)) else 'fail' print 'max residual: ', max( abs(cpu_C.flatten() - cpu_out / (cpu_C.shape[-1] * 2 * BENG_CHANNELS_))) print 'GPU time:', tic.time_till(toc), ' ms = ', tic.time_till(toc) / ( num_snapshots * 0.5 * 13.128e-3), ' x real (both SB)' # destroy plans cufft.cufftDestroy(plan_A) cufft.cufftDestroy(plan_B) cufft.cufftDestroy(plan_C)
def compute_dirich_space_fft(self, d_Ds, Mx, My, Px, Py, Sx, Sy, Wx, Wy): import scikits.cuda.cufft as cufft dx = Sx / Px dy = Sy / Py Px1 = int(np.round(self.Sx / dx)) Py1 = int(np.round(self.Sy / dy)) if self.dtype == np.complex128: typef = np.dtype(np.float64) else: typef = np.dtype(np.float32) d_dirich = parray.empty((self.num_neurons, Py, Px),typef) freemem,totalmem = cuda.mem_get_info() ONE_TIME_FILTER = int(min(freemem / (Px1 * Py1 * d_Ds.dtype.itemsize) / 4, self.num_neurons)) n = np.asarray((Py1, Px1) ,np.int32) if self.dtype == np.complex128: plan = cufft.cufftPlanMany(2, n.ctypes.data, None, 1, 0, None, 1, 0, cufft.CUFFT_Z2Z, ONE_TIME_FILTER) fftfunc = cufft.cufftExecZ2Z else: plan = cufft.cufftPlanMany(2, n.ctypes.data, None, 1, 0, None, 1, 0, cufft.CUFFT_C2C, ONE_TIME_FILTER) fftfunc = cufft.cufftExecC2C Ds2fftfun = get_Ds2fft_kernel(self.dtype) d_filter_complex = parray.empty((ONE_TIME_FILTER, Px1*Py1), self.dtype) filter2recfun = get_filter2rec_kernel(self.dtype) for i in range(0, self.num_neurons, ONE_TIME_FILTER): N_filters = min(ONE_TIME_FILTER, self.num_neurons - i) d_filter_complex.fill(0) launch_kernel(Ds2fftfun, (256,1,1), (Mx*2+1, My*2+1), [[d_Ds,i * d_Ds.ld], d_Ds.ld, d_filter_complex, d_filter_complex.ld, Mx, My, Px1, Py1, N_filters]) if N_filters < ONE_TIME_FILTER: cufft.cufftDestroy(plan) if self.dtype == np.complex128: plan = cufft.cufftPlanMany(2, n.ctypes.data, None, 1, 0, None, 1, 0, cufft.CUFFT_Z2Z, N_filters) else: plan = cufft.cufftPlanMany(2, n.ctypes.data, None, 1, 0, None, 1, 0, cufft.CUFFT_C2C, N_filters) #be careful with the side-by-side constraint fftfunc(plan, int(d_filter_complex.gpudata), int(d_filter_complex.gpudata), cufft.CUFFT_INVERSE) BLOCK_SIZE = 16 launch_kernel(filter2recfun, (BLOCK_SIZE,BLOCK_SIZE,1), (((Px-1) / BLOCK_SIZE + 1)* ((Py-1) / BLOCK_SIZE+1), N_filters), [[d_dirich,i * d_dirich.ld],d_dirich.ld, d_filter_complex, d_filter_complex.ld, N_filters, Px, Py, Px1, Py1]) cufft.cufftDestroy(plan) return d_dirich