def ALL_IN_MIMO_SpeedTest(x,l_b,mu,ovsmpl,R2 = 2): # Preparations C_Type = x.dtype NSyms =int(len(x[0,:])/ovsmpl) NMux = len(x[:,0]) stream = cuda.stream() x_o = np.zeros(NMux*NSyms,dtype = C_Type).reshape(NMux,NSyms) NBlocks = int(NSyms/l_b - 1) # Setup Filter Variables O = np.zeros(l_b,dtype = C_Type) I = np.ones(l_b,dtype = C_Type) * R2 h = np.zeros(l_b * 2 * NMux * NMux,dtype = C_Type).reshape(NMux,NMux,l_b * 2) H = np.zeros(l_b * 2 * NMux * NMux * ovsmpl,dtype = C_Type).reshape(NMux,NMux,ovsmpl,l_b * 2) CTap = np.int(l_b / 2) for OUT in range(NMux): h[OUT,OUT,CTap] = 1/ovsmpl + 0j for IN in range(NMux): for OUT in range(NMux): for S in range(ovsmpl): H[IN,OUT,S,:] = np.fft.fft(h[IN,OUT,:]) H = H.reshape(NMux*NMux*ovsmpl*l_b*2) # Intermediate values X_i = np.zeros(NMux*ovsmpl*l_b*2,dtype = C_Type) X_o = np.zeros(NMux*NMux*l_b*2,dtype = C_Type) E = np.zeros(l_b * 2 * NMux * NMux * ovsmpl,dtype = C_Type) s = np.zeros(l_b * 2 * NMux * NMux * ovsmpl,dtype = C_Type) # FFT planning DBlock_IN_S_FFT = GFFT.FFTPlan((l_b*2,),C_Type,C_Type,NMux*ovsmpl*NBlocks,stream = stream) DBlock_IN_OUT_FFT = GFFT.FFTPlan((l_b*2,),C_Type,C_Type,NMux*NMux,stream = stream) DBlock_IN_OUT_S_FFT = GFFT.FFTPlan((l_b*2,),C_Type,C_Type,NMux*NMux*ovsmpl,stream = stream) ## Transmit TO GPU I = cuda.to_device(I,stream = stream) s= cuda.to_device(s,stream = stream) H = cuda.to_device(H,stream = stream) X_o = cuda.to_device(X_o,stream = stream) E = cuda.to_device(E,stream = stream) x_o = cuda.to_device(x_o,stream = stream) X_i = cuda.to_device(X_i,stream = stream) x = cuda.to_device(x,stream = stream) # Start ovsmpl_OvlapSamps = (NBlocks) * l_b*2 mux_OvlapSamps = ovsmpl_OvlapSamps * ovsmpl x_i = np.zeros(NMux*mux_OvlapSamps, dtype = C_Type) x_i = cuda.to_device(x_i) GetAllBlocks[(NBlocks,NMux,ovsmpl),l_b*2](x,x_i,l_b,NSyms,ovsmpl,ovsmpl_OvlapSamps,mux_OvlapSamps) DBlock_IN_S_FFT.forward(x_i,x_i) t_comp = 90 t_fft_inv_1 = 90 t_Reduce = 90 t_Calc_E = 90 t_FFT_EE = 90 t_compwXI = 90 t_IFFT_ES = 90 t_OVLP = 90 t_FFT_SS = 90 t_UpdateH = 90 for BLOCK in range(1,NBlocks): # Compensation start = timer() Compensate_ALLIN[(NMux,NMux),l_b*2](x_i,X_o,H,l_b,ovsmpl,ovsmpl_OvlapSamps,mux_OvlapSamps,BLOCK,NMux) t_comp = min([t_comp, timer()-start]) start = timer() DBlock_IN_OUT_FFT.inverse(X_o,X_o) t_fft_inv_1 = min([t_fft_inv_1 , timer()-start]) start = timer() Reduce[(NMux,NMux),l_b*2](X_o,x_o,NMux,l_b,BLOCK) t_Reduce = min([t_Reduce , timer()-start]) # Tap Updates start = timer() CalcE_SUM[(NMux,NMux,ovsmpl),l_b*2](x_o,X_o,E,NMux,l_b,BLOCK,I,ovsmpl) t_Calc_E = min([t_Calc_E , timer()-start]) start = timer() DBlock_IN_OUT_S_FFT.forward(E,E) t_FFT_EE = min([t_FFT_EE , timer()-start]) start = timer() CompensateWithInput_ALL_IN[(NMux,NMux,ovsmpl),l_b*2](E,x_i,BLOCK,l_b,NMux,ovsmpl,ovsmpl_OvlapSamps,mux_OvlapSamps) t_compwXI = min([t_compwXI , timer()-start]) start = timer() DBlock_IN_OUT_S_FFT.inverse(E,s) t_IFFT_ES = min([t_IFFT_ES , timer()-start]) start = timer() OverlapS[(NMux,NMux,ovsmpl),l_b](s,l_b,NMux,ovsmpl) t_OVLP = min([t_OVLP , timer()-start]) start = timer() DBlock_IN_OUT_S_FFT.forward(s,s) t_FFT_SS = min([t_FFT_SS , timer()-start]) start = timer() UpdateH[(NMux,NMux,ovsmpl),l_b*2](H,s,mu,l_b,NMux,ovsmpl) t_UpdateH = min([t_UpdateH , timer()-start]) return np.array([t_comp,t_fft_inv_1,t_Calc_E,t_FFT_EE,t_compwXI,t_IFFT_ES,t_OVLP,t_FFT_SS,t_UpdateH])
def __init__(self, Nr, Nz, use_cuda=False, nthreads=None): """ Initialize an FFT object Parameters ---------- Nr: int Number of grid points along the r axis (axis -1) Nz: int Number of grid points along the z axis (axis 0) use_cuda: bool, optional Whether to perform the Fourier transform on the z axis nthreads : int, optional Number of threads for the FFTW transform. If None, the default number of threads of numba is used (environment variable NUMBA_NUM_THREADS) """ # Check whether to use cuda self.use_cuda = use_cuda if (self.use_cuda is True) and (cuda_installed is False): self.use_cuda = False print('** Cuda not available for Fourier transform.') print('** Performing the Fourier transform on the CPU.') # Check whether to use MKL self.use_mkl = mkl_installed # Initialize the object for calculation on the GPU if self.use_cuda: # Initialize the dimension of the grid and blocks self.dim_grid, self.dim_block = cuda_tpb_bpg_2d(Nz, Nr) # Initialize 1d buffer for cufft self.buffer1d_in = cuda.device_array((Nz * Nr, ), dtype=np.complex128) self.buffer1d_out = cuda.device_array((Nz * Nr, ), dtype=np.complex128) # Initialize the cuda libraries object self.fft = cufft.FFTPlan(shape=(Nz, ), itype=np.complex128, otype=np.complex128, batch=Nr) self.blas = cublas.Blas() # For normalization of the iFFT self.inv_Nz = 1. / Nz # For normalization of the iFFT # Initialize the object for calculation on the CPU else: # For MKL FFT if self.use_mkl: # Initialize the MKL plan with dummy array spect_buffer = np.zeros((Nz, Nr), dtype=np.complex128) self.mklfft = MKLFFT(spect_buffer) # For FFTW else: # Determine number of threads if nthreads is None: # Get the default number of threads for numba nthreads = numba.config.NUMBA_NUM_THREADS # Initialize the FFT plan with dummy arrays interp_buffer = np.zeros((Nz, Nr), dtype=np.complex128) spect_buffer = np.zeros((Nz, Nr), dtype=np.complex128) self.fft = pyfftw.FFTW(interp_buffer, spect_buffer, axes=(0, ), direction='FFTW_FORWARD', threads=nthreads) self.ifft = pyfftw.FFTW(spect_buffer, interp_buffer, axes=(0, ), direction='FFTW_BACKWARD', threads=nthreads)
def ALL_IN_MIMO(x,l_b,mu,ovsmpl,R2 = 2): # Preparations C_Type = x.dtype NSyms =int(len(x[0,:])/ovsmpl) NMux = len(x[:,0]) x_o = np.zeros(NMux*NSyms,dtype = C_Type).reshape(NMux,NSyms) NBlocks = int(NSyms/l_b - 1) # Setup Filter Variables O = np.zeros(l_b,dtype = C_Type) I = np.ones(l_b,dtype = C_Type) * R2 h = np.zeros(l_b * 2 * NMux * NMux,dtype = C_Type).reshape(NMux,NMux,l_b * 2) H = np.zeros(l_b * 2 * NMux * NMux * ovsmpl,dtype = C_Type).reshape(NMux,NMux,ovsmpl,l_b * 2) CTap = np.int(l_b / 2) for OUT in range(NMux): h[OUT,OUT,CTap] = 1/ovsmpl + 0j for IN in range(NMux): for OUT in range(NMux): for S in range(ovsmpl): H[IN,OUT,S,:] = np.fft.fft(h[IN,OUT,:]) H = H.reshape(NMux*NMux*ovsmpl*l_b*2) # Intermediate values X_i = np.zeros(NMux*ovsmpl*l_b*2,dtype = C_Type) X_o = np.zeros(NMux*NMux*l_b*2,dtype = C_Type) E = np.zeros(l_b * 2 * NMux * NMux * ovsmpl,dtype = C_Type) s = np.zeros(l_b * 2 * NMux * NMux * ovsmpl,dtype = C_Type) # FFT planning DBlock_IN_S_FFT = GFFT.FFTPlan((l_b*2,),C_Type,C_Type,NMux*ovsmpl*NBlocks) DBlock_IN_OUT_FFT = GFFT.FFTPlan((l_b*2,),C_Type,C_Type,NMux*NMux) DBlock_IN_OUT_S_FFT = GFFT.FFTPlan((l_b*2,),C_Type,C_Type,NMux*NMux*ovsmpl) ## Transmit TO GPU I = cuda.to_device(I) s= cuda.to_device(s) H = cuda.to_device(H) X_o = cuda.to_device(X_o) E = cuda.to_device(E) x_o = cuda.to_device(x_o) X_i = cuda.to_device(X_i) x = cuda.to_device(x) # Start ovsmpl_OvlapSamps = (NBlocks) * l_b*2 mux_OvlapSamps = ovsmpl_OvlapSamps * ovsmpl x_i = np.zeros(NMux*mux_OvlapSamps, dtype = C_Type) GetAllBlocks[(NBlocks,NMux,ovsmpl),l_b*2](x,x_i,l_b,NSyms,ovsmpl,ovsmpl_OvlapSamps,mux_OvlapSamps) DBlock_IN_S_FFT.forward(x_i,x_i) for BLOCK in range(1,NBlocks): # Compensation Compensate_ALLIN[(NMux,NMux),l_b*2](x_i,X_o,H,l_b,ovsmpl,ovsmpl_OvlapSamps,mux_OvlapSamps,BLOCK,NMux) DBlock_IN_OUT_FFT.inverse(X_o,X_o) CalcE_SUM[(NMux,NMux,ovsmpl),l_b*2](x_o,X_o,E,NMux,l_b,BLOCK,I,ovsmpl) DBlock_IN_OUT_S_FFT.forward(E,E) CompensateWithInput_ALL_IN[(NMux,NMux,ovsmpl),l_b*2](E,x_i,BLOCK,l_b,NMux,ovsmpl,ovsmpl_OvlapSamps,mux_OvlapSamps) DBlock_IN_OUT_S_FFT.inverse(E,s) OverlapS[(NMux,NMux,ovsmpl),l_b](s,l_b,NMux,ovsmpl) DBlock_IN_OUT_S_FFT.forward(s,s) UpdateH[(NMux,NMux,ovsmpl),l_b*2](H,s,mu,l_b,NMux,ovsmpl) return x_o.copy_to_host()
import pyculib.fft as fft import time import numba lb = 64 sizediv = 8 nmodes = 1 ovsmpl = 1 ovconj = 1 data = np.ones((lb), dtype=np.complex128) nblocks = 10000 shp_pyc = (lb) plan = fft.FFTPlan((lb, ), np.complex128, np.complex128, 1) arr_pyculib = cuda.to_device(data.reshape(shp_pyc)) arr_out_pyculib = cuda.device_array_like(arr_pyculib) start = time.time() for i_block in range(int(nblocks)): plan.forward(arr_pyculib, arr_out_pyculib) print(arr_out_pyculib[0]) end = time.time() print("Time pyculib = " + str(end - start)) arr_Martin = cuda.to_device(data) arr_out_martin = cuda.device_array_like(arr_Martin)