def damp_EB_open_boundary( self, interp ): """ Damp the fields E and B in the damp cells, at the right and left of the *global* simulation box. Parameter: ----------- interp: list of InterpolationGrid objects (one per azimuthal mode) Objects that contain the fields to be damped. """ # Do not damp the fields for 0 n_damp cells (periodic) if self.n_damp != 0: # Total size of the damping and guard region nd = self.n_guard + self.n_damp + self.n_inject if self.left_proc is None: # Damp the fields on the CPU or the GPU if interp[0].use_cuda: # Damp the fields on the GPU dim_grid, dim_block = cuda_tpb_bpg_2d( nd, interp[0].Nr ) for m in range(len(interp)): cuda_damp_EB_left[dim_grid, dim_block]( interp[m].Er, interp[m].Et, interp[m].Ez, interp[m].Br, interp[m].Bt, interp[m].Bz, self.d_left_damp, nd) else: # Damp the fields on the CPU for m in range(len(interp)): # Damp the fields in left guard cells interp[m].Er[:nd,:]*=self.left_damp[:,np.newaxis] interp[m].Et[:nd,:]*=self.left_damp[:,np.newaxis] interp[m].Ez[:nd,:]*=self.left_damp[:,np.newaxis] interp[m].Br[:nd,:]*=self.left_damp[:,np.newaxis] interp[m].Bt[:nd,:]*=self.left_damp[:,np.newaxis] interp[m].Bz[:nd,:]*=self.left_damp[:,np.newaxis] if self.right_proc is None: # Damp the fields on the CPU or the GPU if interp[0].use_cuda: # Damp the fields on the GPU dim_grid, dim_block = cuda_tpb_bpg_2d( nd, interp[0].Nr ) for m in range(len(interp)): cuda_damp_EB_right[dim_grid, dim_block]( interp[m].Er, interp[m].Et, interp[m].Ez, interp[m].Br, interp[m].Bt, interp[m].Bz, self.d_right_damp, nd) else: # Damp the fields on the CPU for m in range(len(interp)): # Damp the fields in left guard cells interp[m].Er[-nd:,:]*=self.right_damp[::-1,np.newaxis] interp[m].Et[-nd:,:]*=self.right_damp[::-1,np.newaxis] interp[m].Ez[-nd:,:]*=self.right_damp[::-1,np.newaxis] interp[m].Br[-nd:,:]*=self.right_damp[::-1,np.newaxis] interp[m].Bt[-nd:,:]*=self.right_damp[::-1,np.newaxis] interp[m].Bz[-nd:,:]*=self.right_damp[::-1,np.newaxis]
def damp_pml_EB( self, interp ): """ Damp the fields E and B in the PML cells. Parameters ---------- interp: list of InterpolationGrid objects (one per azimuthal mode) Objects that contain the fields to be damped. """ # Damp the fields on the CPU or the GPU if interp[0].use_cuda: # Damp the fields on the GPU dim_grid, dim_block = cuda_tpb_bpg_2d( interp[0].Nz, self.n_pml ) for m in range(len(interp)): cuda_damp_pml_EB[dim_grid, dim_block]( interp[m].Et, interp[m].Et_pml, interp[m].Ez, interp[m].Bt, interp[m].Bt_pml, interp[m].Bz, self.d_damp_array, self.n_pml ) else: # Damp the fields on the CPU n_pml = self.n_pml for m in range(len(interp)): # Substract the theta PML fields to the regular theta fields interp[m].Et[:,-n_pml:] -= interp[m].Et_pml[:,-n_pml:] interp[m].Bt[:,-n_pml:] -= interp[m].Bt_pml[:,-n_pml:] # Damp the theta PML fields interp[m].Et_pml[:,-n_pml:] *= self.damp_array[np.newaxis, :] interp[m].Bt_pml[:,-n_pml:] *= self.damp_array[np.newaxis, :] # Add the theta PML fields back to the regular theta fields interp[m].Et[:,-n_pml:] += interp[m].Et_pml[:,-n_pml:] interp[m].Bt[:,-n_pml:] += interp[m].Bt_pml[:,-n_pml:] # Damp the z fields interp[m].Bz[:,-n_pml:] *= self.damp_array[np.newaxis, :] interp[m].Ez[:,-n_pml:] *= self.damp_array[np.newaxis, :]
def shift_spect_grid( self, grid, n_move, shift_rho=True, shift_currents=True ): """ Shift the spectral fields by n_move cells (with respect to the spatial grid). Shifting is done either on the CPU or the GPU, if use_cuda is True. (Typically n_move is positive, and the fields are shifted backwards) Parameters ---------- grid: an SpectralGrid corresponding to one given azimuthal mode Contains the values of the fields in spectral space, and is modified by this function. n_move: int The number of cells by which the grid should be shifted shift_rho: bool, optional Whether to also shift the charge density Default: True, since rho is only recalculated from scratch when the particles are exchanged shift_currents: bool, optional Whether to also shift the currents Default: False, since the currents are recalculated from scratch at each PIC cycle """ if grid.use_cuda: shift = grid.d_field_shift # Get a 2D CUDA grid of the size of the grid tpb, bpg = cuda_tpb_bpg_2d( grid.Ep.shape[0], grid.Ep.shape[1] ) # Shift all the fields on the GPU shift_spect_array_gpu[tpb, bpg]( grid.Ep, shift, n_move ) shift_spect_array_gpu[tpb, bpg]( grid.Em, shift, n_move ) shift_spect_array_gpu[tpb, bpg]( grid.Ez, shift, n_move ) shift_spect_array_gpu[tpb, bpg]( grid.Bp, shift, n_move ) shift_spect_array_gpu[tpb, bpg]( grid.Bm, shift, n_move ) shift_spect_array_gpu[tpb, bpg]( grid.Bz, shift, n_move ) if shift_rho: shift_spect_array_gpu[tpb, bpg]( grid.rho_prev, shift, n_move ) if shift_currents: shift_spect_array_gpu[tpb, bpg]( grid.Jp, shift, n_move ) shift_spect_array_gpu[tpb, bpg]( grid.Jm, shift, n_move ) shift_spect_array_gpu[tpb, bpg]( grid.Jz, shift, n_move ) else: shift = grid.field_shift # Shift all the fields on the CPU shift_spect_array_cpu( grid.Ep, shift, n_move ) shift_spect_array_cpu( grid.Em, shift, n_move ) shift_spect_array_cpu( grid.Ez, shift, n_move ) shift_spect_array_cpu( grid.Bp, shift, n_move ) shift_spect_array_cpu( grid.Bm, shift, n_move ) shift_spect_array_cpu( grid.Bz, shift, n_move ) if shift_rho: shift_spect_array_cpu( grid.rho_prev, shift, n_move ) if shift_currents: shift_spect_array_cpu( grid.Jp, shift, n_move ) shift_spect_array_cpu( grid.Jm, shift, n_move ) shift_spect_array_cpu( grid.Jz, shift, n_move )
def erase(self, fieldtype): """ Sets the field `fieldtype` to zero on the interpolation grid Parameter --------- fieldtype : string A string which represents the kind of field to be erased (either 'E', 'B', 'J', 'rho') """ if self.use_cuda: # Obtain the cuda grid dim_grid, dim_block = cuda_tpb_bpg_2d(self.Nz, self.Nr) # Erase the arrays on the GPU if fieldtype == 'rho': for m in range(self.Nm): cuda_erase_scalar[dim_grid, dim_block](self.interp[m].rho) elif fieldtype == 'J': for m in range(self.Nm): cuda_erase_vector[dim_grid, dim_block](self.interp[m].Jr, self.interp[m].Jt, self.interp[m].Jz) elif fieldtype == 'E': for m in range(self.Nm): cuda_erase_vector[dim_grid, dim_block](self.interp[m].Er, self.interp[m].Et, self.interp[m].Ez) elif fieldtype == 'B': for m in range(self.Nm): cuda_erase_vector[dim_grid, dim_block](self.interp[m].Br, self.interp[m].Bt, self.interp[m].Bz) else: raise ValueError('Invalid string for fieldtype: %s' % fieldtype) else: # Erase the arrays on the CPU if fieldtype == 'rho': for m in range(self.Nm): self.interp[m].rho[:, :] = 0. elif fieldtype == 'J': for m in range(self.Nm): self.interp[m].Jr[:, :] = 0. self.interp[m].Jt[:, :] = 0. self.interp[m].Jz[:, :] = 0. elif fieldtype == 'E': for m in range(self.Nm): self.interp[m].Er[:, :] = 0. self.interp[m].Et[:, :] = 0. self.interp[m].Ez[:, :] = 0. elif fieldtype == 'B': for m in range(self.Nm): self.interp[m].Br[:, :] = 0. self.interp[m].Bt[:, :] = 0. self.interp[m].Bz[:, :] = 0. else: raise ValueError('Invalid string for fieldtype: %s' % fieldtype)
def filter(self, fieldtype) : """ Filter the field `fieldtype` Parameter --------- fieldtype : string A string which represents the kind of field to be filtered (either 'E', 'B', 'J', 'rho_next' or 'rho_prev') """ if self.use_cuda : # Obtain the cuda grid dim_grid, dim_block = cuda_tpb_bpg_2d( self.Nz, self.Nr ) # Filter fields on the GPU if fieldtype == 'J' : cuda_filter_vector[dim_grid, dim_block]( self.Jp, self.Jm, self.Jz, self.Nz, self.Nr, self.d_filter_array_z, self.d_filter_array_r ) elif fieldtype == 'E' : cuda_filter_vector[dim_grid, dim_block]( self.Ep, self.Em, self.Ez, self.Nz, self.Nr, self.d_filter_array_z, self.d_filter_array_r ) elif fieldtype == 'B' : cuda_filter_vector[dim_grid, dim_block]( self.Bp, self.Bm, self.Bz, self.Nz, self.Nr, self.d_filter_array_z, self.d_filter_array_r ) elif fieldtype in ['rho_prev', 'rho_next', 'rho_next_z', 'rho_next_xy']: spectral_rho = getattr( self, fieldtype ) cuda_filter_scalar[dim_grid, dim_block]( spectral_rho, self.Nz, self.Nr, self.d_filter_array_z, self.d_filter_array_r ) else : raise ValueError('Invalid string for fieldtype: %s'%fieldtype) else : # Filter fields on the CPU if fieldtype == 'J' : numba_filter_vector( self.Jp, self.Jm, self.Jz, self.Nz, self.Nr, self.filter_array_z, self.filter_array_r ) elif fieldtype == 'E' : numba_filter_vector( self.Ep, self.Em, self.Ez, self.Nz, self.Nr, self.filter_array_z, self.filter_array_r ) elif fieldtype == 'B' : numba_filter_vector( self.Bp, self.Bm, self.Bz, self.Nz, self.Nr, self.filter_array_z, self.filter_array_r ) elif fieldtype in ['rho_prev', 'rho_next', 'rho_next_z', 'rho_next_xy']: spectral_rho = getattr( self, fieldtype ) numba_filter_scalar( spectral_rho, self.Nz, self.Nr, self.filter_array_z, self.filter_array_r ) else : raise ValueError('Invalid string for fieldtype: %s'%fieldtype)
def push_rho(self): """ Transfer the values of rho_next to rho_prev, and set rho_next to zero """ if self.use_cuda: # Obtain the cuda grid dim_grid, dim_block = cuda_tpb_bpg_2d(self.Nz, self.Nr) # Push the fields on the GPU cuda_push_rho[dim_grid, dim_block](self.rho_prev, self.rho_next, self.Nz, self.Nr) else: # Push the fields on the CPU self.rho_prev[:, :] = self.rho_next[:, :] self.rho_next[:, :] = 0.
def divide_by_volume(self, fieldtype): """ Divide the field `fieldtype` in each cell by the cell volume, on the interpolation grid. This is typically done for rho and J, after the charge and current deposition. Parameter --------- fieldtype : A string which represents the kind of field to be divided by the volume (either 'rho' or 'J') """ if self.use_cuda: # Perform division on the GPU dim_grid, dim_block = cuda_tpb_bpg_2d(self.Nz, self.Nr) if fieldtype == 'rho': for m in range(self.Nm): cuda_divide_scalar_by_volume[dim_grid, dim_block]( self.interp[m].rho, self.interp[m].d_invvol) elif fieldtype == 'J': for m in range(self.Nm): cuda_divide_vector_by_volume[dim_grid, dim_block]( self.interp[m].Jr, self.interp[m].Jt, self.interp[m].Jz, self.interp[m].d_invvol) else: raise ValueError('Invalid string for fieldtype: %s' % fieldtype) else: # Perform division on the CPU if fieldtype == 'rho': for m in range(self.Nm): self.interp[m].rho = \ self.interp[m].rho * self.interp[m].invvol[np.newaxis,:] elif fieldtype == 'J': for m in range(self.Nm): self.interp[m].Jr = \ self.interp[m].Jr * self.interp[m].invvol[np.newaxis,:] self.interp[m].Jt = \ self.interp[m].Jt * self.interp[m].invvol[np.newaxis,:] self.interp[m].Jz = \ self.interp[m].Jz * self.interp[m].invvol[np.newaxis,:] else: raise ValueError('Invalid string for fieldtype: %s' % fieldtype)
def __init__(self, Nz, Nr, m, rmax, use_cuda=False): """ Initializes the dht and fft attributes, which contain auxiliary matrices allowing to transform the fields quickly Parameters ---------- Nz, Nr : int Number of points along z and r respectively m : int Index of the mode (needed for the Hankel transform) rmax : float The size of the simulation box along r. """ # Check whether to use the GPU self.use_cuda = use_cuda if (self.use_cuda is True) and (cuda_installed is False): self.use_cuda = False if self.use_cuda: # Initialize the dimension of the grid and blocks self.dim_grid, self.dim_block = cuda_tpb_bpg_2d(Nz, Nr, 1, 32) # Initialize the DHT (local implementation, see hankel.py) self.dht0 = DHT(m, m, Nr, Nz, rmax, use_cuda=self.use_cuda) self.dhtp = DHT(m + 1, m, Nr, Nz, rmax, use_cuda=self.use_cuda) self.dhtm = DHT(m - 1, m, Nr, Nz, rmax, use_cuda=self.use_cuda) # Initialize the FFT self.fft = FFT(Nr, Nz, use_cuda=self.use_cuda) # Initialize the spectral buffers if self.use_cuda: self.spect_buffer_r = cuda.device_array((Nz, Nr), dtype=np.complex128) self.spect_buffer_t = cuda.device_array((Nz, Nr), dtype=np.complex128) else: # Initialize the spectral buffers self.spect_buffer_r = np.zeros((Nz, Nr), dtype=np.complex128) self.spect_buffer_t = np.zeros((Nz, Nr), dtype=np.complex128) # Different names for same object (for economy of memory) self.spect_buffer_p = self.spect_buffer_r self.spect_buffer_m = self.spect_buffer_t
def correct_currents(self, dt, ps): """ Correct the currents so that they satisfy the charge conservation equation Parameters ---------- dt : float Timestep of the simulation """ # Precalculate useful coefficient inv_dt = 1. / dt if self.use_cuda: # Obtain the cuda grid dim_grid, dim_block = cuda_tpb_bpg_2d(self.Nz, self.Nr) # Correct the currents on the GPU if ps.V is None: # With standard PSATD algorithm cuda_correct_currents_standard[dim_grid, dim_block]( self.rho_prev, self.rho_next, self.Jp, self.Jm, self.Jz, self.d_kz, self.d_kr, self.d_inv_k2, inv_dt, self.Nz, self.Nr) else: # With Galilean/comoving algorithm cuda_correct_currents_comoving[dim_grid, dim_block]( self.rho_prev, self.rho_next, self.Jp, self.Jm, self.Jz, self.d_kz, self.d_kr, self.d_inv_k2, ps.d_j_corr_coef, ps.d_T_eb, ps.d_T_cc, inv_dt, self.Nz, self.Nr) else: # Correct the currents on the CPU if ps.V is None: # With standard PSATD algorithm numba_correct_currents_standard(self.rho_prev, self.rho_next, self.Jp, self.Jm, self.Jz, self.kz, self.kr, self.inv_k2, inv_dt, self.Nz, self.Nr) else: # With Galilean/comoving algorithm numba_correct_currents_comoving(self.rho_prev, self.rho_next, self.Jp, self.Jm, self.Jz, self.kz, self.kr, self.inv_k2, ps.j_corr_coef, ps.T_eb, ps.T_cc, inv_dt, self.Nz, self.Nr)
def handle_scal_buffer(self, grid, method, exchange_type, use_cuda, before_sending=False, after_receiving=False, gpudirect=False): """ Scalar field buffer handling 1) Copies data from the field grid to the MPI sending buffers -- or -- 2) Replaces or adds MPI sending buffers to the field grid For method 'replace': Either copy the inner part of the domain to the sending buffer for a scalar field, or replace the receving buffer for a scalar field to the guard cells of the domain. For method 'add': Either copy the inner part and the guard region of the domain to the sending buffer for a scalar field, or add the receving buffer for the scalar field to the guard cells and the inner region of the domain. Depending on whether the field data is initially on the CPU or on the GPU, this function will do the appropriate exchange with the device. Parameters ---------- grid: list of 2darrays (One element per azimuthal mode) The 2d arrays represent the fields on the interpolation grid method: str Can either be 'replace' or 'add' depending on the type of field exchange that is needed use_cuda: bool Whether the simulation runs on GPUs. If True, the buffers are copied to the GPU arrays after the MPI exchange. before_sending: bool Whether to copy the inner part of the domain to the sending buffer after_receiving: bool Whether to copy the receiving buffer to the guard cells gpudirect: bool - if `gpudirect` is True: Uses the CUDA GPUDirect feature on clusters that have a working CUDA-aware MPI implementation. - if `gpudirect` is False: (default) Standard MPI communication is performed when using CUDA for computation. This involves a manual GPU to CPU memory copy before exchanging information between MPI domains. """ # Define region that is copied to or from the buffer # depending on the method used. if method == 'replace': nz_start = self.n_guard nz_end = 2 * self.n_guard if method == 'add': nz_start = 0 nz_end = 2 * self.n_guard # Whether or not to send to the left or right neighbor copy_left = (self.left_proc is not None) copy_right = (self.right_proc is not None) Nz = grid[0].shape[0] # When using the GPU if use_cuda: # Calculate the number of blocks and threads per block dim_grid_2d, dim_block_2d = cuda_tpb_bpg_2d( nz_end - nz_start, self.Nr) if before_sending: # Copy the inner regions of the domain to the buffers for m in range(self.Nm): copy_scal_to_gpu_buffer[dim_grid_2d, dim_block_2d]( self.d_send_l[exchange_type], self.d_send_r[exchange_type], grid[m], m, copy_left, copy_right, nz_start, nz_end) # If GPUDirect with CUDA-aware MPI is not used, # copy the GPU buffers to the sending CPU buffers if not gpudirect: if copy_left: self.d_send_l[exchange_type].copy_to_host( self.send_l[exchange_type]) if copy_right: self.d_send_r[exchange_type].copy_to_host( self.send_r[exchange_type]) elif after_receiving: # If GPUDirect with CUDA-aware MPI is not used, # copy the CPU receiving buffers to the GPU buffers if not gpudirect: if copy_left: self.d_recv_l[exchange_type].copy_to_device( self.recv_l[exchange_type]) if copy_right: self.d_recv_r[exchange_type].copy_to_device( self.recv_r[exchange_type]) if method == 'replace': # Replace the guard cells of the domain with the buffers for m in range(self.Nm): replace_scal_from_gpu_buffer[ dim_grid_2d, dim_block_2d](self.d_recv_l[exchange_type], self.d_recv_r[exchange_type], grid[m], m, copy_left, copy_right, nz_start, nz_end) elif method == 'add': # Add the buffers to the domain for m in range(self.Nm): add_scal_from_gpu_buffer[dim_grid_2d, dim_block_2d]( self.d_recv_l[exchange_type], self.d_recv_r[exchange_type], grid[m], m, copy_left, copy_right, nz_start, nz_end) # Without GPU else: if before_sending: send_l = self.send_l[exchange_type] send_r = self.send_r[exchange_type] # Copy the inner regions of the domain to the buffer if copy_left: for m in range(self.Nm): send_l[m, :, :] = grid[m][nz_start:nz_end, :] if copy_right: for m in range(self.Nm): send_r[m, :, :] = grid[m][Nz - nz_end:Nz - nz_start, :] elif after_receiving: recv_l = self.recv_l[exchange_type] recv_r = self.recv_r[exchange_type] if method == 'replace': # Replace the guard cells of the domain with the buffers if copy_left: for m in range(self.Nm): grid[m][:nz_end - nz_start, :] = recv_l[m, :, :] if copy_right: for m in range(self.Nm): grid[m][-(nz_end - nz_start):, :] = recv_r[m, :, :] if method == 'add': # Add buffers to the domain if copy_left: for m in range(self.Nm): grid[m][:nz_end - nz_start, :] += recv_l[m, :, :] if copy_right: for m in range(self.Nm): grid[m][-(nz_end - nz_start):, :] += recv_r[m, :, :]
def push_eb_with(self, ps, use_true_rho=False): """ Push the fields over one timestep, using the psatd coefficients. Parameters ---------- ps : PsatdCoeffs object psatd object corresponding to the same m mode use_true_rho : bool, optional Whether to use the rho projected on the grid. If set to False, this will use div(E) and div(J) to evaluate rho and its time evolution. In the case use_true_rho==False, the rho projected on the grid is used only to correct the currents, and the simulation can be run without the neutralizing ions. """ # Check that psatd object passed as argument is the right one # (i.e. corresponds to the right mode) assert (self.m == ps.m) if self.use_cuda: # Obtain the cuda grid dim_grid, dim_block = cuda_tpb_bpg_2d(self.Nz, self.Nr, 1, 16) # Push the fields on the GPU if ps.V is None: # With the standard PSATD algorithm if self.use_pml: # Push the PML split component cuda_push_eb_pml_standard[dim_grid, dim_block]( self.Ep_pml, self.Em_pml, self.Bp_pml, self.Bm_pml, self.Ez, self.Bz, ps.d_C, ps.d_S_w, self.d_kr, self.d_kz, self.Nz, self.Nr) # Push the regular fields cuda_push_eb_standard[dim_grid, dim_block]( self.Ep, self.Em, self.Ez, self.Bp, self.Bm, self.Bz, self.Jp, self.Jm, self.Jz, self.rho_prev, self.rho_next, ps.d_rho_prev_coef, ps.d_rho_next_coef, ps.d_j_coef, ps.d_C, ps.d_S_w, self.d_kr, self.d_kz, ps.dt, use_true_rho, self.Nz, self.Nr) else: # With the Galilean/comoving algorithm if self.use_pml: # Push the PML split component cuda_push_eb_pml_comoving[dim_grid, dim_block]( self.Ep_pml, self.Em_pml, self.Bp_pml, self.Bm_pml, self.Ez, self.Bz, ps.d_C, ps.d_S_w, ps.d_T_eb, self.d_kr, self.d_kz, self.Nz, self.Nr) # Push the regular fields cuda_push_eb_comoving[dim_grid, dim_block]( self.Ep, self.Em, self.Ez, self.Bp, self.Bm, self.Bz, self.Jp, self.Jm, self.Jz, self.rho_prev, self.rho_next, ps.d_rho_prev_coef, ps.d_rho_next_coef, ps.d_j_coef, ps.d_C, ps.d_S_w, ps.d_T_eb, ps.d_T_cc, ps.d_T_rho, self.d_kr, self.d_kz, ps.dt, ps.V, use_true_rho, self.Nz, self.Nr) else: # Push the fields on the CPU if ps.V is None: # With the standard PSATD algorithm if self.use_pml: # Push the PML split component numba_push_eb_pml_standard(self.Ep_pml, self.Em_pml, self.Bp_pml, self.Bm_pml, self.Ez, self.Bz, ps.C, ps.S_w, self.kr, self.kz, self.Nz, self.Nr) # Push the regular fields numba_push_eb_standard(self.Ep, self.Em, self.Ez, self.Bp, self.Bm, self.Bz, self.Jp, self.Jm, self.Jz, self.rho_prev, self.rho_next, ps.rho_prev_coef, ps.rho_next_coef, ps.j_coef, ps.C, ps.S_w, self.kr, self.kz, ps.dt, use_true_rho, self.Nz, self.Nr) else: # With the Galilean/comoving algorithm if self.use_pml: # Push the PML split component numba_push_eb_pml_comoving(self.Ep_pml, self.Em_pml, self.Bp_pml, self.Bm_pml, self.Ez, self.Bz, ps.C, ps.S_w, ps.T_eb, self.kr, self.kz, self.Nz, self.Nr) # Push the regular fields numba_push_eb_comoving( self.Ep, self.Em, self.Ez, self.Bp, self.Bm, self.Bz, self.Jp, self.Jm, self.Jz, self.rho_prev, self.rho_next, ps.rho_prev_coef, ps.rho_next_coef, ps.j_coef, ps.C, ps.S_w, ps.T_eb, ps.T_cc, ps.T_rho, self.kr, self.kz, ps.dt, ps.V, use_true_rho, self.Nz, self.Nr)
def __init__(self, Nr, Nz, use_cuda=False, nthreads=None): """ Initialize an FFT object Parameters ---------- Nr: int Number of grid points along the r axis (axis -1) Nz: int Number of grid points along the z axis (axis 0) use_cuda: bool, optional Whether to perform the Fourier transform on the z axis nthreads : int, optional Number of threads for the FFTW transform. If None, the default number of threads of numba is used (environment variable NUMBA_NUM_THREADS) """ # Check whether to use cuda self.use_cuda = use_cuda if (self.use_cuda is True) and (cuda_installed is False): self.use_cuda = False print('** Cuda not available for Fourier transform.') print('** Performing the Fourier transform on the CPU.') # Check whether to use MKL self.use_mkl = mkl_installed # Initialize the object for calculation on the GPU if self.use_cuda: # Set optimal number of CUDA threads per block # for copy 1d/2d kernels (determined empirically) copy_tpb = (8, 32) if cuda_gpu_model == "V100" else (2, 16) # Initialize the dimension of the grid and blocks self.dim_grid, self.dim_block = cuda_tpb_bpg_2d(Nz, Nr, *copy_tpb) # Initialize 1d buffer for cufft self.buffer1d_in = cupy.empty((Nz * Nr, ), dtype=np.complex128) self.buffer1d_out = cupy.empty((Nz * Nr, ), dtype=np.complex128) # Initialize the CUDA FFT plan object self.fft = cufft.Plan1d(Nz, cufft.CUFFT_Z2Z, Nr) self.inv_Nz = 1. / Nz # For normalization of the iFFT # Initialize the object for calculation on the CPU else: # For MKL FFT if self.use_mkl: # Initialize the MKL plan with dummy array spect_buffer = np.zeros((Nz, Nr), dtype=np.complex128) self.mklfft = MKLFFT(spect_buffer) # For FFTW else: # Determine number of threads if nthreads is None: # Get the default number of threads for numba nthreads = numba.config.NUMBA_NUM_THREADS # Initialize the FFT plan with dummy arrays interp_buffer = np.zeros((Nz, Nr), dtype=np.complex128) spect_buffer = np.zeros((Nz, Nr), dtype=np.complex128) self.fft = pyfftw.FFTW(interp_buffer, spect_buffer, axes=(0, ), direction='FFTW_FORWARD', threads=nthreads) self.ifft = pyfftw.FFTW(spect_buffer, interp_buffer, axes=(0, ), direction='FFTW_BACKWARD', threads=nthreads)
def __init__(self, p, m, Nr, Nz, rmax, use_cuda=False ): """ Calculate the r (position) and nu (frequency) grid on which the transform will operate. Also store auxiliary data needed for the transform. Parameters: ------------ p: int Order of the Hankel transform m: int The azimuthal mode for which the Hankel transform is calculated Nr, Nz: float Number of points in the r direction and z direction rmax: float Edge of the box in which the Hankel transform is taken (The function is assumed to be zero at that point.) use_cuda: bool, optional Whether to use the GPU for the Hankel transform """ # Register whether to use the GPU. # If yes, initialize the corresponding cuda object self.use_cuda = use_cuda if (self.use_cuda==True) and (cuda_installed==False): self.use_cuda = False print('** Cuda not available for Hankel transform.') print('** Performing the Hankel transform on the CPU.') # Check that m has a valid value if (m in [p-1, p, p+1]) == False: raise ValueError('m must be either p-1, p or p+1') # Register values of the arguments self.p = p self.m = m self.Nr = Nr self.rmax = rmax self.Nz = Nz # Calculate the zeros of the Bessel function if m !=0: # In this case, 0 is a zero of the Bessel function of order m. # It turns out that it is needed to reconstruct the signal for p=0. alphas = np.hstack( (np.array([0.]), jn_zeros(m, Nr-1)) ) else: alphas = jn_zeros(m, Nr) # Calculate the spectral grid self.nu = 1./(2*np.pi*rmax) * alphas # Calculate the spatial grid (Uniform grid with an half-cell offset) self.r = (rmax*1./Nr) * ( np.arange(Nr) + 0.5 ) # Calculate and store the inverse matrix invM # (imposed by the constraints on the DHT of Bessel modes) # NB: When compared with the FBPIC article, all the matrices here # are calculated in transposed form. This is done so as to use the # `dot` and `gemm` functions, in the `transform` method. self.invM = np.empty((Nr, Nr)) if p == m: p_denom = p+1 else: p_denom = p denom = np.pi * rmax**2 * jn( p_denom, alphas)**2 num = jn( p, 2*np.pi* self.r[np.newaxis,:]*self.nu[:,np.newaxis] ) # Get the inverse matrix if m!=0: self.invM[1:, :] = num[1:, :] / denom[1:, np.newaxis] # In this case, the functions are represented by Bessel functions # *and* an additional mode (below) which satisfies the same # algebric relations for curl/div/grad as the regular Bessel modes, # with the value kperp=0. # The normalization of this mode is arbitrary, and is chosen # so that the condition number of invM is close to 1 if p==m-1: self.invM[0, :] = self.r**(m-1) * 1./( np.pi * rmax**(m+1) ) else: self.invM[0, :] = 0. else : self.invM[:, :] = num[:, :] / denom[:, np.newaxis] # Calculate the matrix M by inverting invM self.M = np.empty((Nr, Nr)) if m !=0 and p != m-1: self.M[:, 1:] = np.linalg.pinv( self.invM[1:,:] ) self.M[:, 0] = 0. else: self.M = np.linalg.inv( self.invM ) # Copy the matrices to the GPU if needed if self.use_cuda: self.d_M = cupy.asarray( self.M ) self.d_invM = cupy.asarray( self.invM ) # Initialize buffer arrays to store the complex Nz x Nr grid # as a real 2Nz x Nr grid, before performing the matrix product # (This is because a matrix product of reals is faster than a matrix # product of complexs, and the real-complex conversion is negligible.) if not self.use_cuda: # Initialize real buffer arrays on the CPU zero_array = np.zeros((2*Nz, Nr), dtype=np.float64) self.array_in = zero_array.copy() self.array_out = zero_array.copy() else: # Initialize real buffer arrays on the GPU zero_array = np.zeros((2*Nz, Nr), dtype=np.float64) self.d_in = cupy.asarray( zero_array ) self.d_out = cupy.asarray( zero_array ) # Initialize cuBLAS self.blas = device.get_cublas_handle() # Set optimal number of CUDA threads per block # for copy 2d real/complex (determined empirically) copy_tpb = (8,32) if cuda_gpu_model == "V100" else (2,16) # Initialize the threads per block and block per grid self.dim_grid, self.dim_block = cuda_tpb_bpg_2d(Nz, Nr, *copy_tpb)
def filter(self, fieldtype): """ Filter the field `fieldtype` Parameter --------- fieldtype : string A string which represents the kind of field to be filtered (either 'E', 'B', 'J', 'rho_next' or 'rho_prev') """ if self.use_cuda: # Obtain the cuda grid dim_grid, dim_block = cuda_tpb_bpg_2d(self.Nz, self.Nr) # Filter fields on the GPU if fieldtype == 'rho_prev': cuda_filter_scalar[dim_grid, dim_block](self.rho_prev, self.d_filter_array, self.Nz, self.Nr) elif fieldtype == 'rho_next': cuda_filter_scalar[dim_grid, dim_block](self.rho_next, self.d_filter_array, self.Nz, self.Nr) elif fieldtype == 'J': cuda_filter_vector[dim_grid, dim_block](self.Jp, self.Jm, self.Jz, self.d_filter_array, self.Nz, self.Nr) elif fieldtype == 'E': cuda_filter_vector[dim_grid, dim_block](self.Ep, self.Em, self.Ez, self.d_filter_array, self.Nz, self.Nr) elif fieldtype == 'B': cuda_filter_vector[dim_grid, dim_block](self.Bp, self.Bm, self.Bz, self.d_filter_array, self.Nz, self.Nr) else: raise ValueError('Invalid string for fieldtype: %s' % fieldtype) else: # Filter fields on the CPU if fieldtype == 'rho_prev': self.rho_prev = self.rho_prev * self.filter_array elif fieldtype == 'rho_next': self.rho_next = self.rho_next * self.filter_array elif fieldtype == 'J': self.Jp = self.Jp * self.filter_array self.Jm = self.Jm * self.filter_array self.Jz = self.Jz * self.filter_array elif fieldtype == 'E': self.Ep = self.Ep * self.filter_array self.Em = self.Em * self.filter_array self.Ez = self.Ez * self.filter_array elif fieldtype == 'B': self.Bp = self.Bp * self.filter_array self.Bm = self.Bm * self.filter_array self.Bz = self.Bz * self.filter_array else: raise ValueError('Invalid string for fieldtype: %s' % fieldtype)
def handle_scal_buffer(self, grid, method, use_cuda, before_sending=False, after_receiving=False): """ Scalar field buffer handling 1) Copies data from the field grid to the MPI sending buffers -- or -- 2) Replaces or adds MPI sending buffers to the field grid For method 'replace': Either copy the inner part of the domain to the sending buffer for a scalar field, or replace the receving buffer for a scalar field to the guard cells of the domain. For method 'add': Either copy the inner part and the guard region of the domain to the sending buffer for a scalar field, or add the receving buffer for the scalar field to the guard cells and the inner region of the domain. Depending on whether the field data is initially on the CPU or on the GPU, this function will do the appropriate exchange with the device. Parameters ---------- grid: list of 2darrays (One element per azimuthal mode) The 2d arrays represent the fields on the interpolation grid method: str Can either be 'replace' or 'add' depending on the type of field exchange that is needed use_cuda: bool Whether the simulation runs on GPUs. If True, the buffers are copied to the GPU arrays after the MPI exchange. before_sending: bool Whether to copy the inner part of the domain to the sending buffer after_receiving: bool Whether to copy the receiving buffer to the guard cells """ if method == 'replace': nz_start = self.n_guard nz_end = 2 * self.n_guard if method == 'add': nz_start = 0 nz_end = 2 * self.n_guard copy_left = (self.left_proc is not None) copy_right = (self.right_proc is not None) Nz = grid[0].shape[0] # When using the GPU if use_cuda: # Calculate the number of blocks and threads per block dim_grid_2d, dim_block_2d = cuda_tpb_bpg_2d( nz_end - nz_start, self.Nr) if before_sending: if method == 'replace': # Copy the inner regions of the domain to the GPU buffers for m in range(self.Nm): copy_scal_to_gpu_buffer[dim_grid_2d, dim_block_2d]( self.d_scal_rep_buffer_l, self.d_scal_rep_buffer_r, grid[m], m, copy_left, copy_right, nz_start, nz_end) # Copy the GPU buffers to the sending CPU buffers if copy_left: self.d_scal_rep_buffer_l.copy_to_host( self.scal_rep_send_l) if copy_right: self.d_scal_rep_buffer_r.copy_to_host( self.scal_rep_send_r) if method == 'add': # Copy the inner+guard regions of the domain to the buffers for m in range(self.Nm): copy_scal_to_gpu_buffer[dim_grid_2d, dim_block_2d]( self.d_scal_add_buffer_l, self.d_scal_add_buffer_r, grid[m], m, copy_left, copy_right, nz_start, nz_end) # Copy the GPU buffers to the sending CPU buffers if copy_left: self.d_scal_add_buffer_l.copy_to_host( self.scal_add_send_l) if copy_right: self.d_scal_add_buffer_r.copy_to_host( self.scal_add_send_r) elif after_receiving: if method == 'replace': # Copy the CPU receiving buffers to the GPU buffers if copy_left: self.d_scal_rep_buffer_l.copy_to_device( self.scal_rep_recv_l) if copy_right: self.d_scal_rep_buffer_r.copy_to_device( self.scal_rep_recv_r) # Replace the guard cells of the domain with the buffers for m in range(self.Nm): replace_scal_from_gpu_buffer[ dim_grid_2d, dim_block_2d](self.d_scal_rep_buffer_l, self.d_scal_rep_buffer_r, grid[m], m, copy_left, copy_right, nz_start, nz_end) if method == 'add': # Copy the CPU receiving buffers to the GPU buffers if copy_left: self.d_scal_add_buffer_l.copy_to_device( self.scal_add_recv_l) if copy_right: self.d_scal_add_buffer_r.copy_to_device( self.scal_add_recv_r) # Add the GPU buffers to the domain for m in range(self.Nm): add_scal_from_gpu_buffer[dim_grid_2d, dim_block_2d]( self.d_scal_add_buffer_l, self.d_scal_add_buffer_r, grid[m], m, copy_left, copy_right, nz_start, nz_end) # Without GPU else: if before_sending: if method == 'replace': # Copy the inner regions of the domain to the buffer if copy_left: for m in range(self.Nm): self.scal_rep_send_l[m, :, :] = grid[m][ nz_start:nz_end, :] if copy_right: for m in range(self.Nm): self.scal_rep_send_r[m, :, :] = grid[m][ Nz - nz_end:Nz - nz_start, :] if method == 'add': # Copy the inner+guard regions of the domain to the buffer if copy_left: for m in range(self.Nm): self.scal_add_send_l[m, :, :] = grid[m][ nz_start:nz_end, :] if copy_right: for m in range(self.Nm): self.scal_add_send_r[m, :, :] = grid[m][ Nz - nz_end:Nz - nz_start, :] elif after_receiving: if method == 'replace': # Replace the guard cells of the domain with the buffers if copy_left: for m in range(self.Nm): grid[m][:nz_end - nz_start, :] = self.scal_rep_recv_l[ m, :, :] if copy_right: for m in range(self.Nm): grid[m][-(nz_end - nz_start ):, :] = self.scal_rep_recv_r[m, :, :] if method == 'add': # Add buffers to the domain if copy_left: for m in range(self.Nm): grid[m][:nz_end - nz_start, :] += self.scal_add_recv_l[ m, :, :] if copy_right: for m in range(self.Nm): grid[m][-(nz_end - nz_start ):, :] += self.scal_add_recv_r[m, :, :]
def __init__(self, Nr, Nz, use_cuda=False, nthreads=None): """ Initialize an FFT object Parameters ---------- Nr: int Number of grid points along the r axis (axis -1) Nz: int Number of grid points along the z axis (axis 0) use_cuda: bool, optional Whether to perform the Fourier transform on the z axis nthreads : int, optional Number of threads for the FFTW transform. If None, the default number of threads of numba is used (environment variable NUMBA_NUM_THREADS) """ # Check whether to use cuda self.use_cuda = use_cuda if (self.use_cuda is True) and (cuda_installed is False): self.use_cuda = False print('** Cuda not available for Fourier transform.') print('** Performing the Fourier transform on the CPU.') # Check whether to use MKL self.use_mkl = mkl_installed # Initialize the object for calculation on the GPU if self.use_cuda: # Initialize the dimension of the grid and blocks self.dim_grid, self.dim_block = cuda_tpb_bpg_2d(Nz, Nr) # Initialize 1d buffer for cufft self.buffer1d_in = cuda.device_array((Nz * Nr, ), dtype=np.complex128) self.buffer1d_out = cuda.device_array((Nz * Nr, ), dtype=np.complex128) # Initialize the cuda libraries object self.fft = cufft.FFTPlan(shape=(Nz, ), itype=np.complex128, otype=np.complex128, batch=Nr) self.blas = cublas.Blas() # For normalization of the iFFT self.inv_Nz = 1. / Nz # For normalization of the iFFT # Initialize the object for calculation on the CPU else: # For MKL FFT if self.use_mkl: # Initialize the MKL plan with dummy array spect_buffer = np.zeros((Nz, Nr), dtype=np.complex128) self.mklfft = MKLFFT(spect_buffer) # For FFTW else: # Determine number of threads if nthreads is None: # Get the default number of threads for numba nthreads = numba.config.NUMBA_NUM_THREADS # Initialize the FFT plan with dummy arrays interp_buffer = np.zeros((Nz, Nr), dtype=np.complex128) spect_buffer = np.zeros((Nz, Nr), dtype=np.complex128) self.fft = pyfftw.FFTW(interp_buffer, spect_buffer, axes=(0, ), direction='FFTW_FORWARD', threads=nthreads) self.ifft = pyfftw.FFTW(spect_buffer, interp_buffer, axes=(0, ), direction='FFTW_BACKWARD', threads=nthreads)
def correct_currents(self, dt, ps, current_correction): """ Correct the currents so that they satisfy the charge conservation equation Parameters ---------- dt: float Timestep of the simulation ps: a PSATDCoefs object Contains coefficients that are used in the current correction current_correction: string The type of current correction performed """ # Precalculate useful coefficient inv_dt = 1. / dt if self.use_cuda: # Obtain the cuda grid dim_grid, dim_block = cuda_tpb_bpg_2d(self.Nz, self.Nr, 1, 16) # Correct the currents on the GPU if ps.V is None: # With standard PSATD algorithm # Method: curl-free if current_correction == 'curl-free': cuda_correct_currents_curlfree_standard \ [dim_grid, dim_block]( self.rho_prev, self.rho_next, self.Jp, self.Jm, self.Jz, self.d_kz, self.d_kr, self.d_inv_k2, inv_dt, self.Nz, self.Nr ) # Method: cross-deposition elif current_correction == 'cross-deposition': cuda_correct_currents_crossdeposition_standard \ [dim_grid, dim_block]( self.rho_prev, self.rho_next, self.rho_next_z, self.rho_next_xy, self.Jp, self.Jm, self.Jz, self.d_kz, self.d_kr, inv_dt, self.Nz, self.Nr) else: # With Galilean/comoving algorithm # Method: curl-free if current_correction == 'curl-free': cuda_correct_currents_curlfree_comoving \ [dim_grid, dim_block]( self.rho_prev, self.rho_next, self.Jp, self.Jm, self.Jz, self.d_kz, self.d_kr, self.d_inv_k2, ps.d_j_corr_coef, ps.d_T_eb, ps.d_T_cc, inv_dt, self.Nz, self.Nr) # Method: cross-deposition elif current_correction == 'cross-deposition': cuda_correct_currents_crossdeposition_comoving \ [dim_grid, dim_block]( self.rho_prev, self.rho_next, self.rho_next_z, self.rho_next_xy, self.Jp, self.Jm, self.Jz, self.d_kz, self.d_kr, ps.d_j_corr_coef, ps.d_T_eb, ps.d_T_cc, inv_dt, self.Nz, self.Nr) else: # Correct the currents on the CPU if ps.V is None: # With standard PSATD algorithm # Method: curl-free if current_correction == 'curl-free': numba_correct_currents_curlfree_standard( self.rho_prev, self.rho_next, self.Jp, self.Jm, self.Jz, self.kz, self.kr, self.inv_k2, inv_dt, self.Nz, self.Nr) # Method: cross-deposition elif current_correction == 'cross-deposition': numba_correct_currents_crossdeposition_standard( self.rho_prev, self.rho_next, self.rho_next_z, self.rho_next_xy, self.Jp, self.Jm, self.Jz, self.kz, self.kr, inv_dt, self.Nz, self.Nr) else: # With Galilean/comoving algorithm # Method: curl-free if current_correction == 'curl-free': numba_correct_currents_curlfree_comoving( self.rho_prev, self.rho_next, self.Jp, self.Jm, self.Jz, self.kz, self.kr, self.inv_k2, ps.j_corr_coef, ps.T_eb, ps.T_cc, inv_dt, self.Nz, self.Nr) # Method: cross-deposition elif current_correction == 'cross-deposition': numba_correct_currents_crossdeposition_comoving( self.rho_prev, self.rho_next, self.rho_next_z, self.rho_next_xy, self.Jp, self.Jm, self.Jz, self.kz, self.kr, ps.j_corr_coef, ps.T_eb, ps.T_cc, inv_dt, self.Nz, self.Nr)