def copy_rho_buffer(self, iz_min, grid): """ Add the small-size array rho_buffer into the full-size array rho Parameters ---------- iz_min: int The z index in the full-size array, that corresponds to index 0 in the small-size array (i.e. position at which to add the small-size array into the full-size one) grid: a list of InterpolationGrid objects Contains the full-size array rho """ Nm = len(grid) if type(grid[0].rho) is np.ndarray: # The large-size array rho is on the CPU for m in range(Nm): grid[m].rho[iz_min:iz_min + 2] += self.rho_buffer[m] else: # The large-size array rho is on the GPU # Copy the small-size buffer to the GPU cuda.to_device(self.rho_buffer, to=self.d_rho_buffer) # On the GPU: add the small-size buffers to the large-size array dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d(grid[0].Nr, TPB=64) for m in range(Nm): add_rho_to_gpu_array[dim_grid_1d, dim_block_1d](iz_min, self.d_rho_buffer, grid[m].rho, m)
def __init__( self, n_pml, cdt_over_dr ): """ Initialize a damping object. Parameters ---------- n_pml: int Number of PML cells in the radial direction cdt_over_dr: float Ratio of timestep to radial cell size (needed for calculation of damping coefficients) """ # Register the number of PML cells self.n_pml = n_pml # Create the damping arrays self.damp_array = generate_pml_damp_array( n_pml, cdt_over_dr ) # Transfer the damping array to the GPU if cuda_installed: self.d_damp_array = cuda.to_device( self.damp_array )
def send_particles_to_gpu( self ): """ Copy the particles to the GPU. Particle arrays of self now point to the GPU arrays. """ if self.use_cuda: # Send positions, velocities, inverse gamma and weights # to the GPU (CUDA) self.x = cuda.to_device(self.x) self.y = cuda.to_device(self.y) self.z = cuda.to_device(self.z) self.ux = cuda.to_device(self.ux) self.uy = cuda.to_device(self.uy) self.uz = cuda.to_device(self.uz) self.inv_gamma = cuda.to_device(self.inv_gamma) self.w = cuda.to_device(self.w) # Copy arrays on the GPU for the field # gathering and the particle push self.Ex = cuda.to_device(self.Ex) self.Ey = cuda.to_device(self.Ey) self.Ez = cuda.to_device(self.Ez) self.Bx = cuda.to_device(self.Bx) self.By = cuda.to_device(self.By) self.Bz = cuda.to_device(self.Bz) # Copy arrays on the GPU for the sorting self.cell_idx = cuda.to_device(self.cell_idx) self.sorted_idx = cuda.to_device(self.sorted_idx) self.prefix_sum = cuda.to_device(self.prefix_sum) self.sorting_buffer = cuda.to_device(self.sorting_buffer) if self.n_integer_quantities > 0: self.int_sorting_buffer = cuda.to_device(self.int_sorting_buffer) # Copy particle tracker data if self.tracker is not None: self.tracker.send_to_gpu() # Copy the ionization data if self.ionizer is not None: self.ionizer.send_to_gpu()
def add_buffers_gpu( species, float_recv_left, float_recv_right, uint_recv_left, uint_recv_right): """ Add the particles stored in recv_left and recv_right to the existing particle in species. Parameters ---------- species: a Particles object Contain the particles that stayed on the present processors float_recv_left, float_recv_right, uint_recv_left, uint_recv_right: arrays of shape (n_float,Nptcl) and (n_int,Nptcl) where Nptcl is the number of particles that are received to the left proc and right proc respectively, and where n_float and n_int are the number of float and integer quantities respectively These arrays are always on the CPU (since they were used for MPI) """ # Get the new number of particles old_Ntot = species.Ntot n_left = float_recv_left.shape[1] n_right = float_recv_right.shape[1] new_Ntot = old_Ntot + n_left + n_right # Get the threads per block and the blocks per grid n_left_grid, n_left_block = cuda_tpb_bpg_1d( n_left ) n_right_grid, n_right_block = cuda_tpb_bpg_1d( n_right ) n_old_grid, n_old_block = cuda_tpb_bpg_1d( old_Ntot ) # Iterate over particle attributes # Build list of float attributes to copy attr_list = [ (species,'x'), (species,'y'), (species,'z'), \ (species,'ux'), (species,'uy'), (species,'uz'), \ (species,'inv_gamma'), (species,'w') ] if species.ionizer is not None: attr_list += [ (species.ionizer, 'w_times_level') ] # Loop through the float quantities for i_attr in range( len(attr_list) ): # Copy the proper buffers to the GPU left_buffer = cuda.to_device( float_recv_left[i_attr] ) right_buffer = cuda.to_device( float_recv_right[i_attr] ) # Initialize the new particle array particle_array = cuda.device_array( (new_Ntot,), dtype=np.float64) # Merge the arrays on the GPU stay_buffer = getattr( attr_list[i_attr][0], attr_list[i_attr][1]) if n_left != 0: copy_particles[n_left_grid, n_left_block]( n_left, left_buffer, 0, particle_array, 0 ) if old_Ntot != 0: copy_particles[n_old_grid, n_old_block]( old_Ntot, stay_buffer, 0, particle_array, n_left ) if n_right != 0: copy_particles[n_right_grid, n_right_block]( n_right, right_buffer, 0, particle_array, n_left+old_Ntot ) # Assign the stay_buffer to the initial particle data array # and fill the sending buffers (if needed for MPI) setattr(attr_list[i_attr][0], attr_list[i_attr][1], particle_array) # Build list of integer quantities to copy attr_list = [] if species.tracker is not None: attr_list.append( (species.tracker,'id') ) if species.ionizer is not None: attr_list.append( (species.ionizer,'ionization_level') ) # Loop through the integer quantities for i_attr in range( len(attr_list) ): # Copy the proper buffers to the GPU left_buffer = cuda.to_device( uint_recv_left[i_attr] ) right_buffer = cuda.to_device( uint_recv_right[i_attr] ) # Initialize the new particle array particle_array = cuda.device_array( (new_Ntot,), dtype=np.uint64) # Merge the arrays on the GPU stay_buffer = getattr( attr_list[i_attr][0], attr_list[i_attr][1]) if n_left != 0: copy_particles[n_left_grid, n_left_block]( n_left, left_buffer, 0, particle_array, 0 ) if old_Ntot != 0: copy_particles[n_old_grid, n_old_block]( old_Ntot, stay_buffer, 0, particle_array, n_left ) if n_right != 0: copy_particles[n_right_grid, n_right_block]( n_right, right_buffer, 0, particle_array, n_left+old_Ntot ) # Assign the stay_buffer to the initial particle data array # and fill the sending buffers (if needed for MPI) setattr(attr_list[i_attr][0], attr_list[i_attr][1], particle_array) # Adapt the total number of particles species.Ntot = new_Ntot
def __init__(self, n_guard, Nr, Nm, left_proc, right_proc): """ Initialize the guard cell buffers for the fields. These buffers are used in order to group the MPI exchanges. Parameters ---------- n_guard: int Number of guard cells Nr: int Number of points in the radial direction Nm: int Number of azimuthal modes left_proc, right_proc: int or None Rank of the proc to the right and to the left (None for open boundary) """ # Register parameters self.Nr = Nr self.Nm = Nm self.n_guard = n_guard self.left_proc = left_proc self.right_proc = right_proc # Shortcut ng = self.n_guard # Allocate buffer arrays that are send via MPI to exchange # the fields between domains (either replacing or adding fields) # Buffers are allocated for the left and right side of the domain if not cuda_installed: # Allocate buffers on the CPU # - Replacing vector field buffers self.vec_rep_send_l = np.empty((3 * Nm, ng, Nr), dtype=np.complex128) self.vec_rep_send_r = np.empty((3 * Nm, ng, Nr), dtype=np.complex128) self.vec_rep_recv_l = np.empty((3 * Nm, ng, Nr), dtype=np.complex128) self.vec_rep_recv_r = np.empty((3 * Nm, ng, Nr), dtype=np.complex128) # - Adding vector field buffers self.vec_add_send_l = np.empty((3 * Nm, 2 * ng, Nr), dtype=np.complex128) self.vec_add_send_r = np.empty((3 * Nm, 2 * ng, Nr), dtype=np.complex128) self.vec_add_recv_l = np.empty((3 * Nm, 2 * ng, Nr), dtype=np.complex128) self.vec_add_recv_r = np.empty((3 * Nm, 2 * ng, Nr), dtype=np.complex128) # - Replacing scalar field buffers self.scal_rep_send_l = np.empty((Nm, ng, Nr), dtype=np.complex128) self.scal_rep_send_r = np.empty((Nm, ng, Nr), dtype=np.complex128) self.scal_rep_recv_l = np.empty((Nm, ng, Nr), dtype=np.complex128) self.scal_rep_recv_r = np.empty((Nm, ng, Nr), dtype=np.complex128) # - Adding scalar field buffers self.scal_add_send_l = np.empty((Nm, 2 * ng, Nr), dtype=np.complex128) self.scal_add_send_r = np.empty((Nm, 2 * ng, Nr), dtype=np.complex128) self.scal_add_recv_l = np.empty((Nm, 2 * ng, Nr), dtype=np.complex128) self.scal_add_recv_r = np.empty((Nm, 2 * ng, Nr), dtype=np.complex128) else: # Allocate buffers on the CPU and GPU # Use cuda.pinned_array so that CPU array is pagelocked. # (cannot be swapped out to disk and GPU can access it via DMA) pin_ary = cuda.pinned_array # - Replacing vector field buffers self.vec_rep_send_l = pin_ary((3 * Nm, ng, Nr), dtype=np.complex128) self.vec_rep_send_r = pin_ary((3 * Nm, ng, Nr), dtype=np.complex128) self.vec_rep_recv_l = pin_ary((3 * Nm, ng, Nr), dtype=np.complex128) self.vec_rep_recv_r = pin_ary((3 * Nm, ng, Nr), dtype=np.complex128) self.d_vec_rep_buffer_l = cuda.to_device(self.vec_rep_send_l) self.d_vec_rep_buffer_r = cuda.to_device(self.vec_rep_send_r) # - Adding vector field buffers self.vec_add_send_l = pin_ary((3 * Nm, 2 * ng, Nr), dtype=np.complex128) self.vec_add_send_r = pin_ary((3 * Nm, 2 * ng, Nr), dtype=np.complex128) self.vec_add_recv_l = pin_ary((3 * Nm, 2 * ng, Nr), dtype=np.complex128) self.vec_add_recv_r = pin_ary((3 * Nm, 2 * ng, Nr), dtype=np.complex128) self.d_vec_add_buffer_l = cuda.to_device(self.vec_add_send_l) self.d_vec_add_buffer_r = cuda.to_device(self.vec_add_send_r) # - Replacing scalar field buffers self.scal_rep_send_l = pin_ary((Nm, ng, Nr), dtype=np.complex128) self.scal_rep_send_r = pin_ary((Nm, ng, Nr), dtype=np.complex128) self.scal_rep_recv_l = pin_ary((Nm, ng, Nr), dtype=np.complex128) self.scal_rep_recv_r = pin_ary((Nm, ng, Nr), dtype=np.complex128) self.d_scal_rep_buffer_l = cuda.to_device(self.scal_rep_send_l) self.d_scal_rep_buffer_r = cuda.to_device(self.scal_rep_send_r) # - Adding scalar field buffers self.scal_add_send_l = pin_ary((Nm, 2 * ng, Nr), dtype=np.complex128) self.scal_add_send_r = pin_ary((Nm, 2 * ng, Nr), dtype=np.complex128) self.scal_add_recv_l = pin_ary((Nm, 2 * ng, Nr), dtype=np.complex128) self.scal_add_recv_r = pin_ary((Nm, 2 * ng, Nr), dtype=np.complex128) self.d_scal_add_buffer_l = cuda.to_device(self.scal_add_send_l) self.d_scal_add_buffer_r = cuda.to_device(self.scal_add_send_r)
def __init__( self, Nz, zmin, zmax, Nr, rmax, Nm, dt, v_comoving, use_galilean, boundaries, n_order, n_guard=None, n_damp=64, n_inject=None, exchange_period=None, use_all_mpi_ranks=True): """ Initializes a communicator object. Parameters ---------- Nz, Nr: int The number of cells in the global physical domain (i.e. without damp cells and guard cells) zmin, zmax, rmax: float The position of the edges of the global physical domain in z and r (i.e. without damp cells and guard cells) Nm: int The total number of modes dt: float The timestep of the simulation v_comoving: float or None, optional If this variable is None, the standard PSATD is used (default). Otherwise, the current is assumed to be "comoving", i.e. constant with respect to (z - v_comoving * t). This can be done in two ways: either by - Using a PSATD scheme that takes this hypothesis into account - Solving the PSATD scheme in a Galilean frame use_galilean: bool, optional Determines which one of the two above schemes is used When use_galilean is true, the whole grid moves with a speed v_comoving boundaries: str Indicates how to exchange the fields at the left and right boundaries of the global simulation box. (Either 'periodic' or 'open') n_order: int The order of the stencil for the z derivatives. Use -1 for infinite order, otherwise use a positive, even number. In this case, the stencil extends up to approx. 2*n_order cells on each side. (A finite order stencil is required to have a localized field push that allows to do simulations in parallel on multiple MPI ranks) n_guard: int, optional Number of guard cells to use at the left and right of a domain, when performing parallel (MPI) computation or when using open boundaries. Defaults to None, which calculates the required guard cells for n_order automatically (approx 2*n_order). If no MPI is used and in the case of open boundaries with an infinite order stencil, n_guard defaults to 64, if not set otherwise. n_damp : int, optional Number of damping guard cells at the left and right of a simulation box if a moving window is attached. The guard region at these areas (left / right of moving window) is extended by n_damp in order to smoothly damp the fields such that they do not wrap around. (Defaults to 64) n_inject: int, optional Number of injection cells (at the left and right) of a simulation box, for a simulation with open boundaries. The damping region needs to be additionally extended by n_inject cells at the outer edges to have a region with zero fields where new particles can be injected. For symmetry reasons those cells are added at both sides of the simulation box, although particles are typically injected only at the right side of the box. (Defaults to None and is set to n_guard/2 automatically) exchange_period: int, optional Number of iterations before which the particles are exchanged. If set to None, the maximum exchange period is calculated automatically: Within exchange_period timesteps, the particles should never be able to travel more than (n_guard/2 - particle_shape order) cells. (Setting exchange_period to small values can substantially affect the performance) use_all_mpi_ranks: bool, optional - if `use_all_mpi_ranks` is True (default): All the MPI ranks will contribute to the same simulation, using domain-decomposition to share the work. - if `use_all_mpi_ranks` is False: Each MPI rank will run an independent simulation. This can be useful when running parameter scans. """ # Initialize global number of cells and modes self.Nr = Nr self.Nm = Nm self._Nz_global_domain = Nz self._zmin_global_domain = zmin # Get the distance dz between the cells # (longitudinal spacing of the grid) self.dz = (zmax - zmin)/self._Nz_global_domain # MPI Setup self.use_all_mpi_ranks = use_all_mpi_ranks if self.use_all_mpi_ranks and mpi_installed: self.mpi_comm = comm self.rank = self.mpi_comm.rank self.size = self.mpi_comm.size else: self.mpi_comm = None self.rank = 0 self.size = 1 # Get the rank of the left and the right domain self.left_proc = self.rank-1 self.right_proc = self.rank+1 # Correct these initial values by taking into account boundaries self.boundaries = boundaries if self.boundaries == 'periodic': # Periodic boundary conditions for the domains if self.rank == 0: self.left_proc = (self.size-1) if self.rank == self.size-1: self.right_proc = 0 elif self.boundaries == 'open': # None means that the boundary is open if self.rank == 0: self.left_proc = None if self.rank == self.size-1: self.right_proc = None else: raise ValueError('Unrecognized boundaries: %s' %self.boundaries) # Initialize number of guard cells # Automatically calculate required guard cells # for given order (n_order) if n_guard == None: if n_order == -1: # Set n_guard to fixed value of 64 in case of # open boundaries and infinite order stencil # (if not defined otherwise by user) self.n_guard = 64 # Raise error if user tries to use parallel MPI computation # with an infinite order stencil. This would give wrong results if self.size != 1: raise ValueError( 'When running FBPIC with MPI decomposition, you need to \n' 'set the argument `n_order` of the `Simulation` object \n' 'to a positive value (e.g. n_order=32).') else: # Automatic calculation of the guard region size, # depending on the stencil order (n_order) stencil = get_stencil_reach( self._Nz_global_domain, self.dz, c*dt, n_order, v_comoving, use_galilean ) # approx 2*n_order (+1 because the moving window # shifts the grid by one cell during the PIC loop # and therefore, the guard region needs to be larger # by one cell) self.n_guard = stencil + 1 else: # Otherwise: Set user defined guard region size self.n_guard = n_guard # For single proc and periodic boundaries, no need for guard cells if boundaries=='periodic' and self.size==1: self.n_guard = 0 # Register damping cells self.n_damp = n_damp # For periodic boundaries, no need for damping cells if boundaries=='periodic': self.n_damp = 0 self.n_inject = 0 else: # Register additional injection cells that are part of the # damping region and of size n_guard/2. if n_inject == None: self.n_inject = int(self.n_guard/2) else: # User-defined injection cells. Choose carefully. self.n_inject = n_inject # Initialize the period of the particle exchange and moving window if exchange_period is None: # Maximum number of cells a particle can travel in one timestep # Safety factor of 2 needed if there is a moving window attached # to the simulation or in case a galilean frame is used. cells_per_step = 2.*c*dt/self.dz # Maximum number of timesteps before a particle can reach the end # of the half of guard region including the maximum number of cells # (+/-3) it can affect with a "cubic" particle shape_factor. # (Particles are only allowed to reside in half of the guard # region as this is the stencil reach of the current correction) self.exchange_period = int(((self.n_guard/2)-3)/cells_per_step) # Set exchange_period to 1 in the case of single-proc # and periodic boundary conditions. if self.size == 1 and boundaries == 'periodic': self.exchange_period = 1 # Check that calculated exchange_period is acceptable for given # simulation parameters (check that guard region is large enough). if self.exchange_period < 1: raise ValueError('Guard region size is too small for chosen \ timestep. In one timestep, a particle can travel more \ than n_guard region cells.') else: # User-defined exchange_period. Choose carefully. self.exchange_period = exchange_period # Initialize the moving window to None (See the method # set_moving_window in main.py to initialize a proper moving window) self.moving_win = None # Initialize a buffer handler object, for MPI communications if self.size > 1: self.mpi_buffers = BufferHandler( self.n_guard, Nr, Nm, self.left_proc, self.right_proc ) # Create damping arrays for the damping cells at the left # and right of the box in the case of "open" boundaries. if (self.n_damp+self.n_inject) > 0: if self.left_proc is None: # Create the damping arrays for left proc self.left_damp = self.generate_damp_array( self.n_guard, self.n_damp, self.n_inject ) if cuda_installed: self.d_left_damp = cuda.to_device( self.left_damp ) if self.right_proc is None: # Create the damping arrays for right proc self.right_damp = self.generate_damp_array( self.n_guard, self.n_damp, self.n_inject ) if cuda_installed: self.d_right_damp = cuda.to_device( self.right_damp )
def __init__(self, n_guard, Nr, Nm, left_proc, right_proc): """ Initialize the guard cell buffers for the fields. These buffers are used in order to group the MPI exchanges. Parameters ---------- n_guard: int Number of guard cells Nr: int Number of points in the radial direction Nm: int Number of azimuthal modes left_proc, right_proc: int or None Rank of the proc to the right and to the left (None for open boundary) """ # Register parameters self.Nr = Nr self.Nm = Nm self.n_guard = n_guard self.left_proc = left_proc self.right_proc = right_proc # Shortcut ng = self.n_guard # Allocate buffer arrays that are send via MPI to exchange # the fields between domains (either replacing or adding fields) # Buffers are allocated for the left and right side of the domain # Allocate buffers on the CPU if cuda_installed: # Use cuda.pinned_array so that CPU array is pagelocked. # (cannot be swapped out to disk and GPU can access it via DMA) alloc_cpu = cuda.pinned_array else: # Use regular numpy arrays alloc_cpu = np.empty # Allocate buffers of different size, for the different exchange types self.send_l = { 'E:replace': alloc_cpu((3 * Nm, ng, Nr), dtype=np.complex128), 'B:replace': alloc_cpu((3 * Nm, ng, Nr), dtype=np.complex128), 'J:add': alloc_cpu((3 * Nm, 2 * ng, Nr), dtype=np.complex128), 'rho:add': alloc_cpu((Nm, 2 * ng, Nr), dtype=np.complex128) } self.send_r = { 'E:replace': alloc_cpu((3 * Nm, ng, Nr), dtype=np.complex128), 'B:replace': alloc_cpu((3 * Nm, ng, Nr), dtype=np.complex128), 'J:add': alloc_cpu((3 * Nm, 2 * ng, Nr), dtype=np.complex128), 'rho:add': alloc_cpu((Nm, 2 * ng, Nr), dtype=np.complex128) } self.recv_l = { 'E:replace': alloc_cpu((3 * Nm, ng, Nr), dtype=np.complex128), 'B:replace': alloc_cpu((3 * Nm, ng, Nr), dtype=np.complex128), 'J:add': alloc_cpu((3 * Nm, 2 * ng, Nr), dtype=np.complex128), 'rho:add': alloc_cpu((Nm, 2 * ng, Nr), dtype=np.complex128) } self.recv_r = { 'E:replace': alloc_cpu((3 * Nm, ng, Nr), dtype=np.complex128), 'B:replace': alloc_cpu((3 * Nm, ng, Nr), dtype=np.complex128), 'J:add': alloc_cpu((3 * Nm, 2 * ng, Nr), dtype=np.complex128), 'rho:add': alloc_cpu((Nm, 2 * ng, Nr), dtype=np.complex128) } # Allocate buffers on the GPU, for the different exchange types if cuda_installed: self.d_send_l = { key: cuda.to_device(value) for key, value in \ self.send_l.items() } self.d_send_r = { key: cuda.to_device(value) for key, value in \ self.send_r.items() } self.d_recv_l = { key: cuda.to_device(value) for key, value in \ self.recv_l.items() } self.d_recv_r = { key: cuda.to_device(value) for key, value in \ self.recv_r.items() }
def __init__(self, p, m, Nr, Nz, rmax, use_cuda=False): """ Calculate the r (position) and nu (frequency) grid on which the transform will operate. Also store auxiliary data needed for the transform. Parameters: ------------ p: int Order of the Hankel transform m: int The azimuthal mode for which the Hankel transform is calculated Nr, Nz: float Number of points in the r direction and z direction rmax: float Edge of the box in which the Hankel transform is taken (The function is assumed to be zero at that point.) use_cuda: bool, optional Whether to use the GPU for the Hankel transform """ # Register whether to use the GPU. # If yes, initialize the corresponding cuda object self.use_cuda = use_cuda if (self.use_cuda == True) and (cuda_installed == False): self.use_cuda = False print('** Cuda not available for Hankel transform.') print('** Performing the Hankel transform on the CPU.') # Check that m has a valid value if (m in [p - 1, p, p + 1]) == False: raise ValueError('m must be either p-1, p or p+1') # Register values of the arguments self.p = p self.m = m self.Nr = Nr self.rmax = rmax self.Nz = Nz # Calculate the zeros of the Bessel function if m != 0: # In this case, 0 is a zero of the Bessel function of order m. # It turns out that it is needed to reconstruct the signal for p=0. alphas = np.hstack((np.array([0.]), jn_zeros(m, Nr - 1))) else: alphas = jn_zeros(m, Nr) # Calculate the spectral grid self.nu = 1. / (2 * np.pi * rmax) * alphas # Calculate the spatial grid (Uniform grid with an half-cell offset) self.r = (rmax * 1. / Nr) * (np.arange(Nr) + 0.5) # Calculate and store the inverse matrix invM # (imposed by the constraints on the DHT of Bessel modes) # NB: When compared with the FBPIC article, all the matrices here # are calculated in transposed form. This is done so as to use the # `dot` and `gemm` functions, in the `transform` method. self.invM = np.empty((Nr, Nr)) if p == m: p_denom = p + 1 else: p_denom = p denom = np.pi * rmax**2 * jn(p_denom, alphas)**2 num = jn(p, 2 * np.pi * self.r[np.newaxis, :] * self.nu[:, np.newaxis]) # Get the inverse matrix if m != 0: self.invM[1:, :] = num[1:, :] / denom[1:, np.newaxis] # In this case, the functions are represented by Bessel functions # *and* an additional mode (below) which satisfies the same # algebric relations for curl/div/grad as the regular Bessel modes, # with the value kperp=0. # The normalization of this mode is arbitrary, and is chosen # so that the condition number of invM is close to 1 if p == m - 1: self.invM[0, :] = self.r**(m - 1) * 1. / (np.pi * rmax**(m + 1)) else: self.invM[0, :] = 0. else: self.invM[:, :] = num[:, :] / denom[:, np.newaxis] # Calculate the matrix M by inverting invM self.M = np.empty((Nr, Nr)) if m != 0 and p != m - 1: self.M[:, 1:] = np.linalg.pinv(self.invM[1:, :]) self.M[:, 0] = 0. else: self.M = np.linalg.inv(self.invM) # Copy the matrices to the GPU if needed if self.use_cuda: self.d_M = cuda.to_device(self.M) self.d_invM = cuda.to_device(self.invM) # Initialize buffer arrays to store the complex Nz x Nr grid # as a real 2Nz x Nr grid, before performing the matrix product # (This is because a matrix product of reals is faster than a matrix # product of complexs, and the real-complex conversion is negligible.) if not self.use_cuda: # Initialize real buffer arrays on the CPU zero_array = np.zeros((2 * Nz, Nr), dtype=np.float64) self.array_in = zero_array.copy() self.array_out = zero_array.copy() else: # Initialize real buffer arrays on the GPU zero_array = np.zeros((2 * Nz, Nr), dtype=np.float64) self.d_in = cuda.to_device(zero_array) self.d_out = cuda.to_device(zero_array) # Initialize cuBLAS self.blas = device.get_cublas_handle() # Set optimal number of CUDA threads per block # for copy 2d real/complex (determined empirically) copy_tpb = (8, 32) if cuda_gpu_model == "V100" else (2, 16) # Initialize the threads per block and block per grid self.dim_grid, self.dim_block = cuda_tpb_bpg_2d(Nz, Nr, *copy_tpb)