def deposit_virtual_particles_gpu(self, q, fieldtype, grid): # Position of the particles x = self.d_baseline_x + q * self.excursion_x y = self.d_baseline_y + q * self.excursion_y if fieldtype == 'rho': # --------------------------------------- # Deposit the charge density mode by mode # --------------------------------------- for m in range(len(grid)): dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d(self.Ntot) deposit_rho_gpu_unsorted[dim_grid_1d, dim_block_1d]( x, y, self.d_baseline_z, self.w, q, grid[m].invdz, grid[m].zmin, grid[m].Nz, grid[m].invdr, grid[m].rmin, grid[m].Nr, grid[m].rho, m, grid[m].d_ruyten_linear_coef) elif fieldtype == 'J': # Particle velocities vx = q * self.vx vy = q * self.vy # --------------------------------------- # Deposit the current density mode by mode # --------------------------------------- for m in range(len(grid)): dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d(self.Ntot) deposit_J_gpu_unsorted[dim_grid_1d, dim_block_1d]( x, y, self.d_baseline_z, self.w, q, vx, vy, self.d_vz, grid[m].invdz, grid[m].zmin, grid[m].Nz, grid[m].invdr, grid[m].rmin, grid[m].Nr, grid[m].Jr, grid[m].Jt, grid[m].Jz, m, grid[m].d_ruyten_linear_coef)
def reallocate_and_copy_old( species, use_cuda, old_Ntot, new_Ntot ): """ Copy the particle quantities of `species` from arrays of size `old_Ntot` into arrays of size `new_Ntot`. Set these arrays as attributes of `species. (The first `old_Ntot` elements of the new arrays are copied from the old arrays ; the last elements are left empty and expected to be filled later.) When `use_cuda` is True, this function also reallocates the sorting buffers for GPU, with a size `new_Ntot` Parameters ---------- species: an fbpic Particles object use_cuda: bool If True, the new arrays are device arrays, and copying is done on GPU. If False, the arrays are on CPU, and copying is done on CPU. old_Ntot, new_Ntot: int Size of the old and new arrays (with old_Ntot < new_Ntot) """ # Check if the data is on the GPU data_on_gpu = (type(species.w) is not np.ndarray) # On GPU, use one thread per particle if data_on_gpu: ptcl_grid_1d, ptcl_block_1d = cuda_tpb_bpg_1d( old_Ntot ) # Iterate over particle attributes and copy the old particles for attr in ['x', 'y', 'z', 'ux', 'uy', 'uz', 'w', 'inv_gamma', 'Ex', 'Ey', 'Ez', 'Bx', 'By', 'Bz']: old_array = getattr(species, attr) new_array = allocate_empty( new_Ntot, data_on_gpu, dtype=np.float64 ) if data_on_gpu: copy_particle_data_cuda[ ptcl_grid_1d, ptcl_block_1d ]( old_Ntot, old_array, new_array ) else: copy_particle_data_numba( old_Ntot, old_array, new_array ) setattr( species, attr, new_array ) # Copy the tracking id, if needed if species.tracker is not None: old_array = species.tracker.id new_array = allocate_empty( new_Ntot, use_cuda, dtype=np.uint64 ) if data_on_gpu: copy_particle_data_cuda[ ptcl_grid_1d, ptcl_block_1d ]( old_Ntot, old_array, new_array ) else: copy_particle_data_numba( old_Ntot, old_array, new_array ) species.tracker.id = new_array # Allocate the auxiliary arrays for GPU if use_cuda: species.cell_idx = cuda.device_array((new_Ntot,), dtype=np.int32) species.sorted_idx = cuda.device_array((new_Ntot,), dtype=np.uint32) species.sorting_buffer = cuda.device_array((new_Ntot,), dtype=np.float64) if species.n_integer_quantities > 0: species.int_sorting_buffer = \ cuda.device_array( (new_Ntot,), dtype=np.uint64 ) # Modify the total number of particles species.Ntot = new_Ntot
def copy_rho_buffer(self, iz_min, grid): """ Add the small-size array rho_buffer into the full-size array rho Parameters ---------- iz_min: int The z index in the full-size array, that corresponds to index 0 in the small-size array (i.e. position at which to add the small-size array into the full-size one) grid: a list of InterpolationGrid objects Contains the full-size array rho """ Nm = len(grid) if type(grid[0].rho) is np.ndarray: # The large-size array rho is on the CPU for m in range(Nm): grid[m].rho[iz_min:iz_min + 2] += self.rho_buffer[m] else: # The large-size array rho is on the GPU # Copy the small-size buffer to the GPU cuda.to_device(self.rho_buffer, to=self.d_rho_buffer) # On the GPU: add the small-size buffers to the large-size array dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d(grid[0].Nr, TPB=64) for m in range(Nm): add_rho_to_gpu_array[dim_grid_1d, dim_block_1d](iz_min, self.d_rho_buffer, grid[m].rho, m)
def push_x( self, dt, x_push=1., y_push=1., z_push=1. ) : """ Advance the particles' positions over `dt` using the current momenta (ux, uy, uz). Parameters: ----------- dt: float, seconds The timestep that should be used for the push (This can be typically be half of the simulation timestep) x_push, y_push, z_push: float, dimensionless Multiplying coefficient for the momenta in x, y and z e.g. if x_push=1., the particles are pushed forward in x if x_push=-1., the particles are pushed backward in x """ # GPU (CUDA) version if self.use_cuda: # Get the threads per block and the blocks per grid dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d( self.Ntot ) # Call the CUDA Kernel for push in x push_x_gpu[dim_grid_1d, dim_block_1d]( self.x, self.y, self.z, self.ux, self.uy, self.uz, self.inv_gamma, dt, x_push, y_push, z_push ) # The particle array is unsorted after the push in x self.sorted = False # CPU version else: push_x_numba( self.x, self.y, self.z, self.ux, self.uy, self.uz, self.inv_gamma, self.Ntot, dt, x_push, y_push, z_push )
def sort_particles(self, fld): """ Sort the particles by performing the following steps: 1. Get fied cell index 2. Sort field cell index 3. Parallel prefix sum 4. Rearrange particle arrays Parameter ---------- fld : a Field object Contains the list of InterpolationGrid objects with the field values as well as the prefix sum. """ # Shortcut for interpolation grids grid = fld.interp # Get the threads per block and the blocks per grid dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d(self.Ntot) dim_grid_2d_flat, dim_block_2d_flat = \ cuda_tpb_bpg_1d( self.prefix_sum.shape[0] ) # ------------------------ # Sorting of the particles # ------------------------ # Get the cell index of each particle # (defined by iz_lower and ir_lower) get_cell_idx_per_particle[dim_grid_1d, dim_block_1d](self.cell_idx, self.sorted_idx, self.x, self.y, self.z, grid[0].invdz, grid[0].zmin, grid[0].Nz, grid[0].invdr, grid[0].rmin, grid[0].Nr) # Sort the cell index array and modify the sorted_idx array # accordingly. The value of the sorted_idx array corresponds # to the index of the sorted particle in the other particle # arrays. sort_particles_per_cell(self.cell_idx, self.sorted_idx) # Reset the old prefix sum self.prefix_sum_shift = 0 prefill_prefix_sum[dim_grid_2d_flat, dim_block_2d_flat](self.cell_idx, self.prefix_sum, self.Ntot) # Perform the inclusive parallel prefix sum incl_prefix_sum[dim_grid_1d, dim_block_1d](self.cell_idx, self.prefix_sum) # Rearrange the particle arrays self.rearrange_particle_arrays()
def apply_expression(self, ptcl, t): """ Apply the external field function to the particles This function is called at each timestep, after field gathering in the step function. Parameters ---------- ptcl: a list a Particles objects The particles on which the external fields will be applied t: float (seconds) The time in the simulation """ for species in ptcl: # If any species was specified at initialization, # apply the field only on this species if (self.species is None) or (species is self.species): # Only apply the field if there are macroparticles # in this species if species.Ntot <= 0: return # Loop over the different fields involved for (fieldtype, amplitude) in self.fieldtypes_and_amplitudes: field = getattr(species, fieldtype) if type(field) is np.ndarray: # Call the CPU function self.cpu_func(field, species.x, species.y, species.z, t, amplitude, self.length_scale, out=field) else: # Get the threads per block and the blocks per grid dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d( species.Ntot) # Call the GPU kernel self.gpu_func[dim_grid_1d, dim_block_1d](field, species.x, species.y, species.z, t, amplitude, self.length_scale)
def rearrange_particle_arrays(self): """ Rearranges the particle data arrays to match with the sorted cell index array. The sorted index array is used to resort the arrays. A particle buffer is used to temporarily store the rearranged data. """ # Get the threads per block and the blocks per grid dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d(self.Ntot) # Iterate over (float) particle attributes attr_list = [ (self,'x'), (self,'y'), (self,'z'), \ (self,'ux'), (self,'uy'), (self,'uz'), \ (self, 'w'), (self,'inv_gamma') ] if self.keep_fields_sorted: attr_list += [ (self, 'Ex'), (self, 'Ey'), (self, 'Ez'), \ (self, 'Bx'), (self, 'By'), (self, 'Bz') ] if self.ionizer is not None: attr_list += [(self.ionizer, 'w_times_level')] for attr in attr_list: # Get particle GPU array particle_array = getattr(attr[0], attr[1]) # Write particle data to particle buffer array while rearranging write_sorting_buffer[dim_grid_1d, dim_block_1d](self.sorted_idx, particle_array, self.sorting_buffer) # Assign the particle buffer to # the initial particle data array setattr(attr[0], attr[1], self.sorting_buffer) # Assign the old particle data array to the particle buffer self.sorting_buffer = particle_array # Iterate over (integer) particle attributes attr_list = [] if self.tracker is not None: attr_list += [(self.tracker, 'id')] if self.ionizer is not None: attr_list += [(self.ionizer, 'ionization_level')] for attr in attr_list: # Get particle GPU array particle_array = getattr(attr[0], attr[1]) # Write particle data to particle buffer array while rearranging write_sorting_buffer[dim_grid_1d, dim_block_1d](self.sorted_idx, particle_array, self.int_sorting_buffer) # Assign the particle buffer to # the initial particle data array setattr(attr[0], attr[1], self.int_sorting_buffer) # Assign the old particle data array to the particle buffer self.int_sorting_buffer = particle_array
def generate_new_ids_gpu( self, i_start, i_end ): """ Generate new unique ids, and use them to fill the array `id` in place from index `i_start` (included) to index `i_end` (excluded) Parameters ---------- i_start, i_end: int The indices between which new id should be generated """ N = i_end - i_start grid_1d, block_1d = cuda_tpb_bpg_1d( N ) # Modify the array self.id in-place, # between the indices i_start and i_end generate_ids_gpu[ grid_1d, block_1d ]( self.id, i_start, i_end, self.next_attributed_id, self.id_step ) # Update the value of self.next_attributed_id self.next_attributed_id = self.next_attributed_id + N*self.id_step
def push_p( self ) : """ Advance the particles' momenta over one timestep, using the Vay pusher Reference : Vay, Physics of Plasmas 15, 056701 (2008) This assumes that the momenta (ux, uy, uz) are initially one half-timestep *behind* the positions (x, y, z), and it brings them one half-timestep *ahead* of the positions. """ # Skip push for neutral particles (e.g. photons) if self.q == 0: return # GPU (CUDA) version if self.use_cuda: # Get the threads per block and the blocks per grid dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d( self.Ntot ) # Call the CUDA Kernel for the particle push if self.ionizer is None: push_p_gpu[dim_grid_1d, dim_block_1d]( self.ux, self.uy, self.uz, self.inv_gamma, self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz, self.q, self.m, self.Ntot, self.dt ) else: # Ionizable species can have a charge that depends on the # macroparticle, and hence require a different function push_p_ioniz_gpu[dim_grid_1d, dim_block_1d]( self.ux, self.uy, self.uz, self.inv_gamma, self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz, self.m, self.Ntot, self.dt, self.ionizer.ionization_level ) # CPU version else: if self.ionizer is None: push_p_numba(self.ux, self.uy, self.uz, self.inv_gamma, self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz, self.q, self.m, self.Ntot, self.dt ) else: # Ionizable species can have a charge that depends on the # macroparticle, and hence require a different function push_p_ioniz_numba(self.ux, self.uy, self.uz, self.inv_gamma, self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz, self.m, self.Ntot, self.dt, self.ionizer.ionization_level )
def shift_particles_periodic_subdomain( species, zmin, zmax ): """ Assuming the local subdomain is periodic: Shift the particle positions by an integer number of box length, so that outside particle are back inside the physical domain Parameters: ----------- species: an fbpic.Species object Contains the particle data zmin, zmax: floats Positions of the edges of the periodic box """ # Perform the shift on the GPU if species.use_cuda: dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d( species.Ntot ) shift_particles_periodic_cuda[ dim_grid_1d, dim_block_1d ]( species.z, zmin, zmax ) # Perform the shift on the CPU else: shift_particles_periodic_numba( species.z, zmin, zmax )
def halfpush_x( self ) : """ Advance the particles' positions over one half-timestep This assumes that the positions (x, y, z) are initially either one half-timestep *behind* the momenta (ux, uy, uz), or at the same timestep as the momenta. """ # GPU (CUDA) version if self.use_cuda: # Get the threads per block and the blocks per grid dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d( self.Ntot ) # Call the CUDA Kernel for halfpush in x push_x_gpu[dim_grid_1d, dim_block_1d]( self.x, self.y, self.z, self.ux, self.uy, self.uz, self.inv_gamma, self.dt ) # The particle array is unsorted after the push in x self.sorted = False # CPU version else: push_x_numba( self.x, self.y, self.z, self.ux, self.uy, self.uz, self.inv_gamma, self.Ntot, self.dt )
def copy_J_buffer(self, iz_min, grid): """ Add the small-size arrays Jr_buffer, Jt_buffer, Jz_buffer into the full-size arrays Jr, Jt, Jz Parameters ---------- iz_min: int The z index in the full-size array, that corresponds to index 0 in the small-size array (i.e. position at which to add the small-size array into the full-size one) grid: a list of InterpolationGrid objects Contains the full-size array Jr, Jt, Jz """ Nm = len(grid) if type(grid[0].Jr) is np.ndarray: # The large-size arrays for J are on the CPU for m in range(Nm): grid[m].Jr[iz_min:iz_min + 2] += self.Jr_buffer[m] grid[m].Jt[iz_min:iz_min + 2] += self.Jt_buffer[m] grid[m].Jz[iz_min:iz_min + 2] += self.Jz_buffer[m] else: # The large-size arrays for J are on the GPU # Copy the small-size buffers to the GPU self.d_Jr_buffer.set(self.Jr_buffer) self.d_Jt_buffer.set(self.Jt_buffer) self.d_Jz_buffer.set(self.Jz_buffer) # On the GPU: add the small-size buffers to the large-size array dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d(grid[0].Nr, TPB=64) for m in range(Nm): add_J_to_gpu_array[dim_grid_1d, dim_block_1d](iz_min, self.d_Jr_buffer, self.d_Jt_buffer, self.d_Jz_buffer, grid[m].Jr, grid[m].Jt, grid[m].Jz, m)
def handle_ionization(self, ion): """ Handle ionization, either on CPU or GPU - For each ion macroparticle, decide whether it is going to be further ionized during this timestep, based on the ADK rate. - Add the electrons created from ionization to the `target_species` Parameters: ----------- ion: an fbpic.Particles object The ionizable species, from which new electrons are created. """ # Skip this function if there are no ions if ion.Ntot == 0: return # Process particles in batches (of typically 10, 20 particles) N_batch = int(ion.Ntot / self.batch_size) + 1 # Short-cuts use_cuda = self.use_cuda # Set the number of levels that should be distinguished if self.store_electrons_per_level: n_levels = self.level_max - self.level_start else: n_levels = 1 # Create temporary arrays (on CPU or GPU, depending on `use_cuda`) ionized_from = allocate_empty(ion.Ntot, use_cuda, dtype=np.int16) n_ionized = allocate_empty((n_levels, N_batch), use_cuda, dtype=np.int64) # Draw random numbers if self.use_cuda: random_draw = cupy.random.rand(ion.Ntot, dtype=cupy.float32) else: random_draw = np.random.rand(ion.Ntot) # Determine the ions that are ionized, and count them in each batch # (one thread per batch on GPU; parallel loop over batches on CPU) if use_cuda: batch_grid_1d, batch_block_1d = cuda_tpb_bpg_1d(N_batch) ionize_ions_cuda[batch_grid_1d, batch_block_1d]( N_batch, self.batch_size, ion.Ntot, self.level_start, self.level_max, n_levels, n_ionized, ionized_from, self.ionization_level, random_draw, self.adk_prefactor, self.adk_power, self.adk_exp_prefactor, ion.ux, ion.uy, ion.uz, ion.Ex, ion.Ey, ion.Ez, ion.Bx, ion.By, ion.Bz, ion.w, self.w_times_level) else: ionize_ions_numba(N_batch, self.batch_size, ion.Ntot, self.level_start, self.level_max, n_levels, n_ionized, ionized_from, self.ionization_level, random_draw, self.adk_prefactor, self.adk_power, self.adk_exp_prefactor, ion.ux, ion.uy, ion.uz, ion.Ex, ion.Ey, ion.Ez, ion.Bx, ion.By, ion.Bz, ion.w, self.w_times_level) # Count the total number of new electrons (operation always performed # on the CPU, as this is typically difficult on the GPU) if use_cuda: n_ionized = n_ionized.copy_to_host() cumulative_n_ionized = perform_cumsum_2d(n_ionized) # If no new particle was created, skip the rest of this function if np.all(cumulative_n_ionized[:, -1] == 0): return # Copy the cumulated number of electrons back on GPU # (Keep a copy on the CPU) if use_cuda: d_cumulative_n_ionized = cuda.to_device(cumulative_n_ionized) # Loop over the electron species associated to each level # (when store_electrons_per_level is False, there is a single species) # Reallocate electron species (on CPU or GPU depending on `use_cuda`), # to accomodate the electrons produced by ionization, # and copy the old electrons to the new arrays assert len(self.target_species) == n_levels for i_level, elec in enumerate(self.target_species): old_Ntot = elec.Ntot new_Ntot = old_Ntot + cumulative_n_ionized[i_level, -1] reallocate_and_copy_old(elec, use_cuda, old_Ntot, new_Ntot) # Create the new electrons from ionization (one thread per batch) if use_cuda: copy_ionized_electrons_cuda[batch_grid_1d, batch_block_1d]( N_batch, self.batch_size, old_Ntot, ion.Ntot, d_cumulative_n_ionized, ionized_from, i_level, self.store_electrons_per_level, elec.x, elec.y, elec.z, elec.inv_gamma, elec.ux, elec.uy, elec.uz, elec.w, elec.Ex, elec.Ey, elec.Ez, elec.Bx, elec.By, elec.Bz, ion.x, ion.y, ion.z, ion.inv_gamma, ion.ux, ion.uy, ion.uz, ion.w, ion.Ex, ion.Ey, ion.Ez, ion.Bx, ion.By, ion.Bz) # Mark the new electrons as unsorted elec.sorted = False else: copy_ionized_electrons_numba( N_batch, self.batch_size, old_Ntot, ion.Ntot, cumulative_n_ionized, ionized_from, i_level, self.store_electrons_per_level, elec.x, elec.y, elec.z, elec.inv_gamma, elec.ux, elec.uy, elec.uz, elec.w, elec.Ex, elec.Ey, elec.Ez, elec.Bx, elec.By, elec.Bz, ion.x, ion.y, ion.z, ion.inv_gamma, ion.ux, ion.uy, ion.uz, ion.w, ion.Ex, ion.Ey, ion.Ez, ion.Bx, ion.By, ion.Bz) # If the electrons are tracked, generate new ids # (on GPU or GPU depending on `use_cuda`) generate_new_ids(elec, old_Ntot, new_Ntot)
def deposit( self, fld, fieldtype ) : """ Deposit the particles charge or current onto the grid This assumes that the particle positions (and momenta in the case of J) are currently at the same timestep as the field that is to be deposited Parameter ---------- fld : a Field object Contains the list of InterpolationGrid objects with the field values as well as the prefix sum. fieldtype : string Indicates which field to deposit Either 'J' or 'rho' """ # Skip deposition for neutral particles (e.g. photons) if self.q == 0: return # Shortcuts and safe-guards grid = fld.interp assert fieldtype in ['rho', 'J'] assert self.particle_shape in ['linear', 'cubic'] # When running on GPU: first sort the arrays of particles if self.use_cuda: # Sort the particles if not self.sorted: self.sort_particles(fld=fld) # The particles are now sorted and rearranged self.sorted = True # For ionizable atoms: set the effective weight to the weight # times the ionization level (on GPU, this needs to be done *after* # sorting, otherwise `weight` is not equal to the corresponding array) if self.ionizer is not None: weight = self.ionizer.w_times_level else: weight = self.w # GPU (CUDA) version if self.use_cuda: # Get the threads per block and the blocks per grid dim_grid_2d_flat, dim_block_2d_flat = \ cuda_tpb_bpg_1d( self.prefix_sum.shape[0], TPB=64 ) # Call the CUDA Kernel for the deposition of rho or J Nm = len( grid ) # Rho if fieldtype == 'rho': if self.particle_shape == 'linear': if Nm == 2: deposit_rho_gpu_linear[ dim_grid_2d_flat, dim_block_2d_flat]( self.x, self.y, self.z, weight, self.q, grid[0].invdz, grid[0].zmin, grid[0].Nz, grid[0].invdr, grid[0].rmin, grid[0].Nr, grid[0].rho, grid[1].rho, self.cell_idx, self.prefix_sum) else: for m in range(Nm): deposit_rho_gpu_linear_one_mode[ dim_grid_2d_flat, dim_block_2d_flat]( self.x, self.y, self.z, weight, self.q, grid[m].invdz, grid[m].zmin, grid[m].Nz, grid[m].invdr, grid[m].rmin, grid[m].Nr, grid[m].rho, m, self.cell_idx, self.prefix_sum) elif self.particle_shape == 'cubic': if Nm == 2: deposit_rho_gpu_cubic[ dim_grid_2d_flat, dim_block_2d_flat]( self.x, self.y, self.z, weight, self.q, grid[0].invdz, grid[0].zmin, grid[0].Nz, grid[0].invdr, grid[0].rmin, grid[0].Nr, grid[0].rho, grid[1].rho, self.cell_idx, self.prefix_sum) else: for m in range(Nm): deposit_rho_gpu_cubic_one_mode[ dim_grid_2d_flat, dim_block_2d_flat]( self.x, self.y, self.z, weight, self.q, grid[m].invdz, grid[m].zmin, grid[m].Nz, grid[m].invdr, grid[m].rmin, grid[m].Nr, grid[m].rho, m, self.cell_idx, self.prefix_sum) # J elif fieldtype == 'J': # Deposit J in each of four directions if self.particle_shape == 'linear': if Nm == 2: deposit_J_gpu_linear[ dim_grid_2d_flat, dim_block_2d_flat]( self.x, self.y, self.z, weight, self.q, self.ux, self.uy, self.uz, self.inv_gamma, grid[0].invdz, grid[0].zmin, grid[0].Nz, grid[0].invdr, grid[0].rmin, grid[0].Nr, grid[0].Jr, grid[1].Jr, grid[0].Jt, grid[1].Jt, grid[0].Jz, grid[1].Jz, self.cell_idx, self.prefix_sum) else: for m in range(Nm): deposit_J_gpu_linear_one_mode[ dim_grid_2d_flat, dim_block_2d_flat]( self.x, self.y, self.z, weight, self.q, self.ux, self.uy, self.uz, self.inv_gamma, grid[m].invdz, grid[m].zmin, grid[m].Nz, grid[m].invdr, grid[m].rmin, grid[m].Nr, grid[m].Jr, grid[m].Jt, grid[m].Jz, m, self.cell_idx, self.prefix_sum) elif self.particle_shape == 'cubic': if Nm == 2: deposit_J_gpu_cubic[ dim_grid_2d_flat, dim_block_2d_flat]( self.x, self.y, self.z, weight, self.q, self.ux, self.uy, self.uz, self.inv_gamma, grid[0].invdz, grid[0].zmin, grid[0].Nz, grid[0].invdr, grid[0].rmin, grid[0].Nr, grid[0].Jr, grid[1].Jr, grid[0].Jt, grid[1].Jt, grid[0].Jz, grid[1].Jz, self.cell_idx, self.prefix_sum) else: for m in range(Nm): deposit_J_gpu_cubic_one_mode[ dim_grid_2d_flat, dim_block_2d_flat]( self.x, self.y, self.z, weight, self.q, self.ux, self.uy, self.uz, self.inv_gamma, grid[m].invdz, grid[m].zmin, grid[m].Nz, grid[m].invdr, grid[m].rmin, grid[m].Nr, grid[m].Jr, grid[m].Jt, grid[m].Jz, m, self.cell_idx, self.prefix_sum) # CPU version else: # Divide particles in chunks (each chunk is handled by a different # thread) and register the indices that bound each chunks ptcl_chunk_indices = get_chunk_indices(self.Ntot, nthreads) # Multithreading functions for the deposition of rho or J # for Mode 0 and 1 only. if fieldtype == 'rho': # Deposit rho using CPU threading if self.particle_shape == 'linear': deposit_rho_numba_linear( self.x, self.y, self.z, weight, self.q, grid[0].invdz, grid[0].zmin, grid[0].Nz, grid[0].invdr, grid[0].rmin, grid[0].Nr, fld.rho_global, fld.Nm, nthreads, ptcl_chunk_indices ) elif self.particle_shape == 'cubic': deposit_rho_numba_cubic( self.x, self.y, self.z, weight, self.q, grid[0].invdz, grid[0].zmin, grid[0].Nz, grid[0].invdr, grid[0].rmin, grid[0].Nr, fld.rho_global, fld.Nm, nthreads, ptcl_chunk_indices ) elif fieldtype == 'J': # Deposit J using CPU threading if self.particle_shape == 'linear': deposit_J_numba_linear( self.x, self.y, self.z, weight, self.q, self.ux, self.uy, self.uz, self.inv_gamma, grid[0].invdz, grid[0].zmin, grid[0].Nz, grid[0].invdr, grid[0].rmin, grid[0].Nr, fld.Jr_global, fld.Jt_global, fld.Jz_global, fld.Nm, nthreads, ptcl_chunk_indices ) elif self.particle_shape == 'cubic': deposit_J_numba_cubic( self.x, self.y, self.z, weight, self.q, self.ux, self.uy, self.uz, self.inv_gamma, grid[0].invdz, grid[0].zmin, grid[0].Nz, grid[0].invdr, grid[0].rmin, grid[0].Nr, fld.Jr_global, fld.Jt_global, fld.Jz_global, fld.Nm, nthreads, ptcl_chunk_indices )
def gather( self, grid ) : """ Gather the fields onto the macroparticles This assumes that the particle positions are currently at the same timestep as the field that is to be gathered. Parameter ---------- grid : a list of InterpolationGrid objects (one InterpolationGrid object per azimuthal mode) Contains the field values on the interpolation grid """ # Skip gathering for neutral particles (e.g. photons) if self.q == 0: return # Number of modes Nm = len(grid) # GPU (CUDA) version if self.use_cuda: # Get the threads per block and the blocks per grid dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d( self.Ntot, TPB=64 ) # Call the CUDA Kernel for the gathering of E and B Fields if self.particle_shape == 'linear': if Nm == 2: # Optimized version for 2 modes gather_field_gpu_linear[dim_grid_1d, dim_block_1d]( self.x, self.y, self.z, grid[0].invdz, grid[0].zmin, grid[0].Nz, grid[0].invdr, grid[0].rmin, grid[0].Nr, grid[0].Er, grid[0].Et, grid[0].Ez, grid[1].Er, grid[1].Et, grid[1].Ez, grid[0].Br, grid[0].Bt, grid[0].Bz, grid[1].Br, grid[1].Bt, grid[1].Bz, self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz) else: # Generic version for arbitrary number of modes erase_eb_cuda[dim_grid_1d, dim_block_1d]( self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz, self.Ntot ) for m in range(Nm): gather_field_gpu_linear_one_mode[ dim_grid_1d, dim_block_1d]( self.x, self.y, self.z, grid[m].invdz, grid[m].zmin, grid[m].Nz, grid[m].invdr, grid[m].rmin, grid[m].Nr, grid[m].Er, grid[m].Et, grid[m].Ez, grid[m].Br, grid[m].Bt, grid[m].Bz, m, self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz) elif self.particle_shape == 'cubic': if Nm == 2: # Optimized version for 2 modes gather_field_gpu_cubic[dim_grid_1d, dim_block_1d]( self.x, self.y, self.z, grid[0].invdz, grid[0].zmin, grid[0].Nz, grid[0].invdr, grid[0].rmin, grid[0].Nr, grid[0].Er, grid[0].Et, grid[0].Ez, grid[1].Er, grid[1].Et, grid[1].Ez, grid[0].Br, grid[0].Bt, grid[0].Bz, grid[1].Br, grid[1].Bt, grid[1].Bz, self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz) else: # Generic version for arbitrary number of modes erase_eb_cuda[dim_grid_1d, dim_block_1d]( self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz, self.Ntot ) for m in range(Nm): gather_field_gpu_cubic_one_mode[ dim_grid_1d, dim_block_1d]( self.x, self.y, self.z, grid[m].invdz, grid[m].zmin, grid[m].Nz, grid[m].invdr, grid[m].rmin, grid[m].Nr, grid[m].Er, grid[m].Et, grid[m].Ez, grid[m].Br, grid[m].Bt, grid[m].Bz, m, self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz) else: raise ValueError("`particle_shape` should be either \ 'linear' or 'cubic' \ but is `%s`" % self.particle_shape) # CPU version else: if self.particle_shape == 'linear': if Nm == 2: # Optimized version for 2 modes gather_field_numba_linear( self.x, self.y, self.z, grid[0].invdz, grid[0].zmin, grid[0].Nz, grid[0].invdr, grid[0].rmin, grid[0].Nr, grid[0].Er, grid[0].Et, grid[0].Ez, grid[1].Er, grid[1].Et, grid[1].Ez, grid[0].Br, grid[0].Bt, grid[0].Bz, grid[1].Br, grid[1].Bt, grid[1].Bz, self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz) else: # Generic version for arbitrary number of modes erase_eb_numba( self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz, self.Ntot ) for m in range(Nm): gather_field_numba_linear_one_mode( self.x, self.y, self.z, grid[m].invdz, grid[m].zmin, grid[m].Nz, grid[m].invdr, grid[m].rmin, grid[m].Nr, grid[m].Er, grid[m].Et, grid[m].Ez, grid[m].Br, grid[m].Bt, grid[m].Bz, m, self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz ) elif self.particle_shape == 'cubic': # Divide particles into chunks (each chunk is handled by a # different thread) and return the indices that bound chunks ptcl_chunk_indices = get_chunk_indices(self.Ntot, nthreads) if Nm == 2: # Optimized version for 2 modes gather_field_numba_cubic( self.x, self.y, self.z, grid[0].invdz, grid[0].zmin, grid[0].Nz, grid[0].invdr, grid[0].rmin, grid[0].Nr, grid[0].Er, grid[0].Et, grid[0].Ez, grid[1].Er, grid[1].Et, grid[1].Ez, grid[0].Br, grid[0].Bt, grid[0].Bz, grid[1].Br, grid[1].Bt, grid[1].Bz, self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz, nthreads, ptcl_chunk_indices ) else: # Generic version for arbitrary number of modes erase_eb_numba( self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz, self.Ntot ) for m in range(Nm): gather_field_numba_cubic_one_mode( self.x, self.y, self.z, grid[m].invdz, grid[m].zmin, grid[m].Nz, grid[m].invdr, grid[m].rmin, grid[m].Nr, grid[m].Er, grid[m].Et, grid[m].Ez, grid[m].Br, grid[m].Bt, grid[m].Bz, m, self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz, nthreads, ptcl_chunk_indices ) else: raise ValueError("`particle_shape` should be either \ 'linear' or 'cubic' \ but is `%s`" % self.particle_shape)
def push_p( self, t ) : """ Advance the particles' momenta over one timestep, using the Vay pusher Reference : Vay, Physics of Plasmas 15, 056701 (2008) This assumes that the momenta (ux, uy, uz) are initially one half-timestep *behind* the positions (x, y, z), and it brings them one half-timestep *ahead* of the positions. Parameters ---------- t: float The current simulation time (Useful for particles that are ballistic before a given plane) """ # Skip push for neutral particles (e.g. photons) if self.q == 0: return # For particles that are ballistic before a plane, # get the current position of the plane if isinstance( self.injector, BallisticBeforePlane ): z_plane = self.injector.get_current_plane_position( t ) if self.ionizer is not None: raise NotImplementedError('Ballistic injection before a plane ' 'is not implemented for ionizable particles.') else: z_plane = None # GPU (CUDA) version if self.use_cuda: # Get the threads per block and the blocks per grid dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d( self.Ntot ) # Call the CUDA Kernel for the particle push if self.ionizer is not None: # Ionizable species can have a charge that depends on the # macroparticle, and hence require a different function push_p_ioniz_gpu[dim_grid_1d, dim_block_1d]( self.ux, self.uy, self.uz, self.inv_gamma, self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz, self.m, self.Ntot, self.dt, self.ionizer.ionization_level ) elif z_plane is not None: # Particles that are ballistic before a plane also # require a different pusher push_p_after_plane_gpu[dim_grid_1d, dim_block_1d]( self.z, z_plane, self.ux, self.uy, self.uz, self.inv_gamma, self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz, self.q, self.m, self.Ntot, self.dt ) else: # Standard pusher push_p_gpu[dim_grid_1d, dim_block_1d]( self.ux, self.uy, self.uz, self.inv_gamma, self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz, self.q, self.m, self.Ntot, self.dt ) # CPU version else: if self.ionizer is not None: # Ionizable species can have a charge that depends on the # macroparticle, and hence require a different function push_p_ioniz_numba(self.ux, self.uy, self.uz, self.inv_gamma, self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz, self.m, self.Ntot, self.dt, self.ionizer.ionization_level ) elif z_plane is not None: # Particles that are ballistic before a plane also # require a different pusher push_p_after_plane_numba( self.z, z_plane, self.ux, self.uy, self.uz, self.inv_gamma, self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz, self.q, self.m, self.Ntot, self.dt ) else: # Standard pusher push_p_numba(self.ux, self.uy, self.uz, self.inv_gamma, self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz, self.q, self.m, self.Ntot, self.dt )
def add_buffers_gpu( species, float_recv_left, float_recv_right, uint_recv_left, uint_recv_right): """ Add the particles stored in recv_left and recv_right to the existing particle in species. Parameters ---------- species: a Particles object Contain the particles that stayed on the present processors float_recv_left, float_recv_right, uint_recv_left, uint_recv_right: arrays of shape (n_float,Nptcl) and (n_int,Nptcl) where Nptcl is the number of particles that are received to the left proc and right proc respectively, and where n_float and n_int are the number of float and integer quantities respectively These arrays are always on the CPU (since they were used for MPI) """ # Get the new number of particles old_Ntot = species.Ntot n_left = float_recv_left.shape[1] n_right = float_recv_right.shape[1] new_Ntot = old_Ntot + n_left + n_right # Get the threads per block and the blocks per grid n_left_grid, n_left_block = cuda_tpb_bpg_1d( n_left ) n_right_grid, n_right_block = cuda_tpb_bpg_1d( n_right ) n_old_grid, n_old_block = cuda_tpb_bpg_1d( old_Ntot ) # Iterate over particle attributes # Build list of float attributes to copy attr_list = [ (species,'x'), (species,'y'), (species,'z'), \ (species,'ux'), (species,'uy'), (species,'uz'), \ (species,'inv_gamma'), (species,'w') ] if species.ionizer is not None: attr_list += [ (species.ionizer, 'w_times_level') ] # Loop through the float quantities for i_attr in range( len(attr_list) ): # Copy the proper buffers to the GPU left_buffer = cuda.to_device( float_recv_left[i_attr] ) right_buffer = cuda.to_device( float_recv_right[i_attr] ) # Initialize the new particle array particle_array = cuda.device_array( (new_Ntot,), dtype=np.float64) # Merge the arrays on the GPU stay_buffer = getattr( attr_list[i_attr][0], attr_list[i_attr][1]) if n_left != 0: copy_particles[n_left_grid, n_left_block]( n_left, left_buffer, 0, particle_array, 0 ) if old_Ntot != 0: copy_particles[n_old_grid, n_old_block]( old_Ntot, stay_buffer, 0, particle_array, n_left ) if n_right != 0: copy_particles[n_right_grid, n_right_block]( n_right, right_buffer, 0, particle_array, n_left+old_Ntot ) # Assign the stay_buffer to the initial particle data array # and fill the sending buffers (if needed for MPI) setattr(attr_list[i_attr][0], attr_list[i_attr][1], particle_array) # Build list of integer quantities to copy attr_list = [] if species.tracker is not None: attr_list.append( (species.tracker,'id') ) if species.ionizer is not None: attr_list.append( (species.ionizer,'ionization_level') ) # Loop through the integer quantities for i_attr in range( len(attr_list) ): # Copy the proper buffers to the GPU left_buffer = cuda.to_device( uint_recv_left[i_attr] ) right_buffer = cuda.to_device( uint_recv_right[i_attr] ) # Initialize the new particle array particle_array = cuda.device_array( (new_Ntot,), dtype=np.uint64) # Merge the arrays on the GPU stay_buffer = getattr( attr_list[i_attr][0], attr_list[i_attr][1]) if n_left != 0: copy_particles[n_left_grid, n_left_block]( n_left, left_buffer, 0, particle_array, 0 ) if old_Ntot != 0: copy_particles[n_old_grid, n_old_block]( old_Ntot, stay_buffer, 0, particle_array, n_left ) if n_right != 0: copy_particles[n_right_grid, n_right_block]( n_right, right_buffer, 0, particle_array, n_left+old_Ntot ) # Assign the stay_buffer to the initial particle data array # and fill the sending buffers (if needed for MPI) setattr(attr_list[i_attr][0], attr_list[i_attr][1], particle_array) # Adapt the total number of particles species.Ntot = new_Ntot
def remove_particles_gpu(species, fld, n_guard, left_proc, right_proc): """ Remove the particles that are outside of the physical domain (i.e. in the guard cells). Store them in sending buffers, which are returned. Parameters ---------- species: a Particles object Contains the data of this species fld: a Fields object Contains information about the dimension of the grid, and the prefix sum (when using the GPU) n_guard: int Number of guard cells left_proc, right_proc: int or None Indicate whether there is a left or right processor or if the boundary is open (None). Returns ------- float_send_left, float_send_right, uint_send_left, uint_send_right: arrays of shape (n_float,Nptcl) and (n_int,Nptcl) where Nptcl is the number of particles that are sent to the left proc and right proc respectively, and where n_float and n_int are the number of float and integer quantities respectively """ # Check if particles are sorted # (The particles are usually expected to be sorted from the previous # iteration at this point - except at the first iteration of `step`.) if species.sorted == False: species.sort_particles(fld = fld) species.sorted = True # Get the particle indices between which to remove the particles # (Take into account the fact that the moving window may have # shifted the grid since the particles were last sorted: prefix_sum_shift) prefix_sum = species.prefix_sum Nz = fld.Nz Nr = fld.Nr # Find the z index of the first cell for which particles are kept iz_min = max( n_guard + species.prefix_sum_shift, 0 ) # Find the z index of the first cell for which particles are removed again iz_max = min( Nz - n_guard + species.prefix_sum_shift + 1, Nz ) # Find the corresponding indices in the particle array # Reminder: prefix_sum[i] is the cumulative sum of the number of particles # in cells 0 to i (where cell i is included) if iz_min*(Nr+1) - 1 >= 0: i_min = prefix_sum.getitem( iz_min*(Nr+1) - 1 ) else: i_min = 0 i_max = prefix_sum.getitem( iz_max*(Nr+1) - 1 ) # Total number of particles in each particle group N_send_l = i_min new_Ntot = i_max - i_min N_send_r = species.Ntot - i_max # Allocate the sending buffers on the CPU n_float = species.n_float_quantities n_int = species.n_integer_quantities if left_proc is not None: float_send_left = np.empty((n_float, N_send_l), dtype=np.float64) uint_send_left = np.empty((n_int, N_send_l), dtype=np.uint64) else: float_send_left = np.empty((n_float, 0), dtype=np.float64) uint_send_left = np.empty((n_int, 0), dtype=np.uint64) if right_proc is not None: float_send_right = np.empty((n_float, N_send_r), dtype=np.float64) uint_send_right = np.empty((n_int, N_send_r), dtype=np.uint64) else: float_send_right = np.empty((n_float, 0), dtype=np.float64) uint_send_right = np.empty((n_int, 0), dtype=np.uint64) # Get the threads per block and the blocks per grid dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d( species.Ntot ) # Float quantities: # Build list of float attributes to copy attr_list = [ (species,'x'), (species,'y'), (species,'z'), (species,'ux'), (species,'uy'), (species,'uz'), (species,'inv_gamma'), (species,'w') ] if species.ionizer is not None: attr_list.append( (species.ionizer,'w_times_level') ) # Loop through the float attributes for i_attr in range(n_float): # Initialize 3 buffer arrays on the GPU (need to be initialized # inside the loop, as `copy_to_host` invalidates these arrays) left_buffer = cuda.device_array((N_send_l,), dtype=np.float64) right_buffer = cuda.device_array((N_send_r,), dtype=np.float64) stay_buffer = cuda.device_array((new_Ntot,), dtype=np.float64) # Check that the buffers are still on GPU # (safeguard against automatic memory management) assert type(left_buffer) != np.ndarray assert type(right_buffer) != np.ndarray assert type(left_buffer) != np.ndarray # Split the particle array into the 3 buffers on the GPU particle_array = getattr( attr_list[i_attr][0], attr_list[i_attr][1] ) split_particles_to_buffers[dim_grid_1d, dim_block_1d]( particle_array, left_buffer, stay_buffer, right_buffer, i_min, i_max) # Assign the stay_buffer to the initial particle data array # and fill the sending buffers (if needed for MPI) setattr( attr_list[i_attr][0], attr_list[i_attr][1], stay_buffer) if left_proc is not None: left_buffer.copy_to_host( float_send_left[i_attr] ) if right_proc is not None: right_buffer.copy_to_host( float_send_right[i_attr] ) # Integer quantities: if n_int > 0: attr_list = [] if species.tracker is not None: attr_list.append( (species.tracker,'id') ) if species.ionizer is not None: attr_list.append( (species.ionizer,'ionization_level') ) for i_attr in range(n_int): # Initialize 3 buffer arrays on the GPU (need to be initialized # inside the loop, as `copy_to_host` invalidates these arrays) left_buffer = cuda.device_array((N_send_l,), dtype=np.uint64) right_buffer = cuda.device_array((N_send_r,), dtype=np.uint64) stay_buffer = cuda.device_array((new_Ntot,), dtype=np.uint64) # Split the particle array into the 3 buffers on the GPU particle_array = getattr( attr_list[i_attr][0], attr_list[i_attr][1] ) split_particles_to_buffers[dim_grid_1d, dim_block_1d]( particle_array, left_buffer, stay_buffer, right_buffer, i_min, i_max) # Assign the stay_buffer to the initial particle data array # and fill the sending buffers (if needed for MPI) setattr( attr_list[i_attr][0], attr_list[i_attr][1], stay_buffer) if left_proc is not None: left_buffer.copy_to_host( uint_send_left[i_attr] ) if right_proc is not None: right_buffer.copy_to_host( uint_send_right[i_attr] ) # Change the new total number of particles species.Ntot = new_Ntot # Return the sending buffers return(float_send_left, float_send_right, uint_send_left, uint_send_right)
def handle_scattering(self, elec, t): """ Handle Compton scattering, either on CPU or GPU - For each electron, decide whether it is going to be produce a new photon, based on the integrated Klein-Nishina formula - Add the photons created from Compton scattering to `target_species` Parameters: ----------- elec: an fbpic.Particles object The electrons species, from which new photons will be created t: float The simulation time """ # Process particles in batches (of typically 10, 20 particles) N_batch = int(elec.Ntot / self.batch_size) + 1 # Short-cut for use_cuda use_cuda = self.use_cuda # Create temporary arrays (on CPU or GPU, depending on `use_cuda`) nscatter_per_batch = allocate_empty(N_batch, use_cuda, dtype=np.int64) nscatter_per_elec = allocate_empty(elec.Ntot, use_cuda, dtype=np.int64) photon_n = allocate_empty(elec.Ntot, use_cuda, dtype=np.float64) # Prepare random numbers if self.use_cuda: seed = np.random.randint(256) random_states = create_xoroshiro128p_states(N_batch, seed) # For each electron, calculate the local density of photons # *in the frame of the simulation* if use_cuda: bpg, tpg = cuda_tpb_bpg_1d(elec.Ntot) get_photon_density_gaussian_cuda[bpg, tpg]( photon_n, elec.Ntot, elec.x, elec.y, elec.z, c * t, self.photon_n_lab_peak, self.inv_laser_waist2, self.inv_laser_ctau2, self.laser_initial_z0, self.gamma_boost, self.beta_boost) else: get_photon_density_gaussian_numba( photon_n, elec.Ntot, elec.x, elec.y, elec.z, c * t, self.photon_n_lab_peak, self.inv_laser_waist2, self.inv_laser_ctau2, self.laser_initial_z0, self.gamma_boost, self.beta_boost) # Determine the electrons that scatter, and count them in each batch # (one thread per batch on GPU; parallel loop over batches on CPU) if use_cuda: batch_grid_1d, batch_block_1d = cuda_tpb_bpg_1d(N_batch) determine_scatterings_cuda[batch_grid_1d, batch_block_1d]( N_batch, self.batch_size, elec.Ntot, nscatter_per_elec, nscatter_per_batch, random_states, elec.dt, elec.ux, elec.uy, elec.uz, elec.inv_gamma, self.ratio_w_electron_photon, photon_n, self.photon_p, self.photon_beta_x, self.photon_beta_y, self.photon_beta_z) else: determine_scatterings_numba(N_batch, self.batch_size, elec.Ntot, nscatter_per_elec, nscatter_per_batch, elec.dt, elec.ux, elec.uy, elec.uz, elec.inv_gamma, self.ratio_w_electron_photon, photon_n, self.photon_p, self.photon_beta_x, self.photon_beta_y, self.photon_beta_z) # Count the total number of new photons cumul_nscatter_per_batch = perform_cumsum(nscatter_per_batch, use_cuda) N_created = int(cumul_nscatter_per_batch[-1]) # If no new particle was created, skip the rest of this function if N_created == 0: return # Reallocate photons species (on CPU or GPU depending on `use_cuda`), # to accomodate the photons produced by Compton scattering, # and copy the old photons to the new arrays photons = self.target_species old_Ntot = photons.Ntot new_Ntot = old_Ntot + N_created reallocate_and_copy_old(photons, use_cuda, old_Ntot, new_Ntot) # Create the new photons from ionization (with a random # scattering angle) and add recoil momentum to the electrons if use_cuda: scatter_photons_electrons_cuda[batch_grid_1d, batch_block_1d]( N_batch, self.batch_size, old_Ntot, elec.Ntot, cumul_nscatter_per_batch, nscatter_per_elec, random_states, self.photon_p, self.photon_px, self.photon_py, self.photon_pz, photons.x, photons.y, photons.z, photons.inv_gamma, photons.ux, photons.uy, photons.uz, photons.w, elec.x, elec.y, elec.z, elec.inv_gamma, elec.ux, elec.uy, elec.uz, elec.w, self.inv_ratio_w_elec_photon) photons.sorted = False else: scatter_photons_electrons_numba( N_batch, self.batch_size, old_Ntot, elec.Ntot, cumul_nscatter_per_batch, nscatter_per_elec, self.photon_p, self.photon_px, self.photon_py, self.photon_pz, photons.x, photons.y, photons.z, photons.inv_gamma, photons.ux, photons.uy, photons.uz, photons.w, elec.x, elec.y, elec.z, elec.inv_gamma, elec.ux, elec.uy, elec.uz, elec.w, self.inv_ratio_w_elec_photon) # If the photons are tracked, generate new ids # (on GPU or GPU depending on `use_cuda`) generate_new_ids(photons, old_Ntot, new_Ntot)
def extract_slice_from_gpu(pref_sum_curr, N_area, species): """ Extract the particles which have which have index between pref_sum_curr and pref_sum_curr + N_area, and return them in dictionaries. Parameters ---------- pref_sum_curr: int The starting index needed for the extraction process N_area: int The number of particles to extract. species: an fbpic Species object The species from to extract data Returns ------- particle_data : A dictionary of 1D float arrays (that are on the CPU) A dictionary that contains the particle data of the simulation (with normalized weigths), including optional integer arrays (e.g. "id", "charge") """ # Call kernel that extracts particles from GPU dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d(N_area) # - General particle quantities part_data = cupy.empty((8, N_area), dtype=np.float64) extract_particles_from_gpu[dim_grid_1d, dim_block_1d](pref_sum_curr, species.x, species.y, species.z, species.ux, species.uy, species.uz, species.w, species.inv_gamma, part_data) # - Optional particle arrays if species.tracker is not None: selected_particle_id = cupy.empty((N_area, ), dtype=np.uint64) extract_array_from_gpu[dim_grid_1d, dim_block_1d](pref_sum_curr, species.tracker.id, selected_particle_id) if species.ionizer is not None: selected_particle_charge = cupy.empty((N_area, ), dtype=np.uint64) extract_array_from_gpu[dim_grid_1d, dim_block_1d](pref_sum_curr, species.ionizer.ionization_level, selected_particle_charge) selected_particle_weight = cupy.empty((N_area, ), dtype=np.float64) extract_array_from_gpu[dim_grid_1d, dim_block_1d](pref_sum_curr, species.ionizer.w_times_level, selected_particle_weight) # Copy GPU arrays to the host part_data = part_data.get() particle_data = { 'x': part_data[0], 'y': part_data[1], 'z': part_data[2], 'ux': part_data[3], 'uy': part_data[4], 'uz': part_data[5], 'w': part_data[6], 'inv_gamma': part_data[7] } if species.tracker is not None: particle_data['id'] = selected_particle_id.get() if species.ionizer is not None: particle_data['charge'] = selected_particle_charge.get() # Replace particle weight particle_data['w'] = selected_particle_weight.get() # Return the data as dictionary return (particle_data)
def handle_ionization(self, ion): """ Handle ionization, either on CPU or GPU - For each ion macroparticle, decide whether it is going to be further ionized during this timestep, based on the ADK rate. - Add the electrons created from ionization to the `target_species` Parameters: ----------- ion: an fbpic.Particles object The ionizable species, from which new electrons are created. """ # Process particles in batches (of typically 10, 20 particles) N_batch = int(ion.Ntot / self.batch_size) + 1 # Short-cut for use_cuda use_cuda = self.use_cuda # Create temporary arrays (on CPU or GPU, depending on `use_cuda`) is_ionized = allocate_empty(ion.Ntot, use_cuda, dtype=np.int16) n_ionized = allocate_empty(N_batch, use_cuda, dtype=np.int64) # Draw random numbers if self.use_cuda: random_draw = allocate_empty(ion.Ntot, use_cuda, dtype=np.float32) self.prng.uniform(random_draw) else: random_draw = np.random.rand(ion.Ntot) # Determine the ions that are ionized, and count them in each batch # (one thread per batch on GPU; parallel loop over batches on CPU) if use_cuda: batch_grid_1d, batch_block_1d = cuda_tpb_bpg_1d(N_batch) ionize_ions_cuda[batch_grid_1d, batch_block_1d]( N_batch, self.batch_size, ion.Ntot, self.level_max, n_ionized, is_ionized, self.ionization_level, random_draw, self.adk_prefactor, self.adk_power, self.adk_exp_prefactor, ion.ux, ion.uy, ion.uz, ion.Ex, ion.Ey, ion.Ez, ion.Bx, ion.By, ion.Bz, ion.w, self.w_times_level) else: ionize_ions_numba(N_batch, self.batch_size, ion.Ntot, self.level_max, n_ionized, is_ionized, self.ionization_level, random_draw, self.adk_prefactor, self.adk_power, self.adk_exp_prefactor, ion.ux, ion.uy, ion.uz, ion.Ex, ion.Ey, ion.Ez, ion.Bx, ion.By, ion.Bz, ion.w, self.w_times_level) # Count the total number of new electrons (operation always performed # on the CPU, as this is typically difficult on the GPU) if use_cuda: n_ionized = n_ionized.copy_to_host() cumulative_n_ionized = perform_cumsum(n_ionized) # If no new particle was created, skip the rest of this function if cumulative_n_ionized[-1] == 0: return # Reallocate electron species (on CPU or GPU depending on `use_cuda`), # to accomodate the electrons produced by ionization, # and copy the old electrons to the new arrays elec = self.target_species old_Ntot = elec.Ntot new_Ntot = old_Ntot + cumulative_n_ionized[-1] reallocate_and_copy_old(elec, use_cuda, old_Ntot, new_Ntot) # Create the new electrons from ionization (one thread per batch) if use_cuda: cumulative_n_ionized = cuda.to_device(cumulative_n_ionized) copy_ionized_electrons_cuda[batch_grid_1d, batch_block_1d]( N_batch, self.batch_size, old_Ntot, ion.Ntot, cumulative_n_ionized, is_ionized, elec.x, elec.y, elec.z, elec.inv_gamma, elec.ux, elec.uy, elec.uz, elec.w, elec.Ex, elec.Ey, elec.Ez, elec.Bx, elec.By, elec.Bz, ion.x, ion.y, ion.z, ion.inv_gamma, ion.ux, ion.uy, ion.uz, ion.w, ion.Ex, ion.Ey, ion.Ez, ion.Bx, ion.By, ion.Bz) # Mark the new electrons as unsorted elec.sorted = False else: copy_ionized_electrons_numba( N_batch, self.batch_size, old_Ntot, ion.Ntot, cumulative_n_ionized, is_ionized, elec.x, elec.y, elec.z, elec.inv_gamma, elec.ux, elec.uy, elec.uz, elec.w, elec.Ex, elec.Ey, elec.Ez, elec.Bx, elec.By, elec.Bz, ion.x, ion.y, ion.z, ion.inv_gamma, ion.ux, ion.uy, ion.uz, ion.w, ion.Ex, ion.Ey, ion.Ez, ion.Bx, ion.By, ion.Bz) # If the electrons are tracked, generate new ids # (on GPU or GPU depending on `use_cuda`) generate_new_ids(elec, old_Ntot, new_Ntot)
def extract_slice(self, fld, comm, z_boost, zmin_boost, slice_array): """ Fills `slice_array` with a slice of the fields at z_boost (the fields returned are still in the boosted frame ; for performance, the Lorentz transform of the fields values is performed only when flushing to disk) Parameters ---------- fld: a Fields object The object from which to extract the fields comm: a BoundaryCommunicator object Contains information about the gard cells in particular z_boost: float (meters) Position of the slice in the boosted frame zmin_boost: float (meters) Position of the left end of physical part of the local subdomain (i.e. excludes guard cells) slice_array: either a numpy array or a cuda device array An array of reals that packs together the slices of the different fields (always on array on the CPU). The first index of this array corresponds to the field type (10 different field types), and the correspondance between the field type and integer index is given field_to_index The shape of this arrays is (10, 2*Nm-1, Nr_output) """ # Find the index of the slice in the boosted frame # and the corresponding interpolation shape factor dz = fld.interp[0].dz # Find the interpolation data in the z direction z_staggered_gridunits = (z_boost - zmin_boost - 0.5 * dz) / dz iz = int(z_staggered_gridunits) Sz = iz + 1 - z_staggered_gridunits # Add the guard cells to the index iz if comm is not None: iz += comm.n_guard if comm.left_proc is None: iz += comm.nz_damp + comm.n_inject # Extract the slice directly on the CPU # Fill the pre-allocated CPU array slice_array if fld.use_cuda is False: # Extract a slice of the fields *in the boosted frame* # at z_boost, using interpolation, and store them in slice_array self.extract_slice_cpu(fld, iz, Sz, slice_array) # Extract the slice on the GPU # Fill the pre-allocated GPU array slice_array else: # Prepare kernel call dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d(self.Nr_output) # Extract the slices interp = fld.interp for m in range(fld.Nm): extract_slice_cuda[dim_grid_1d, dim_block_1d]( self.Nr_output, iz, Sz, slice_array, interp[m].Er, interp[m].Et, interp[m].Ez, interp[m].Br, interp[m].Bt, interp[m].Bz, interp[m].Jr, interp[m].Jt, interp[m].Jz, interp[m].rho, m)