Esempio n. 1
0
    def copy_rho_buffer(self, iz_min, grid):
        """
        Add the small-size array rho_buffer into the full-size array rho

        Parameters
        ----------
        iz_min: int
            The z index in the full-size array, that corresponds to index 0
            in the small-size array (i.e. position at which to add the
            small-size array into the full-size one)

        grid: a list of InterpolationGrid objects
            Contains the full-size array rho
        """
        Nm = len(grid)
        if type(grid[0].rho) is np.ndarray:
            # The large-size array rho is on the CPU
            for m in range(Nm):
                grid[m].rho[iz_min:iz_min + 2] += self.rho_buffer[m]
        else:
            # The large-size array rho is on the GPU
            # Copy the small-size buffer to the GPU
            cuda.to_device(self.rho_buffer, to=self.d_rho_buffer)
            # On the GPU: add the small-size buffers to the large-size array
            dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d(grid[0].Nr, TPB=64)
            for m in range(Nm):
                add_rho_to_gpu_array[dim_grid_1d,
                                     dim_block_1d](iz_min, self.d_rho_buffer,
                                                   grid[m].rho, m)
Esempio n. 2
0
    def __init__( self, n_pml, cdt_over_dr ):
        """
        Initialize a damping object.

        Parameters
        ----------
        n_pml: int
            Number of PML cells in the radial direction

        cdt_over_dr: float
            Ratio of timestep to radial cell size
            (needed for calculation of damping coefficients)
        """
        # Register the number of PML cells
        self.n_pml = n_pml

        # Create the damping arrays
        self.damp_array = generate_pml_damp_array( n_pml, cdt_over_dr )

        # Transfer the damping array to the GPU
        if cuda_installed:
            self.d_damp_array = cuda.to_device( self.damp_array )
Esempio n. 3
0
    def send_particles_to_gpu( self ):
        """
        Copy the particles to the GPU.
        Particle arrays of self now point to the GPU arrays.
        """
        if self.use_cuda:
            # Send positions, velocities, inverse gamma and weights
            # to the GPU (CUDA)
            self.x = cuda.to_device(self.x)
            self.y = cuda.to_device(self.y)
            self.z = cuda.to_device(self.z)
            self.ux = cuda.to_device(self.ux)
            self.uy = cuda.to_device(self.uy)
            self.uz = cuda.to_device(self.uz)
            self.inv_gamma = cuda.to_device(self.inv_gamma)
            self.w = cuda.to_device(self.w)

            # Copy arrays on the GPU for the field
            # gathering and the particle push
            self.Ex = cuda.to_device(self.Ex)
            self.Ey = cuda.to_device(self.Ey)
            self.Ez = cuda.to_device(self.Ez)
            self.Bx = cuda.to_device(self.Bx)
            self.By = cuda.to_device(self.By)
            self.Bz = cuda.to_device(self.Bz)

            # Copy arrays on the GPU for the sorting
            self.cell_idx = cuda.to_device(self.cell_idx)
            self.sorted_idx = cuda.to_device(self.sorted_idx)
            self.prefix_sum = cuda.to_device(self.prefix_sum)
            self.sorting_buffer = cuda.to_device(self.sorting_buffer)
            if self.n_integer_quantities > 0:
                self.int_sorting_buffer = cuda.to_device(self.int_sorting_buffer)

            # Copy particle tracker data
            if self.tracker is not None:
                self.tracker.send_to_gpu()
            # Copy the ionization data
            if self.ionizer is not None:
                self.ionizer.send_to_gpu()
Esempio n. 4
0
def add_buffers_gpu( species, float_recv_left, float_recv_right,
                            uint_recv_left, uint_recv_right):
    """
    Add the particles stored in recv_left and recv_right
    to the existing particle in species.

    Parameters
    ----------
    species: a Particles object
        Contain the particles that stayed on the present processors

    float_recv_left, float_recv_right, uint_recv_left, uint_recv_right:
        arrays of shape (n_float,Nptcl) and (n_int,Nptcl) where Nptcl
        is the number of particles that are received to the left
        proc and right proc respectively, and where n_float and n_int
        are the number of float and integer quantities respectively
        These arrays are always on the CPU (since they were used for MPI)
    """
    # Get the new number of particles
    old_Ntot = species.Ntot
    n_left = float_recv_left.shape[1]
    n_right = float_recv_right.shape[1]
    new_Ntot = old_Ntot + n_left + n_right

    # Get the threads per block and the blocks per grid
    n_left_grid, n_left_block = cuda_tpb_bpg_1d( n_left )
    n_right_grid, n_right_block = cuda_tpb_bpg_1d( n_right )
    n_old_grid, n_old_block = cuda_tpb_bpg_1d( old_Ntot )

    # Iterate over particle attributes
    # Build list of float attributes to copy
    attr_list = [ (species,'x'), (species,'y'), (species,'z'), \
                  (species,'ux'), (species,'uy'), (species,'uz'), \
                  (species,'inv_gamma'), (species,'w') ]
    if species.ionizer is not None:
        attr_list += [ (species.ionizer, 'w_times_level') ]
    # Loop through the float quantities
    for i_attr in range( len(attr_list) ):
        # Copy the proper buffers to the GPU
        left_buffer = cuda.to_device( float_recv_left[i_attr] )
        right_buffer = cuda.to_device( float_recv_right[i_attr] )
        # Initialize the new particle array
        particle_array = cuda.device_array( (new_Ntot,), dtype=np.float64)
        # Merge the arrays on the GPU
        stay_buffer = getattr( attr_list[i_attr][0], attr_list[i_attr][1])
        if n_left != 0:
            copy_particles[n_left_grid, n_left_block](
                n_left, left_buffer, 0, particle_array, 0 )
        if old_Ntot != 0:
            copy_particles[n_old_grid, n_old_block](
                old_Ntot, stay_buffer, 0, particle_array, n_left )
        if n_right != 0:
            copy_particles[n_right_grid, n_right_block](
                n_right, right_buffer, 0, particle_array, n_left+old_Ntot )
        # Assign the stay_buffer to the initial particle data array
        # and fill the sending buffers (if needed for MPI)
        setattr(attr_list[i_attr][0], attr_list[i_attr][1], particle_array)

    # Build list of integer quantities to copy
    attr_list = []
    if species.tracker is not None:
        attr_list.append( (species.tracker,'id') )
    if species.ionizer is not None:
        attr_list.append( (species.ionizer,'ionization_level') )
    # Loop through the integer quantities
    for i_attr in range( len(attr_list) ):
        # Copy the proper buffers to the GPU
        left_buffer = cuda.to_device( uint_recv_left[i_attr] )
        right_buffer = cuda.to_device( uint_recv_right[i_attr] )
        # Initialize the new particle array
        particle_array = cuda.device_array( (new_Ntot,), dtype=np.uint64)
        # Merge the arrays on the GPU
        stay_buffer = getattr( attr_list[i_attr][0], attr_list[i_attr][1])
        if n_left != 0:
            copy_particles[n_left_grid, n_left_block](
                n_left, left_buffer, 0, particle_array, 0 )
        if old_Ntot != 0:
            copy_particles[n_old_grid, n_old_block](
                old_Ntot, stay_buffer, 0, particle_array, n_left )
        if n_right != 0:
            copy_particles[n_right_grid, n_right_block](
                n_right, right_buffer, 0, particle_array, n_left+old_Ntot )
        # Assign the stay_buffer to the initial particle data array
        # and fill the sending buffers (if needed for MPI)
        setattr(attr_list[i_attr][0], attr_list[i_attr][1], particle_array)

    # Adapt the total number of particles
    species.Ntot = new_Ntot
Esempio n. 5
0
    def __init__(self, n_guard, Nr, Nm, left_proc, right_proc):
        """
        Initialize the guard cell buffers for the fields.
        These buffers are used in order to group the MPI exchanges.

        Parameters
        ----------
        n_guard: int
           Number of guard cells

        Nr: int
           Number of points in the radial direction

        Nm: int
           Number of azimuthal modes

        left_proc, right_proc: int or None
           Rank of the proc to the right and to the left
           (None for open boundary)
        """
        # Register parameters
        self.Nr = Nr
        self.Nm = Nm
        self.n_guard = n_guard
        self.left_proc = left_proc
        self.right_proc = right_proc
        # Shortcut
        ng = self.n_guard
        # Allocate buffer arrays that are send via MPI to exchange
        # the fields between domains (either replacing or adding fields)
        # Buffers are allocated for the left and right side of the domain
        if not cuda_installed:
            # Allocate buffers on the CPU
            # - Replacing vector field buffers
            self.vec_rep_send_l = np.empty((3 * Nm, ng, Nr),
                                           dtype=np.complex128)
            self.vec_rep_send_r = np.empty((3 * Nm, ng, Nr),
                                           dtype=np.complex128)
            self.vec_rep_recv_l = np.empty((3 * Nm, ng, Nr),
                                           dtype=np.complex128)
            self.vec_rep_recv_r = np.empty((3 * Nm, ng, Nr),
                                           dtype=np.complex128)
            # - Adding vector field buffers
            self.vec_add_send_l = np.empty((3 * Nm, 2 * ng, Nr),
                                           dtype=np.complex128)
            self.vec_add_send_r = np.empty((3 * Nm, 2 * ng, Nr),
                                           dtype=np.complex128)
            self.vec_add_recv_l = np.empty((3 * Nm, 2 * ng, Nr),
                                           dtype=np.complex128)
            self.vec_add_recv_r = np.empty((3 * Nm, 2 * ng, Nr),
                                           dtype=np.complex128)
            # - Replacing scalar field buffers
            self.scal_rep_send_l = np.empty((Nm, ng, Nr), dtype=np.complex128)
            self.scal_rep_send_r = np.empty((Nm, ng, Nr), dtype=np.complex128)
            self.scal_rep_recv_l = np.empty((Nm, ng, Nr), dtype=np.complex128)
            self.scal_rep_recv_r = np.empty((Nm, ng, Nr), dtype=np.complex128)
            # - Adding scalar field buffers
            self.scal_add_send_l = np.empty((Nm, 2 * ng, Nr),
                                            dtype=np.complex128)
            self.scal_add_send_r = np.empty((Nm, 2 * ng, Nr),
                                            dtype=np.complex128)
            self.scal_add_recv_l = np.empty((Nm, 2 * ng, Nr),
                                            dtype=np.complex128)
            self.scal_add_recv_r = np.empty((Nm, 2 * ng, Nr),
                                            dtype=np.complex128)
        else:
            # Allocate buffers on the CPU and GPU
            # Use cuda.pinned_array so that CPU array is pagelocked.
            # (cannot be swapped out to disk and GPU can access it via DMA)
            pin_ary = cuda.pinned_array
            # - Replacing vector field buffers
            self.vec_rep_send_l = pin_ary((3 * Nm, ng, Nr),
                                          dtype=np.complex128)
            self.vec_rep_send_r = pin_ary((3 * Nm, ng, Nr),
                                          dtype=np.complex128)
            self.vec_rep_recv_l = pin_ary((3 * Nm, ng, Nr),
                                          dtype=np.complex128)
            self.vec_rep_recv_r = pin_ary((3 * Nm, ng, Nr),
                                          dtype=np.complex128)
            self.d_vec_rep_buffer_l = cuda.to_device(self.vec_rep_send_l)
            self.d_vec_rep_buffer_r = cuda.to_device(self.vec_rep_send_r)
            # - Adding vector field buffers
            self.vec_add_send_l = pin_ary((3 * Nm, 2 * ng, Nr),
                                          dtype=np.complex128)
            self.vec_add_send_r = pin_ary((3 * Nm, 2 * ng, Nr),
                                          dtype=np.complex128)
            self.vec_add_recv_l = pin_ary((3 * Nm, 2 * ng, Nr),
                                          dtype=np.complex128)
            self.vec_add_recv_r = pin_ary((3 * Nm, 2 * ng, Nr),
                                          dtype=np.complex128)
            self.d_vec_add_buffer_l = cuda.to_device(self.vec_add_send_l)
            self.d_vec_add_buffer_r = cuda.to_device(self.vec_add_send_r)
            # - Replacing scalar field buffers
            self.scal_rep_send_l = pin_ary((Nm, ng, Nr), dtype=np.complex128)
            self.scal_rep_send_r = pin_ary((Nm, ng, Nr), dtype=np.complex128)
            self.scal_rep_recv_l = pin_ary((Nm, ng, Nr), dtype=np.complex128)
            self.scal_rep_recv_r = pin_ary((Nm, ng, Nr), dtype=np.complex128)
            self.d_scal_rep_buffer_l = cuda.to_device(self.scal_rep_send_l)
            self.d_scal_rep_buffer_r = cuda.to_device(self.scal_rep_send_r)
            # - Adding scalar field buffers
            self.scal_add_send_l = pin_ary((Nm, 2 * ng, Nr),
                                           dtype=np.complex128)
            self.scal_add_send_r = pin_ary((Nm, 2 * ng, Nr),
                                           dtype=np.complex128)
            self.scal_add_recv_l = pin_ary((Nm, 2 * ng, Nr),
                                           dtype=np.complex128)
            self.scal_add_recv_r = pin_ary((Nm, 2 * ng, Nr),
                                           dtype=np.complex128)
            self.d_scal_add_buffer_l = cuda.to_device(self.scal_add_send_l)
            self.d_scal_add_buffer_r = cuda.to_device(self.scal_add_send_r)
Esempio n. 6
0
    def __init__( self, Nz, zmin, zmax, Nr, rmax, Nm, dt, v_comoving,
            use_galilean, boundaries, n_order, n_guard=None, n_damp=64,
            n_inject=None, exchange_period=None, use_all_mpi_ranks=True):
        """
        Initializes a communicator object.

        Parameters
        ----------
        Nz, Nr: int
            The number of cells in the global physical domain
            (i.e. without damp cells and guard cells)

        zmin, zmax, rmax: float
            The position of the edges of the global physical domain in z and r
            (i.e. without damp cells and guard cells)

        Nm: int
            The total number of modes

        dt: float
            The timestep of the simulation

        v_comoving: float or None, optional
            If this variable is None, the standard PSATD is used (default).
            Otherwise, the current is assumed to be "comoving",
            i.e. constant with respect to (z - v_comoving * t).
            This can be done in two ways: either by
            - Using a PSATD scheme that takes this hypothesis into account
            - Solving the PSATD scheme in a Galilean frame

        use_galilean: bool, optional
            Determines which one of the two above schemes is used
            When use_galilean is true, the whole grid moves
            with a speed v_comoving

        boundaries: str
            Indicates how to exchange the fields at the left and right
            boundaries of the global simulation box.
            (Either 'periodic' or 'open')

        n_order: int
           The order of the stencil for the z derivatives.
           Use -1 for infinite order, otherwise use a positive, even
           number. In this case, the stencil extends up to approx.
           2*n_order cells on each side. (A finite order stencil
           is required to have a localized field push that allows
           to do simulations in parallel on multiple MPI ranks)

        n_guard: int, optional
            Number of guard cells to use at the left and right of
            a domain, when performing parallel (MPI) computation
            or when using open boundaries. Defaults to None, which
            calculates the required guard cells for n_order
            automatically (approx 2*n_order). If no MPI is used and
            in the case of open boundaries with an infinite order stencil,
            n_guard defaults to 64, if not set otherwise.

        n_damp : int, optional
            Number of damping guard cells at the left and right of a
            simulation box if a moving window is attached. The guard
            region at these areas (left / right of moving window) is
            extended by n_damp in order to smoothly
            damp the fields such that they do not wrap around.
            (Defaults to 64)

        n_inject: int, optional
            Number of injection cells (at the left and right) of a simulation
            box, for a simulation with open boundaries. The damping region
            needs to be additionally extended by n_inject cells at the
            outer edges to have a region with zero fields where new particles
            can be injected. For symmetry reasons those cells are added at
            both sides of the simulation box, although particles are typically
            injected only at the right side of the box.
            (Defaults to None and is set to n_guard/2 automatically)

        exchange_period: int, optional
            Number of iterations before which the particles are exchanged.
            If set to None, the maximum exchange period is calculated
            automatically: Within exchange_period timesteps, the
            particles should never be able to travel more than
            (n_guard/2 - particle_shape order) cells. (Setting exchange_period
            to small values can substantially affect the performance)

        use_all_mpi_ranks: bool, optional
            - if `use_all_mpi_ranks` is True (default):
              All the MPI ranks will contribute to the same simulation,
              using domain-decomposition to share the work.
            - if `use_all_mpi_ranks` is False:
              Each MPI rank will run an independent simulation.
              This can be useful when running parameter scans.
        """
        # Initialize global number of cells and modes
        self.Nr = Nr
        self.Nm = Nm
        self._Nz_global_domain = Nz
        self._zmin_global_domain = zmin
        # Get the distance dz between the cells
        # (longitudinal spacing of the grid)
        self.dz = (zmax - zmin)/self._Nz_global_domain

        # MPI Setup
        self.use_all_mpi_ranks = use_all_mpi_ranks
        if self.use_all_mpi_ranks and mpi_installed:
            self.mpi_comm = comm
            self.rank = self.mpi_comm.rank
            self.size = self.mpi_comm.size
        else:
            self.mpi_comm = None
            self.rank = 0
            self.size = 1
        # Get the rank of the left and the right domain
        self.left_proc = self.rank-1
        self.right_proc = self.rank+1
        # Correct these initial values by taking into account boundaries
        self.boundaries = boundaries
        if self.boundaries == 'periodic':
            # Periodic boundary conditions for the domains
            if self.rank == 0:
                self.left_proc = (self.size-1)
            if self.rank == self.size-1:
                self.right_proc = 0
        elif self.boundaries == 'open':
            # None means that the boundary is open
            if self.rank == 0:
                self.left_proc = None
            if self.rank == self.size-1:
                self.right_proc = None
        else:
            raise ValueError('Unrecognized boundaries: %s' %self.boundaries)

        # Initialize number of guard cells
        # Automatically calculate required guard cells
        # for given order (n_order)
        if n_guard == None:
            if n_order == -1:
                # Set n_guard to fixed value of 64 in case of
                # open boundaries and infinite order stencil
                # (if not defined otherwise by user)
                self.n_guard = 64
                # Raise error if user tries to use parallel MPI computation
                # with an infinite order stencil. This would give wrong results
                if self.size != 1:
                    raise ValueError(
                    'When running FBPIC with MPI decomposition, you need to \n'
                    'set the argument `n_order` of the `Simulation` object \n'
                    'to a positive value (e.g. n_order=32).')
            else:
                # Automatic calculation of the guard region size,
                # depending on the stencil order (n_order)
                stencil = get_stencil_reach(
                        self._Nz_global_domain, self.dz, c*dt, n_order,
                        v_comoving, use_galilean )
                # approx 2*n_order (+1 because the moving window
                # shifts the grid by one cell during the PIC loop
                # and therefore, the guard region needs to be larger
                # by one cell)
                self.n_guard = stencil + 1
        else:
            # Otherwise: Set user defined guard region size
            self.n_guard = n_guard
        # For single proc and periodic boundaries, no need for guard cells
        if boundaries=='periodic' and self.size==1:
            self.n_guard = 0

        # Register damping cells
        self.n_damp = n_damp
        # For periodic boundaries, no need for damping cells
        if boundaries=='periodic':
            self.n_damp = 0
            self.n_inject = 0
        else:
            # Register additional injection cells that are part of the
            # damping region and of size n_guard/2.
            if n_inject == None:
                self.n_inject = int(self.n_guard/2)
            else:
                # User-defined injection cells. Choose carefully.
                self.n_inject = n_inject

        # Initialize the period of the particle exchange and moving window
        if exchange_period is None:
            # Maximum number of cells a particle can travel in one timestep
            # Safety factor of 2 needed if there is a moving window attached
            # to the simulation or in case a galilean frame is used.
            cells_per_step = 2.*c*dt/self.dz
            # Maximum number of timesteps before a particle can reach the end
            # of the half of guard region including the maximum number of cells
            # (+/-3) it can affect with a "cubic" particle shape_factor.
            # (Particles are only allowed to reside in half of the guard
            # region as this is the stencil reach of the current correction)
            self.exchange_period = int(((self.n_guard/2)-3)/cells_per_step)
            # Set exchange_period to 1 in the case of single-proc
            # and periodic boundary conditions.
            if self.size == 1 and boundaries == 'periodic':
                self.exchange_period = 1
            # Check that calculated exchange_period is acceptable for given
            # simulation parameters (check that guard region is large enough).
            if self.exchange_period < 1:
                raise ValueError('Guard region size is too small for chosen \
                    timestep. In one timestep, a particle can travel more \
                    than n_guard region cells.')
        else:
            # User-defined exchange_period. Choose carefully.
            self.exchange_period = exchange_period

        # Initialize the moving window to None (See the method
        # set_moving_window in main.py to initialize a proper moving window)
        self.moving_win = None

        # Initialize a buffer handler object, for MPI communications
        if self.size > 1:
            self.mpi_buffers = BufferHandler( self.n_guard, Nr, Nm,
                                      self.left_proc, self.right_proc )

        # Create damping arrays for the damping cells at the left
        # and right of the box in the case of "open" boundaries.
        if (self.n_damp+self.n_inject) > 0:
            if self.left_proc is None:
                # Create the damping arrays for left proc
                self.left_damp = self.generate_damp_array(
                    self.n_guard, self.n_damp, self.n_inject )
                if cuda_installed:
                    self.d_left_damp = cuda.to_device( self.left_damp )
            if self.right_proc is None:
                # Create the damping arrays for right proc
                self.right_damp = self.generate_damp_array(
                    self.n_guard, self.n_damp, self.n_inject )
                if cuda_installed:
                    self.d_right_damp = cuda.to_device( self.right_damp )
Esempio n. 7
0
    def __init__(self, n_guard, Nr, Nm, left_proc, right_proc):
        """
        Initialize the guard cell buffers for the fields.
        These buffers are used in order to group the MPI exchanges.

        Parameters
        ----------
        n_guard: int
           Number of guard cells

        Nr: int
           Number of points in the radial direction

        Nm: int
           Number of azimuthal modes

        left_proc, right_proc: int or None
           Rank of the proc to the right and to the left
           (None for open boundary)
        """
        # Register parameters
        self.Nr = Nr
        self.Nm = Nm
        self.n_guard = n_guard
        self.left_proc = left_proc
        self.right_proc = right_proc
        # Shortcut
        ng = self.n_guard

        # Allocate buffer arrays that are send via MPI to exchange
        # the fields between domains (either replacing or adding fields)
        # Buffers are allocated for the left and right side of the domain

        # Allocate buffers on the CPU
        if cuda_installed:
            # Use cuda.pinned_array so that CPU array is pagelocked.
            # (cannot be swapped out to disk and GPU can access it via DMA)
            alloc_cpu = cuda.pinned_array
        else:
            # Use regular numpy arrays
            alloc_cpu = np.empty
        # Allocate buffers of different size, for the different exchange types
        self.send_l = {
            'E:replace': alloc_cpu((3 * Nm, ng, Nr), dtype=np.complex128),
            'B:replace': alloc_cpu((3 * Nm, ng, Nr), dtype=np.complex128),
            'J:add': alloc_cpu((3 * Nm, 2 * ng, Nr), dtype=np.complex128),
            'rho:add': alloc_cpu((Nm, 2 * ng, Nr), dtype=np.complex128)
        }
        self.send_r = {
            'E:replace': alloc_cpu((3 * Nm, ng, Nr), dtype=np.complex128),
            'B:replace': alloc_cpu((3 * Nm, ng, Nr), dtype=np.complex128),
            'J:add': alloc_cpu((3 * Nm, 2 * ng, Nr), dtype=np.complex128),
            'rho:add': alloc_cpu((Nm, 2 * ng, Nr), dtype=np.complex128)
        }
        self.recv_l = {
            'E:replace': alloc_cpu((3 * Nm, ng, Nr), dtype=np.complex128),
            'B:replace': alloc_cpu((3 * Nm, ng, Nr), dtype=np.complex128),
            'J:add': alloc_cpu((3 * Nm, 2 * ng, Nr), dtype=np.complex128),
            'rho:add': alloc_cpu((Nm, 2 * ng, Nr), dtype=np.complex128)
        }
        self.recv_r = {
            'E:replace': alloc_cpu((3 * Nm, ng, Nr), dtype=np.complex128),
            'B:replace': alloc_cpu((3 * Nm, ng, Nr), dtype=np.complex128),
            'J:add': alloc_cpu((3 * Nm, 2 * ng, Nr), dtype=np.complex128),
            'rho:add': alloc_cpu((Nm, 2 * ng, Nr), dtype=np.complex128)
        }

        # Allocate buffers on the GPU, for the different exchange types
        if cuda_installed:
            self.d_send_l = { key: cuda.to_device(value) for key, value in \
                                self.send_l.items() }
            self.d_send_r = { key: cuda.to_device(value) for key, value in \
                                self.send_r.items() }
            self.d_recv_l = { key: cuda.to_device(value) for key, value in \
                                self.recv_l.items() }
            self.d_recv_r = { key: cuda.to_device(value) for key, value in \
                                self.recv_r.items() }
Esempio n. 8
0
    def __init__(self, p, m, Nr, Nz, rmax, use_cuda=False):
        """
        Calculate the r (position) and nu (frequency) grid
        on which the transform will operate.

        Also store auxiliary data needed for the transform.

        Parameters:
        ------------
        p: int
        Order of the Hankel transform

        m: int
        The azimuthal mode for which the Hankel transform is calculated

        Nr, Nz: float
        Number of points in the r direction and z direction

        rmax: float
        Edge of the box in which the Hankel transform is taken
        (The function is assumed to be zero at that point.)

        use_cuda: bool, optional
        Whether to use the GPU for the Hankel transform
        """
        # Register whether to use the GPU.
        # If yes, initialize the corresponding cuda object
        self.use_cuda = use_cuda
        if (self.use_cuda == True) and (cuda_installed == False):
            self.use_cuda = False
            print('** Cuda not available for Hankel transform.')
            print('** Performing the Hankel transform on the CPU.')

        # Check that m has a valid value
        if (m in [p - 1, p, p + 1]) == False:
            raise ValueError('m must be either p-1, p or p+1')

        # Register values of the arguments
        self.p = p
        self.m = m
        self.Nr = Nr
        self.rmax = rmax
        self.Nz = Nz

        # Calculate the zeros of the Bessel function
        if m != 0:
            # In this case, 0 is a zero of the Bessel function of order m.
            # It turns out that it is needed to reconstruct the signal for p=0.
            alphas = np.hstack((np.array([0.]), jn_zeros(m, Nr - 1)))
        else:
            alphas = jn_zeros(m, Nr)

        # Calculate the spectral grid
        self.nu = 1. / (2 * np.pi * rmax) * alphas

        # Calculate the spatial grid (Uniform grid with an half-cell offset)
        self.r = (rmax * 1. / Nr) * (np.arange(Nr) + 0.5)

        # Calculate and store the inverse matrix invM
        # (imposed by the constraints on the DHT of Bessel modes)
        # NB: When compared with the FBPIC article, all the matrices here
        # are calculated in transposed form. This is done so as to use the
        # `dot` and `gemm` functions, in the `transform` method.
        self.invM = np.empty((Nr, Nr))
        if p == m:
            p_denom = p + 1
        else:
            p_denom = p
        denom = np.pi * rmax**2 * jn(p_denom, alphas)**2
        num = jn(p, 2 * np.pi * self.r[np.newaxis, :] * self.nu[:, np.newaxis])
        # Get the inverse matrix
        if m != 0:
            self.invM[1:, :] = num[1:, :] / denom[1:, np.newaxis]
            # In this case, the functions are represented by Bessel functions
            # *and* an additional mode (below) which satisfies the same
            # algebric relations for curl/div/grad as the regular Bessel modes,
            # with the value kperp=0.
            # The normalization of this mode is arbitrary, and is chosen
            # so that the condition number of invM is close to 1
            if p == m - 1:
                self.invM[0, :] = self.r**(m - 1) * 1. / (np.pi *
                                                          rmax**(m + 1))
            else:
                self.invM[0, :] = 0.
        else:
            self.invM[:, :] = num[:, :] / denom[:, np.newaxis]

        # Calculate the matrix M by inverting invM
        self.M = np.empty((Nr, Nr))
        if m != 0 and p != m - 1:
            self.M[:, 1:] = np.linalg.pinv(self.invM[1:, :])
            self.M[:, 0] = 0.
        else:
            self.M = np.linalg.inv(self.invM)

        # Copy the matrices to the GPU if needed
        if self.use_cuda:
            self.d_M = cuda.to_device(self.M)
            self.d_invM = cuda.to_device(self.invM)

        # Initialize buffer arrays to store the complex Nz x Nr grid
        # as a real 2Nz x Nr grid, before performing the matrix product
        # (This is because a matrix product of reals is faster than a matrix
        # product of complexs, and the real-complex conversion is negligible.)
        if not self.use_cuda:
            # Initialize real buffer arrays on the CPU
            zero_array = np.zeros((2 * Nz, Nr), dtype=np.float64)
            self.array_in = zero_array.copy()
            self.array_out = zero_array.copy()
        else:
            # Initialize real buffer arrays on the GPU
            zero_array = np.zeros((2 * Nz, Nr), dtype=np.float64)
            self.d_in = cuda.to_device(zero_array)
            self.d_out = cuda.to_device(zero_array)
            # Initialize cuBLAS
            self.blas = device.get_cublas_handle()
            # Set optimal number of CUDA threads per block
            # for copy 2d real/complex (determined empirically)
            copy_tpb = (8, 32) if cuda_gpu_model == "V100" else (2, 16)
            # Initialize the threads per block and block per grid
            self.dim_grid, self.dim_block = cuda_tpb_bpg_2d(Nz, Nr, *copy_tpb)