def __init__(self, gd, bd, block_comm, dtype, mcpus, ncpus,
                 blocksize, buffer_size=None, timer=nulltimer):
        BlacsLayouts.__init__(self, gd, bd, block_comm, dtype,
                              mcpus, ncpus, blocksize, timer)
        self.buffer_size = buffer_size
        nbands = bd.nbands
        self.mynbands = mynbands = bd.mynbands
        self.blocksize = blocksize

        # 1D layout - columns
        self.columngrid = BlacsGrid(self.column_comm, 1, bd.comm.size)
        self.Nndescriptor = self.columngrid.new_descriptor(nbands, nbands,
                                                           nbands, mynbands)

        # 2D layout
        self.nndescriptor = self.blockgrid.new_descriptor(nbands, nbands,
                                                          blocksize, blocksize)

        # 1D layout - rows
        self.rowgrid = BlacsGrid(self.column_comm, bd.comm.size, 1)
        self.nNdescriptor = self.rowgrid.new_descriptor(nbands, nbands,
                                                        mynbands, nbands)

        # Only redistribute filled out half for Hermitian matrices
        self.Nn2nn = Redistributor(self.block_comm, self.Nndescriptor,
                                   self.nndescriptor)
        #self.Nn2nn = Redistributor(self.block_comm, self.Nndescriptor,
        #                           self.nndescriptor, 'L') #XXX faster but...

        # Resulting matrix will be used in dgemm which is symmetry obvlious
        self.nn2nN = Redistributor(self.block_comm, self.nndescriptor,
                                   self.nNdescriptor)
 def distribute_to_columns(self, rho_mm, srcdescriptor):
     redistributor = Redistributor(self.block_comm, # XXX
                                   srcdescriptor,
                                   self.mM_unique_descriptor)
     rho_mM = redistributor.redistribute(rho_mm)
     if self.gd.rank != 0:
         rho_mM = self.mMdescriptor.zeros(dtype=rho_mm.dtype)
     self.gd.comm.broadcast(rho_mM, 0)
     return rho_mM
    def __init__(self, gd, bd, block_comm, dtype, mcpus, ncpus,
                 blocksize, nao, timer=nulltimer):
        BlacsLayouts.__init__(self, gd, bd, block_comm, dtype,
                              mcpus, ncpus, blocksize, timer)
        nbands = bd.nbands
        self.blocksize = blocksize
        self.mynbands = mynbands = bd.mynbands
        
        self.orbital_comm = self.bd.comm
        self.naoblocksize = naoblocksize = -((-nao) // self.orbital_comm.size)
        self.nao = nao

        # Range of basis functions for BLACS distribution of matrices:
        self.Mmax = nao
        self.Mstart = bd.comm.rank * naoblocksize
        self.Mstop = min(self.Mstart + naoblocksize, self.Mmax)
        self.mynao = self.Mstop - self.Mstart

        # Column layout for one matrix per band rank:
        self.columngrid = BlacsGrid(bd.comm, bd.comm.size, 1)
        self.mMdescriptor = self.columngrid.new_descriptor(nao, nao,
                                                           naoblocksize, nao)
        self.nMdescriptor = self.columngrid.new_descriptor(nbands, nao,
                                                           mynbands, nao)

        #parallelprint(world, (mynao, self.mMdescriptor.shape))

        # Column layout for one matrix in total (only on grid masters):
        self.single_column_grid = BlacsGrid(self.column_comm, bd.comm.size, 1)
        self.mM_unique_descriptor = self.single_column_grid.new_descriptor( \
            nao, nao, naoblocksize, nao)

        # nM_unique_descriptor is meant to hold the coefficients after
        # diagonalization.  BLACS requires it to be nao-by-nao, but
        # we only fill meaningful data into the first nbands columns.
        #
        # The array will then be trimmed and broadcast across
        # the grid descriptor's communicator.
        self.nM_unique_descriptor = self.single_column_grid.new_descriptor( \
            nbands, nao, mynbands, nao)

        # Fully blocked grid for diagonalization with many CPUs:
        self.mmdescriptor = self.blockgrid.new_descriptor(nao, nao, blocksize,
                                                          blocksize)

        #self.nMdescriptor = nMdescriptor
        self.mM2mm = Redistributor(self.block_comm, self.mM_unique_descriptor,
                                   self.mmdescriptor)
        self.mm2nM = Redistributor(self.block_comm, self.mmdescriptor,
                                   self.nM_unique_descriptor)
    def calculate_blocked_density_matrix(self, f_n, C_nM):
        nbands = self.bd.nbands
        mynbands = self.bd.mynbands
        nao = self.nao
        dtype = C_nM.dtype
        
        self.nMdescriptor.checkassert(C_nM)
        if self.gd.rank == 0:
            Cf_nM = (C_nM * f_n[:, None]).conj()
        else:
            C_nM = self.nM_unique_descriptor.zeros(dtype=dtype)
            Cf_nM = self.nM_unique_descriptor.zeros(dtype=dtype)

        r = Redistributor(self.block_comm, self.nM_unique_descriptor,
                          self.mmdescriptor)

        Cf_mm = self.mmdescriptor.zeros(dtype=dtype)
        r.redistribute(Cf_nM, Cf_mm, nbands, nao)
        del Cf_nM
        
        C_mm = self.mmdescriptor.zeros(dtype=dtype)
        r.redistribute(C_nM, C_mm, nbands, nao)
        # no use to delete C_nM as it's in the input...

        rho_mm = self.mmdescriptor.zeros(dtype=dtype)
        
        pblas_simple_gemm(self.mmdescriptor,
                          self.mmdescriptor,
                          self.mmdescriptor,
                          Cf_mm, C_mm, rho_mm, transa='T')
        return rho_mm
    def calculate_density_matrix(self, f_n, C_nM, rho_mM=None):
        """Calculate density matrix from occupations and coefficients.

        Presently this function performs the usual scalapack 3-step trick:
        redistribute-numbercrunching-backdistribute.
        
        
        Notes on future performance improvement.
        
        As per the current framework, C_nM exists as copies on each
        domain, i.e. this is not parallel over domains.  We'd like to
        correct this and have an efficient distribution using e.g. the
        block communicator.

        The diagonalization routine and other parts of the code should
        however be changed to accommodate the following scheme:
        
        Keep coefficients in C_mm form after the diagonalization.
        rho_mm can then be directly calculated from C_mm without
        redistribution, after which we only need to redistribute
        rho_mm across domains.
        
        """
        dtype = C_nM.dtype
        rho_mm = self.calculate_blocked_density_matrix(f_n, C_nM)
        rback = Redistributor(self.block_comm, self.mmdescriptor,
                              self.mM_unique_descriptor)
        rho1_mM = self.mM_unique_descriptor.zeros(dtype=dtype)
        rback.redistribute(rho_mm, rho1_mM)
        del rho_mm

        if rho_mM is None:
            if self.gd.rank == 0:
                rho_mM = rho1_mM
            else:
                rho_mM = self.mMdescriptor.zeros(dtype=dtype)

        self.gd.comm.broadcast(rho_mM, 0)
        return rho_mM
class BlacsOrbitalLayouts(BlacsLayouts):
    """ScaLAPACK Dense Linear Algebra.

    This class is instantiated in LCAO.  Not for casual use, at least for now.
    
    Requires two distributors and three descriptors for initialization
    as well as grid descriptors and band descriptors. Distributors are
    for cols2blocks (1D -> 2D BLACS grid) and blocks2cols (2D -> 1D
    BLACS grid). ScaLAPACK operations must occur on 2D BLACS grid for
    performance and scalability.

    _general_diagonalize is "hard-coded" for LCAO.
    Expects both Hamiltonian and Overlap matrix to be on the 2D BLACS grid.
    This is done early on to save memory.
    """
    # XXX rewrite this docstring a bit!

    # This class 'describes' all the LCAO Blacs-related layouts
    def __init__(self, gd, bd, block_comm, dtype, mcpus, ncpus,
                 blocksize, nao, timer=nulltimer):
        BlacsLayouts.__init__(self, gd, bd, block_comm, dtype,
                              mcpus, ncpus, blocksize, timer)
        nbands = bd.nbands
        self.blocksize = blocksize
        self.mynbands = mynbands = bd.mynbands
        
        self.orbital_comm = self.bd.comm
        self.naoblocksize = naoblocksize = -((-nao) // self.orbital_comm.size)
        self.nao = nao

        # Range of basis functions for BLACS distribution of matrices:
        self.Mmax = nao
        self.Mstart = bd.comm.rank * naoblocksize
        self.Mstop = min(self.Mstart + naoblocksize, self.Mmax)
        self.mynao = self.Mstop - self.Mstart

        # Column layout for one matrix per band rank:
        self.columngrid = BlacsGrid(bd.comm, bd.comm.size, 1)
        self.mMdescriptor = self.columngrid.new_descriptor(nao, nao,
                                                           naoblocksize, nao)
        self.nMdescriptor = self.columngrid.new_descriptor(nbands, nao,
                                                           mynbands, nao)

        #parallelprint(world, (mynao, self.mMdescriptor.shape))

        # Column layout for one matrix in total (only on grid masters):
        self.single_column_grid = BlacsGrid(self.column_comm, bd.comm.size, 1)
        self.mM_unique_descriptor = self.single_column_grid.new_descriptor( \
            nao, nao, naoblocksize, nao)

        # nM_unique_descriptor is meant to hold the coefficients after
        # diagonalization.  BLACS requires it to be nao-by-nao, but
        # we only fill meaningful data into the first nbands columns.
        #
        # The array will then be trimmed and broadcast across
        # the grid descriptor's communicator.
        self.nM_unique_descriptor = self.single_column_grid.new_descriptor( \
            nbands, nao, mynbands, nao)

        # Fully blocked grid for diagonalization with many CPUs:
        self.mmdescriptor = self.blockgrid.new_descriptor(nao, nao, blocksize,
                                                          blocksize)

        #self.nMdescriptor = nMdescriptor
        self.mM2mm = Redistributor(self.block_comm, self.mM_unique_descriptor,
                                   self.mmdescriptor)
        self.mm2nM = Redistributor(self.block_comm, self.mmdescriptor,
                                   self.nM_unique_descriptor)

    def diagonalize(self, H_mm, C_nM, eps_n, S_mm):
        # C_nM needs to be simultaneously compatible with:
        # 1. outdescriptor
        # 2. broadcast with gd.comm
        # We will does this with a dummy buffer C2_nM
        indescriptor = self.mM2mm.srcdescriptor  # cols2blocks
        outdescriptor = self.mm2nM.dstdescriptor  # blocks2cols
        blockdescriptor = self.mM2mm.dstdescriptor  # cols2blocks

        dtype = S_mm.dtype
        eps_M = np.empty(C_nM.shape[-1])  # empty helps us debug
        subM, subN = outdescriptor.gshape
        
        C_mm = blockdescriptor.zeros(dtype=dtype)
        self.timer.start('General diagonalize')
        # general_diagonalize_ex may have a buffer overflow, so
        # we no longer use it
        #blockdescriptor.general_diagonalize_ex(H_mm, S_mm.copy(), C_mm, eps_M,
        #                                       UL='L', iu=self.bd.nbands)
        blockdescriptor.general_diagonalize_dc(H_mm, S_mm.copy(), C_mm, eps_M,
                                               UL='L')
        self.timer.stop('General diagonalize')
 
        # Make C_nM compatible with the redistributor
        self.timer.start('Redistribute coefs')
        if outdescriptor:
            C2_nM = C_nM
        else:
            C2_nM = outdescriptor.empty(dtype=dtype)
        assert outdescriptor.check(C2_nM)
        self.mm2nM.redistribute(C_mm, C2_nM, subM, subN)  # blocks2cols
        self.timer.stop('Redistribute coefs')

        self.timer.start('Send coefs to domains')
        # eps_M is already on block_comm.rank = 0
        # easier to broadcast eps_M to all and
        # get the correct slice afterward.
        self.block_comm.broadcast(eps_M, 0)
        eps_n[:] = eps_M[self.bd.get_slice()]
        self.gd.comm.broadcast(C_nM, 0)
        self.timer.stop('Send coefs to domains')

    def distribute_overlap_matrix(self, S_qmM, root=0,
                                  add_hermitian_conjugate=False):
        # Some MPI implementations need a lot of memory to do large
        # reductions.  To avoid trouble, we do comm.sum on smaller blocks
        # of S (this code is also safe for arrays smaller than blocksize)
        Sflat_x = S_qmM.ravel()
        blocksize = 2**23 // Sflat_x.itemsize  # 8 MiB
        nblocks = -(-len(Sflat_x) // blocksize)
        Mstart = 0
        for i in range(nblocks):
            self.gd.comm.sum(Sflat_x[Mstart:Mstart + blocksize], root=root)
            Mstart += blocksize
        assert Mstart + blocksize >= len(Sflat_x)

        xshape = S_qmM.shape[:-2]
        nm, nM = S_qmM.shape[-2:]
        S_qmM = S_qmM.reshape(-1, nm, nM)
        
        blockdesc = self.mmdescriptor
        coldesc = self.mM_unique_descriptor
        S_qmm = blockdesc.zeros(len(S_qmM), S_qmM.dtype)

        if not coldesc:  # XXX ugly way to sort out inactive ranks
            S_qmM = coldesc.zeros(len(S_qmM), S_qmM.dtype)
        
        self.timer.start('Distribute overlap matrix')
        for S_mM, S_mm in zip(S_qmM, S_qmm):
            self.mM2mm.redistribute(S_mM, S_mm)
            if add_hermitian_conjugate:
                if blockdesc.active:
                    pblas_tran(1.0, S_mm.copy(), 1.0, S_mm,
                               blockdesc, blockdesc)
                
        self.timer.stop('Distribute overlap matrix')
        return S_qmm.reshape(xshape + blockdesc.shape)

    def get_overlap_matrix_shape(self):
        return self.mmdescriptor.shape

    def calculate_blocked_density_matrix(self, f_n, C_nM):
        nbands = self.bd.nbands
        mynbands = self.bd.mynbands
        nao = self.nao
        dtype = C_nM.dtype
        
        self.nMdescriptor.checkassert(C_nM)
        if self.gd.rank == 0:
            Cf_nM = (C_nM * f_n[:, None]).conj()
        else:
            C_nM = self.nM_unique_descriptor.zeros(dtype=dtype)
            Cf_nM = self.nM_unique_descriptor.zeros(dtype=dtype)

        r = Redistributor(self.block_comm, self.nM_unique_descriptor,
                          self.mmdescriptor)

        Cf_mm = self.mmdescriptor.zeros(dtype=dtype)
        r.redistribute(Cf_nM, Cf_mm, nbands, nao)
        del Cf_nM
        
        C_mm = self.mmdescriptor.zeros(dtype=dtype)
        r.redistribute(C_nM, C_mm, nbands, nao)
        # no use to delete C_nM as it's in the input...

        rho_mm = self.mmdescriptor.zeros(dtype=dtype)
        
        pblas_simple_gemm(self.mmdescriptor,
                          self.mmdescriptor,
                          self.mmdescriptor,
                          Cf_mm, C_mm, rho_mm, transa='T')
        return rho_mm

    def calculate_density_matrix(self, f_n, C_nM, rho_mM=None):
        """Calculate density matrix from occupations and coefficients.

        Presently this function performs the usual scalapack 3-step trick:
        redistribute-numbercrunching-backdistribute.
        
        
        Notes on future performance improvement.
        
        As per the current framework, C_nM exists as copies on each
        domain, i.e. this is not parallel over domains.  We'd like to
        correct this and have an efficient distribution using e.g. the
        block communicator.

        The diagonalization routine and other parts of the code should
        however be changed to accommodate the following scheme:
        
        Keep coefficients in C_mm form after the diagonalization.
        rho_mm can then be directly calculated from C_mm without
        redistribution, after which we only need to redistribute
        rho_mm across domains.
        
        """
        dtype = C_nM.dtype
        rho_mm = self.calculate_blocked_density_matrix(f_n, C_nM)
        rback = Redistributor(self.block_comm, self.mmdescriptor,
                              self.mM_unique_descriptor)
        rho1_mM = self.mM_unique_descriptor.zeros(dtype=dtype)
        rback.redistribute(rho_mm, rho1_mM)
        del rho_mm

        if rho_mM is None:
            if self.gd.rank == 0:
                rho_mM = rho1_mM
            else:
                rho_mM = self.mMdescriptor.zeros(dtype=dtype)

        self.gd.comm.broadcast(rho_mM, 0)
        return rho_mM

    def distribute_to_columns(self, rho_mm, srcdescriptor):
        redistributor = Redistributor(self.block_comm, # XXX
                                      srcdescriptor,
                                      self.mM_unique_descriptor)
        rho_mM = redistributor.redistribute(rho_mm)
        if self.gd.rank != 0:
            rho_mM = self.mMdescriptor.zeros(dtype=rho_mm.dtype)
        self.gd.comm.broadcast(rho_mM, 0)
        return rho_mM

    def oldcalculate_density_matrix(self, f_n, C_nM, rho_mM=None):
        # This version is parallel over the band descriptor only.
        # This is inefficient, but let's keep it for a while in case
        # there's trouble with the more efficient version
        nbands = self.bd.nbands
        mynbands = self.bd.mynbands
        nao = self.nao
        
        if rho_mM is None:
            rho_mM = self.mMdescriptor.zeros(dtype=C_nM.dtype)
        
        Cf_nM = (C_nM * f_n[:, None]).conj()
        pblas_simple_gemm(self.nMdescriptor, self.nMdescriptor,
                          self.mMdescriptor, Cf_nM, C_nM, rho_mM, transa='T')
        return rho_mM

    def get_transposed_density_matrix(self, f_n, C_nM, rho_mM=None):
        return self.calculate_density_matrix(f_n, C_nM, rho_mM).conj()

    def get_description(self):
        (title, template) = BlacsLayouts.get_description(self)
        bg = self.blockgrid
        desc = self.mmdescriptor
        s = template % (bg.nprow, bg.npcol, desc.mb, desc.nb)
        return ' '.join([title, s])