Beispiel #1
0
    def calculate_blocked_density_matrix(self, f_n, C_nM):
        nbands = self.bd.nbands
        nao = self.nao
        dtype = C_nM.dtype

        self.nMdescriptor.checkassert(C_nM)
        if self.gd.rank == 0:
            Cf_nM = (C_nM * f_n[:, None]).conj()
        else:
            C_nM = self.nM_unique_descriptor.zeros(dtype=dtype)
            Cf_nM = self.nM_unique_descriptor.zeros(dtype=dtype)

        r = Redistributor(self.block_comm, self.nM_unique_descriptor,
                          self.mmdescriptor)

        Cf_mm = self.mmdescriptor.zeros(dtype=dtype)
        r.redistribute(Cf_nM, Cf_mm, nbands, nao)
        del Cf_nM

        C_mm = self.mmdescriptor.zeros(dtype=dtype)
        r.redistribute(C_nM, C_mm, nbands, nao)
        # no use to delete C_nM as it's in the input...

        rho_mm = self.mmdescriptor.zeros(dtype=dtype)

        pblas_simple_gemm(self.mmdescriptor,
                          self.mmdescriptor,
                          self.mmdescriptor,
                          Cf_mm,
                          C_mm,
                          rho_mm,
                          transa='T')
        return rho_mm
Beispiel #2
0
    def scalapack_diagonalize(self, H_sS):

        mb = 32
        N = self.nS
        
        g1 = BlacsGrid(world, size,    1)
        g2 = BlacsGrid(world, size//2, 2)
        nndesc1 = g1.new_descriptor(N, N, self.nS_local,  N) 
        nndesc2 = g2.new_descriptor(N, N, mb, mb)
        
        A_ss = nndesc2.empty(dtype=H_sS.dtype)
        redistributor = Redistributor(world, nndesc1, nndesc2)
        redistributor.redistribute(H_sS, A_ss)
        
        # diagonalize
        v_ss = nndesc2.zeros(dtype=A_ss.dtype)
        w_S = np.zeros(N,dtype=float)
        nndesc2.diagonalize_dc(A_ss, v_ss, w_S, 'L')
        
        # distribute the eigenvectors to master
        v_sS = np.zeros_like(H_sS)
        redistributor = Redistributor(world, nndesc2, nndesc1)
        redistributor.redistribute(v_ss, v_sS)

#        v2_SS = np.zeros((self.nS, self.nS), dtype=complex)
#        world.all_gather(v_sS, v2_SS)
        
        return w_S, v_sS.conj()
Beispiel #3
0
    def scalapack_diagonalize(self, H_sS):

        mb = 32
        N = self.nS

        g1 = BlacsGrid(world, size, 1)
        g2 = BlacsGrid(world, size // 2, 2)
        nndesc1 = g1.new_descriptor(N, N, self.nS_local, N)
        nndesc2 = g2.new_descriptor(N, N, mb, mb)

        A_ss = nndesc2.empty(dtype=H_sS.dtype)
        redistributor = Redistributor(world, nndesc1, nndesc2)
        redistributor.redistribute(H_sS, A_ss)

        # diagonalize
        v_ss = nndesc2.zeros(dtype=A_ss.dtype)
        w_S = np.zeros(N, dtype=float)
        nndesc2.diagonalize_dc(A_ss, v_ss, w_S, 'L')

        # distribute the eigenvectors to master
        v_sS = np.zeros_like(H_sS)
        redistributor = Redistributor(world, nndesc2, nndesc1)
        redistributor.redistribute(v_ss, v_sS)

        #        v2_SS = np.zeros((self.nS, self.nS), dtype=complex)
        #        world.all_gather(v_sS, v2_SS)

        return w_S, v_sS.conj()
Beispiel #4
0
    def diagonalize(self):

        print('Diagonalizing Hamiltonian', file=self.fd)
        """The t and T represent local and global
           eigenstates indices respectively
        """

        # Non-Hermitian matrix can only use linalg.eig
        if not self.td:
            print('  Using numpy.linalg.eig...', file=self.fd)
            print('  Eliminated %s pair orbitals' % len(self.excludef_S),
                  file=self.fd)

            self.H_SS = self.collect_A_SS(self.H_sS)
            self.w_T = np.zeros(self.nS - len(self.excludef_S), complex)
            if world.rank == 0:
                self.H_SS = np.delete(self.H_SS, self.excludef_S, axis=0)
                self.H_SS = np.delete(self.H_SS, self.excludef_S, axis=1)
                self.w_T, self.v_ST = np.linalg.eig(self.H_SS)
            world.broadcast(self.w_T, 0)
            self.df_S = np.delete(self.df_S, self.excludef_S)
            self.rhoG0_S = np.delete(self.rhoG0_S, self.excludef_S)
        # Here the eigenvectors are returned as complex conjugated rows
        else:
            if world.size == 1:
                print('  Using lapack...', file=self.fd)
                from gpaw.utilities.lapack import diagonalize
                self.w_T = np.zeros(self.nS)
                diagonalize(self.H_sS, self.w_T)
                self.v_St = self.H_sS.conj().T
            else:
                print('  Using scalapack...', file=self.fd)
                nS = self.nS
                ns = -(-self.kd.nbzkpts // world.size) * (self.nv * self.nc *
                                                          self.spins *
                                                          (self.spinors + 1)**2)
                grid = BlacsGrid(world, world.size, 1)
                desc = grid.new_descriptor(nS, nS, ns, nS)

                desc2 = grid.new_descriptor(nS, nS, 2, 2)
                H_tmp = desc2.zeros(dtype=complex)
                r = Redistributor(world, desc, desc2)
                r.redistribute(self.H_sS, H_tmp)

                self.w_T = np.empty(nS)
                v_tmp = desc2.empty(dtype=complex)
                desc2.diagonalize_dc(H_tmp, v_tmp, self.w_T)

                r = Redistributor(grid.comm, desc2, desc)
                self.v_St = desc.zeros(dtype=complex)
                r.redistribute(v_tmp, self.v_St)
                self.v_St = self.v_St.conj().T

        if self.write_v and self.td:
            # Cannot use par_save without td
            self.par_save('v_TS.ulm', 'v_TS', self.v_St.T)

        return
Beispiel #5
0
    def redistribute_H(self, H_sS):

        g1 = BlacsGrid(world, size, 1)
        g2 = BlacsGrid(world, 1, size)
        N = self.nS
        nndesc1 = g1.new_descriptor(N, N, self.nS_local,  N) 
        nndesc2 = g2.new_descriptor(N, N, N, self.nS_local)
        
        H_Ss = nndesc2.empty(dtype=H_sS.dtype)
        redistributor = Redistributor(world, nndesc1, nndesc2)
        redistributor.redistribute(H_sS, H_Ss)

        return H_Ss
Beispiel #6
0
    def redistribute_H(self, H_sS):

        g1 = BlacsGrid(world, size, 1)
        g2 = BlacsGrid(world, 1, size)
        N = self.nS
        nndesc1 = g1.new_descriptor(N, N, self.nS_local, N)
        nndesc2 = g2.new_descriptor(N, N, N, self.nS_local)

        H_Ss = nndesc2.empty(dtype=H_sS.dtype)
        redistributor = Redistributor(world, nndesc1, nndesc2)
        redistributor.redistribute(H_sS, H_Ss)

        return H_Ss
Beispiel #7
0
def parallel_eigh(matrixfile, blacsgrid=(4, 2), blocksize=64):
    """Diagonalize matrix in parallel"""
    assert np.prod(blacsgrid) == world.size
    grid = BlacsGrid(world, *blacsgrid)

    if world.rank == MASTER:
        H_MM = np.load(matrixfile)
        assert H_MM.ndim == 2
        assert H_MM.shape[0] == H_MM.shape[1]
        NM = len(H_MM)
    else:
        NM = 0
    NM = world.sum(NM) # Distribute matrix shape to all nodes

    # descriptor for the individual blocks
    block_desc = grid.new_descriptor(NM, NM, blocksize, blocksize)

    # descriptor for global array on MASTER
    local_desc = grid.new_descriptor(NM, NM, NM, NM)

    # Make some dummy array on all the slaves
    if world.rank != MASTER:
        H_MM = local_desc.zeros()
    assert local_desc.check(H_MM)

    # The local version of the matrix
    H_mm = block_desc.empty()

    # Distribute global array to smaller blocks
    redistributor = Redistributor(world, local_desc, block_desc)
    redistributor.redistribute(H_MM, H_mm)

    # Allocate arrays for eigenvalues and -vectors
    eps_M = np.empty(NM)
    C_mm = block_desc.empty()
    block_desc.diagonalize_ex(H_mm, C_mm, eps_M)

    # Collect eigenvectors on MASTER
    C_MM = local_desc.empty()
    redistributor2 = Redistributor(world, block_desc, local_desc)
    redistributor2.redistribute(C_mm, C_MM)

    # Return eigenvalues and -vectors on Master
    if world.rank == MASTER:
        return eps_M, C_MM
    else:
        return None, None
Beispiel #8
0
def parallel_eigh(matrixfile, blacsgrid=(4, 2), blocksize=64):
    """Diagonalize matrix in parallel"""
    assert np.prod(blacsgrid) == world.size
    grid = BlacsGrid(world, *blacsgrid)

    if world.rank == MASTER:
        H_MM = np.load(matrixfile)
        assert H_MM.ndim == 2
        assert H_MM.shape[0] == H_MM.shape[1]
        NM = len(H_MM)
    else:
        NM = 0
    NM = world.sum(NM) # Distribute matrix shape to all nodes

    # descriptor for the individual blocks
    block_desc = grid.new_descriptor(NM, NM, blocksize, blocksize)

    # descriptor for global array on MASTER
    local_desc = grid.new_descriptor(NM, NM, NM, NM)

    # Make some dummy array on all the slaves
    if world.rank != MASTER:
        H_MM = local_desc.zeros()
    assert local_desc.check(H_MM)

    # The local version of the matrix
    H_mm = block_desc.empty()

    # Distribute global array to smaller blocks
    redistributor = Redistributor(world, local_desc, block_desc)
    redistributor.redistribute(H_MM, H_mm)

    # Allocate arrays for eigenvalues and -vectors
    eps_M = np.empty(NM)
    C_mm = block_desc.empty()
    block_desc.diagonalize_ex(H_mm, C_mm, eps_M)

    # Collect eigenvectors on MASTER
    C_MM = local_desc.empty()
    redistributor2 = Redistributor(world, block_desc, local_desc)
    redistributor2.redistribute(C_mm, C_MM)

    # Return eigenvalues and -vectors on Master
    if world.rank == MASTER:
        return eps_M, C_MM
    else:
        return None, None
Beispiel #9
0
def distribute_MM(wfs, a_MM):
    ksl = wfs.ksl
    if not ksl.using_blacs:
        return a_MM

    dtype = a_MM.dtype
    ksl_comm = ksl.block_comm
    NM = ksl.nao
    grid = BlacsGrid(ksl_comm, 1, 1)
    MM_descriptor = grid.new_descriptor(NM, NM, NM, NM)
    MM2mm = Redistributor(ksl_comm, MM_descriptor, ksl.mmdescriptor)
    if ksl_comm.rank != 0:
        a_MM = MM_descriptor.empty(dtype=dtype)

    a_mm = ksl.mmdescriptor.empty(dtype=dtype)
    MM2mm.redistribute(a_MM, a_mm)
    return a_mm
    def redistribute(self, in_wGG, out_x=None):
        """Redistribute array.
        
        Switch between two kinds of parallel distributions:
            
        1) parallel over G-vectors (second dimension of in_wGG)
        2) parallel over frequency (first dimension of in_wGG)

        Returns new array using the memory in the 1-d array out_x.
        """
        
        comm = self.blockcomm
        
        if comm.size == 1:
            return in_wGG
            
        nw = len(self.omega_w)
        nG = in_wGG.shape[2]
        mynw = (nw + comm.size - 1) // comm.size
        mynG = (nG + comm.size - 1) // comm.size
        
        bg1 = BlacsGrid(comm, comm.size, 1)
        bg2 = BlacsGrid(comm, 1, comm.size)
        md1 = BlacsDescriptor(bg1, nw, nG**2, mynw, nG**2)
        md2 = BlacsDescriptor(bg2, nw, nG**2, nw, mynG * nG)
        
        if len(in_wGG) == nw:
            mdin = md2
            mdout = md1
        else:
            mdin = md1
            mdout = md2
            
        r = Redistributor(comm, mdin, mdout)
        
        outshape = (mdout.shape[0], mdout.shape[1] // nG, nG)
        if out_x is None:
            out_wGG = np.empty(outshape, complex)
        else:
            out_wGG = out_x[:np.product(outshape)].reshape(outshape)

        r.redistribute(in_wGG.reshape(mdin.shape),
                       out_wGG.reshape(mdout.shape))
        
        return out_wGG
Beispiel #11
0
    def redistribute(self, in_wGG, out_x=None):
        """Redistribute array.
        
        Switch between two kinds of parallel distributions:
            
        1) parallel over G-vectors (second dimension of in_wGG)
        2) parallel over frequency (first dimension of in_wGG)

        Returns new array using the memory in the 1-d array out_x.
        """

        comm = self.blockcomm

        if comm.size == 1:
            return in_wGG

        nw = len(self.omega_w)
        nG = in_wGG.shape[2]
        mynw = (nw + comm.size - 1) // comm.size
        mynG = (nG + comm.size - 1) // comm.size

        bg1 = BlacsGrid(comm, comm.size, 1)
        bg2 = BlacsGrid(comm, 1, comm.size)
        md1 = BlacsDescriptor(bg1, nw, nG**2, mynw, nG**2)
        md2 = BlacsDescriptor(bg2, nw, nG**2, nw, mynG * nG)

        if len(in_wGG) == nw:
            mdin = md2
            mdout = md1
        else:
            mdin = md1
            mdout = md2

        r = Redistributor(comm, mdin, mdout)

        outshape = (mdout.shape[0], mdout.shape[1] // nG, nG)
        if out_x is None:
            out_wGG = np.empty(outshape, complex)
        else:
            out_wGG = out_x[:np.product(outshape)].reshape(outshape)

        r.redistribute(in_wGG.reshape(mdin.shape),
                       out_wGG.reshape(mdout.shape))

        return out_wGG
Beispiel #12
0
def scal_diagonalize(A, nodes='master'):
    # Diagonalize matrix A (size N*N) with scalapack
    # Usage: eps, B = scal_diagonalize(A)
    # eps and B and the eigenvalues and eigenvectors
    # nodes = 'master': eigenvectors only available on master node
    # nodes = 'all': eigenvectors broadcast to all nodes

    # make sure A is N*N, and hermitian
    N = A.shape[0]
    assert A.shape[0] == A.shape[1]
    for i in range(N):
        for j in range(i, N):
            assert A[i,j] == A[j,i].conj()

    # create blacs descriptor
    mb = 64
    g = BlacsGrid(world, 2, size//2)
    nndesc1 = g.new_descriptor(N, N, N,  N) 
    nndesc2 = g.new_descriptor(N, N, mb, mb)

    # distribute A to blacs grid A_
    if rank != 0:
        A = nndesc1.zeros(dtype=A.dtype)
    A_ = nndesc2.empty(dtype=A.dtype)
    redistributor = Redistributor(world, nndesc1, nndesc2)
    redistributor.redistribute(A, A_)

    # diagonalize
    B_ = nndesc2.zeros(dtype=A.dtype)
    eps = np.zeros(N,dtype=A.dtype)
    nndesc2.diagonalize_dc(A_, B_, eps, 'L')

    # distribute the eigenvectors to master
    B = np.zeros_like(A)
    redistributor = Redistributor(world, nndesc2, nndesc1)
    redistributor.redistribute(B_, B)

    if nodes == 'master':
        return eps, B
    elif nodes == 'all':
        if rank != 0:
            B = np.zeros((N, N))
        world.broadcast(B, 0)
        return eps, B
Beispiel #13
0
def scal_diagonalize(A, nodes='master'):
    # Diagonalize matrix A (size N*N) with scalapack
    # Usage: eps, B = scal_diagonalize(A)
    # eps and B and the eigenvalues and eigenvectors
    # nodes = 'master': eigenvectors only available on master node
    # nodes = 'all': eigenvectors broadcast to all nodes

    # make sure A is N*N, and hermitian
    N = A.shape[0]
    assert A.shape[0] == A.shape[1]
    for i in range(N):
        for j in range(i, N):
            assert A[i, j] == A[j, i].conj()

    # create blacs descriptor
    mb = 64
    g = BlacsGrid(world, 2, size // 2)
    nndesc1 = g.new_descriptor(N, N, N, N)
    nndesc2 = g.new_descriptor(N, N, mb, mb)

    # distribute A to blacs grid A_
    if rank != 0:
        A = nndesc1.zeros(dtype=A.dtype)
    A_ = nndesc2.empty(dtype=A.dtype)
    redistributor = Redistributor(world, nndesc1, nndesc2)
    redistributor.redistribute(A, A_)

    # diagonalize
    B_ = nndesc2.zeros(dtype=A.dtype)
    eps = np.zeros(N, dtype=A.dtype)
    nndesc2.diagonalize_dc(A_, B_, eps, 'L')

    # distribute the eigenvectors to master
    B = np.zeros_like(A)
    redistributor = Redistributor(world, nndesc2, nndesc1)
    redistributor.redistribute(B_, B)

    if nodes == 'master':
        return eps, B
    elif nodes == 'all':
        if rank != 0:
            B = np.zeros((N, N))
        world.broadcast(B, 0)
        return eps, B
Beispiel #14
0
 def distribute_to_columns(self, rho_mm, srcdescriptor):
     redistributor = Redistributor(
         self.block_comm,  # XXX
         srcdescriptor,
         self.mM_unique_descriptor)
     rho_mM = redistributor.redistribute(rho_mm)
     if self.gd.rank != 0:
         rho_mM = self.mMdescriptor.zeros(dtype=rho_mm.dtype)
     self.gd.comm.broadcast(rho_mM, 0)
     return rho_mM
Beispiel #15
0
def test(comm, M, N, mcpus, ncpus, mb, nb):
    grid0 = BlacsGrid(comm, 1, 1)
    desc0 = grid0.new_descriptor(M, N, M, N, 0, 0)
    A_mn = desc0.zeros(dtype=float)
    A_mn[:] = comm.size + 1

    grid1 = BlacsGrid(comm, mcpus, ncpus)
    desc1 = grid1.new_descriptor(M, N, mb, nb, 0, 0)  # ???
    B_mn = desc1.zeros(dtype=float)
    B_mn[:] = comm.rank

    if comm.rank == 0:
        msg = 'Slices of global matrix indices by rank'
        print(msg)
        print('-' * len(msg))

    for rank in range(comm.size):
        comm.barrier()
        if rank == comm.rank:
            print('Rank %d:' % rank)
            last_Mstart = -1
            for Mstart, Mstop, Nstart, Nstop, block in desc1.my_blocks(B_mn):
                if Mstart > last_Mstart and last_Mstart >= 0:
                    print()
                print('[%3d:%3d, %3d:%3d]' % (Mstart, Mstop, Nstart, Nstop),
                      end=' ')
                last_Mstart = Mstart
                assert (block == comm.rank).all()
                #print block
                #print
            print()
            print()
        comm.barrier()

    redistributor = Redistributor(comm, desc1, desc0)
    redistributor.redistribute(B_mn, A_mn)

    if comm.rank == 0:
        msg = 'Rank where each element of the global matrix is stored'
        print(msg)
        print('-' * len(msg))
        print(A_mn)
Beispiel #16
0
def test(comm, M, N, mcpus, ncpus, mb, nb):
    grid0 = BlacsGrid(comm, 1, 1)
    desc0 = grid0.new_descriptor(M, N, M, N, 0, 0)
    A_mn = desc0.zeros(dtype=float)
    A_mn[:] = comm.size + 1

    grid1 = BlacsGrid(comm, mcpus, ncpus)
    desc1 = grid1.new_descriptor(M, N, mb, nb, 0, 0) # ???
    B_mn = desc1.zeros(dtype=float)
    B_mn[:] = comm.rank

    if comm.rank == 0:
        msg = 'Slices of global matrix indices by rank'
        print msg
        print '-' * len(msg)

    for rank in range(comm.size):
        comm.barrier()
        if rank == comm.rank:
            print 'Rank %d:' % rank
            last_Mstart = -1
            for Mstart, Mstop, Nstart, Nstop, block in desc1.my_blocks(B_mn):
                if Mstart > last_Mstart and last_Mstart >= 0:
                    print
                print '[%3d:%3d, %3d:%3d]' % (Mstart, Mstop, Nstart, Nstop),
                last_Mstart = Mstart
                assert (block == comm.rank).all()
                #print block
                #print
            print
            print
        comm.barrier()
    
    redistributor = Redistributor(comm, desc1, desc0)
    redistributor.redistribute(B_mn, A_mn)

    if comm.rank == 0:
        msg = 'Rank where each element of the global matrix is stored'
        print msg
        print '-' * len(msg)
        print A_mn
Beispiel #17
0
    def calculate_density_matrix(self, f_n, C_nM, rho_mM=None):
        """Calculate density matrix from occupations and coefficients.

        Presently this function performs the usual scalapack 3-step trick:
        redistribute-numbercrunching-backdistribute.
        
        
        Notes on future performance improvement.
        
        As per the current framework, C_nM exists as copies on each
        domain, i.e. this is not parallel over domains.  We'd like to
        correct this and have an efficient distribution using e.g. the
        block communicator.

        The diagonalization routine and other parts of the code should
        however be changed to accommodate the following scheme:
        
        Keep coefficients in C_mm form after the diagonalization.
        rho_mm can then be directly calculated from C_mm without
        redistribution, after which we only need to redistribute
        rho_mm across domains.
        
        """
        dtype = C_nM.dtype
        rho_mm = self.calculate_blocked_density_matrix(f_n, C_nM)
        rback = Redistributor(self.block_comm, self.mmdescriptor,
                              self.mM_unique_descriptor)
        rho1_mM = self.mM_unique_descriptor.zeros(dtype=dtype)
        rback.redistribute(rho_mm, rho1_mM)
        del rho_mm

        if rho_mM is None:
            if self.gd.rank == 0:
                rho_mM = rho1_mM
            else:
                rho_mM = self.mMdescriptor.zeros(dtype=dtype)

        self.gd.comm.broadcast(rho_mM, 0)
        return rho_mM
Beispiel #18
0
def collect_wuMM(wfs, a_wuMM, w, s, k):
    # This function is based on
    # gpaw/wavefunctions/base.py: WaveFunctions.collect_auxiliary()

    dtype = a_wuMM[0][0].dtype

    ksl = wfs.ksl
    NM = ksl.nao
    kpt_rank, u = wfs.kd.get_rank_and_index(s, k)

    ksl_comm = ksl.block_comm

    if wfs.kd.comm.rank == kpt_rank:
        a_MM = a_wuMM[w][u]

        # Collect within blacs grid
        if ksl.using_blacs:
            a_mm = a_MM
            grid = BlacsGrid(ksl_comm, 1, 1)
            MM_descriptor = grid.new_descriptor(NM, NM, NM, NM)
            mm2MM = Redistributor(ksl_comm, ksl.mmdescriptor, MM_descriptor)

            a_MM = MM_descriptor.empty(dtype=dtype)
            mm2MM.redistribute(a_mm, a_MM)

        # KSL master send a_MM to the global master
        if ksl_comm.rank == 0:
            if kpt_rank == 0:
                assert wfs.world.rank == 0
                # I have it already
                return a_MM
            else:
                wfs.kd.comm.send(a_MM, 0, 2017)
                return None
    elif ksl_comm.rank == 0 and kpt_rank != 0:
        assert wfs.world.rank == 0
        a_MM = np.empty((NM, NM), dtype=dtype)
        wfs.kd.comm.receive(a_MM, kpt_rank, 2017)
        return a_MM
    def distribute_frequencies(self, chi0_wGG):
        """Distribute frequencies to all cores."""
        
        world = self.world
        comm = self.blockcomm
        
        if world.size == 1:
            return chi0_wGG
            
        nw = len(self.omega_w)
        nG = chi0_wGG.shape[2]
        mynw = (nw + world.size - 1) // world.size
        mynG = (nG + comm.size - 1) // comm.size
  
        wa = min(world.rank * mynw, nw)
        wb = min(wa + mynw, nw)

        if self.blockcomm.size == 1:
            return chi0_wGG[wa:wb].copy()

        if self.kncomm.rank == 0:
            bg1 = BlacsGrid(comm, 1, comm.size)
            in_wGG = chi0_wGG.reshape((nw, -1))
        else:
            bg1 = DryRunBlacsGrid(mpi.serial_comm, 1, 1)
            in_wGG = np.zeros((0, 0), complex)
        md1 = BlacsDescriptor(bg1, nw, nG**2, nw, mynG * nG)
        
        bg2 = BlacsGrid(world, world.size, 1)
        md2 = BlacsDescriptor(bg2, nw, nG**2, mynw, nG**2)
        
        r = Redistributor(world, md1, md2)
        shape = (wb - wa, nG, nG)
        out_wGG = np.empty(shape, complex)
        r.redistribute(in_wGG, out_wGG.reshape((wb - wa, nG**2)))
        
        return out_wGG
Beispiel #20
0
    def distribute_frequencies(self, chi0_wGG):
        """Distribute frequencies to all cores."""

        world = self.world
        comm = self.blockcomm

        if world.size == 1:
            return chi0_wGG

        nw = len(self.omega_w)
        nG = chi0_wGG.shape[2]
        mynw = (nw + world.size - 1) // world.size
        mynG = (nG + comm.size - 1) // comm.size

        wa = min(world.rank * mynw, nw)
        wb = min(wa + mynw, nw)

        if self.blockcomm.size == 1:
            return chi0_wGG[wa:wb].copy()

        if self.kncomm.rank == 0:
            bg1 = BlacsGrid(comm, 1, comm.size)
            in_wGG = chi0_wGG.reshape((nw, -1))
        else:
            bg1 = DryRunBlacsGrid(mpi.serial_comm, 1, 1)
            in_wGG = np.zeros((0, 0), complex)
        md1 = BlacsDescriptor(bg1, nw, nG**2, nw, mynG * nG)

        bg2 = BlacsGrid(world, world.size, 1)
        md2 = BlacsDescriptor(bg2, nw, nG**2, mynw, nG**2)

        r = Redistributor(world, md1, md2)
        shape = (wb - wa, nG, nG)
        out_wGG = np.empty(shape, complex)
        r.redistribute(in_wGG, out_wGG.reshape((wb - wa, nG**2)))

        return out_wGG
Beispiel #21
0
class LrTDDFTLayouts:
    """BLACS layout for distributed Omega matrix in linear response
       time-dependet DFT calculations"""
    def __init__(self, sl_lrtddft, nkq, dd_comm, eh_comm):
        mcpus, ncpus, blocksize = tuple(sl_lrtddft)
        self.world = eh_comm.parent
        self.dd_comm = dd_comm
        if self.world is None:
            self.world = self.dd_comm

        # All the ranks within domain communicator contain the omega matrix
        # construct new communicator only on domain masters
        eh_ranks = np.arange(eh_comm.size) * dd_comm.size
        self.eh_comm2 = self.world.new_communicator(eh_ranks)

        self.eh_grid = BlacsGrid(self.eh_comm2, eh_comm.size, 1)
        self.eh_descr = self.eh_grid.new_descriptor(nkq, nkq, 1, nkq)
        self.diag_grid = BlacsGrid(self.world, mcpus, ncpus)
        self.diag_descr = self.diag_grid.new_descriptor(
            nkq, nkq, blocksize, blocksize)

        self.redistributor_in = Redistributor(self.world, self.eh_descr,
                                              self.diag_descr)
        self.redistributor_out = Redistributor(self.world, self.diag_descr,
                                               self.eh_descr)
        """
        # -----------------------------------------------------------------
        # for SCALAPACK we need TRANSPOSED MATRIX (and vector)
        # -----------------------------------------------------------------
        # M = rows, N = cols
        M = nkq*4; N = nkq*4; mb = nkq*4; nb = 4; Nrhs = 1
        # Matrix, mp=1, np=eh_comm.size
        self.eh_grid2a = BlacsGrid(self.eh_comm2, eh_comm.size, 1)
        # Vector, mp=eh_comm.size, np=1
        self.eh_grid2b = BlacsGrid(self.eh_comm2, 1, eh_comm.size)
        self.eh_descr2a = self.eh_grid2a.new_descriptor(N,    M,  nb, mb)
        self.eh_descr2b = self.eh_grid2b.new_descriptor(Nrhs, N,   1, nb)

        self.solve_descr2a =self.diag_grid.new_descriptor(N, M,
                                                          blocksize, blocksize)
        self.solve_descr2b =self.diag_grid.new_descriptor(Nrhs, N,
                                                          1, blocksize)

        self.redist_solve_in_2a = Redistributor(self.world,
                                                self.eh_descr2a,
                                                self.solve_descr2a)
        self.redist_solve_in_2b = Redistributor(self.world,
                                                self.eh_descr2b,
                                                self.solve_descr2b)

        self.redist_solve_out_2a = Redistributor(self.world,
                                                 self.solve_descr2a,
                                                 self.eh_descr2a)
        self.redist_solve_out_2b = Redistributor(self.world,
                                                 self.solve_descr2b,
                                                 self.eh_descr2b)
        """

        # -----------------------------------------------------------------
        # for SCALAPACK we need TRANSPOSED MATRIX (and vector)
        # -----------------------------------------------------------------
        # M = rows, N = cols
        M = nkq * 4
        N = nkq * 4
        mb = 4
        nb = 4
        Nrhs = 1
        # Matrix, mp=1, np=eh_comm.size
        self.eh_grid2a = BlacsGrid(self.world, dd_comm.size, eh_comm.size)
        # Vector, mp=eh_comm.size, np=1
        self.eh_grid2b = BlacsGrid(self.world, 1, dd_comm.size * eh_comm.size)
        self.eh_descr2a = self.eh_grid2a.new_descriptor(N, M, nb, mb)
        self.eh_descr2b = self.eh_grid2b.new_descriptor(Nrhs, N, Nrhs, nb)
        self.solve_descr2a = self.diag_grid.new_descriptor(
            N, M, blocksize, blocksize)
        self.solve_descr2b = self.diag_grid.new_descriptor(
            Nrhs, N, Nrhs, blocksize)

        self.redist_solve_in_2a = Redistributor(self.world, self.eh_descr2a,
                                                self.solve_descr2a)
        self.redist_solve_in_2b = Redistributor(self.world, self.eh_descr2b,
                                                self.solve_descr2b)

        self.redist_solve_out_2a = Redistributor(self.world,
                                                 self.solve_descr2a,
                                                 self.eh_descr2a)
        self.redist_solve_out_2b = Redistributor(self.world,
                                                 self.solve_descr2b,
                                                 self.eh_descr2b)

    def solve(self, A, b):
        #if 0:
        #    print 'edescr2a', rank, self.eh_descr2a.asarray()
        #    print 'edescr2b', rank, self.eh_descr2b.asarray()
        #
        #    sys.stdout.flush()
        #    self.world.barrier()
        #
        #    print 'sdescr2a', rank, self.solve_descr2a.asarray()
        #    print 'sdescr2b', rank, self.solve_descr2b.asarray()
        #
        #    sys.stdout.flush()
        #    self.world.barrier()
        #
        #    print 'A ', rank, A.shape
        #    if b is not None:
        #        print 'b ', rank, b.shape
        #
        #    sys.stdout.flush()
        #    self.world.barrier()

        A_nn = self.solve_descr2a.empty(dtype=float)
        if self.eh_descr2a.blacsgrid.is_active():
            A_Nn = A
        else:
            A_Nn = np.empty((0, 0), dtype=float)
        self.redist_solve_in_2a.redistribute(A_Nn, A_nn)

        b_n = self.solve_descr2b.empty(dtype=float)
        if self.eh_descr2b.blacsgrid.is_active():
            b_N = b.reshape(1, len(b))
        else:
            b_N = np.empty((A_Nn.shape[0], 0), dtype=float)
        self.redist_solve_in_2b.redistribute(b_N, b_n)

        #if 0:
        #    print 'A_Nn ', rank, A_Nn.shape
        #    print 'b_N  ', rank, b_N.shape
        #    sys.stdout.flush()
        #    self.world.barrier()
        #    print 'A_nn ', rank, A_nn.shape
        #    print 'b_n  ', rank, b_n.shape
        #    sys.stdout.flush()
        #    self.world.barrier()
        #
        #
        #    print 'b_N  ', rank, b_N
        #    sys.stdout.flush()
        #    self.world.barrier()
        #    print 'b_n ', rank, b_n
        #    sys.stdout.flush()
        #    self.world.barrier()
        #
        #    print 'A_Nn  ', rank, A_Nn
        #    sys.stdout.flush()
        #    self.world.barrier()
        #    print 'A_nn ', rank, A_nn
        #    sys.stdout.flush()
        #    self.world.barrier()

        info = 0
        if self.solve_descr2a.blacsgrid.is_active():
            _gpaw.scalapack_solve(A_nn, self.solve_descr2a.asarray(), b_n,
                                  self.solve_descr2b.asarray())
            if info != 0:
                raise RuntimeError('scalapack_solve error: %d' % info)
        self.redist_solve_out_2b.redistribute(b_n, b_N)

        if self.eh_descr2b.blacsgrid.is_active():
            b_N = b_N.flatten()
        else:
            b_N = b

        #self.dd_comm.broadcast(b_N, 0)

        b[:] = b_N

    def diagonalize(self, Om, eps_n):

        O_nn = self.diag_descr.empty(dtype=float)
        if self.eh_descr.blacsgrid.is_active():
            O_nN = Om
        else:
            O_nN = np.empty((0, 0), dtype=float)

        self.redistributor_in.redistribute(O_nN, O_nn)
        self.diag_descr.diagonalize_dc(O_nn.copy(), O_nn, eps_n, 'L')
        self.redistributor_out.redistribute(O_nn, O_nN)
        self.world.broadcast(eps_n, 0)
        # Broadcast eigenvectors within domains
        if not self.eh_descr.blacsgrid.is_active():
            O_nN = Om
        self.dd_comm.broadcast(O_nN, 0)
Beispiel #22
0
    def diagonalize_full_hamiltonian(self, ham, atoms, occupations, txt,
                                     nbands=None,
                                     scalapack=None):

        if nbands is None:
            nbands = self.pd.ngmin

        assert nbands <= self.pd.ngmin

        self.bd = bd = BandDescriptor(nbands, self.bd.comm)

        if scalapack:
            nprow, npcol, b = scalapack
            bg = BlacsGrid(bd.comm, bd.comm.size, 1)
            bg2 = BlacsGrid(bd.comm, nprow, npcol)
        else:
            nprow = npcol = 1

        assert bd.comm.size == nprow * npcol

        self.pt.set_positions(atoms.get_scaled_positions())
        self.kpt_u[0].P_ani = None
        self.allocate_arrays_for_projections(self.pt.my_atom_indices)

        myslice = bd.get_slice()

        for kpt in self.kpt_u:
            npw = len(self.pd.Q_qG[kpt.q])
            if scalapack:
                mynpw = -(-npw // bd.comm.size)
                md = BlacsDescriptor(bg, npw, npw, mynpw, npw)
                md2 = BlacsDescriptor(bg2, npw, npw, b, b)
            else:
                md = md2 = MatrixDescriptor(npw, npw)

            H_GG, S_GG = self.hs(ham, kpt.q, kpt.s, md)

            if scalapack:
                r = Redistributor(bd.comm, md, md2)
                H_GG = r.redistribute(H_GG)
                S_GG = r.redistribute(S_GG)

            psit_nG = md2.empty(dtype=complex)
            eps_n = np.empty(npw)
            md2.general_diagonalize_dc(H_GG, S_GG, psit_nG, eps_n)
            del H_GG, S_GG

            kpt.eps_n = eps_n[myslice].copy()

            if scalapack:
                md3 = BlacsDescriptor(bg, npw, npw, bd.mynbands, npw)
                r = Redistributor(bd.comm, md2, md3)
                psit_nG = r.redistribute(psit_nG)

            kpt.psit_nG = psit_nG[:bd.mynbands].copy()
            del psit_nG

            self.pt.integrate(kpt.psit_nG, kpt.P_ani, kpt.q)

            #f_n = np.zeros_like(kpt.eps_n)
            #f_n[:len(kpt.f_n)] = kpt.f_n
            kpt.f_n = None

        occupations.calculate(self)
Beispiel #23
0
    def diagonalize_full_hamiltonian(self, ham, atoms, occupations, log,
                                     nbands=None, ecut=None, scalapack=None,
                                     expert=False):

        if self.dtype != complex:
            raise ValueError('Your wavefunctions are not complex as '
                             'required by the PW diagonalization routine.\n'
                             'Please supply GPAW(..., dtype=complex, ...) '
                             'as an argument to the calculator to enforce '
                             'complex wavefunctions.')

        if nbands is None and ecut is None:
            nbands = self.pd.ngmin // self.bd.comm.size * self.bd.comm.size
        elif nbands is None:
            ecut /= units.Hartree
            vol = abs(np.linalg.det(self.gd.cell_cv))
            nbands = int(vol * ecut**1.5 * 2**0.5 / 3 / pi**2)
        else:
            assert nbands <= self.pd.ngmin

        if expert:
            iu = nbands
        else:
            iu = None

        self.bd = bd = BandDescriptor(nbands, self.bd.comm)

        log('Diagonalizing full Hamiltonian ({0} lowest bands)'.format(nbands))
        log('Matrix size (min, max): {0}, {1}'.format(self.pd.ngmin,
                                                      self.pd.ngmax))
        mem = 3 * self.pd.ngmax**2 * 16 / bd.comm.size / 1024**2
        log('Approximate memory usage per core: {0:.3f} MB'.format(mem))
        if bd.comm.size > 1:
            if isinstance(scalapack, (list, tuple)):
                nprow, npcol, b = scalapack
            else:
                nprow = int(round(bd.comm.size**0.5))
                while bd.comm.size % nprow != 0:
                    nprow -= 1
                npcol = bd.comm.size // nprow
                b = 64
            log('ScaLapack grid: {0}x{1},'.format(nprow, npcol),
                'block-size:', b)
            bg = BlacsGrid(bd.comm, bd.comm.size, 1)
            bg2 = BlacsGrid(bd.comm, nprow, npcol)
            scalapack = True
        else:
            nprow = npcol = 1
            scalapack = False

        self.set_positions(atoms.get_scaled_positions())
        self.kpt_u[0].P_ani = None
        self.allocate_arrays_for_projections(self.pt.my_atom_indices)

        myslice = bd.get_slice()

        pb = ProgressBar(log.fd)
        nkpt = len(self.kpt_u)

        for u, kpt in enumerate(self.kpt_u):
            pb.update(u / nkpt)
            npw = len(self.pd.Q_qG[kpt.q])
            if scalapack:
                mynpw = -(-npw // bd.comm.size)
                md = BlacsDescriptor(bg, npw, npw, mynpw, npw)
                md2 = BlacsDescriptor(bg2, npw, npw, b, b)
            else:
                md = md2 = MatrixDescriptor(npw, npw)

            with self.timer('Build H and S'):
                H_GG, S_GG = self.hs(ham, kpt.q, kpt.s, md)

            if scalapack:
                r = Redistributor(bd.comm, md, md2)
                H_GG = r.redistribute(H_GG)
                S_GG = r.redistribute(S_GG)

            psit_nG = md2.empty(dtype=complex)
            eps_n = np.empty(npw)

            with self.timer('Diagonalize'):
                if not scalapack:
                    md2.general_diagonalize_dc(H_GG, S_GG, psit_nG, eps_n,
                                               iu=iu)
                else:
                    md2.general_diagonalize_dc(H_GG, S_GG, psit_nG, eps_n)
            del H_GG, S_GG

            kpt.eps_n = eps_n[myslice].copy()

            if scalapack:
                md3 = BlacsDescriptor(bg, npw, npw, bd.mynbands, npw)
                r = Redistributor(bd.comm, md2, md3)
                psit_nG = r.redistribute(psit_nG)

            kpt.psit_nG = psit_nG[:bd.mynbands].copy()
            del psit_nG

            with self.timer('Projections'):
                self.pt.integrate(kpt.psit_nG, kpt.P_ani, kpt.q)

            kpt.f_n = None

        pb.finish()

        occupations.calculate(self)

        return nbands
Beispiel #24
0
class BlacsOrbitalLayouts(BlacsLayouts):
    """ScaLAPACK Dense Linear Algebra.

    This class is instantiated in LCAO.  Not for casual use, at least for now.
    
    Requires two distributors and three descriptors for initialization
    as well as grid descriptors and band descriptors. Distributors are
    for cols2blocks (1D -> 2D BLACS grid) and blocks2cols (2D -> 1D
    BLACS grid). ScaLAPACK operations must occur on 2D BLACS grid for
    performance and scalability.

    _general_diagonalize is "hard-coded" for LCAO.
    Expects both Hamiltonian and Overlap matrix to be on the 2D BLACS grid.
    This is done early on to save memory.
    """

    # XXX rewrite this docstring a bit!

    # This class 'describes' all the LCAO Blacs-related layouts
    def __init__(self,
                 gd,
                 bd,
                 block_comm,
                 dtype,
                 mcpus,
                 ncpus,
                 blocksize,
                 nao,
                 timer=nulltimer):
        BlacsLayouts.__init__(self, gd, bd, block_comm, dtype, mcpus, ncpus,
                              blocksize, timer)
        nbands = bd.nbands
        self.blocksize = blocksize
        self.mynbands = mynbands = bd.mynbands

        self.orbital_comm = self.bd.comm
        self.naoblocksize = naoblocksize = -((-nao) // self.orbital_comm.size)
        self.nao = nao

        # Range of basis functions for BLACS distribution of matrices:
        self.Mmax = nao
        self.Mstart = bd.comm.rank * naoblocksize
        self.Mstop = min(self.Mstart + naoblocksize, self.Mmax)
        self.mynao = self.Mstop - self.Mstart

        # Column layout for one matrix per band rank:
        self.columngrid = BlacsGrid(bd.comm, bd.comm.size, 1)
        self.mMdescriptor = self.columngrid.new_descriptor(
            nao, nao, naoblocksize, nao)
        self.nMdescriptor = self.columngrid.new_descriptor(
            nbands, nao, mynbands, nao)

        #parallelprint(world, (mynao, self.mMdescriptor.shape))

        # Column layout for one matrix in total (only on grid masters):
        self.single_column_grid = BlacsGrid(self.column_comm, bd.comm.size, 1)
        self.mM_unique_descriptor = self.single_column_grid.new_descriptor( \
            nao, nao, naoblocksize, nao)

        # nM_unique_descriptor is meant to hold the coefficients after
        # diagonalization.  BLACS requires it to be nao-by-nao, but
        # we only fill meaningful data into the first nbands columns.
        #
        # The array will then be trimmed and broadcast across
        # the grid descriptor's communicator.
        self.nM_unique_descriptor = self.single_column_grid.new_descriptor( \
            nbands, nao, mynbands, nao)

        # Fully blocked grid for diagonalization with many CPUs:
        self.mmdescriptor = self.blockgrid.new_descriptor(
            nao, nao, blocksize, blocksize)

        #self.nMdescriptor = nMdescriptor
        self.mM2mm = Redistributor(self.block_comm, self.mM_unique_descriptor,
                                   self.mmdescriptor)
        self.mm2nM = Redistributor(self.block_comm, self.mmdescriptor,
                                   self.nM_unique_descriptor)

    def diagonalize(self, H_mm, C_nM, eps_n, S_mm):
        # C_nM needs to be simultaneously compatible with:
        # 1. outdescriptor
        # 2. broadcast with gd.comm
        # We will does this with a dummy buffer C2_nM
        outdescriptor = self.mm2nM.dstdescriptor  # blocks2cols
        blockdescriptor = self.mM2mm.dstdescriptor  # cols2blocks

        dtype = S_mm.dtype
        eps_M = np.empty(C_nM.shape[-1])  # empty helps us debug
        subM, subN = outdescriptor.gshape

        C_mm = blockdescriptor.zeros(dtype=dtype)
        self.timer.start('General diagonalize')
        # general_diagonalize_ex may have a buffer overflow, so
        # we no longer use it
        #blockdescriptor.general_diagonalize_ex(H_mm, S_mm.copy(), C_mm, eps_M,
        #                                       UL='L', iu=self.bd.nbands)
        blockdescriptor.general_diagonalize_dc(H_mm,
                                               S_mm.copy(),
                                               C_mm,
                                               eps_M,
                                               UL='L')
        self.timer.stop('General diagonalize')

        # Make C_nM compatible with the redistributor
        self.timer.start('Redistribute coefs')
        if outdescriptor:
            C2_nM = C_nM
        else:
            C2_nM = outdescriptor.empty(dtype=dtype)
        assert outdescriptor.check(C2_nM)
        self.mm2nM.redistribute(C_mm, C2_nM, subM, subN)  # blocks2cols
        self.timer.stop('Redistribute coefs')

        self.timer.start('Send coefs to domains')
        # eps_M is already on block_comm.rank = 0
        # easier to broadcast eps_M to all and
        # get the correct slice afterward.
        self.block_comm.broadcast(eps_M, 0)
        eps_n[:] = eps_M[self.bd.get_slice()]
        self.gd.comm.broadcast(C_nM, 0)
        self.timer.stop('Send coefs to domains')

    def distribute_overlap_matrix(self,
                                  S_qmM,
                                  root=0,
                                  add_hermitian_conjugate=False):
        # Some MPI implementations need a lot of memory to do large
        # reductions.  To avoid trouble, we do comm.sum on smaller blocks
        # of S (this code is also safe for arrays smaller than blocksize)
        Sflat_x = S_qmM.ravel()
        blocksize = 2**23 // Sflat_x.itemsize  # 8 MiB
        nblocks = -(-len(Sflat_x) // blocksize)
        Mstart = 0
        self.timer.start('blocked summation')
        for i in range(nblocks):
            self.gd.comm.sum(Sflat_x[Mstart:Mstart + blocksize], root=root)
            Mstart += blocksize
        assert Mstart + blocksize >= len(Sflat_x)
        self.timer.stop('blocked summation')

        xshape = S_qmM.shape[:-2]
        nm, nM = S_qmM.shape[-2:]
        S_qmM = S_qmM.reshape(-1, nm, nM)

        blockdesc = self.mmdescriptor
        coldesc = self.mM_unique_descriptor
        S_qmm = blockdesc.zeros(len(S_qmM), S_qmM.dtype)

        if not coldesc:  # XXX ugly way to sort out inactive ranks
            S_qmM = coldesc.zeros(len(S_qmM), S_qmM.dtype)

        self.timer.start('Scalapack redistribute')
        for S_mM, S_mm in zip(S_qmM, S_qmm):
            self.mM2mm.redistribute(S_mM, S_mm)
            if add_hermitian_conjugate:
                if blockdesc.active:
                    pblas_tran(1.0, S_mm.copy(), 1.0, S_mm, blockdesc,
                               blockdesc)

        self.timer.stop('Scalapack redistribute')
        return S_qmm.reshape(xshape + blockdesc.shape)

    def get_overlap_matrix_shape(self):
        return self.mmdescriptor.shape

    def calculate_blocked_density_matrix(self, f_n, C_nM):
        nbands = self.bd.nbands
        nao = self.nao
        dtype = C_nM.dtype

        self.nMdescriptor.checkassert(C_nM)
        if self.gd.rank == 0:
            Cf_nM = (C_nM * f_n[:, None]).conj()
        else:
            C_nM = self.nM_unique_descriptor.zeros(dtype=dtype)
            Cf_nM = self.nM_unique_descriptor.zeros(dtype=dtype)

        r = Redistributor(self.block_comm, self.nM_unique_descriptor,
                          self.mmdescriptor)

        Cf_mm = self.mmdescriptor.zeros(dtype=dtype)
        r.redistribute(Cf_nM, Cf_mm, nbands, nao)
        del Cf_nM

        C_mm = self.mmdescriptor.zeros(dtype=dtype)
        r.redistribute(C_nM, C_mm, nbands, nao)
        # no use to delete C_nM as it's in the input...

        rho_mm = self.mmdescriptor.zeros(dtype=dtype)

        pblas_simple_gemm(self.mmdescriptor,
                          self.mmdescriptor,
                          self.mmdescriptor,
                          Cf_mm,
                          C_mm,
                          rho_mm,
                          transa='T')
        return rho_mm

    def calculate_density_matrix(self, f_n, C_nM, rho_mM=None):
        """Calculate density matrix from occupations and coefficients.

        Presently this function performs the usual scalapack 3-step trick:
        redistribute-numbercrunching-backdistribute.
        
        
        Notes on future performance improvement.
        
        As per the current framework, C_nM exists as copies on each
        domain, i.e. this is not parallel over domains.  We'd like to
        correct this and have an efficient distribution using e.g. the
        block communicator.

        The diagonalization routine and other parts of the code should
        however be changed to accommodate the following scheme:
        
        Keep coefficients in C_mm form after the diagonalization.
        rho_mm can then be directly calculated from C_mm without
        redistribution, after which we only need to redistribute
        rho_mm across domains.
        
        """
        dtype = C_nM.dtype
        rho_mm = self.calculate_blocked_density_matrix(f_n, C_nM)
        rback = Redistributor(self.block_comm, self.mmdescriptor,
                              self.mM_unique_descriptor)
        rho1_mM = self.mM_unique_descriptor.zeros(dtype=dtype)
        rback.redistribute(rho_mm, rho1_mM)
        del rho_mm

        if rho_mM is None:
            if self.gd.rank == 0:
                rho_mM = rho1_mM
            else:
                rho_mM = self.mMdescriptor.zeros(dtype=dtype)

        self.gd.comm.broadcast(rho_mM, 0)
        return rho_mM

    def distribute_to_columns(self, rho_mm, srcdescriptor):
        redistributor = Redistributor(
            self.block_comm,  # XXX
            srcdescriptor,
            self.mM_unique_descriptor)
        rho_mM = redistributor.redistribute(rho_mm)
        if self.gd.rank != 0:
            rho_mM = self.mMdescriptor.zeros(dtype=rho_mm.dtype)
        self.gd.comm.broadcast(rho_mM, 0)
        return rho_mM

    def oldcalculate_density_matrix(self, f_n, C_nM, rho_mM=None):
        # This version is parallel over the band descriptor only.
        # This is inefficient, but let's keep it for a while in case
        # there's trouble with the more efficient version
        if rho_mM is None:
            rho_mM = self.mMdescriptor.zeros(dtype=C_nM.dtype)

        Cf_nM = (C_nM * f_n[:, None]).conj()
        pblas_simple_gemm(self.nMdescriptor,
                          self.nMdescriptor,
                          self.mMdescriptor,
                          Cf_nM,
                          C_nM,
                          rho_mM,
                          transa='T')
        return rho_mM

    def get_transposed_density_matrix(self, f_n, C_nM, rho_mM=None):
        return self.calculate_density_matrix(f_n, C_nM, rho_mM).conj()

    def get_description(self):
        (title, template) = BlacsLayouts.get_description(self)
        bg = self.blockgrid
        desc = self.mmdescriptor
        s = template % (bg.nprow, bg.npcol, desc.mb, desc.nb)
        return ' '.join([title, s])
Beispiel #25
0
grid = BlacsGrid(world, 2, world.size // 2)

desc = grid.new_descriptor(12, 8, 2, 3)

a = desc.zeros()
a[:] = world.rank

subdesc = grid.new_descriptor(7, 7, 2, 2)
b = subdesc.zeros()

r = Redistributor(grid.comm, desc, subdesc, uplo="G")

ia = 3
ja = 2
ib = 1
jb = 1
M = 4
N = 5

r.redistribute(a, b, M, N, ia, ja, ib, jb)

a0 = desc.collect_on_master(a)
b0 = subdesc.collect_on_master(b)
if world.rank == 0:
    print a0
    print b0
    xa = a0[ia : ia + M, ja : ja + N]
    xb = b0[ib : ib + M, jb : jb + N]
    assert (xa == xb).all()
Beispiel #26
0
    def calculate_rkernel(self):

        gd = self.gd
        ng_c = gd.N_c
        cell_cv = gd.cell_cv
        icell_cv = 2 * np.pi * np.linalg.inv(cell_cv)
        vol = np.linalg.det(cell_cv)

        ns = self.calc.wfs.nspins
        n_g = self.n_g   # density on rough grid

        fx_g = ns * self.get_fxc_g(n_g)   # local exchange kernel
        qc_g = (-4 * np.pi * ns / fx_g)**0.5   # cutoff functional
        flocal_g = qc_g**3 * fx_g / (6 * np.pi**2)   # ren. x-kernel for r=r'
        Vlocal_g = 2 * qc_g / np.pi   # ren. Hartree kernel for r=r'

        ng = np.prod(ng_c)   # number of grid points
        r_vg = gd.get_grid_point_coordinates()
        rx_g = r_vg[0].flatten()
        ry_g = r_vg[1].flatten()
        rz_g = r_vg[2].flatten()

        prnt('    %d grid points and %d plane waves at the Gamma point' %
             (ng, self.pd.ngmax), file=self.fd)

        # Unit cells
        R_Rv = []
        weight_R = []
        nR_v = self.unit_cells
        nR = np.prod(nR_v)
        for i in range(-nR_v[0] + 1, nR_v[0]):
            for j in range(-nR_v[1] + 1, nR_v[1]):
                for h in range(-nR_v[2] + 1, nR_v[2]):
                    R_Rv.append(i * cell_cv[0] +
                                j * cell_cv[1] +
                                h * cell_cv[2])
                    weight_R.append((nR_v[0] - abs(i)) *
                                    (nR_v[1] - abs(j)) *
                                    (nR_v[2] - abs(h)) / float(nR))
        if nR > 1:
            # with more than one unit cell only the exchange kernel is
            # calculated on the grid. The bare Coulomb kernel is added
            # in PW basis and Vlocal_g only the exchange part
            dv = self.calc.density.gd.dv
            gc = (3 * dv / 4 / np.pi)**(1 / 3.)
            Vlocal_g -= 2 * np.pi * gc**2 / dv
            prnt('    Lattice point sampling: ' +
                 '(%s x %s x %s)^2 ' % (nR_v[0], nR_v[1], nR_v[2]) +
                 ' Reduced to %s lattice points' % len(R_Rv), file=self.fd)

        l_g_size = -(-ng // mpi.world.size)
        l_g_range = range(mpi.world.rank * l_g_size,
                          min((mpi.world.rank+1) * l_g_size, ng))

        fhxc_qsGr = {}
        for iq in range(len(self.ibzq_qc)):
            fhxc_qsGr[iq] = np.zeros((ns, len(self.pd.G2_qG[iq]),
                                      len(l_g_range)), dtype=complex)

        inv_error = np.seterr()
        np.seterr(invalid='ignore')
        np.seterr(divide='ignore')

        t0 = time()
        # Loop over Lattice points
        for i, R_v in enumerate(R_Rv):
            # Loop over r'. f_rr and V_rr are functions of r (dim. as r_vg[0])
            if i == 1:
                prnt('      Finished 1 cell in %s seconds' % int(time() - t0) +
                     ' - estimated %s seconds left' %
                     int((len(R_Rv) - 1) * (time() - t0)), 
                     file=self.fd)
                self.fd.flush()
            if len(R_Rv) > 5:
                if (i+1) % (len(R_Rv) / 5 + 1) == 0:
                    prnt('      Finished %s cells in %s seconds'
                         % (i, int(time() - t0))
                         + ' - estimated %s seconds left'
                         % int((len(R_Rv) - i) * (time() - t0) / i), 
                         file=self.fd)
                    self.fd.flush()
            for g in l_g_range:
                rx = rx_g[g] + R_v[0]
                ry = ry_g[g] + R_v[1]
                rz = rz_g[g] + R_v[2]

                # |r-r'-R_i|
                rr = ((r_vg[0] - rx)**2 +
                      (r_vg[1] - ry)**2 +
                      (r_vg[2] - rz)**2)**0.5

                n_av = (n_g + n_g.flatten()[g]) / 2.
                fx_g = ns * self.get_fxc_g(n_av, index=g)
                qc_g = (-4 * np.pi * ns / fx_g)**0.5
                x = qc_g * rr
                osc_x = np.sin(x) - x*np.cos(x)
                f_rr = fx_g * osc_x / (2 * np.pi**2 * rr**3)
                if nR > 1:   # include only exchange part of the kernel here
                    V_rr = (sici(x)[0] * 2 / np.pi - 1) / rr
                else:        # include the full kernel (also hartree part)
                    V_rr = (sici(x)[0] * 2 / np.pi) / rr

                # Terms with r = r'
                if (np.abs(R_v) < 0.001).all():
                    tmp_flat = f_rr.flatten()
                    tmp_flat[g] = flocal_g.flatten()[g]
                    f_rr = tmp_flat.reshape(ng_c)
                    tmp_flat = V_rr.flatten()
                    tmp_flat[g] = Vlocal_g.flatten()[g]
                    V_rr = tmp_flat.reshape(ng_c)
                    del tmp_flat

                f_rr[np.where(n_av < self.density_cut)] = 0.0
                V_rr[np.where(n_av < self.density_cut)] = 0.0

                f_rr *= weight_R[i]
                V_rr *= weight_R[i]

                # r-r'-R_i
                r_r = np.array([r_vg[0] - rx, r_vg[1] - ry, r_vg[2] - rz])

                # Fourier transform of r
                for iq, q in enumerate(self.ibzq_qc):
                    q_v = np.dot(q, icell_cv)
                    e_q = np.exp(-1j * gemmdot(q_v, r_r, beta=0.0))
                    f_q = self.pd.fft((f_rr + V_rr) * e_q, iq) * vol / ng
                    fhxc_qsGr[iq][0, :, g - l_g_range[0]] += f_q
                    if ns == 2:
                        f_q = self.pd.fft(V_rr * e_q, iq) * vol / ng
                        fhxc_qsGr[iq][1, :, g - l_g_range[0]] += f_q

        mpi.world.barrier()

        np.seterr(**inv_error)

        for iq, q in enumerate(self.ibzq_qc):
            npw = len(self.pd.G2_qG[iq])
            fhxc_sGsG = np.zeros((ns * npw, ns * npw), complex)
            l_pw_size = -(-npw // mpi.world.size)  # parallelize over PW below
            l_pw_range = range(mpi.world.rank * l_pw_size,
                               min((mpi.world.rank + 1) * l_pw_size, npw))

            if mpi.world.size > 1:
                # redistribute grid and plane waves in fhxc_qsGr[iq]
                bg1 = BlacsGrid(mpi.world, 1, mpi.world.size)
                bg2 = BlacsGrid(mpi.world, mpi.world.size, 1)
                bd1 = bg1.new_descriptor(npw, ng, npw, - (-ng / mpi.world.size))
                bd2 = bg2.new_descriptor(npw, ng, -(-npw / mpi.world.size), ng)

                fhxc_Glr = np.zeros((len(l_pw_range), ng), dtype=complex)
                if ns == 2:
                    Koff_Glr = np.zeros((len(l_pw_range), ng), dtype=complex)

                r = Redistributor(bg1.comm, bd1, bd2)
                r.redistribute(fhxc_qsGr[iq][0], fhxc_Glr, npw, ng)
                if ns == 2:
                    r.redistribute(fhxc_qsGr[iq][1], Koff_Glr, npw, ng)
            else:
                fhxc_Glr = fhxc_qsGr[iq][0]
                if ns == 2:
                    Koff_Glr = fhxc_qsGr[iq][1]

            # Fourier transform of r'
            for iG in range(len(l_pw_range)):
                f_g = fhxc_Glr[iG].reshape(ng_c)
                f_G = self.pd.fft(f_g.conj(), iq) * vol / ng
                fhxc_sGsG[l_pw_range[0] + iG, :npw] = f_G.conj()
                if ns == 2:
                    v_g = Koff_Glr[iG].reshape(ng_c)
                    v_G = self.pd.fft(v_g.conj(), iq) * vol / ng
                    fhxc_sGsG[npw + l_pw_range[0] + iG, :npw] = v_G.conj()

            if ns == 2:  # f_00 = f_11 and f_01 = f_10
                fhxc_sGsG[:npw, npw:] = fhxc_sGsG[npw:, :npw]
                fhxc_sGsG[npw:, npw:] = fhxc_sGsG[:npw, :npw]

            mpi.world.sum(fhxc_sGsG)
            fhxc_sGsG /= vol

            if mpi.rank == 0:
                w = Writer('fhxc_%s_%s_%s_%s.gpw' %
                           (self.tag, self.xc, self.ecut, iq))
                w.dimension('sG', ns * npw)
                w.add('fhxc_sGsG', ('sG', 'sG'), dtype=complex)
                if nR > 1:  # add Hartree kernel evaluated in PW basis
                    Gq2_G = self.pd.G2_qG[iq]
                    if (q == 0).all():
                        Gq2_G[0] = 1.
                    vq_G = 4 * np.pi / Gq2_G
                    fhxc_sGsG += np.tile(np.eye(npw) * vq_G, (ns, ns))
                w.fill(fhxc_sGsG)
                w.close()
            mpi.world.barrier()
        prnt(file=self.fd)
Beispiel #27
0
class ECNPropagator(LCAOPropagator):

    def __init__(self):
        LCAOPropagator.__init__(self)

    def initialize(self, paw, hamiltonian=None):
        LCAOPropagator.initialize(self, paw)
        if hamiltonian is not None:
            self.hamiltonian = hamiltonian

        ksl = self.wfs.ksl
        self.blacs = ksl.using_blacs
        if self.blacs:
            from gpaw.blacs import Redistributor
            self.log('BLACS Parallelization')

            # Parallel grid descriptors
            grid = ksl.blockgrid
            assert grid.nprow * grid.npcol == ksl.block_comm.size
            self.mm_block_descriptor = ksl.mmdescriptor
            self.Cnm_block_descriptor = grid.new_descriptor(ksl.bd.nbands,
                                                            ksl.nao,
                                                            ksl.blocksize,
                                                            ksl.blocksize)
            self.CnM_unique_descriptor = ksl.nM_unique_descriptor

            # Redistributors
            self.Cnm2nM = Redistributor(ksl.block_comm,
                                        self.Cnm_block_descriptor,
                                        self.CnM_unique_descriptor)
            self.CnM2nm = Redistributor(ksl.block_comm,
                                        self.CnM_unique_descriptor,
                                        self.Cnm_block_descriptor)

            if debug:
                nao = ksl.nao
                self.MM_descriptor = grid.new_descriptor(nao, nao, nao, nao)
                self.mm2MM = Redistributor(ksl.block_comm,
                                           self.mm_block_descriptor,
                                           self.MM_descriptor)
                self.MM2mm = Redistributor(ksl.block_comm,
                                           self.MM_descriptor,
                                           self.mm_block_descriptor)

            for kpt in self.wfs.kpt_u:
                scalapack_zero(self.mm_block_descriptor, kpt.S_MM, 'U')
                scalapack_zero(self.mm_block_descriptor, kpt.T_MM, 'U')

    def kick(self, hamiltonian, time):
        # Propagate
        get_matrix = self.wfs.eigensolver.calculate_hamiltonian_matrix
        for kpt in self.wfs.kpt_u:
            Vkick_MM = get_matrix(hamiltonian, self.wfs, kpt,
                                  add_kinetic=False, root=-1)
            for i in range(10):
                self.propagate_wfs(kpt.C_nM, kpt.C_nM, kpt.S_MM, Vkick_MM, 0.1)

        # Update Hamiltonian (and density)
        self.hamiltonian.update()

    def propagate(self, time, time_step):
        for kpt in self.wfs.kpt_u:
            H_MM = self.hamiltonian.get_hamiltonian_matrix(kpt)
            self.propagate_wfs(kpt.C_nM, kpt.C_nM, kpt.S_MM, H_MM, time_step)
        self.hamiltonian.update()
        return time + time_step

    def propagate_wfs(self, sourceC_nM, targetC_nM, S_MM, H_MM, dt):
        self.timer.start('Linear solve')

        if self.blacs:
            # XXX, Preallocate
            target_blockC_nm = self.Cnm_block_descriptor.empty(dtype=complex)
            temp_blockC_nm = self.Cnm_block_descriptor.empty(dtype=complex)
            temp_block_mm = self.mm_block_descriptor.empty(dtype=complex)
            if self.density.gd.comm.rank != 0:
                # XXX Fake blacks nbands, nao, nbands, nao grid because some
                # weird asserts
                # (these are 0,x or x,0 arrays)
                sourceC_nM = self.CnM_unique_descriptor.zeros(dtype=complex)

            # 1. target = (S+0.5j*H*dt) * source
            # Wave functions to target
            self.CnM2nm.redistribute(sourceC_nM, temp_blockC_nm)

            # XXX It can't be this f'n hard to symmetrize a matrix (tri2full)
            # Remove upper diagonal
            scalapack_zero(self.mm_block_descriptor, H_MM, 'U')
            # Lower diagonal matrix:
            temp_block_mm[:] = S_MM - (0.5j * dt) * H_MM
            scalapack_set(self.mm_block_descriptor, temp_block_mm, 0, 0, 'U')
            # Note it's strictly lower diagonal matrix
            # Add transpose of H
            pblas_tran(-0.5j * dt, H_MM, 1.0, temp_block_mm,
                       self.mm_block_descriptor, self.mm_block_descriptor)
            # Add transpose of S
            pblas_tran(1.0, S_MM, 1.0, temp_block_mm,
                       self.mm_block_descriptor, self.mm_block_descriptor)

            pblas_simple_gemm(self.Cnm_block_descriptor,
                              self.mm_block_descriptor,
                              self.Cnm_block_descriptor,
                              temp_blockC_nm,
                              temp_block_mm,
                              target_blockC_nm)
            # 2. target = (S-0.5j*H*dt)^-1 * target
            # temp_block_mm[:] = S_MM + (0.5j*dt) * H_MM
            # XXX It can't be this f'n hard to symmetrize a matrix (tri2full)
            # Lower diagonal matrix:
            temp_block_mm[:] = S_MM + (0.5j * dt) * H_MM
            # Not it's stricly lower diagonal matrix:
            scalapack_set(self.mm_block_descriptor, temp_block_mm, 0, 0, 'U')
            # Add transpose of H:
            pblas_tran(+0.5j * dt, H_MM, 1.0, temp_block_mm,
                       self.mm_block_descriptor, self.mm_block_descriptor)
            # Add transpose of S
            pblas_tran(1.0, S_MM, 1.0, temp_block_mm,
                       self.mm_block_descriptor, self.mm_block_descriptor)

            scalapack_solve(self.mm_block_descriptor,
                            self.Cnm_block_descriptor,
                            temp_block_mm,
                            target_blockC_nm)

            if self.density.gd.comm.rank != 0:  # XXX is this correct?
                # XXX Fake blacks nbands, nao, nbands, nao grid because some
                # weird asserts
                # (these are 0,x or x,0 arrays)
                target = self.CnM_unique_descriptor.zeros(dtype=complex)
            else:
                target = targetC_nM
            self.Cnm2nM.redistribute(target_blockC_nm, target)
            self.density.gd.comm.broadcast(targetC_nM, 0)  # Is this required?
        else:
            # Note: The full equation is conjugated (therefore -+, not +-)
            targetC_nM[:] = \
                solve(S_MM - 0.5j * H_MM * dt,
                      np.dot(S_MM + 0.5j * H_MM * dt,
                             sourceC_nM.T.conjugate())).T.conjugate()

        self.timer.stop('Linear solve')

    def blacs_mm_to_global(self, H_mm):
        if not debug:
            raise RuntimeError('Use debug mode')
        # Someone could verify that this works and remove the error.
        raise NotImplementedError('Method untested and thus unreliable')
        target = self.MM_descriptor.empty(dtype=complex)
        self.mm2MM.redistribute(H_mm, target)
        self.wfs.world.barrier()
        return target

    def blacs_nm_to_global(self, C_nm):
        # Someone could verify that this works and remove the error.
        raise NotImplementedError('Method untested and thus unreliable')
        target = self.CnM_unique_descriptor.empty(dtype=complex)
        self.Cnm2nM.redistribute(C_nm, target)
        self.wfs.world.barrier()
        return target

    def todict(self):
        return {'name': 'ecn'}
Beispiel #28
0
class LrTDDFPTSolveLayout:
    """BLACS layouts for distributed TD-DFPT"""
    def __init__(self, sl_lrtddft, nrows, lr_comms):
        self.mprocs, self.nprocs, self.block_size = tuple(sl_lrtddft)

        self.lr_comms = lr_comms

        # for SCALAPACK we need TRANSPOSED MATRIX (and vector)
        #
        # -----------------------------------------------------------------
        # matrix

        # original grid, ie, how matrix is stored
        self.orig_matrix_grid = BlacsGrid(self.lr_comms.parent_comm,
                                          self.lr_comms.dd_comm.size,
                                          self.lr_comms.eh_comm.size)

        # solve grid
        self.solve_matrix_grid = BlacsGrid(self.lr_comms.parent_comm,
                                           self.mprocs, self.nprocs)

        # M = rows, N = cols
        M = nrows * 4
        N = nrows * 4
        mb = 4
        nb = 4
        self.orig_matrix_descr = self.orig_matrix_grid.new_descriptor(
            N, M, nb, mb)

        bs = self.block_size
        self.solve_matrix_descr = self.solve_matrix_grid.new_descriptor(
            N, M, bs, bs)

        self.matrix_in_redist = Redistributor(self.lr_comms.parent_comm,
                                              self.orig_matrix_descr,
                                              self.solve_matrix_descr)

        # -----------------------------------------------------------------
        # vector

        # original grid, ie, how vector is stored
        self.orig_vector_grid = BlacsGrid(
            self.lr_comms.parent_comm, 1,
            (self.lr_comms.dd_comm.size * self.lr_comms.eh_comm.size))

        # solve grid
        #self.solve_vector_grid = BlacsGrid(self.lr_comms.parent_comm, self.mprocs, self.nprocs)

        # M = rows, N = cols
        M = nrows * 4
        Nrhs = 1
        mb = 4
        nb = 1
        self.orig_vector_descr = self.orig_vector_grid.new_descriptor(
            Nrhs, M, nb, mb)

        bs = self.block_size
        self.solve_vector_descr = self.solve_matrix_grid.new_descriptor(
            Nrhs, M, 1, bs)

        self.vector_in_redist = Redistributor(self.lr_comms.parent_comm,
                                              self.orig_vector_descr,
                                              self.solve_vector_descr)

        self.vector_out_redist = Redistributor(self.lr_comms.parent_comm,
                                               self.solve_vector_descr,
                                               self.orig_vector_descr)

    def solve(self, A_orig, b_orig):
        """Solve TD-DFPT equation using Scalapack.
        """

        A_solve = self.solve_matrix_descr.empty(dtype=float)
        if not self.orig_matrix_descr.blacsgrid.is_active():
            A_orig = np.empty((0, 0), dtype=float)

        self.matrix_in_redist.redistribute(A_orig, A_solve)

        b_solve = self.solve_vector_descr.empty(dtype=float)
        if not self.orig_vector_descr.blacsgrid.is_active():
            b_orig = np.empty((0, 0), dtype=float)

        self.vector_in_redist.redistribute(b_orig, b_solve)

        #if False:
        #    np.set_printoptions(precision=5, suppress=True)
        #    for i in range(self.lr_comms.parent_comm.size):
        #        if ( self.lr_comms.parent_comm.rank == i ):
        #            print 'rank ', i
        #            print A_orig
        #            print A_solve
        #            print
        #            print b_orig
        #            print b_solve
        #            print
        #            print
        #            print self.solve_matrix_descr.asarray()
        #            print self.solve_vector_descr.asarray()
        #            print
        #            print '---'
        #            print
        #        self.lr_comms.parent_comm.barrier()

        info = 0
        if self.solve_matrix_descr.blacsgrid.is_active():
            _gpaw.scalapack_solve(A_solve, self.solve_matrix_descr.asarray(),
                                  b_solve, self.solve_vector_descr.asarray())
            if info != 0:
                raise RuntimeError('scalapack_solve error: %d' % info)

        self.vector_out_redist.redistribute(b_solve, b_orig)

        #if False:
        #    for i in range(self.lr_comms.parent_comm.size):
        #        if ( self.lr_comms.parent_comm.rank == i ):
        #            print 'rank ', i
        #            print A_orig
        #            print A_solve
        #            print
        #            print b_orig
        #            print b_solve
        #            print
        #            print
        #        self.lr_comms.parent_comm.barrier()

        return b_orig
Beispiel #29
0
    def diagonalize_full_hamiltonian(self, ham, atoms, occupations, txt,
                                     nbands=None, scalapack=None, expert=False):
        assert self.dtype == complex

        if nbands is None:
            nbands = self.pd.ngmin // self.bd.comm.size * self.bd.comm.size
        else:
            assert nbands <= self.pd.ngmin

        if expert:
            iu = nbands
        else:
            iu = None

        self.bd = bd = BandDescriptor(nbands, self.bd.comm)

        p = functools.partial(print, file=txt)
        p('Diagonalizing full Hamiltonian ({0} lowest bands)'.format(nbands))
        p('Matrix size (min, max): {0}, {1}'.format(self.pd.ngmin,
                                                    self.pd.ngmax))
        mem = 3 * self.pd.ngmax**2 * 16 / bd.comm.size / 1024**2
        p('Approximate memory usage per core: {0:.3f} MB'.format(mem))
        if bd.comm.size > 1:
            if isinstance(scalapack, (list, tuple)):
                nprow, npcol, b = scalapack
            else:
                nprow = int(round(bd.comm.size**0.5))
                while bd.comm.size % nprow != 0:
                    nprow -= 1
                npcol = bd.comm.size // nprow
                b = 64
            p('ScaLapack grid: {0}x{1},'.format(nprow, npcol),
              'block-size:', b)
            bg = BlacsGrid(bd.comm, bd.comm.size, 1)
            bg2 = BlacsGrid(bd.comm, nprow, npcol)
            scalapack = True
        else:
            nprow = npcol = 1
            scalapack = False

        self.pt.set_positions(atoms.get_scaled_positions())
        self.kpt_u[0].P_ani = None
        self.allocate_arrays_for_projections(self.pt.my_atom_indices)

        myslice = bd.get_slice()

        pb = ProgressBar(txt)
        nkpt = len(self.kpt_u)
        
        for u, kpt in enumerate(self.kpt_u):
            pb.update(u / nkpt)
            npw = len(self.pd.Q_qG[kpt.q])
            if scalapack:
                mynpw = -(-npw // bd.comm.size)
                md = BlacsDescriptor(bg, npw, npw, mynpw, npw)
                md2 = BlacsDescriptor(bg2, npw, npw, b, b)
            else:
                md = md2 = MatrixDescriptor(npw, npw)

            with self.timer('Build H and S'):
                H_GG, S_GG = self.hs(ham, kpt.q, kpt.s, md)

            if scalapack:
                r = Redistributor(bd.comm, md, md2)
                H_GG = r.redistribute(H_GG)
                S_GG = r.redistribute(S_GG)

            psit_nG = md2.empty(dtype=complex)
            eps_n = np.empty(npw)

            with self.timer('Diagonalize'):
                if not scalapack:
                    md2.general_diagonalize_dc(H_GG, S_GG, psit_nG, eps_n, 
                                               iu=iu)
                else:
                    md2.general_diagonalize_dc(H_GG, S_GG, psit_nG, eps_n)
            del H_GG, S_GG

            kpt.eps_n = eps_n[myslice].copy()

            if scalapack:
                md3 = BlacsDescriptor(bg, npw, npw, bd.mynbands, npw)
                r = Redistributor(bd.comm, md2, md3)
                psit_nG = r.redistribute(psit_nG)

            kpt.psit_nG = psit_nG[:bd.mynbands].copy()
            del psit_nG

            with self.timer('Projections'):
                self.pt.integrate(kpt.psit_nG, kpt.P_ani, kpt.q)

            kpt.f_n = None

        pb.finish()
        
        occupations.calculate(self)
Beispiel #30
0
def main(N=73, seed=42, mprocs=2, nprocs=2, dtype=float):
    gen = np.random.RandomState(seed)
    grid = BlacsGrid(world, mprocs, nprocs)
    
    if (dtype==complex):
        epsilon = 1.0j
    else:
        epsilon = 0.0

    # Create descriptors for matrices on master:
    glob = grid.new_descriptor(N, N, N, N)

    # print globA.asarray()
    # Populate matrices local to master:
    H0 = glob.zeros(dtype=dtype) + gen.rand(*glob.shape)
    S0 = glob.zeros(dtype=dtype) + gen.rand(*glob.shape)
    C0 = glob.empty(dtype=dtype)
    if rank == 0:
        # Complex case must have real numbers on the diagonal.
        # We make a simple complex Hermitian matrix below.
        H0 = H0 + epsilon * (0.1*np.tri(N, N, k= -N // nprocs) + 0.3*np.tri(N, N, k=-1))
        S0 = S0 + epsilon * (0.2*np.tri(N, N, k= -N // nprocs) + 0.4*np.tri(N, N, k=-1))
        # Make matrices symmetric
        rk(1.0, H0.copy(), 0.0, H0)
        rk(1.0, S0.copy(), 0.0, S0)
        # Overlap matrix must be semi-positive definite
        S0 = S0 + 50.0*np.eye(N, N, 0)
        # Hamiltonian is usually diagonally dominant
        H0 = H0 + 75.0*np.eye(N, N, 0)
        C0 = S0.copy()

    # Local result matrices
    W0 = np.empty((N),dtype=float)
    W0_g = np.empty((N),dtype=float)

    # Calculate eigenvalues
    if rank == 0:
        diagonalize(H0.copy(), W0)
        general_diagonalize(H0.copy(), W0_g, S0.copy())
        inverse_cholesky(C0) # result returned in lower triangle
        # tri2full(C0) # symmetrize
        
    assert glob.check(H0) and glob.check(S0) and glob.check(C0)

    # Create distributed destriptors with various block sizes:
    dist = grid.new_descriptor(N, N, 8, 8)

    # Distributed matrices:
    # We can use empty here, but end up with garbage on
    # on the other half of the triangle when we redistribute.
    # This is fine because ScaLAPACK does not care.

    H = dist.empty(dtype=dtype)
    S = dist.empty(dtype=dtype)
    Z = dist.empty(dtype=dtype)
    C = dist.empty(dtype=dtype)

    # Eigenvalues are non-BLACS matrices
    W = np.empty((N), dtype=float)
    W_dc = np.empty((N), dtype=float)
    W_mr3 = np.empty((N), dtype=float)
    W_g = np.empty((N), dtype=float)
    W_g_dc = np.empty((N), dtype=float)
    W_g_mr3 = np.empty((N), dtype=float)

    Glob2dist = Redistributor(world, glob, dist)
    Glob2dist.redistribute(H0, H, uplo='L')
    Glob2dist.redistribute(S0, S, uplo='L')
    Glob2dist.redistribute(S0, C, uplo='L') # C0 was previously overwritten

    # we don't test the expert drivers anymore since there
    # might be a buffer overflow error
    ## scalapack_diagonalize_ex(dist, H.copy(), Z, W, 'L')
    scalapack_diagonalize_dc(dist, H.copy(), Z, W_dc, 'L')
    ## scalapack_diagonalize_mr3(dist, H.copy(), Z, W_mr3, 'L')
    ## scalapack_general_diagonalize_ex(dist, H.copy(), S.copy(), Z, W_g, 'L')
    scalapack_general_diagonalize_dc(dist, H.copy(), S.copy(), Z, W_g_dc, 'L')
    ## scalapack_general_diagonalize_mr3(dist, H.copy(), S.copy(), Z, W_g_mr3, 'L')
    scalapack_inverse_cholesky(dist, C, 'L')

    # Undo redistribute
    C_test = glob.empty(dtype=dtype)
    Dist2glob = Redistributor(world, dist, glob)
    Dist2glob.redistribute(C, C_test)

    if rank == 0:
        ## diag_ex_err = abs(W - W0).max()
        diag_dc_err = abs(W_dc - W0).max()
        ## diag_mr3_err = abs(W_mr3 - W0).max()
        ## general_diag_ex_err = abs(W_g - W0_g).max()
        general_diag_dc_err = abs(W_g_dc - W0_g).max()
        ## general_diag_mr3_err = abs(W_g_mr3 - W0_g).max()
        inverse_chol_err = abs(C_test-C0).max()
        ## print 'diagonalize ex err', diag_ex_err
        print 'diagonalize dc err', diag_dc_err
        ## print 'diagonalize mr3 err', diag_mr3_err
        ## print 'general diagonalize ex err', general_diag_ex_err
        print 'general diagonalize dc err', general_diag_dc_err
        ## print 'general diagonalize mr3 err', general_diag_mr3_err
        print 'inverse chol err', inverse_chol_err 
    else:
        ## diag_ex_err = 0.0
        diag_dc_err = 0.0
        ## diag_mr3_err = 0.0
        ## general_diag_ex_err = 0.0
        general_diag_dc_err = 0.0
        ## general_diag_mr3_err = 0.0
        inverse_chol_err = 0.0

    # We don't like exceptions on only one cpu
    ## diag_ex_err = world.sum(diag_ex_err)
    diag_dc_err = world.sum(diag_dc_err)
    ## diag_mr3_err = world.sum(diag_mr3_err)
    ## general_diag_ex_err = world.sum(general_diag_ex_err)
    general_diag_dc_err = world.sum(general_diag_dc_err)
    ## general_diag_mr3_err = world.sum(general_diag_mr3_err) 
    inverse_chol_err = world.sum(inverse_chol_err)
    ## assert diag_ex_err < tol
    assert diag_dc_err < tol
    ## assert diag_mr3_err < tol
    ## assert general_diag_ex_err < tol
    assert general_diag_dc_err < tol
    ## assert general_diag_mr3_err < tol
    assert inverse_chol_err < tol
Beispiel #31
0
    def calculate_rkernel(self):

        gd = self.gd
        ng_c = gd.N_c
        cell_cv = gd.cell_cv
        icell_cv = 2 * np.pi * np.linalg.inv(cell_cv)
        vol = np.linalg.det(cell_cv)

        ns = self.calc.wfs.nspins
        n_g = self.n_g  # density on rough grid

        fx_g = ns * self.get_fxc_g(n_g)  # local exchange kernel
        qc_g = (-4 * np.pi * ns / fx_g)**0.5  # cutoff functional
        flocal_g = qc_g**3 * fx_g / (6 * np.pi**2)  # ren. x-kernel for r=r'
        Vlocal_g = 2 * qc_g / np.pi  # ren. Hartree kernel for r=r'

        ng = np.prod(ng_c)  # number of grid points
        r_vg = gd.get_grid_point_coordinates()
        rx_g = r_vg[0].flatten()
        ry_g = r_vg[1].flatten()
        rz_g = r_vg[2].flatten()

        prnt('    %d grid points and %d plane waves at the Gamma point' %
             (ng, self.pd.ngmax),
             file=self.fd)

        # Unit cells
        R_Rv = []
        weight_R = []
        nR_v = self.unit_cells
        nR = np.prod(nR_v)
        for i in range(-nR_v[0] + 1, nR_v[0]):
            for j in range(-nR_v[1] + 1, nR_v[1]):
                for h in range(-nR_v[2] + 1, nR_v[2]):
                    R_Rv.append(i * cell_cv[0] + j * cell_cv[1] +
                                h * cell_cv[2])
                    weight_R.append((nR_v[0] - abs(i)) * (nR_v[1] - abs(j)) *
                                    (nR_v[2] - abs(h)) / float(nR))
        if nR > 1:
            # with more than one unit cell only the exchange kernel is
            # calculated on the grid. The bare Coulomb kernel is added
            # in PW basis and Vlocal_g only the exchange part
            dv = self.calc.density.gd.dv
            gc = (3 * dv / 4 / np.pi)**(1 / 3.)
            Vlocal_g -= 2 * np.pi * gc**2 / dv
            prnt('    Lattice point sampling: ' + '(%s x %s x %s)^2 ' %
                 (nR_v[0], nR_v[1], nR_v[2]) +
                 ' Reduced to %s lattice points' % len(R_Rv),
                 file=self.fd)

        l_g_size = -(-ng // mpi.world.size)
        l_g_range = range(mpi.world.rank * l_g_size,
                          min((mpi.world.rank + 1) * l_g_size, ng))

        fhxc_qsGr = {}
        for iq in range(len(self.ibzq_qc)):
            fhxc_qsGr[iq] = np.zeros(
                (ns, len(self.pd.G2_qG[iq]), len(l_g_range)), dtype=complex)

        inv_error = np.seterr()
        np.seterr(invalid='ignore')
        np.seterr(divide='ignore')

        t0 = time()
        # Loop over Lattice points
        for i, R_v in enumerate(R_Rv):
            # Loop over r'. f_rr and V_rr are functions of r (dim. as r_vg[0])
            if i == 1:
                prnt('      Finished 1 cell in %s seconds' % int(time() - t0) +
                     ' - estimated %s seconds left' % int(
                         (len(R_Rv) - 1) * (time() - t0)),
                     file=self.fd)
                self.fd.flush()
            if len(R_Rv) > 5:
                if (i + 1) % (len(R_Rv) / 5 + 1) == 0:
                    prnt('      Finished %s cells in %s seconds' %
                         (i, int(time() - t0)) +
                         ' - estimated %s seconds left' % int(
                             (len(R_Rv) - i) * (time() - t0) / i),
                         file=self.fd)
                    self.fd.flush()
            for g in l_g_range:
                rx = rx_g[g] + R_v[0]
                ry = ry_g[g] + R_v[1]
                rz = rz_g[g] + R_v[2]

                # |r-r'-R_i|
                rr = ((r_vg[0] - rx)**2 + (r_vg[1] - ry)**2 +
                      (r_vg[2] - rz)**2)**0.5

                n_av = (n_g + n_g.flatten()[g]) / 2.
                fx_g = ns * self.get_fxc_g(n_av, index=g)
                qc_g = (-4 * np.pi * ns / fx_g)**0.5
                x = qc_g * rr
                osc_x = np.sin(x) - x * np.cos(x)
                f_rr = fx_g * osc_x / (2 * np.pi**2 * rr**3)
                if nR > 1:  # include only exchange part of the kernel here
                    V_rr = (sici(x)[0] * 2 / np.pi - 1) / rr
                else:  # include the full kernel (also hartree part)
                    V_rr = (sici(x)[0] * 2 / np.pi) / rr

                # Terms with r = r'
                if (np.abs(R_v) < 0.001).all():
                    tmp_flat = f_rr.flatten()
                    tmp_flat[g] = flocal_g.flatten()[g]
                    f_rr = tmp_flat.reshape(ng_c)
                    tmp_flat = V_rr.flatten()
                    tmp_flat[g] = Vlocal_g.flatten()[g]
                    V_rr = tmp_flat.reshape(ng_c)
                    del tmp_flat

                f_rr[np.where(n_av < self.density_cut)] = 0.0
                V_rr[np.where(n_av < self.density_cut)] = 0.0

                f_rr *= weight_R[i]
                V_rr *= weight_R[i]

                # r-r'-R_i
                r_r = np.array([r_vg[0] - rx, r_vg[1] - ry, r_vg[2] - rz])

                # Fourier transform of r
                for iq, q in enumerate(self.ibzq_qc):
                    q_v = np.dot(q, icell_cv)
                    e_q = np.exp(-1j * gemmdot(q_v, r_r, beta=0.0))
                    f_q = self.pd.fft((f_rr + V_rr) * e_q, iq) * vol / ng
                    fhxc_qsGr[iq][0, :, g - l_g_range[0]] += f_q
                    if ns == 2:
                        f_q = self.pd.fft(V_rr * e_q, iq) * vol / ng
                        fhxc_qsGr[iq][1, :, g - l_g_range[0]] += f_q

        mpi.world.barrier()

        np.seterr(**inv_error)

        for iq, q in enumerate(self.ibzq_qc):
            npw = len(self.pd.G2_qG[iq])
            fhxc_sGsG = np.zeros((ns * npw, ns * npw), complex)
            l_pw_size = -(-npw // mpi.world.size)  # parallelize over PW below
            l_pw_range = range(mpi.world.rank * l_pw_size,
                               min((mpi.world.rank + 1) * l_pw_size, npw))

            if mpi.world.size > 1:
                # redistribute grid and plane waves in fhxc_qsGr[iq]
                bg1 = BlacsGrid(mpi.world, 1, mpi.world.size)
                bg2 = BlacsGrid(mpi.world, mpi.world.size, 1)
                bd1 = bg1.new_descriptor(npw, ng, npw, -(-ng / mpi.world.size))
                bd2 = bg2.new_descriptor(npw, ng, -(-npw / mpi.world.size), ng)

                fhxc_Glr = np.zeros((len(l_pw_range), ng), dtype=complex)
                if ns == 2:
                    Koff_Glr = np.zeros((len(l_pw_range), ng), dtype=complex)

                r = Redistributor(bg1.comm, bd1, bd2)
                r.redistribute(fhxc_qsGr[iq][0], fhxc_Glr, npw, ng)
                if ns == 2:
                    r.redistribute(fhxc_qsGr[iq][1], Koff_Glr, npw, ng)
            else:
                fhxc_Glr = fhxc_qsGr[iq][0]
                if ns == 2:
                    Koff_Glr = fhxc_qsGr[iq][1]

            # Fourier transform of r'
            for iG in range(len(l_pw_range)):
                f_g = fhxc_Glr[iG].reshape(ng_c)
                f_G = self.pd.fft(f_g.conj(), iq) * vol / ng
                fhxc_sGsG[l_pw_range[0] + iG, :npw] = f_G.conj()
                if ns == 2:
                    v_g = Koff_Glr[iG].reshape(ng_c)
                    v_G = self.pd.fft(v_g.conj(), iq) * vol / ng
                    fhxc_sGsG[npw + l_pw_range[0] + iG, :npw] = v_G.conj()

            if ns == 2:  # f_00 = f_11 and f_01 = f_10
                fhxc_sGsG[:npw, npw:] = fhxc_sGsG[npw:, :npw]
                fhxc_sGsG[npw:, npw:] = fhxc_sGsG[:npw, :npw]

            mpi.world.sum(fhxc_sGsG)
            fhxc_sGsG /= vol

            if mpi.rank == 0:
                w = Writer('fhxc_%s_%s_%s_%s.gpw' %
                           (self.tag, self.xc, self.ecut, iq))
                w.dimension('sG', ns * npw)
                w.add('fhxc_sGsG', ('sG', 'sG'), dtype=complex)
                if nR > 1:  # add Hartree kernel evaluated in PW basis
                    Gq2_G = self.pd.G2_qG[iq]
                    if (q == 0).all():
                        Gq2_G[0] = 1.
                    vq_G = 4 * np.pi / Gq2_G
                    fhxc_sGsG += np.tile(np.eye(npw) * vq_G, (ns, ns))
                w.fill(fhxc_sGsG)
                w.close()
            mpi.world.barrier()
        prnt(file=self.fd)
Beispiel #32
0
    def calculate_blocked_density_matrix(self, f_n, C_nM):
        nbands = self.bd.nbands
        nao = self.nao
        dtype = C_nM.dtype

        self.nMdescriptor.checkassert(C_nM)
        if self.gd.rank == 0:
            Cf_nM = (C_nM * f_n[:, None])
        else:
            C_nM = self.nM_unique_descriptor.zeros(dtype=dtype)
            Cf_nM = self.nM_unique_descriptor.zeros(dtype=dtype)

        r = Redistributor(self.block_comm, self.nM_unique_descriptor,
                          self.mmdescriptor)

        Cf_mm = self.mmdescriptor.zeros(dtype=dtype)
        r.redistribute(Cf_nM, Cf_mm, nbands, nao)
        del Cf_nM

        C_mm = self.mmdescriptor.zeros(dtype=dtype)
        r.redistribute(C_nM, C_mm, nbands, nao)
        # no use to delete C_nM as it's in the input...

        rho_mm = self.mmdescriptor.zeros(dtype=dtype)

        if 1:  # if self.libelpa is None:
            pblas_simple_gemm(self.mmdescriptor,
                              self.mmdescriptor,
                              self.mmdescriptor,
                              Cf_mm,
                              C_mm,
                              rho_mm,
                              transa='C')
        else:
            # elpa_hermitian_multiply was not faster than the ordinary
            # multiplication in the test.  The way we have things distributed,
            # we need to transpose things at the moment.
            #
            # Rather than enabling this, we should store the coefficients
            # in an appropriate 2D block cyclic format (c_nm) and not the
            # current C_nM format.  This makes it possible to avoid
            # redistributing the coefficients at all.  But we don't have time
            # to implement this at the moment.
            mul = self.libelpa.hermitian_multiply
            desc = self.mmdescriptor
            from gpaw.utilities.scalapack import pblas_tran

            def T(array):
                tmp = array.copy()
                pblas_tran(alpha=1.0,
                           a_MN=tmp,
                           beta=0.0,
                           c_NM=array,
                           desca=desc,
                           descc=desc)

            T(C_mm)
            T(Cf_mm)
            mul(C_mm, Cf_mm, rho_mm, desc, desc, desc, uplo_a='X', uplo_c='X')

        return rho_mm
Beispiel #33
0
def main(N=72, seed=42, mprocs=2, nprocs=2, dtype=float):
    gen = np.random.RandomState(seed)
    grid = BlacsGrid(world, mprocs, nprocs)

    if (dtype == complex):
        epsilon = 1.0j
    else:
        epsilon = 0.0

    # Create descriptors for matrices on master:
    glob = grid.new_descriptor(N, N, N, N)

    # print globA.asarray()
    # Populate matrices local to master:
    H0 = glob.zeros(dtype=dtype) + gen.rand(*glob.shape)
    S0 = glob.zeros(dtype=dtype) + gen.rand(*glob.shape)
    C0 = glob.empty(dtype=dtype)
    if rank == 0:
        # Complex case must have real numbers on the diagonal.
        # We make a simple complex Hermitian matrix below.
        H0 = H0 + epsilon * (0.1 * np.tri(N, N, k=-N // nprocs) +
                             0.3 * np.tri(N, N, k=-1))
        S0 = S0 + epsilon * (0.2 * np.tri(N, N, k=-N // nprocs) +
                             0.4 * np.tri(N, N, k=-1))
        # Make matrices symmetric
        rk(1.0, H0.copy(), 0.0, H0)
        rk(1.0, S0.copy(), 0.0, S0)
        # Overlap matrix must be semi-positive definite
        S0 = S0 + 50.0 * np.eye(N, N, 0)
        # Hamiltonian is usually diagonally dominant
        H0 = H0 + 75.0 * np.eye(N, N, 0)
        C0 = S0.copy()
        S0_inv = S0.copy()

    # Local result matrices
    W0 = np.empty((N), dtype=float)
    W0_g = np.empty((N), dtype=float)

    # Calculate eigenvalues / other serial results
    if rank == 0:
        diagonalize(H0.copy(), W0)
        general_diagonalize(H0.copy(), W0_g, S0.copy())
        inverse_cholesky(C0)  # result returned in lower triangle
        tri2full(S0_inv, 'L')
        S0_inv = inv(S0_inv)
        # tri2full(C0) # symmetrize

    assert glob.check(H0) and glob.check(S0) and glob.check(C0)

    # Create distributed destriptors with various block sizes:
    dist = grid.new_descriptor(N, N, 8, 8)

    # Distributed matrices:
    # We can use empty here, but end up with garbage on
    # on the other half of the triangle when we redistribute.
    # This is fine because ScaLAPACK does not care.

    H = dist.empty(dtype=dtype)
    S = dist.empty(dtype=dtype)
    Sinv = dist.empty(dtype=dtype)
    Z = dist.empty(dtype=dtype)
    C = dist.empty(dtype=dtype)
    Sinv = dist.empty(dtype=dtype)

    # Eigenvalues are non-BLACS matrices
    W = np.empty((N), dtype=float)
    W_dc = np.empty((N), dtype=float)
    W_mr3 = np.empty((N), dtype=float)
    W_g = np.empty((N), dtype=float)
    W_g_dc = np.empty((N), dtype=float)
    W_g_mr3 = np.empty((N), dtype=float)

    Glob2dist = Redistributor(world, glob, dist)
    Glob2dist.redistribute(H0, H, uplo='L')
    Glob2dist.redistribute(S0, S, uplo='L')
    Glob2dist.redistribute(S0, C, uplo='L')  # C0 was previously overwritten
    Glob2dist.redistribute(S0, Sinv, uplo='L')

    # we don't test the expert drivers anymore since there
    # might be a buffer overflow error
    ## scalapack_diagonalize_ex(dist, H.copy(), Z, W, 'L')
    scalapack_diagonalize_dc(dist, H.copy(), Z, W_dc, 'L')
    ## scalapack_diagonalize_mr3(dist, H.copy(), Z, W_mr3, 'L')
    ## scalapack_general_diagonalize_ex(dist, H.copy(), S.copy(), Z, W_g, 'L')
    scalapack_general_diagonalize_dc(dist, H.copy(), S.copy(), Z, W_g_dc, 'L')
    ## scalapack_general_diagonalize_mr3(dist, H.copy(), S.copy(), Z, W_g_mr3, 'L')

    scalapack_inverse_cholesky(dist, C, 'L')

    if dtype == complex:  # Only supported for complex for now
        scalapack_inverse(dist, Sinv, 'L')
    # Undo redistribute
    C_test = glob.empty(dtype=dtype)
    Sinv_test = glob.empty(dtype=dtype)
    Dist2glob = Redistributor(world, dist, glob)
    Dist2glob.redistribute(C, C_test)
    Dist2glob.redistribute(Sinv, Sinv_test)

    if rank == 0:
        ## diag_ex_err = abs(W - W0).max()
        diag_dc_err = abs(W_dc - W0).max()
        ## diag_mr3_err = abs(W_mr3 - W0).max()
        ## general_diag_ex_err = abs(W_g - W0_g).max()
        general_diag_dc_err = abs(W_g_dc - W0_g).max()
        ## general_diag_mr3_err = abs(W_g_mr3 - W0_g).max()
        inverse_chol_err = abs(C_test - C0).max()

        tri2full(Sinv_test, 'L')
        inverse_err = abs(Sinv_test - S0_inv).max()
        ## print 'diagonalize ex err', diag_ex_err
        print('diagonalize dc err', diag_dc_err)
        ## print 'diagonalize mr3 err', diag_mr3_err
        ## print 'general diagonalize ex err', general_diag_ex_err
        print('general diagonalize dc err', general_diag_dc_err)
        ## print 'general diagonalize mr3 err', general_diag_mr3_err
        print('inverse chol err', inverse_chol_err)
        if dtype == complex:
            print('inverse err', inverse_err)
    else:
        ## diag_ex_err = 0.0
        diag_dc_err = 0.0
        ## diag_mr3_err = 0.0
        ## general_diag_ex_err = 0.0
        general_diag_dc_err = 0.0
        ## general_diag_mr3_err = 0.0
        inverse_chol_err = 0.0
        inverse_err = 0.0

    # We don't like exceptions on only one cpu
    ## diag_ex_err = world.sum(diag_ex_err)
    diag_dc_err = world.sum(diag_dc_err)
    ## diag_mr3_err = world.sum(diag_mr3_err)
    ## general_diag_ex_err = world.sum(general_diag_ex_err)
    general_diag_dc_err = world.sum(general_diag_dc_err)
    ## general_diag_mr3_err = world.sum(general_diag_mr3_err)
    inverse_chol_err = world.sum(inverse_chol_err)
    inverse_err = world.sum(inverse_err)
    ## assert diag_ex_err < tol
    assert diag_dc_err < tol
    ## assert diag_mr3_err < tol
    ## assert general_diag_ex_err < tol
    assert general_diag_dc_err < tol
    ## assert general_diag_mr3_err < tol
    assert inverse_chol_err < tol
    if dtype == complex:
        assert inverse_err < tol
Beispiel #34
0
class LCAOTDDFT(GPAW):
    def __init__(self, filename=None, propagator_debug=False,
                 propagator='cn', fxc=None, **kwargs):
        self.time = 0.0
        self.niter = 0
        self.kick_strength = [0.0, 0.0, 0.0]
        GPAW.__init__(self, filename, **kwargs)
        self.propagator_debug = propagator_debug
        self.tddft_initialized = False
        self.fxc = fxc
        self.propagator = propagator

        # Restarting from a file
        if filename is not None:
            self.initialize()
            self.set_positions()

    def propagate_wfs(self, sourceC_nm, targetC_nm, S_MM, H_MM, dt):
        if self.propagator == 'cn':
            return self.linear_propagator(sourceC_nm, targetC_nm, S_MM, H_MM, dt)
        raise NotImplementedError

    def linear_propagator(self, sourceC_nM, targetC_nM, S_MM, H_MM, dt):
        self.timer.start('Linear solve')
        # XXX Debugging stuff. Remove
        if self.propagator_debug:
            if self.blacs:
                globalH_MM = self.blacs_mm_to_global(H_MM)
                globalS_MM = self.blacs_mm_to_global(S_MM)
                if world.rank == 0:
                    tri2full(globalS_MM, 'L')
                    tri2full(globalH_MM, 'L')
                    U_MM = dot(inv(globalS_MM-0.5j*globalH_MM*dt), globalS_MM+0.5j*globalH_MM*dt)
                    debugC_nM = dot(sourceC_nM, U_MM.T.conjugate())
                    #print 'PASS PROPAGATOR'
                    #debugC_nM = sourceC_nM.copy()
            else:
                if world.rank == 0:
                    U_MM = dot(inv(S_MM-0.5j*H_MM*dt), S_MM+0.5j*H_MM*dt)
                    debugC_nM = dot(sourceC_nM, U_MM.T.conjugate())
                #print 'PASS PROPAGATOR'
                #debugC_nM = sourceC_nM.copy()

        if self.blacs:
            target_blockC_nm = self.Cnm_block_descriptor.empty(dtype=complex) # XXX, Preallocate
            temp_blockC_nm = self.Cnm_block_descriptor.empty(dtype=complex) # XXX, Preallocate
            temp_block_mm = self.mm_block_descriptor.empty(dtype=complex)
            if self.density.gd.comm.rank != 0:
                # XXX Fake blacks nbands, nao, nbands, nao grid because some weird asserts
                # (these are 0,x or x,0 arrays)
                sourceC_nM = self.CnM_unique_descriptor.zeros(dtype=complex)

            # 1. target = (S+0.5j*H*dt) * source
            # Wave functions to target
            self.CnM2nm.redistribute(sourceC_nM, temp_blockC_nm)

            # XXX It can't be this f'n hard to symmetrize a matrix (tri2full)
            scalapack_zero(self.mm_block_descriptor, H_MM, 'U') # Remove upper diagonal
            temp_block_mm[:] = S_MM - (0.5j*dt) * H_MM  # Lower diagonal matrix
            scalapack_set(self.mm_block_descriptor, temp_block_mm, 0, 0, 'U')
            # Note it's stricly lower diagonal matrix
            pblas_tran(-0.5j*dt, H_MM, 1.0, temp_block_mm, self.mm_block_descriptor, self.mm_block_descriptor) # Add transpose of H
            pblas_tran(1.0, S_MM, 1.0, temp_block_mm, self.mm_block_descriptor, self.mm_block_descriptor) # Add transpose of S

            pblas_simple_gemm(self.Cnm_block_descriptor,
                              self.mm_block_descriptor,
                              self.Cnm_block_descriptor,
                              temp_blockC_nm,
                              temp_block_mm,
                              target_blockC_nm)
            # 2. target = (S-0.5j*H*dt)^-1 * target
            #temp_block_mm[:] = S_MM + (0.5j*dt) * H_MM
            # XXX It can't be this f'n hard to symmetrize a matrix (tri2full)
            temp_block_mm[:] = S_MM + (0.5j*dt) * H_MM  # Lower diagonal matrix
            scalapack_set(self.mm_block_descriptor, temp_block_mm, 0, 0, 'U') # Not it's stricly lower diagonal matrix           
            pblas_tran(+0.5j*dt, H_MM, 1.0, temp_block_mm, self.mm_block_descriptor, self.mm_block_descriptor) # Add transpose of H
            pblas_tran(1.0, S_MM, 1.0, temp_block_mm, self.mm_block_descriptor, self.mm_block_descriptor) # Add transpose of S

            scalapack_solve(self.mm_block_descriptor, 
                            self.Cnm_block_descriptor, 
                            temp_block_mm,
                            target_blockC_nm)

            if self.density.gd.comm.rank != 0: # XXX is this correct?
                # XXX Fake blacks nbands, nao, nbands, nao grid because some weird asserts
                # (these are 0,x or x,0 arrays)
                target = self.CnM_unique_descriptor.zeros(dtype=complex)
            else:
                target = targetC_nM
            self.Cnm2nM.redistribute(target_blockC_nm, target)
            self.density.gd.comm.broadcast(targetC_nM, 0) # Is this required?
        else:
            # Note: The full equation is conjugated (therefore -+, not +-)
            targetC_nM[:] = solve(S_MM-0.5j*H_MM*dt, np.dot(S_MM+0.5j*H_MM*dt, sourceC_nM.T.conjugate())).T.conjugate()
        
        # XXX Debugging stuff. Remove
        if self.propagator_debug:
            if world.rank == 0:
                verify(targetC_nM, debugC_nM,
                       'Linear solver propagator vs. reference')

        self.timer.stop('Linear solve')

    def taylor_propagator(self, sourceC_nM, targetC_nM, S_MM, H_MM, dt):
        self.timer.start('Taylor propagator')
        # XXX Debugging stuff. Remove
        if self.propagator_debug:
            if self.blacs:
                globalH_MM = self.blacs_mm_to_global(H_MM)
                globalS_MM = self.blacs_mm_to_global(S_MM) 
                if world.rank == 0:
                    tri2full(globalS_MM, 'L')
                    tri2full(globalH_MM, 'L')
                    U_MM = dot(inv(globalS_MM-0.5j*globalH_MM*dt), globalS_MM+0.5j*globalH_MM*dt)
                    debugC_nM = dot(sourceC_nM, U_MM.T.conjugate())
                    #print 'PASS PROPAGATOR'
                    #debugC_nM = sourceC_nM.copy()
            else:
                if world.rank == 0:
                    U_MM = dot(inv(S_MM - 0.5j * H_MM * dt),
                               S_MM + 0.5j * H_MM * dt)
                    debugC_nM = dot(sourceC_nM, U_MM.T.conjugate())
                #print 'PASS PROPAGATOR'
                #debugC_nM = sourceC_nM.copy()

        if self.blacs:
            target_blockC_nm = self.Cnm_block_descriptor.empty(dtype=complex) # XXX, Preallocate
            if self.density.gd.comm.rank != 0: 
                # XXX Fake blacks nbands, nao, nbands, nao grid because some weird asserts
                # (these are 0,x or x,0 arrays)
                sourceC_nM = self.CnM_unique_descriptor.zeros(dtype=complex)

            # Zeroth order taylor to target
            self.CnM2nm.redistribute(sourceC_nM, target_blockC_nm) 

            # XXX, preallocate, optimize use of temporal arrays
            temp_blockC_nm = target_blockC_nm.copy()
            temp2_blockC_nm = target_blockC_nm.copy()

            order = 4
            assert self.wfs.kd.comm.size == 1
            for n in range(order):
                # Multiply with hamiltonian
                pblas_simple_hemm(self.mm_block_descriptor, 
                                  self.Cnm_block_descriptor, 
                                  self.Cnm_block_descriptor, 
                                  H_MM, 
                                  temp_blockC_nm, 
                                  temp2_blockC_nm, side='R') 
                # XXX: replace with not simple gemm
                temp2_blockC_nm *= -1j*dt/(n+1) 
                # Multiply with inverse overlap
                pblas_simple_hemm(self.mm_block_descriptor, 
                                  self.Cnm_block_descriptor,
                                  self.Cnm_block_descriptor, 
                                  self.wfs.kpt_u[0].invS_MM, # XXX
                                  temp2_blockC_nm, 
                                  temp_blockC_nm, side='R')
                target_blockC_nm += temp_blockC_nm
            if self.density.gd.comm.rank != 0: # Todo: Change to gd.rank
                # XXX Fake blacks nbands, nao, nbands, nao grid because some weird asserts
                # (these are 0,x or x,0 arrays)
                target = self.CnM_unique_descriptor.zeros(dtype=complex)
            else:
                target = targetC_nM
            self.Cnm2nM.redistribute(target_blockC_nm, target)

            self.density.gd.comm.broadcast(targetC_nM, 0)
        else:
            assert self.wfs.kd.comm.size == 1
            if self.density.gd.comm.rank == 0:
                targetC_nM[:] = sourceC_nM[:]
                tempC_nM = sourceC_nM.copy()
                order = 4
                for n in range(order):
                    tempC_nM[:] = np.dot(self.wfs.kpt_u[0].invS, np.dot(H_MM, 1j*dt/(n+1)*tempC_nM.T.conjugate())).T.conjugate()
                    targetC_nM += tempC_nM
            self.density.gd.comm.broadcast(targetC_nM, 0)
                
        if self.propagator_debug:
            if world.rank == 0:
                verify(targetC_nM, debugC_nM,
                       'Linear solver propagator vs. reference')

        self.timer.stop('Taylor propagator')

    def kick(self, strength):
        self.tddft_init()
        self.timer.start('Kick')
        self.kick_strength = strength

        # magnitude
        magnitude = np.sqrt(strength[0]*strength[0] 
                             + strength[1]*strength[1] 
                             + strength[2]*strength[2])

        # normalize
        direction = strength / magnitude

        self.text('Applying absorbtion kick')
        self.text('Magnitude: %.8f ' % magnitude)
        self.text('Direction: %.4f %.4f %.4f' % tuple(direction))

        # Create hamiltonian object for absorbtion kick
        kick_hamiltonian = KickHamiltonian(self, ConstantElectricField(magnitude, direction=direction))
        for k, kpt in enumerate(self.wfs.kpt_u):
            Vkick_MM = self.wfs.eigensolver.calculate_hamiltonian_matrix(kick_hamiltonian, self.wfs, kpt, add_kinetic=False, root=-1)
            for i in range(10):
                self.propagate_wfs(kpt.C_nM, kpt.C_nM, kpt.S_MM, Vkick_MM, 0.1)
        self.timer.stop('Kick')

    def blacs_mm_to_global(self, H_mm):
        target = self.MM_descriptor.empty(dtype=complex)
        self.mm2MM.redistribute(H_mm, target)
        world.barrier()
        return target

    def blacs_nm_to_global(self, C_nm):
        target = self.CnM_unique_descriptor.empty(dtype=complex)
        self.Cnm2nM.redistribute(C_nm, target)
        world.barrier()
        return target

    def tddft_init(self):
        if not self.tddft_initialized:
            if world.rank == 0:
                print('Initializing real time LCAO TD-DFT calculation.')
                print('XXX Warning: Array use not optimal for memory.')
                print('XXX Taylor propagator probably doesn\'t work')
                print('XXX ...and no arrays are listed in memory estimate yet.')
            self.blacs = self.wfs.ksl.using_blacs
            if self.blacs:
                self.ksl = ksl = self.wfs.ksl    
                nao = ksl.nao
                nbands = ksl.bd.nbands
                mynbands = ksl.bd.mynbands
                blocksize = ksl.blocksize

                from gpaw.blacs import Redistributor
                if world.rank == 0:
                    print('BLACS Parallelization')

                # Parallel grid descriptors
                self.MM_descriptor = ksl.blockgrid.new_descriptor(nao, nao, nao, nao) # FOR DEBUG
                self.mm_block_descriptor = ksl.blockgrid.new_descriptor(nao, nao, blocksize, blocksize)
                self.Cnm_block_descriptor = ksl.blockgrid.new_descriptor(nbands, nao, blocksize, blocksize)
                #self.CnM_descriptor = ksl.blockgrid.new_descriptor(nbands, nao, mynbands, nao)
                self.mM_column_descriptor = ksl.single_column_grid.new_descriptor(nao, nao, ksl.naoblocksize, nao)
                self.CnM_unique_descriptor = ksl.single_column_grid.new_descriptor(nbands, nao, mynbands, nao)

                # Redistributors
                self.mm2MM = Redistributor(ksl.block_comm,
                                           self.mm_block_descriptor,
                                           self.MM_descriptor) # XXX FOR DEBUG
                self.MM2mm = Redistributor(ksl.block_comm,
                                           self.MM_descriptor,
                                           self.mm_block_descriptor) # XXX FOR DEBUG
                self.Cnm2nM = Redistributor(ksl.block_comm,
                                            self.Cnm_block_descriptor,
                                            self.CnM_unique_descriptor) 
                self.CnM2nm = Redistributor(ksl.block_comm,
                                            self.CnM_unique_descriptor,
                                            self.Cnm_block_descriptor) 
                self.mM2mm =  Redistributor(ksl.block_comm,
                                            self.mM_column_descriptor,
                                            self.mm_block_descriptor)

                for kpt in self.wfs.kpt_u:
                    scalapack_zero(self.mm_block_descriptor, kpt.S_MM,'U')
                    scalapack_zero(self.mm_block_descriptor, kpt.T_MM,'U')

                # XXX to propagator class
                if self.propagator == 'taylor' and self.blacs:  
                    # cholS_mm = self.mm_block_descriptor.empty(dtype=complex)
                    for kpt in self.wfs.kpt_u:
                        kpt.invS_MM = kpt.S_MM.copy()
                        scalapack_inverse(self.mm_block_descriptor,
                                          kpt.invS_MM, 'L')
                    if self.propagator_debug:
                        if world.rank == 0:
                            print('XXX Doing serial inversion of overlap matrix.')
                        self.timer.start('Invert overlap (serial)')
                        invS2_MM = self.MM_descriptor.empty(dtype=complex)
                        for kpt in self.wfs.kpt_u:
                            #kpt.S_MM[:] = 128.0*(2**world.rank)
                            self.mm2MM.redistribute(self.wfs.S_qMM[kpt.q], invS2_MM)
                            world.barrier()
                            if world.rank == 0:
                                tri2full(invS2_MM,'L')
                                invS2_MM[:] = inv(invS2_MM.copy())
                                self.invS2_MM = invS2_MM
                            kpt.invS2_MM = ksl.mmdescriptor.empty(dtype=complex)
                            self.MM2mm.redistribute(invS2_MM, kpt.invS2_MM)
                            verify(kpt.invS_MM, kpt.invS2_MM, 'overlap par. vs. serial', 'L')
                        self.timer.stop('Invert overlap (serial)')
                        if world.rank == 0:
                            print('XXX Overlap inverted.')
                if self.propagator == 'taylor' and not self.blacs:
                    tmp = inv(self.wfs.kpt_u[0].S_MM)
                    self.wfs.kpt_u[0].invS = tmp

            # Reset the density mixer
            self.density.mixer = DummyMixer()    
            self.tddft_initialized = True
            for k, kpt in enumerate(self.wfs.kpt_u):
                kpt.C2_nM = kpt.C_nM.copy()
                #kpt.firstC_nM = kpt.C_nM.copy()

    def update_projectors(self):
        self.timer.start('LCAO update projectors') 
        # Loop over all k-points
        for k, kpt in enumerate(self.wfs.kpt_u):
            for a, P_ni in kpt.P_ani.items():
                print('Update projector: Rank:', world.rank, 'a', a)
                P_ni.fill(117)
                gemm(1.0, kpt.P_aMi[a], kpt.C_nM, 0.0, P_ni, 'n')
        self.timer.stop('LCAO update projectors') 

    def save_wfs(self):
        for k, kpt in enumerate(self.wfs.kpt_u):
            kpt.C2_nM[:] = kpt.C_nM

    def update_hamiltonian(self):
        self.update_projectors()
        self.density.update(self.wfs)
        self.hamiltonian.update(self.density)

    def propagate(self, time_step=10, iterations=2000, out='lcao.dm',
                  dump_interval=50):
        assert self.wfs.dtype == complex
        time_step *= attosec_to_autime
        self.time_step = time_step
        self.dump_interval = dump_interval
        maxiter = self.niter + iterations

        if self.time < self.time_step:
            self.dm_file = paropen(out,'w') # XXXX
            # Bug: will fail if world != self.wfs.world.  -askhl
            header = '# Kick = [%22.12le, %22.12le, %22.12le]\n' \
                   % (self.kick_strength[0], self.kick_strength[1], \
                      self.kick_strength[2])
            header += '# %15s %15s %22s %22s %22s\n' \
                   % ('time', 'norm', 'dmx', 'dmy', 'dmz')
            self.dm_file.write(header)
            self.dm_file.flush()
            self.text('About to do %d propagation steps.' % iterations)
        else:
            self.dm_file = paropen(out,'a') # XXXX
            self.text('About to continue from iteration %d and do %d propagation steps' % (self.niter, maxiter)) 
        self.tddft_init()

        dm0 = None # Initial dipole moment
        self.timer.start('Propagate')
        while self.niter < maxiter:
            dm = self.density.finegd.calculate_dipole_moment(self.density.rhot_g)
            if dm0 is None:
                dm0 = dm
            norm = self.density.finegd.integrate(self.density.rhot_g)
            line = '%20.8lf %20.8le %22.12le %22.12le %22.12le' % (self.time, norm, dm[0], dm[1], dm[2])
            T = localtime()
            if world.rank == 0:
                print(line, file=self.dm_file)

            if world.rank == 0 and self.niter%10==0:
                print('iter: %3d  %02d:%02d:%02d %11.2f   %9.1f %12.8f' % (self.niter,
                                                                           T[3], T[4], T[5],
                                                                           self.time * autime_to_attosec,
                                                                           log(abs(norm)+1e-16)/log(10),
                                                                           np.sqrt(dm[0]**2+dm[1]**2+dm[2]**2)))
                self.dm_file.flush()

            # ---------------------------------------------------------------------------- 
            # Predictor step
            # ----------------------------------------------------------------------------
            # 1. Calculate H(t)
            self.save_wfs() # kpt.C2_nM = kpt.C_nM
            # 2. H_MM(t) = <M|H(t)|H>
            #    Solve Psi(t+dt) from (S_MM - 0.5j*H_MM(t)*dt) Psi(t+dt) = (S_MM + 0.5j*H_MM(t)*dt) Psi(t)

            for k, kpt in enumerate(self.wfs.kpt_u):
                if self.fxc is not None:
                    if self.time == 0.0:
                        kpt.deltaXC_H_MM = self.wfs.eigensolver.calculate_hamiltonian_matrix(\
                            self.hamiltonian, self.wfs, kpt, root=-1)
                        self.hamiltonian.xc = XC(self.fxc)
                        self.update_hamiltonian()
                        assert len(self.wfs.kpt_u) == 1
                        kpt.deltaXC_H_MM -= self.wfs.eigensolver.calculate_hamiltonian_matrix(\
                            self.hamiltonian, self.wfs, kpt, root=-1)

            self.update_hamiltonian()

            for k, kpt in enumerate(self.wfs.kpt_u):
                kpt.H0_MM = self.wfs.eigensolver.calculate_hamiltonian_matrix(self.hamiltonian, self.wfs, kpt, root=-1)
                if self.fxc is not None:
                    kpt.H0_MM += kpt.deltaXC_H_MM
                self.propagate_wfs(kpt.C_nM, kpt.C_nM, kpt.S_MM, kpt.H0_MM, self.time_step)
            # ----------------------------------------------------------------------------
            # Propagator step
            # ----------------------------------------------------------------------------
            # 1. Calculate H(t+dt)
            self.update_hamiltonian()
            # 2. Estimate H(t+0.5*dt) ~ H(t) + H(t+dT)
            for k, kpt in enumerate(self.wfs.kpt_u):
                kpt.H0_MM *= 0.5
                if self.fxc is not None:
                    #  Store this to H0_MM and maybe save one extra H_MM of memory?
                    kpt.H0_MM += 0.5 * (self.wfs.eigensolver.calculate_hamiltonian_matrix( \
                                             self.hamiltonian, self.wfs, kpt, root=-1) +\
                                             kpt.deltaXC_H_MM)
                else:
                    #  Store this to H0_MM and maybe save one extra H_MM of memory?
                    kpt.H0_MM += 0.5 * self.wfs.eigensolver.calculate_hamiltonian_matrix( \
                                             self.hamiltonian, self.wfs, kpt, root=-1)

                # 3. Solve Psi(t+dt) from (S_MM - 0.5j*H_MM(t+0.5*dt)*dt) Psi(t+dt) = (S_MM + 0.5j*H_MM(t+0.5*dt)*dt) Psi(t)
                self.propagate_wfs(kpt.C2_nM, kpt.C_nM, kpt.S_MM, kpt.H0_MM, self.time_step)

            self.niter += 1
            self.time += self.time_step
            
            # Call registered callback functions
            self.call_observers(self.niter)

        self.call_observers(self.niter, final=True)
        self.dm_file.close()
        self.timer.stop('Propagate')
Beispiel #35
0
class LrDiagonalizeLayout:
    """BLACS layout for distributed Omega matrix in linear response
       time-dependet DFT calculations"""
    def __init__(self, sl_lrtddft, nrows, lr_comms):
        self.mprocs, self.nprocs, self.block_size = tuple(sl_lrtddft)

        self.lr_comms = lr_comms

        # original grid, ie, how matrix is stored
        self.matrix_grid = BlacsGrid(self.lr_comms.parent_comm,
                                     self.lr_comms.dd_comm.size,
                                     self.lr_comms.eh_comm.size)

        # diagonalization grid
        self.diag_grid = BlacsGrid(self.lr_comms.parent_comm, self.mprocs,
                                   self.nprocs)

        # -----------------------------------------------------------------
        # for SCALAPACK we need TRANSPOSED MATRIX (and vector)
        #
        # M = rows, N = cols
        M = nrows
        N = nrows
        mb = 1
        nb = 1
        self.matrix_descr = self.matrix_grid.new_descriptor(N, M, nb, mb)

        bs = self.block_size
        self.diag_descr = self.diag_grid.new_descriptor(N, M, bs, bs)

        self.diag_in_redist = Redistributor(self.lr_comms.parent_comm,
                                            self.matrix_descr, self.diag_descr)

        self.diag_out_redist = Redistributor(self.lr_comms.parent_comm,
                                             self.diag_descr,
                                             self.matrix_descr)

    def diagonalize(self, eigenvectors, eigenvalues):
        """Diagonalize symmetric distributed Casida matrix using Scalapack.
        Parameters:

        eigenvectors
          distributed Casida matrix on input, distributed eigenvectors on output

        eigenvalues
          zero array on input, eigenvalues on output
        """
        O_diag = self.diag_descr.empty(dtype=float)
        if self.matrix_descr.blacsgrid.is_active():
            O_orig = eigenvectors
        else:
            O_orig = np.empty((0, 0), dtype=float)

        self.diag_in_redist.redistribute(O_orig, O_diag)

        #print O_diag

        self.diag_descr.diagonalize_dc(O_diag.copy(), O_diag, eigenvalues, 'L')

        self.diag_out_redist.redistribute(O_diag, O_orig)

        self.lr_comms.parent_comm.broadcast(eigenvalues, 0)
Beispiel #36
0
grid = BlacsGrid(world, 2, world.size // 2)

desc = grid.new_descriptor(12, 8, 2, 3)

a = desc.zeros()
a[:] = world.rank

subdesc = grid.new_descriptor(7, 7, 2, 2)
b = subdesc.zeros()

r = Redistributor(grid.comm, desc, subdesc, uplo='G')

ia = 3
ja = 2
ib = 1
jb = 1
M = 4
N = 5

r.redistribute(a, b, M, N, ia, ja, ib, jb)

a0 = desc.collect_on_master(a)
b0 = subdesc.collect_on_master(b)
if world.rank == 0:
    print a0
    print b0
    xa = a0[ia:ia + M, ja:ja + N]
    xb = b0[ib:ib + M, jb:jb + N]
    assert (xa == xb).all()