def calculate_blocked_density_matrix(self, f_n, C_nM): nbands = self.bd.nbands nao = self.nao dtype = C_nM.dtype self.nMdescriptor.checkassert(C_nM) if self.gd.rank == 0: Cf_nM = (C_nM * f_n[:, None]).conj() else: C_nM = self.nM_unique_descriptor.zeros(dtype=dtype) Cf_nM = self.nM_unique_descriptor.zeros(dtype=dtype) r = Redistributor(self.block_comm, self.nM_unique_descriptor, self.mmdescriptor) Cf_mm = self.mmdescriptor.zeros(dtype=dtype) r.redistribute(Cf_nM, Cf_mm, nbands, nao) del Cf_nM C_mm = self.mmdescriptor.zeros(dtype=dtype) r.redistribute(C_nM, C_mm, nbands, nao) # no use to delete C_nM as it's in the input... rho_mm = self.mmdescriptor.zeros(dtype=dtype) pblas_simple_gemm(self.mmdescriptor, self.mmdescriptor, self.mmdescriptor, Cf_mm, C_mm, rho_mm, transa='T') return rho_mm
def scalapack_diagonalize(self, H_sS): mb = 32 N = self.nS g1 = BlacsGrid(world, size, 1) g2 = BlacsGrid(world, size//2, 2) nndesc1 = g1.new_descriptor(N, N, self.nS_local, N) nndesc2 = g2.new_descriptor(N, N, mb, mb) A_ss = nndesc2.empty(dtype=H_sS.dtype) redistributor = Redistributor(world, nndesc1, nndesc2) redistributor.redistribute(H_sS, A_ss) # diagonalize v_ss = nndesc2.zeros(dtype=A_ss.dtype) w_S = np.zeros(N,dtype=float) nndesc2.diagonalize_dc(A_ss, v_ss, w_S, 'L') # distribute the eigenvectors to master v_sS = np.zeros_like(H_sS) redistributor = Redistributor(world, nndesc2, nndesc1) redistributor.redistribute(v_ss, v_sS) # v2_SS = np.zeros((self.nS, self.nS), dtype=complex) # world.all_gather(v_sS, v2_SS) return w_S, v_sS.conj()
def scalapack_diagonalize(self, H_sS): mb = 32 N = self.nS g1 = BlacsGrid(world, size, 1) g2 = BlacsGrid(world, size // 2, 2) nndesc1 = g1.new_descriptor(N, N, self.nS_local, N) nndesc2 = g2.new_descriptor(N, N, mb, mb) A_ss = nndesc2.empty(dtype=H_sS.dtype) redistributor = Redistributor(world, nndesc1, nndesc2) redistributor.redistribute(H_sS, A_ss) # diagonalize v_ss = nndesc2.zeros(dtype=A_ss.dtype) w_S = np.zeros(N, dtype=float) nndesc2.diagonalize_dc(A_ss, v_ss, w_S, 'L') # distribute the eigenvectors to master v_sS = np.zeros_like(H_sS) redistributor = Redistributor(world, nndesc2, nndesc1) redistributor.redistribute(v_ss, v_sS) # v2_SS = np.zeros((self.nS, self.nS), dtype=complex) # world.all_gather(v_sS, v2_SS) return w_S, v_sS.conj()
def diagonalize(self): print('Diagonalizing Hamiltonian', file=self.fd) """The t and T represent local and global eigenstates indices respectively """ # Non-Hermitian matrix can only use linalg.eig if not self.td: print(' Using numpy.linalg.eig...', file=self.fd) print(' Eliminated %s pair orbitals' % len(self.excludef_S), file=self.fd) self.H_SS = self.collect_A_SS(self.H_sS) self.w_T = np.zeros(self.nS - len(self.excludef_S), complex) if world.rank == 0: self.H_SS = np.delete(self.H_SS, self.excludef_S, axis=0) self.H_SS = np.delete(self.H_SS, self.excludef_S, axis=1) self.w_T, self.v_ST = np.linalg.eig(self.H_SS) world.broadcast(self.w_T, 0) self.df_S = np.delete(self.df_S, self.excludef_S) self.rhoG0_S = np.delete(self.rhoG0_S, self.excludef_S) # Here the eigenvectors are returned as complex conjugated rows else: if world.size == 1: print(' Using lapack...', file=self.fd) from gpaw.utilities.lapack import diagonalize self.w_T = np.zeros(self.nS) diagonalize(self.H_sS, self.w_T) self.v_St = self.H_sS.conj().T else: print(' Using scalapack...', file=self.fd) nS = self.nS ns = -(-self.kd.nbzkpts // world.size) * (self.nv * self.nc * self.spins * (self.spinors + 1)**2) grid = BlacsGrid(world, world.size, 1) desc = grid.new_descriptor(nS, nS, ns, nS) desc2 = grid.new_descriptor(nS, nS, 2, 2) H_tmp = desc2.zeros(dtype=complex) r = Redistributor(world, desc, desc2) r.redistribute(self.H_sS, H_tmp) self.w_T = np.empty(nS) v_tmp = desc2.empty(dtype=complex) desc2.diagonalize_dc(H_tmp, v_tmp, self.w_T) r = Redistributor(grid.comm, desc2, desc) self.v_St = desc.zeros(dtype=complex) r.redistribute(v_tmp, self.v_St) self.v_St = self.v_St.conj().T if self.write_v and self.td: # Cannot use par_save without td self.par_save('v_TS.ulm', 'v_TS', self.v_St.T) return
def redistribute_H(self, H_sS): g1 = BlacsGrid(world, size, 1) g2 = BlacsGrid(world, 1, size) N = self.nS nndesc1 = g1.new_descriptor(N, N, self.nS_local, N) nndesc2 = g2.new_descriptor(N, N, N, self.nS_local) H_Ss = nndesc2.empty(dtype=H_sS.dtype) redistributor = Redistributor(world, nndesc1, nndesc2) redistributor.redistribute(H_sS, H_Ss) return H_Ss
def parallel_eigh(matrixfile, blacsgrid=(4, 2), blocksize=64): """Diagonalize matrix in parallel""" assert np.prod(blacsgrid) == world.size grid = BlacsGrid(world, *blacsgrid) if world.rank == MASTER: H_MM = np.load(matrixfile) assert H_MM.ndim == 2 assert H_MM.shape[0] == H_MM.shape[1] NM = len(H_MM) else: NM = 0 NM = world.sum(NM) # Distribute matrix shape to all nodes # descriptor for the individual blocks block_desc = grid.new_descriptor(NM, NM, blocksize, blocksize) # descriptor for global array on MASTER local_desc = grid.new_descriptor(NM, NM, NM, NM) # Make some dummy array on all the slaves if world.rank != MASTER: H_MM = local_desc.zeros() assert local_desc.check(H_MM) # The local version of the matrix H_mm = block_desc.empty() # Distribute global array to smaller blocks redistributor = Redistributor(world, local_desc, block_desc) redistributor.redistribute(H_MM, H_mm) # Allocate arrays for eigenvalues and -vectors eps_M = np.empty(NM) C_mm = block_desc.empty() block_desc.diagonalize_ex(H_mm, C_mm, eps_M) # Collect eigenvectors on MASTER C_MM = local_desc.empty() redistributor2 = Redistributor(world, block_desc, local_desc) redistributor2.redistribute(C_mm, C_MM) # Return eigenvalues and -vectors on Master if world.rank == MASTER: return eps_M, C_MM else: return None, None
def distribute_MM(wfs, a_MM): ksl = wfs.ksl if not ksl.using_blacs: return a_MM dtype = a_MM.dtype ksl_comm = ksl.block_comm NM = ksl.nao grid = BlacsGrid(ksl_comm, 1, 1) MM_descriptor = grid.new_descriptor(NM, NM, NM, NM) MM2mm = Redistributor(ksl_comm, MM_descriptor, ksl.mmdescriptor) if ksl_comm.rank != 0: a_MM = MM_descriptor.empty(dtype=dtype) a_mm = ksl.mmdescriptor.empty(dtype=dtype) MM2mm.redistribute(a_MM, a_mm) return a_mm
def redistribute(self, in_wGG, out_x=None): """Redistribute array. Switch between two kinds of parallel distributions: 1) parallel over G-vectors (second dimension of in_wGG) 2) parallel over frequency (first dimension of in_wGG) Returns new array using the memory in the 1-d array out_x. """ comm = self.blockcomm if comm.size == 1: return in_wGG nw = len(self.omega_w) nG = in_wGG.shape[2] mynw = (nw + comm.size - 1) // comm.size mynG = (nG + comm.size - 1) // comm.size bg1 = BlacsGrid(comm, comm.size, 1) bg2 = BlacsGrid(comm, 1, comm.size) md1 = BlacsDescriptor(bg1, nw, nG**2, mynw, nG**2) md2 = BlacsDescriptor(bg2, nw, nG**2, nw, mynG * nG) if len(in_wGG) == nw: mdin = md2 mdout = md1 else: mdin = md1 mdout = md2 r = Redistributor(comm, mdin, mdout) outshape = (mdout.shape[0], mdout.shape[1] // nG, nG) if out_x is None: out_wGG = np.empty(outshape, complex) else: out_wGG = out_x[:np.product(outshape)].reshape(outshape) r.redistribute(in_wGG.reshape(mdin.shape), out_wGG.reshape(mdout.shape)) return out_wGG
def scal_diagonalize(A, nodes='master'): # Diagonalize matrix A (size N*N) with scalapack # Usage: eps, B = scal_diagonalize(A) # eps and B and the eigenvalues and eigenvectors # nodes = 'master': eigenvectors only available on master node # nodes = 'all': eigenvectors broadcast to all nodes # make sure A is N*N, and hermitian N = A.shape[0] assert A.shape[0] == A.shape[1] for i in range(N): for j in range(i, N): assert A[i,j] == A[j,i].conj() # create blacs descriptor mb = 64 g = BlacsGrid(world, 2, size//2) nndesc1 = g.new_descriptor(N, N, N, N) nndesc2 = g.new_descriptor(N, N, mb, mb) # distribute A to blacs grid A_ if rank != 0: A = nndesc1.zeros(dtype=A.dtype) A_ = nndesc2.empty(dtype=A.dtype) redistributor = Redistributor(world, nndesc1, nndesc2) redistributor.redistribute(A, A_) # diagonalize B_ = nndesc2.zeros(dtype=A.dtype) eps = np.zeros(N,dtype=A.dtype) nndesc2.diagonalize_dc(A_, B_, eps, 'L') # distribute the eigenvectors to master B = np.zeros_like(A) redistributor = Redistributor(world, nndesc2, nndesc1) redistributor.redistribute(B_, B) if nodes == 'master': return eps, B elif nodes == 'all': if rank != 0: B = np.zeros((N, N)) world.broadcast(B, 0) return eps, B
def scal_diagonalize(A, nodes='master'): # Diagonalize matrix A (size N*N) with scalapack # Usage: eps, B = scal_diagonalize(A) # eps and B and the eigenvalues and eigenvectors # nodes = 'master': eigenvectors only available on master node # nodes = 'all': eigenvectors broadcast to all nodes # make sure A is N*N, and hermitian N = A.shape[0] assert A.shape[0] == A.shape[1] for i in range(N): for j in range(i, N): assert A[i, j] == A[j, i].conj() # create blacs descriptor mb = 64 g = BlacsGrid(world, 2, size // 2) nndesc1 = g.new_descriptor(N, N, N, N) nndesc2 = g.new_descriptor(N, N, mb, mb) # distribute A to blacs grid A_ if rank != 0: A = nndesc1.zeros(dtype=A.dtype) A_ = nndesc2.empty(dtype=A.dtype) redistributor = Redistributor(world, nndesc1, nndesc2) redistributor.redistribute(A, A_) # diagonalize B_ = nndesc2.zeros(dtype=A.dtype) eps = np.zeros(N, dtype=A.dtype) nndesc2.diagonalize_dc(A_, B_, eps, 'L') # distribute the eigenvectors to master B = np.zeros_like(A) redistributor = Redistributor(world, nndesc2, nndesc1) redistributor.redistribute(B_, B) if nodes == 'master': return eps, B elif nodes == 'all': if rank != 0: B = np.zeros((N, N)) world.broadcast(B, 0) return eps, B
def distribute_to_columns(self, rho_mm, srcdescriptor): redistributor = Redistributor( self.block_comm, # XXX srcdescriptor, self.mM_unique_descriptor) rho_mM = redistributor.redistribute(rho_mm) if self.gd.rank != 0: rho_mM = self.mMdescriptor.zeros(dtype=rho_mm.dtype) self.gd.comm.broadcast(rho_mM, 0) return rho_mM
def test(comm, M, N, mcpus, ncpus, mb, nb): grid0 = BlacsGrid(comm, 1, 1) desc0 = grid0.new_descriptor(M, N, M, N, 0, 0) A_mn = desc0.zeros(dtype=float) A_mn[:] = comm.size + 1 grid1 = BlacsGrid(comm, mcpus, ncpus) desc1 = grid1.new_descriptor(M, N, mb, nb, 0, 0) # ??? B_mn = desc1.zeros(dtype=float) B_mn[:] = comm.rank if comm.rank == 0: msg = 'Slices of global matrix indices by rank' print(msg) print('-' * len(msg)) for rank in range(comm.size): comm.barrier() if rank == comm.rank: print('Rank %d:' % rank) last_Mstart = -1 for Mstart, Mstop, Nstart, Nstop, block in desc1.my_blocks(B_mn): if Mstart > last_Mstart and last_Mstart >= 0: print() print('[%3d:%3d, %3d:%3d]' % (Mstart, Mstop, Nstart, Nstop), end=' ') last_Mstart = Mstart assert (block == comm.rank).all() #print block #print print() print() comm.barrier() redistributor = Redistributor(comm, desc1, desc0) redistributor.redistribute(B_mn, A_mn) if comm.rank == 0: msg = 'Rank where each element of the global matrix is stored' print(msg) print('-' * len(msg)) print(A_mn)
def test(comm, M, N, mcpus, ncpus, mb, nb): grid0 = BlacsGrid(comm, 1, 1) desc0 = grid0.new_descriptor(M, N, M, N, 0, 0) A_mn = desc0.zeros(dtype=float) A_mn[:] = comm.size + 1 grid1 = BlacsGrid(comm, mcpus, ncpus) desc1 = grid1.new_descriptor(M, N, mb, nb, 0, 0) # ??? B_mn = desc1.zeros(dtype=float) B_mn[:] = comm.rank if comm.rank == 0: msg = 'Slices of global matrix indices by rank' print msg print '-' * len(msg) for rank in range(comm.size): comm.barrier() if rank == comm.rank: print 'Rank %d:' % rank last_Mstart = -1 for Mstart, Mstop, Nstart, Nstop, block in desc1.my_blocks(B_mn): if Mstart > last_Mstart and last_Mstart >= 0: print print '[%3d:%3d, %3d:%3d]' % (Mstart, Mstop, Nstart, Nstop), last_Mstart = Mstart assert (block == comm.rank).all() #print block #print print print comm.barrier() redistributor = Redistributor(comm, desc1, desc0) redistributor.redistribute(B_mn, A_mn) if comm.rank == 0: msg = 'Rank where each element of the global matrix is stored' print msg print '-' * len(msg) print A_mn
def calculate_density_matrix(self, f_n, C_nM, rho_mM=None): """Calculate density matrix from occupations and coefficients. Presently this function performs the usual scalapack 3-step trick: redistribute-numbercrunching-backdistribute. Notes on future performance improvement. As per the current framework, C_nM exists as copies on each domain, i.e. this is not parallel over domains. We'd like to correct this and have an efficient distribution using e.g. the block communicator. The diagonalization routine and other parts of the code should however be changed to accommodate the following scheme: Keep coefficients in C_mm form after the diagonalization. rho_mm can then be directly calculated from C_mm without redistribution, after which we only need to redistribute rho_mm across domains. """ dtype = C_nM.dtype rho_mm = self.calculate_blocked_density_matrix(f_n, C_nM) rback = Redistributor(self.block_comm, self.mmdescriptor, self.mM_unique_descriptor) rho1_mM = self.mM_unique_descriptor.zeros(dtype=dtype) rback.redistribute(rho_mm, rho1_mM) del rho_mm if rho_mM is None: if self.gd.rank == 0: rho_mM = rho1_mM else: rho_mM = self.mMdescriptor.zeros(dtype=dtype) self.gd.comm.broadcast(rho_mM, 0) return rho_mM
def collect_wuMM(wfs, a_wuMM, w, s, k): # This function is based on # gpaw/wavefunctions/base.py: WaveFunctions.collect_auxiliary() dtype = a_wuMM[0][0].dtype ksl = wfs.ksl NM = ksl.nao kpt_rank, u = wfs.kd.get_rank_and_index(s, k) ksl_comm = ksl.block_comm if wfs.kd.comm.rank == kpt_rank: a_MM = a_wuMM[w][u] # Collect within blacs grid if ksl.using_blacs: a_mm = a_MM grid = BlacsGrid(ksl_comm, 1, 1) MM_descriptor = grid.new_descriptor(NM, NM, NM, NM) mm2MM = Redistributor(ksl_comm, ksl.mmdescriptor, MM_descriptor) a_MM = MM_descriptor.empty(dtype=dtype) mm2MM.redistribute(a_mm, a_MM) # KSL master send a_MM to the global master if ksl_comm.rank == 0: if kpt_rank == 0: assert wfs.world.rank == 0 # I have it already return a_MM else: wfs.kd.comm.send(a_MM, 0, 2017) return None elif ksl_comm.rank == 0 and kpt_rank != 0: assert wfs.world.rank == 0 a_MM = np.empty((NM, NM), dtype=dtype) wfs.kd.comm.receive(a_MM, kpt_rank, 2017) return a_MM
def distribute_frequencies(self, chi0_wGG): """Distribute frequencies to all cores.""" world = self.world comm = self.blockcomm if world.size == 1: return chi0_wGG nw = len(self.omega_w) nG = chi0_wGG.shape[2] mynw = (nw + world.size - 1) // world.size mynG = (nG + comm.size - 1) // comm.size wa = min(world.rank * mynw, nw) wb = min(wa + mynw, nw) if self.blockcomm.size == 1: return chi0_wGG[wa:wb].copy() if self.kncomm.rank == 0: bg1 = BlacsGrid(comm, 1, comm.size) in_wGG = chi0_wGG.reshape((nw, -1)) else: bg1 = DryRunBlacsGrid(mpi.serial_comm, 1, 1) in_wGG = np.zeros((0, 0), complex) md1 = BlacsDescriptor(bg1, nw, nG**2, nw, mynG * nG) bg2 = BlacsGrid(world, world.size, 1) md2 = BlacsDescriptor(bg2, nw, nG**2, mynw, nG**2) r = Redistributor(world, md1, md2) shape = (wb - wa, nG, nG) out_wGG = np.empty(shape, complex) r.redistribute(in_wGG, out_wGG.reshape((wb - wa, nG**2))) return out_wGG
class LrTDDFTLayouts: """BLACS layout for distributed Omega matrix in linear response time-dependet DFT calculations""" def __init__(self, sl_lrtddft, nkq, dd_comm, eh_comm): mcpus, ncpus, blocksize = tuple(sl_lrtddft) self.world = eh_comm.parent self.dd_comm = dd_comm if self.world is None: self.world = self.dd_comm # All the ranks within domain communicator contain the omega matrix # construct new communicator only on domain masters eh_ranks = np.arange(eh_comm.size) * dd_comm.size self.eh_comm2 = self.world.new_communicator(eh_ranks) self.eh_grid = BlacsGrid(self.eh_comm2, eh_comm.size, 1) self.eh_descr = self.eh_grid.new_descriptor(nkq, nkq, 1, nkq) self.diag_grid = BlacsGrid(self.world, mcpus, ncpus) self.diag_descr = self.diag_grid.new_descriptor( nkq, nkq, blocksize, blocksize) self.redistributor_in = Redistributor(self.world, self.eh_descr, self.diag_descr) self.redistributor_out = Redistributor(self.world, self.diag_descr, self.eh_descr) """ # ----------------------------------------------------------------- # for SCALAPACK we need TRANSPOSED MATRIX (and vector) # ----------------------------------------------------------------- # M = rows, N = cols M = nkq*4; N = nkq*4; mb = nkq*4; nb = 4; Nrhs = 1 # Matrix, mp=1, np=eh_comm.size self.eh_grid2a = BlacsGrid(self.eh_comm2, eh_comm.size, 1) # Vector, mp=eh_comm.size, np=1 self.eh_grid2b = BlacsGrid(self.eh_comm2, 1, eh_comm.size) self.eh_descr2a = self.eh_grid2a.new_descriptor(N, M, nb, mb) self.eh_descr2b = self.eh_grid2b.new_descriptor(Nrhs, N, 1, nb) self.solve_descr2a =self.diag_grid.new_descriptor(N, M, blocksize, blocksize) self.solve_descr2b =self.diag_grid.new_descriptor(Nrhs, N, 1, blocksize) self.redist_solve_in_2a = Redistributor(self.world, self.eh_descr2a, self.solve_descr2a) self.redist_solve_in_2b = Redistributor(self.world, self.eh_descr2b, self.solve_descr2b) self.redist_solve_out_2a = Redistributor(self.world, self.solve_descr2a, self.eh_descr2a) self.redist_solve_out_2b = Redistributor(self.world, self.solve_descr2b, self.eh_descr2b) """ # ----------------------------------------------------------------- # for SCALAPACK we need TRANSPOSED MATRIX (and vector) # ----------------------------------------------------------------- # M = rows, N = cols M = nkq * 4 N = nkq * 4 mb = 4 nb = 4 Nrhs = 1 # Matrix, mp=1, np=eh_comm.size self.eh_grid2a = BlacsGrid(self.world, dd_comm.size, eh_comm.size) # Vector, mp=eh_comm.size, np=1 self.eh_grid2b = BlacsGrid(self.world, 1, dd_comm.size * eh_comm.size) self.eh_descr2a = self.eh_grid2a.new_descriptor(N, M, nb, mb) self.eh_descr2b = self.eh_grid2b.new_descriptor(Nrhs, N, Nrhs, nb) self.solve_descr2a = self.diag_grid.new_descriptor( N, M, blocksize, blocksize) self.solve_descr2b = self.diag_grid.new_descriptor( Nrhs, N, Nrhs, blocksize) self.redist_solve_in_2a = Redistributor(self.world, self.eh_descr2a, self.solve_descr2a) self.redist_solve_in_2b = Redistributor(self.world, self.eh_descr2b, self.solve_descr2b) self.redist_solve_out_2a = Redistributor(self.world, self.solve_descr2a, self.eh_descr2a) self.redist_solve_out_2b = Redistributor(self.world, self.solve_descr2b, self.eh_descr2b) def solve(self, A, b): #if 0: # print 'edescr2a', rank, self.eh_descr2a.asarray() # print 'edescr2b', rank, self.eh_descr2b.asarray() # # sys.stdout.flush() # self.world.barrier() # # print 'sdescr2a', rank, self.solve_descr2a.asarray() # print 'sdescr2b', rank, self.solve_descr2b.asarray() # # sys.stdout.flush() # self.world.barrier() # # print 'A ', rank, A.shape # if b is not None: # print 'b ', rank, b.shape # # sys.stdout.flush() # self.world.barrier() A_nn = self.solve_descr2a.empty(dtype=float) if self.eh_descr2a.blacsgrid.is_active(): A_Nn = A else: A_Nn = np.empty((0, 0), dtype=float) self.redist_solve_in_2a.redistribute(A_Nn, A_nn) b_n = self.solve_descr2b.empty(dtype=float) if self.eh_descr2b.blacsgrid.is_active(): b_N = b.reshape(1, len(b)) else: b_N = np.empty((A_Nn.shape[0], 0), dtype=float) self.redist_solve_in_2b.redistribute(b_N, b_n) #if 0: # print 'A_Nn ', rank, A_Nn.shape # print 'b_N ', rank, b_N.shape # sys.stdout.flush() # self.world.barrier() # print 'A_nn ', rank, A_nn.shape # print 'b_n ', rank, b_n.shape # sys.stdout.flush() # self.world.barrier() # # # print 'b_N ', rank, b_N # sys.stdout.flush() # self.world.barrier() # print 'b_n ', rank, b_n # sys.stdout.flush() # self.world.barrier() # # print 'A_Nn ', rank, A_Nn # sys.stdout.flush() # self.world.barrier() # print 'A_nn ', rank, A_nn # sys.stdout.flush() # self.world.barrier() info = 0 if self.solve_descr2a.blacsgrid.is_active(): _gpaw.scalapack_solve(A_nn, self.solve_descr2a.asarray(), b_n, self.solve_descr2b.asarray()) if info != 0: raise RuntimeError('scalapack_solve error: %d' % info) self.redist_solve_out_2b.redistribute(b_n, b_N) if self.eh_descr2b.blacsgrid.is_active(): b_N = b_N.flatten() else: b_N = b #self.dd_comm.broadcast(b_N, 0) b[:] = b_N def diagonalize(self, Om, eps_n): O_nn = self.diag_descr.empty(dtype=float) if self.eh_descr.blacsgrid.is_active(): O_nN = Om else: O_nN = np.empty((0, 0), dtype=float) self.redistributor_in.redistribute(O_nN, O_nn) self.diag_descr.diagonalize_dc(O_nn.copy(), O_nn, eps_n, 'L') self.redistributor_out.redistribute(O_nn, O_nN) self.world.broadcast(eps_n, 0) # Broadcast eigenvectors within domains if not self.eh_descr.blacsgrid.is_active(): O_nN = Om self.dd_comm.broadcast(O_nN, 0)
def diagonalize_full_hamiltonian(self, ham, atoms, occupations, txt, nbands=None, scalapack=None): if nbands is None: nbands = self.pd.ngmin assert nbands <= self.pd.ngmin self.bd = bd = BandDescriptor(nbands, self.bd.comm) if scalapack: nprow, npcol, b = scalapack bg = BlacsGrid(bd.comm, bd.comm.size, 1) bg2 = BlacsGrid(bd.comm, nprow, npcol) else: nprow = npcol = 1 assert bd.comm.size == nprow * npcol self.pt.set_positions(atoms.get_scaled_positions()) self.kpt_u[0].P_ani = None self.allocate_arrays_for_projections(self.pt.my_atom_indices) myslice = bd.get_slice() for kpt in self.kpt_u: npw = len(self.pd.Q_qG[kpt.q]) if scalapack: mynpw = -(-npw // bd.comm.size) md = BlacsDescriptor(bg, npw, npw, mynpw, npw) md2 = BlacsDescriptor(bg2, npw, npw, b, b) else: md = md2 = MatrixDescriptor(npw, npw) H_GG, S_GG = self.hs(ham, kpt.q, kpt.s, md) if scalapack: r = Redistributor(bd.comm, md, md2) H_GG = r.redistribute(H_GG) S_GG = r.redistribute(S_GG) psit_nG = md2.empty(dtype=complex) eps_n = np.empty(npw) md2.general_diagonalize_dc(H_GG, S_GG, psit_nG, eps_n) del H_GG, S_GG kpt.eps_n = eps_n[myslice].copy() if scalapack: md3 = BlacsDescriptor(bg, npw, npw, bd.mynbands, npw) r = Redistributor(bd.comm, md2, md3) psit_nG = r.redistribute(psit_nG) kpt.psit_nG = psit_nG[:bd.mynbands].copy() del psit_nG self.pt.integrate(kpt.psit_nG, kpt.P_ani, kpt.q) #f_n = np.zeros_like(kpt.eps_n) #f_n[:len(kpt.f_n)] = kpt.f_n kpt.f_n = None occupations.calculate(self)
def diagonalize_full_hamiltonian(self, ham, atoms, occupations, log, nbands=None, ecut=None, scalapack=None, expert=False): if self.dtype != complex: raise ValueError('Your wavefunctions are not complex as ' 'required by the PW diagonalization routine.\n' 'Please supply GPAW(..., dtype=complex, ...) ' 'as an argument to the calculator to enforce ' 'complex wavefunctions.') if nbands is None and ecut is None: nbands = self.pd.ngmin // self.bd.comm.size * self.bd.comm.size elif nbands is None: ecut /= units.Hartree vol = abs(np.linalg.det(self.gd.cell_cv)) nbands = int(vol * ecut**1.5 * 2**0.5 / 3 / pi**2) else: assert nbands <= self.pd.ngmin if expert: iu = nbands else: iu = None self.bd = bd = BandDescriptor(nbands, self.bd.comm) log('Diagonalizing full Hamiltonian ({0} lowest bands)'.format(nbands)) log('Matrix size (min, max): {0}, {1}'.format(self.pd.ngmin, self.pd.ngmax)) mem = 3 * self.pd.ngmax**2 * 16 / bd.comm.size / 1024**2 log('Approximate memory usage per core: {0:.3f} MB'.format(mem)) if bd.comm.size > 1: if isinstance(scalapack, (list, tuple)): nprow, npcol, b = scalapack else: nprow = int(round(bd.comm.size**0.5)) while bd.comm.size % nprow != 0: nprow -= 1 npcol = bd.comm.size // nprow b = 64 log('ScaLapack grid: {0}x{1},'.format(nprow, npcol), 'block-size:', b) bg = BlacsGrid(bd.comm, bd.comm.size, 1) bg2 = BlacsGrid(bd.comm, nprow, npcol) scalapack = True else: nprow = npcol = 1 scalapack = False self.set_positions(atoms.get_scaled_positions()) self.kpt_u[0].P_ani = None self.allocate_arrays_for_projections(self.pt.my_atom_indices) myslice = bd.get_slice() pb = ProgressBar(log.fd) nkpt = len(self.kpt_u) for u, kpt in enumerate(self.kpt_u): pb.update(u / nkpt) npw = len(self.pd.Q_qG[kpt.q]) if scalapack: mynpw = -(-npw // bd.comm.size) md = BlacsDescriptor(bg, npw, npw, mynpw, npw) md2 = BlacsDescriptor(bg2, npw, npw, b, b) else: md = md2 = MatrixDescriptor(npw, npw) with self.timer('Build H and S'): H_GG, S_GG = self.hs(ham, kpt.q, kpt.s, md) if scalapack: r = Redistributor(bd.comm, md, md2) H_GG = r.redistribute(H_GG) S_GG = r.redistribute(S_GG) psit_nG = md2.empty(dtype=complex) eps_n = np.empty(npw) with self.timer('Diagonalize'): if not scalapack: md2.general_diagonalize_dc(H_GG, S_GG, psit_nG, eps_n, iu=iu) else: md2.general_diagonalize_dc(H_GG, S_GG, psit_nG, eps_n) del H_GG, S_GG kpt.eps_n = eps_n[myslice].copy() if scalapack: md3 = BlacsDescriptor(bg, npw, npw, bd.mynbands, npw) r = Redistributor(bd.comm, md2, md3) psit_nG = r.redistribute(psit_nG) kpt.psit_nG = psit_nG[:bd.mynbands].copy() del psit_nG with self.timer('Projections'): self.pt.integrate(kpt.psit_nG, kpt.P_ani, kpt.q) kpt.f_n = None pb.finish() occupations.calculate(self) return nbands
class BlacsOrbitalLayouts(BlacsLayouts): """ScaLAPACK Dense Linear Algebra. This class is instantiated in LCAO. Not for casual use, at least for now. Requires two distributors and three descriptors for initialization as well as grid descriptors and band descriptors. Distributors are for cols2blocks (1D -> 2D BLACS grid) and blocks2cols (2D -> 1D BLACS grid). ScaLAPACK operations must occur on 2D BLACS grid for performance and scalability. _general_diagonalize is "hard-coded" for LCAO. Expects both Hamiltonian and Overlap matrix to be on the 2D BLACS grid. This is done early on to save memory. """ # XXX rewrite this docstring a bit! # This class 'describes' all the LCAO Blacs-related layouts def __init__(self, gd, bd, block_comm, dtype, mcpus, ncpus, blocksize, nao, timer=nulltimer): BlacsLayouts.__init__(self, gd, bd, block_comm, dtype, mcpus, ncpus, blocksize, timer) nbands = bd.nbands self.blocksize = blocksize self.mynbands = mynbands = bd.mynbands self.orbital_comm = self.bd.comm self.naoblocksize = naoblocksize = -((-nao) // self.orbital_comm.size) self.nao = nao # Range of basis functions for BLACS distribution of matrices: self.Mmax = nao self.Mstart = bd.comm.rank * naoblocksize self.Mstop = min(self.Mstart + naoblocksize, self.Mmax) self.mynao = self.Mstop - self.Mstart # Column layout for one matrix per band rank: self.columngrid = BlacsGrid(bd.comm, bd.comm.size, 1) self.mMdescriptor = self.columngrid.new_descriptor( nao, nao, naoblocksize, nao) self.nMdescriptor = self.columngrid.new_descriptor( nbands, nao, mynbands, nao) #parallelprint(world, (mynao, self.mMdescriptor.shape)) # Column layout for one matrix in total (only on grid masters): self.single_column_grid = BlacsGrid(self.column_comm, bd.comm.size, 1) self.mM_unique_descriptor = self.single_column_grid.new_descriptor( \ nao, nao, naoblocksize, nao) # nM_unique_descriptor is meant to hold the coefficients after # diagonalization. BLACS requires it to be nao-by-nao, but # we only fill meaningful data into the first nbands columns. # # The array will then be trimmed and broadcast across # the grid descriptor's communicator. self.nM_unique_descriptor = self.single_column_grid.new_descriptor( \ nbands, nao, mynbands, nao) # Fully blocked grid for diagonalization with many CPUs: self.mmdescriptor = self.blockgrid.new_descriptor( nao, nao, blocksize, blocksize) #self.nMdescriptor = nMdescriptor self.mM2mm = Redistributor(self.block_comm, self.mM_unique_descriptor, self.mmdescriptor) self.mm2nM = Redistributor(self.block_comm, self.mmdescriptor, self.nM_unique_descriptor) def diagonalize(self, H_mm, C_nM, eps_n, S_mm): # C_nM needs to be simultaneously compatible with: # 1. outdescriptor # 2. broadcast with gd.comm # We will does this with a dummy buffer C2_nM outdescriptor = self.mm2nM.dstdescriptor # blocks2cols blockdescriptor = self.mM2mm.dstdescriptor # cols2blocks dtype = S_mm.dtype eps_M = np.empty(C_nM.shape[-1]) # empty helps us debug subM, subN = outdescriptor.gshape C_mm = blockdescriptor.zeros(dtype=dtype) self.timer.start('General diagonalize') # general_diagonalize_ex may have a buffer overflow, so # we no longer use it #blockdescriptor.general_diagonalize_ex(H_mm, S_mm.copy(), C_mm, eps_M, # UL='L', iu=self.bd.nbands) blockdescriptor.general_diagonalize_dc(H_mm, S_mm.copy(), C_mm, eps_M, UL='L') self.timer.stop('General diagonalize') # Make C_nM compatible with the redistributor self.timer.start('Redistribute coefs') if outdescriptor: C2_nM = C_nM else: C2_nM = outdescriptor.empty(dtype=dtype) assert outdescriptor.check(C2_nM) self.mm2nM.redistribute(C_mm, C2_nM, subM, subN) # blocks2cols self.timer.stop('Redistribute coefs') self.timer.start('Send coefs to domains') # eps_M is already on block_comm.rank = 0 # easier to broadcast eps_M to all and # get the correct slice afterward. self.block_comm.broadcast(eps_M, 0) eps_n[:] = eps_M[self.bd.get_slice()] self.gd.comm.broadcast(C_nM, 0) self.timer.stop('Send coefs to domains') def distribute_overlap_matrix(self, S_qmM, root=0, add_hermitian_conjugate=False): # Some MPI implementations need a lot of memory to do large # reductions. To avoid trouble, we do comm.sum on smaller blocks # of S (this code is also safe for arrays smaller than blocksize) Sflat_x = S_qmM.ravel() blocksize = 2**23 // Sflat_x.itemsize # 8 MiB nblocks = -(-len(Sflat_x) // blocksize) Mstart = 0 self.timer.start('blocked summation') for i in range(nblocks): self.gd.comm.sum(Sflat_x[Mstart:Mstart + blocksize], root=root) Mstart += blocksize assert Mstart + blocksize >= len(Sflat_x) self.timer.stop('blocked summation') xshape = S_qmM.shape[:-2] nm, nM = S_qmM.shape[-2:] S_qmM = S_qmM.reshape(-1, nm, nM) blockdesc = self.mmdescriptor coldesc = self.mM_unique_descriptor S_qmm = blockdesc.zeros(len(S_qmM), S_qmM.dtype) if not coldesc: # XXX ugly way to sort out inactive ranks S_qmM = coldesc.zeros(len(S_qmM), S_qmM.dtype) self.timer.start('Scalapack redistribute') for S_mM, S_mm in zip(S_qmM, S_qmm): self.mM2mm.redistribute(S_mM, S_mm) if add_hermitian_conjugate: if blockdesc.active: pblas_tran(1.0, S_mm.copy(), 1.0, S_mm, blockdesc, blockdesc) self.timer.stop('Scalapack redistribute') return S_qmm.reshape(xshape + blockdesc.shape) def get_overlap_matrix_shape(self): return self.mmdescriptor.shape def calculate_blocked_density_matrix(self, f_n, C_nM): nbands = self.bd.nbands nao = self.nao dtype = C_nM.dtype self.nMdescriptor.checkassert(C_nM) if self.gd.rank == 0: Cf_nM = (C_nM * f_n[:, None]).conj() else: C_nM = self.nM_unique_descriptor.zeros(dtype=dtype) Cf_nM = self.nM_unique_descriptor.zeros(dtype=dtype) r = Redistributor(self.block_comm, self.nM_unique_descriptor, self.mmdescriptor) Cf_mm = self.mmdescriptor.zeros(dtype=dtype) r.redistribute(Cf_nM, Cf_mm, nbands, nao) del Cf_nM C_mm = self.mmdescriptor.zeros(dtype=dtype) r.redistribute(C_nM, C_mm, nbands, nao) # no use to delete C_nM as it's in the input... rho_mm = self.mmdescriptor.zeros(dtype=dtype) pblas_simple_gemm(self.mmdescriptor, self.mmdescriptor, self.mmdescriptor, Cf_mm, C_mm, rho_mm, transa='T') return rho_mm def calculate_density_matrix(self, f_n, C_nM, rho_mM=None): """Calculate density matrix from occupations and coefficients. Presently this function performs the usual scalapack 3-step trick: redistribute-numbercrunching-backdistribute. Notes on future performance improvement. As per the current framework, C_nM exists as copies on each domain, i.e. this is not parallel over domains. We'd like to correct this and have an efficient distribution using e.g. the block communicator. The diagonalization routine and other parts of the code should however be changed to accommodate the following scheme: Keep coefficients in C_mm form after the diagonalization. rho_mm can then be directly calculated from C_mm without redistribution, after which we only need to redistribute rho_mm across domains. """ dtype = C_nM.dtype rho_mm = self.calculate_blocked_density_matrix(f_n, C_nM) rback = Redistributor(self.block_comm, self.mmdescriptor, self.mM_unique_descriptor) rho1_mM = self.mM_unique_descriptor.zeros(dtype=dtype) rback.redistribute(rho_mm, rho1_mM) del rho_mm if rho_mM is None: if self.gd.rank == 0: rho_mM = rho1_mM else: rho_mM = self.mMdescriptor.zeros(dtype=dtype) self.gd.comm.broadcast(rho_mM, 0) return rho_mM def distribute_to_columns(self, rho_mm, srcdescriptor): redistributor = Redistributor( self.block_comm, # XXX srcdescriptor, self.mM_unique_descriptor) rho_mM = redistributor.redistribute(rho_mm) if self.gd.rank != 0: rho_mM = self.mMdescriptor.zeros(dtype=rho_mm.dtype) self.gd.comm.broadcast(rho_mM, 0) return rho_mM def oldcalculate_density_matrix(self, f_n, C_nM, rho_mM=None): # This version is parallel over the band descriptor only. # This is inefficient, but let's keep it for a while in case # there's trouble with the more efficient version if rho_mM is None: rho_mM = self.mMdescriptor.zeros(dtype=C_nM.dtype) Cf_nM = (C_nM * f_n[:, None]).conj() pblas_simple_gemm(self.nMdescriptor, self.nMdescriptor, self.mMdescriptor, Cf_nM, C_nM, rho_mM, transa='T') return rho_mM def get_transposed_density_matrix(self, f_n, C_nM, rho_mM=None): return self.calculate_density_matrix(f_n, C_nM, rho_mM).conj() def get_description(self): (title, template) = BlacsLayouts.get_description(self) bg = self.blockgrid desc = self.mmdescriptor s = template % (bg.nprow, bg.npcol, desc.mb, desc.nb) return ' '.join([title, s])
grid = BlacsGrid(world, 2, world.size // 2) desc = grid.new_descriptor(12, 8, 2, 3) a = desc.zeros() a[:] = world.rank subdesc = grid.new_descriptor(7, 7, 2, 2) b = subdesc.zeros() r = Redistributor(grid.comm, desc, subdesc, uplo="G") ia = 3 ja = 2 ib = 1 jb = 1 M = 4 N = 5 r.redistribute(a, b, M, N, ia, ja, ib, jb) a0 = desc.collect_on_master(a) b0 = subdesc.collect_on_master(b) if world.rank == 0: print a0 print b0 xa = a0[ia : ia + M, ja : ja + N] xb = b0[ib : ib + M, jb : jb + N] assert (xa == xb).all()
def calculate_rkernel(self): gd = self.gd ng_c = gd.N_c cell_cv = gd.cell_cv icell_cv = 2 * np.pi * np.linalg.inv(cell_cv) vol = np.linalg.det(cell_cv) ns = self.calc.wfs.nspins n_g = self.n_g # density on rough grid fx_g = ns * self.get_fxc_g(n_g) # local exchange kernel qc_g = (-4 * np.pi * ns / fx_g)**0.5 # cutoff functional flocal_g = qc_g**3 * fx_g / (6 * np.pi**2) # ren. x-kernel for r=r' Vlocal_g = 2 * qc_g / np.pi # ren. Hartree kernel for r=r' ng = np.prod(ng_c) # number of grid points r_vg = gd.get_grid_point_coordinates() rx_g = r_vg[0].flatten() ry_g = r_vg[1].flatten() rz_g = r_vg[2].flatten() prnt(' %d grid points and %d plane waves at the Gamma point' % (ng, self.pd.ngmax), file=self.fd) # Unit cells R_Rv = [] weight_R = [] nR_v = self.unit_cells nR = np.prod(nR_v) for i in range(-nR_v[0] + 1, nR_v[0]): for j in range(-nR_v[1] + 1, nR_v[1]): for h in range(-nR_v[2] + 1, nR_v[2]): R_Rv.append(i * cell_cv[0] + j * cell_cv[1] + h * cell_cv[2]) weight_R.append((nR_v[0] - abs(i)) * (nR_v[1] - abs(j)) * (nR_v[2] - abs(h)) / float(nR)) if nR > 1: # with more than one unit cell only the exchange kernel is # calculated on the grid. The bare Coulomb kernel is added # in PW basis and Vlocal_g only the exchange part dv = self.calc.density.gd.dv gc = (3 * dv / 4 / np.pi)**(1 / 3.) Vlocal_g -= 2 * np.pi * gc**2 / dv prnt(' Lattice point sampling: ' + '(%s x %s x %s)^2 ' % (nR_v[0], nR_v[1], nR_v[2]) + ' Reduced to %s lattice points' % len(R_Rv), file=self.fd) l_g_size = -(-ng // mpi.world.size) l_g_range = range(mpi.world.rank * l_g_size, min((mpi.world.rank+1) * l_g_size, ng)) fhxc_qsGr = {} for iq in range(len(self.ibzq_qc)): fhxc_qsGr[iq] = np.zeros((ns, len(self.pd.G2_qG[iq]), len(l_g_range)), dtype=complex) inv_error = np.seterr() np.seterr(invalid='ignore') np.seterr(divide='ignore') t0 = time() # Loop over Lattice points for i, R_v in enumerate(R_Rv): # Loop over r'. f_rr and V_rr are functions of r (dim. as r_vg[0]) if i == 1: prnt(' Finished 1 cell in %s seconds' % int(time() - t0) + ' - estimated %s seconds left' % int((len(R_Rv) - 1) * (time() - t0)), file=self.fd) self.fd.flush() if len(R_Rv) > 5: if (i+1) % (len(R_Rv) / 5 + 1) == 0: prnt(' Finished %s cells in %s seconds' % (i, int(time() - t0)) + ' - estimated %s seconds left' % int((len(R_Rv) - i) * (time() - t0) / i), file=self.fd) self.fd.flush() for g in l_g_range: rx = rx_g[g] + R_v[0] ry = ry_g[g] + R_v[1] rz = rz_g[g] + R_v[2] # |r-r'-R_i| rr = ((r_vg[0] - rx)**2 + (r_vg[1] - ry)**2 + (r_vg[2] - rz)**2)**0.5 n_av = (n_g + n_g.flatten()[g]) / 2. fx_g = ns * self.get_fxc_g(n_av, index=g) qc_g = (-4 * np.pi * ns / fx_g)**0.5 x = qc_g * rr osc_x = np.sin(x) - x*np.cos(x) f_rr = fx_g * osc_x / (2 * np.pi**2 * rr**3) if nR > 1: # include only exchange part of the kernel here V_rr = (sici(x)[0] * 2 / np.pi - 1) / rr else: # include the full kernel (also hartree part) V_rr = (sici(x)[0] * 2 / np.pi) / rr # Terms with r = r' if (np.abs(R_v) < 0.001).all(): tmp_flat = f_rr.flatten() tmp_flat[g] = flocal_g.flatten()[g] f_rr = tmp_flat.reshape(ng_c) tmp_flat = V_rr.flatten() tmp_flat[g] = Vlocal_g.flatten()[g] V_rr = tmp_flat.reshape(ng_c) del tmp_flat f_rr[np.where(n_av < self.density_cut)] = 0.0 V_rr[np.where(n_av < self.density_cut)] = 0.0 f_rr *= weight_R[i] V_rr *= weight_R[i] # r-r'-R_i r_r = np.array([r_vg[0] - rx, r_vg[1] - ry, r_vg[2] - rz]) # Fourier transform of r for iq, q in enumerate(self.ibzq_qc): q_v = np.dot(q, icell_cv) e_q = np.exp(-1j * gemmdot(q_v, r_r, beta=0.0)) f_q = self.pd.fft((f_rr + V_rr) * e_q, iq) * vol / ng fhxc_qsGr[iq][0, :, g - l_g_range[0]] += f_q if ns == 2: f_q = self.pd.fft(V_rr * e_q, iq) * vol / ng fhxc_qsGr[iq][1, :, g - l_g_range[0]] += f_q mpi.world.barrier() np.seterr(**inv_error) for iq, q in enumerate(self.ibzq_qc): npw = len(self.pd.G2_qG[iq]) fhxc_sGsG = np.zeros((ns * npw, ns * npw), complex) l_pw_size = -(-npw // mpi.world.size) # parallelize over PW below l_pw_range = range(mpi.world.rank * l_pw_size, min((mpi.world.rank + 1) * l_pw_size, npw)) if mpi.world.size > 1: # redistribute grid and plane waves in fhxc_qsGr[iq] bg1 = BlacsGrid(mpi.world, 1, mpi.world.size) bg2 = BlacsGrid(mpi.world, mpi.world.size, 1) bd1 = bg1.new_descriptor(npw, ng, npw, - (-ng / mpi.world.size)) bd2 = bg2.new_descriptor(npw, ng, -(-npw / mpi.world.size), ng) fhxc_Glr = np.zeros((len(l_pw_range), ng), dtype=complex) if ns == 2: Koff_Glr = np.zeros((len(l_pw_range), ng), dtype=complex) r = Redistributor(bg1.comm, bd1, bd2) r.redistribute(fhxc_qsGr[iq][0], fhxc_Glr, npw, ng) if ns == 2: r.redistribute(fhxc_qsGr[iq][1], Koff_Glr, npw, ng) else: fhxc_Glr = fhxc_qsGr[iq][0] if ns == 2: Koff_Glr = fhxc_qsGr[iq][1] # Fourier transform of r' for iG in range(len(l_pw_range)): f_g = fhxc_Glr[iG].reshape(ng_c) f_G = self.pd.fft(f_g.conj(), iq) * vol / ng fhxc_sGsG[l_pw_range[0] + iG, :npw] = f_G.conj() if ns == 2: v_g = Koff_Glr[iG].reshape(ng_c) v_G = self.pd.fft(v_g.conj(), iq) * vol / ng fhxc_sGsG[npw + l_pw_range[0] + iG, :npw] = v_G.conj() if ns == 2: # f_00 = f_11 and f_01 = f_10 fhxc_sGsG[:npw, npw:] = fhxc_sGsG[npw:, :npw] fhxc_sGsG[npw:, npw:] = fhxc_sGsG[:npw, :npw] mpi.world.sum(fhxc_sGsG) fhxc_sGsG /= vol if mpi.rank == 0: w = Writer('fhxc_%s_%s_%s_%s.gpw' % (self.tag, self.xc, self.ecut, iq)) w.dimension('sG', ns * npw) w.add('fhxc_sGsG', ('sG', 'sG'), dtype=complex) if nR > 1: # add Hartree kernel evaluated in PW basis Gq2_G = self.pd.G2_qG[iq] if (q == 0).all(): Gq2_G[0] = 1. vq_G = 4 * np.pi / Gq2_G fhxc_sGsG += np.tile(np.eye(npw) * vq_G, (ns, ns)) w.fill(fhxc_sGsG) w.close() mpi.world.barrier() prnt(file=self.fd)
class ECNPropagator(LCAOPropagator): def __init__(self): LCAOPropagator.__init__(self) def initialize(self, paw, hamiltonian=None): LCAOPropagator.initialize(self, paw) if hamiltonian is not None: self.hamiltonian = hamiltonian ksl = self.wfs.ksl self.blacs = ksl.using_blacs if self.blacs: from gpaw.blacs import Redistributor self.log('BLACS Parallelization') # Parallel grid descriptors grid = ksl.blockgrid assert grid.nprow * grid.npcol == ksl.block_comm.size self.mm_block_descriptor = ksl.mmdescriptor self.Cnm_block_descriptor = grid.new_descriptor(ksl.bd.nbands, ksl.nao, ksl.blocksize, ksl.blocksize) self.CnM_unique_descriptor = ksl.nM_unique_descriptor # Redistributors self.Cnm2nM = Redistributor(ksl.block_comm, self.Cnm_block_descriptor, self.CnM_unique_descriptor) self.CnM2nm = Redistributor(ksl.block_comm, self.CnM_unique_descriptor, self.Cnm_block_descriptor) if debug: nao = ksl.nao self.MM_descriptor = grid.new_descriptor(nao, nao, nao, nao) self.mm2MM = Redistributor(ksl.block_comm, self.mm_block_descriptor, self.MM_descriptor) self.MM2mm = Redistributor(ksl.block_comm, self.MM_descriptor, self.mm_block_descriptor) for kpt in self.wfs.kpt_u: scalapack_zero(self.mm_block_descriptor, kpt.S_MM, 'U') scalapack_zero(self.mm_block_descriptor, kpt.T_MM, 'U') def kick(self, hamiltonian, time): # Propagate get_matrix = self.wfs.eigensolver.calculate_hamiltonian_matrix for kpt in self.wfs.kpt_u: Vkick_MM = get_matrix(hamiltonian, self.wfs, kpt, add_kinetic=False, root=-1) for i in range(10): self.propagate_wfs(kpt.C_nM, kpt.C_nM, kpt.S_MM, Vkick_MM, 0.1) # Update Hamiltonian (and density) self.hamiltonian.update() def propagate(self, time, time_step): for kpt in self.wfs.kpt_u: H_MM = self.hamiltonian.get_hamiltonian_matrix(kpt) self.propagate_wfs(kpt.C_nM, kpt.C_nM, kpt.S_MM, H_MM, time_step) self.hamiltonian.update() return time + time_step def propagate_wfs(self, sourceC_nM, targetC_nM, S_MM, H_MM, dt): self.timer.start('Linear solve') if self.blacs: # XXX, Preallocate target_blockC_nm = self.Cnm_block_descriptor.empty(dtype=complex) temp_blockC_nm = self.Cnm_block_descriptor.empty(dtype=complex) temp_block_mm = self.mm_block_descriptor.empty(dtype=complex) if self.density.gd.comm.rank != 0: # XXX Fake blacks nbands, nao, nbands, nao grid because some # weird asserts # (these are 0,x or x,0 arrays) sourceC_nM = self.CnM_unique_descriptor.zeros(dtype=complex) # 1. target = (S+0.5j*H*dt) * source # Wave functions to target self.CnM2nm.redistribute(sourceC_nM, temp_blockC_nm) # XXX It can't be this f'n hard to symmetrize a matrix (tri2full) # Remove upper diagonal scalapack_zero(self.mm_block_descriptor, H_MM, 'U') # Lower diagonal matrix: temp_block_mm[:] = S_MM - (0.5j * dt) * H_MM scalapack_set(self.mm_block_descriptor, temp_block_mm, 0, 0, 'U') # Note it's strictly lower diagonal matrix # Add transpose of H pblas_tran(-0.5j * dt, H_MM, 1.0, temp_block_mm, self.mm_block_descriptor, self.mm_block_descriptor) # Add transpose of S pblas_tran(1.0, S_MM, 1.0, temp_block_mm, self.mm_block_descriptor, self.mm_block_descriptor) pblas_simple_gemm(self.Cnm_block_descriptor, self.mm_block_descriptor, self.Cnm_block_descriptor, temp_blockC_nm, temp_block_mm, target_blockC_nm) # 2. target = (S-0.5j*H*dt)^-1 * target # temp_block_mm[:] = S_MM + (0.5j*dt) * H_MM # XXX It can't be this f'n hard to symmetrize a matrix (tri2full) # Lower diagonal matrix: temp_block_mm[:] = S_MM + (0.5j * dt) * H_MM # Not it's stricly lower diagonal matrix: scalapack_set(self.mm_block_descriptor, temp_block_mm, 0, 0, 'U') # Add transpose of H: pblas_tran(+0.5j * dt, H_MM, 1.0, temp_block_mm, self.mm_block_descriptor, self.mm_block_descriptor) # Add transpose of S pblas_tran(1.0, S_MM, 1.0, temp_block_mm, self.mm_block_descriptor, self.mm_block_descriptor) scalapack_solve(self.mm_block_descriptor, self.Cnm_block_descriptor, temp_block_mm, target_blockC_nm) if self.density.gd.comm.rank != 0: # XXX is this correct? # XXX Fake blacks nbands, nao, nbands, nao grid because some # weird asserts # (these are 0,x or x,0 arrays) target = self.CnM_unique_descriptor.zeros(dtype=complex) else: target = targetC_nM self.Cnm2nM.redistribute(target_blockC_nm, target) self.density.gd.comm.broadcast(targetC_nM, 0) # Is this required? else: # Note: The full equation is conjugated (therefore -+, not +-) targetC_nM[:] = \ solve(S_MM - 0.5j * H_MM * dt, np.dot(S_MM + 0.5j * H_MM * dt, sourceC_nM.T.conjugate())).T.conjugate() self.timer.stop('Linear solve') def blacs_mm_to_global(self, H_mm): if not debug: raise RuntimeError('Use debug mode') # Someone could verify that this works and remove the error. raise NotImplementedError('Method untested and thus unreliable') target = self.MM_descriptor.empty(dtype=complex) self.mm2MM.redistribute(H_mm, target) self.wfs.world.barrier() return target def blacs_nm_to_global(self, C_nm): # Someone could verify that this works and remove the error. raise NotImplementedError('Method untested and thus unreliable') target = self.CnM_unique_descriptor.empty(dtype=complex) self.Cnm2nM.redistribute(C_nm, target) self.wfs.world.barrier() return target def todict(self): return {'name': 'ecn'}
class LrTDDFPTSolveLayout: """BLACS layouts for distributed TD-DFPT""" def __init__(self, sl_lrtddft, nrows, lr_comms): self.mprocs, self.nprocs, self.block_size = tuple(sl_lrtddft) self.lr_comms = lr_comms # for SCALAPACK we need TRANSPOSED MATRIX (and vector) # # ----------------------------------------------------------------- # matrix # original grid, ie, how matrix is stored self.orig_matrix_grid = BlacsGrid(self.lr_comms.parent_comm, self.lr_comms.dd_comm.size, self.lr_comms.eh_comm.size) # solve grid self.solve_matrix_grid = BlacsGrid(self.lr_comms.parent_comm, self.mprocs, self.nprocs) # M = rows, N = cols M = nrows * 4 N = nrows * 4 mb = 4 nb = 4 self.orig_matrix_descr = self.orig_matrix_grid.new_descriptor( N, M, nb, mb) bs = self.block_size self.solve_matrix_descr = self.solve_matrix_grid.new_descriptor( N, M, bs, bs) self.matrix_in_redist = Redistributor(self.lr_comms.parent_comm, self.orig_matrix_descr, self.solve_matrix_descr) # ----------------------------------------------------------------- # vector # original grid, ie, how vector is stored self.orig_vector_grid = BlacsGrid( self.lr_comms.parent_comm, 1, (self.lr_comms.dd_comm.size * self.lr_comms.eh_comm.size)) # solve grid #self.solve_vector_grid = BlacsGrid(self.lr_comms.parent_comm, self.mprocs, self.nprocs) # M = rows, N = cols M = nrows * 4 Nrhs = 1 mb = 4 nb = 1 self.orig_vector_descr = self.orig_vector_grid.new_descriptor( Nrhs, M, nb, mb) bs = self.block_size self.solve_vector_descr = self.solve_matrix_grid.new_descriptor( Nrhs, M, 1, bs) self.vector_in_redist = Redistributor(self.lr_comms.parent_comm, self.orig_vector_descr, self.solve_vector_descr) self.vector_out_redist = Redistributor(self.lr_comms.parent_comm, self.solve_vector_descr, self.orig_vector_descr) def solve(self, A_orig, b_orig): """Solve TD-DFPT equation using Scalapack. """ A_solve = self.solve_matrix_descr.empty(dtype=float) if not self.orig_matrix_descr.blacsgrid.is_active(): A_orig = np.empty((0, 0), dtype=float) self.matrix_in_redist.redistribute(A_orig, A_solve) b_solve = self.solve_vector_descr.empty(dtype=float) if not self.orig_vector_descr.blacsgrid.is_active(): b_orig = np.empty((0, 0), dtype=float) self.vector_in_redist.redistribute(b_orig, b_solve) #if False: # np.set_printoptions(precision=5, suppress=True) # for i in range(self.lr_comms.parent_comm.size): # if ( self.lr_comms.parent_comm.rank == i ): # print 'rank ', i # print A_orig # print A_solve # print # print b_orig # print b_solve # print # print # print self.solve_matrix_descr.asarray() # print self.solve_vector_descr.asarray() # print # print '---' # print # self.lr_comms.parent_comm.barrier() info = 0 if self.solve_matrix_descr.blacsgrid.is_active(): _gpaw.scalapack_solve(A_solve, self.solve_matrix_descr.asarray(), b_solve, self.solve_vector_descr.asarray()) if info != 0: raise RuntimeError('scalapack_solve error: %d' % info) self.vector_out_redist.redistribute(b_solve, b_orig) #if False: # for i in range(self.lr_comms.parent_comm.size): # if ( self.lr_comms.parent_comm.rank == i ): # print 'rank ', i # print A_orig # print A_solve # print # print b_orig # print b_solve # print # print # self.lr_comms.parent_comm.barrier() return b_orig
def diagonalize_full_hamiltonian(self, ham, atoms, occupations, txt, nbands=None, scalapack=None, expert=False): assert self.dtype == complex if nbands is None: nbands = self.pd.ngmin // self.bd.comm.size * self.bd.comm.size else: assert nbands <= self.pd.ngmin if expert: iu = nbands else: iu = None self.bd = bd = BandDescriptor(nbands, self.bd.comm) p = functools.partial(print, file=txt) p('Diagonalizing full Hamiltonian ({0} lowest bands)'.format(nbands)) p('Matrix size (min, max): {0}, {1}'.format(self.pd.ngmin, self.pd.ngmax)) mem = 3 * self.pd.ngmax**2 * 16 / bd.comm.size / 1024**2 p('Approximate memory usage per core: {0:.3f} MB'.format(mem)) if bd.comm.size > 1: if isinstance(scalapack, (list, tuple)): nprow, npcol, b = scalapack else: nprow = int(round(bd.comm.size**0.5)) while bd.comm.size % nprow != 0: nprow -= 1 npcol = bd.comm.size // nprow b = 64 p('ScaLapack grid: {0}x{1},'.format(nprow, npcol), 'block-size:', b) bg = BlacsGrid(bd.comm, bd.comm.size, 1) bg2 = BlacsGrid(bd.comm, nprow, npcol) scalapack = True else: nprow = npcol = 1 scalapack = False self.pt.set_positions(atoms.get_scaled_positions()) self.kpt_u[0].P_ani = None self.allocate_arrays_for_projections(self.pt.my_atom_indices) myslice = bd.get_slice() pb = ProgressBar(txt) nkpt = len(self.kpt_u) for u, kpt in enumerate(self.kpt_u): pb.update(u / nkpt) npw = len(self.pd.Q_qG[kpt.q]) if scalapack: mynpw = -(-npw // bd.comm.size) md = BlacsDescriptor(bg, npw, npw, mynpw, npw) md2 = BlacsDescriptor(bg2, npw, npw, b, b) else: md = md2 = MatrixDescriptor(npw, npw) with self.timer('Build H and S'): H_GG, S_GG = self.hs(ham, kpt.q, kpt.s, md) if scalapack: r = Redistributor(bd.comm, md, md2) H_GG = r.redistribute(H_GG) S_GG = r.redistribute(S_GG) psit_nG = md2.empty(dtype=complex) eps_n = np.empty(npw) with self.timer('Diagonalize'): if not scalapack: md2.general_diagonalize_dc(H_GG, S_GG, psit_nG, eps_n, iu=iu) else: md2.general_diagonalize_dc(H_GG, S_GG, psit_nG, eps_n) del H_GG, S_GG kpt.eps_n = eps_n[myslice].copy() if scalapack: md3 = BlacsDescriptor(bg, npw, npw, bd.mynbands, npw) r = Redistributor(bd.comm, md2, md3) psit_nG = r.redistribute(psit_nG) kpt.psit_nG = psit_nG[:bd.mynbands].copy() del psit_nG with self.timer('Projections'): self.pt.integrate(kpt.psit_nG, kpt.P_ani, kpt.q) kpt.f_n = None pb.finish() occupations.calculate(self)
def main(N=73, seed=42, mprocs=2, nprocs=2, dtype=float): gen = np.random.RandomState(seed) grid = BlacsGrid(world, mprocs, nprocs) if (dtype==complex): epsilon = 1.0j else: epsilon = 0.0 # Create descriptors for matrices on master: glob = grid.new_descriptor(N, N, N, N) # print globA.asarray() # Populate matrices local to master: H0 = glob.zeros(dtype=dtype) + gen.rand(*glob.shape) S0 = glob.zeros(dtype=dtype) + gen.rand(*glob.shape) C0 = glob.empty(dtype=dtype) if rank == 0: # Complex case must have real numbers on the diagonal. # We make a simple complex Hermitian matrix below. H0 = H0 + epsilon * (0.1*np.tri(N, N, k= -N // nprocs) + 0.3*np.tri(N, N, k=-1)) S0 = S0 + epsilon * (0.2*np.tri(N, N, k= -N // nprocs) + 0.4*np.tri(N, N, k=-1)) # Make matrices symmetric rk(1.0, H0.copy(), 0.0, H0) rk(1.0, S0.copy(), 0.0, S0) # Overlap matrix must be semi-positive definite S0 = S0 + 50.0*np.eye(N, N, 0) # Hamiltonian is usually diagonally dominant H0 = H0 + 75.0*np.eye(N, N, 0) C0 = S0.copy() # Local result matrices W0 = np.empty((N),dtype=float) W0_g = np.empty((N),dtype=float) # Calculate eigenvalues if rank == 0: diagonalize(H0.copy(), W0) general_diagonalize(H0.copy(), W0_g, S0.copy()) inverse_cholesky(C0) # result returned in lower triangle # tri2full(C0) # symmetrize assert glob.check(H0) and glob.check(S0) and glob.check(C0) # Create distributed destriptors with various block sizes: dist = grid.new_descriptor(N, N, 8, 8) # Distributed matrices: # We can use empty here, but end up with garbage on # on the other half of the triangle when we redistribute. # This is fine because ScaLAPACK does not care. H = dist.empty(dtype=dtype) S = dist.empty(dtype=dtype) Z = dist.empty(dtype=dtype) C = dist.empty(dtype=dtype) # Eigenvalues are non-BLACS matrices W = np.empty((N), dtype=float) W_dc = np.empty((N), dtype=float) W_mr3 = np.empty((N), dtype=float) W_g = np.empty((N), dtype=float) W_g_dc = np.empty((N), dtype=float) W_g_mr3 = np.empty((N), dtype=float) Glob2dist = Redistributor(world, glob, dist) Glob2dist.redistribute(H0, H, uplo='L') Glob2dist.redistribute(S0, S, uplo='L') Glob2dist.redistribute(S0, C, uplo='L') # C0 was previously overwritten # we don't test the expert drivers anymore since there # might be a buffer overflow error ## scalapack_diagonalize_ex(dist, H.copy(), Z, W, 'L') scalapack_diagonalize_dc(dist, H.copy(), Z, W_dc, 'L') ## scalapack_diagonalize_mr3(dist, H.copy(), Z, W_mr3, 'L') ## scalapack_general_diagonalize_ex(dist, H.copy(), S.copy(), Z, W_g, 'L') scalapack_general_diagonalize_dc(dist, H.copy(), S.copy(), Z, W_g_dc, 'L') ## scalapack_general_diagonalize_mr3(dist, H.copy(), S.copy(), Z, W_g_mr3, 'L') scalapack_inverse_cholesky(dist, C, 'L') # Undo redistribute C_test = glob.empty(dtype=dtype) Dist2glob = Redistributor(world, dist, glob) Dist2glob.redistribute(C, C_test) if rank == 0: ## diag_ex_err = abs(W - W0).max() diag_dc_err = abs(W_dc - W0).max() ## diag_mr3_err = abs(W_mr3 - W0).max() ## general_diag_ex_err = abs(W_g - W0_g).max() general_diag_dc_err = abs(W_g_dc - W0_g).max() ## general_diag_mr3_err = abs(W_g_mr3 - W0_g).max() inverse_chol_err = abs(C_test-C0).max() ## print 'diagonalize ex err', diag_ex_err print 'diagonalize dc err', diag_dc_err ## print 'diagonalize mr3 err', diag_mr3_err ## print 'general diagonalize ex err', general_diag_ex_err print 'general diagonalize dc err', general_diag_dc_err ## print 'general diagonalize mr3 err', general_diag_mr3_err print 'inverse chol err', inverse_chol_err else: ## diag_ex_err = 0.0 diag_dc_err = 0.0 ## diag_mr3_err = 0.0 ## general_diag_ex_err = 0.0 general_diag_dc_err = 0.0 ## general_diag_mr3_err = 0.0 inverse_chol_err = 0.0 # We don't like exceptions on only one cpu ## diag_ex_err = world.sum(diag_ex_err) diag_dc_err = world.sum(diag_dc_err) ## diag_mr3_err = world.sum(diag_mr3_err) ## general_diag_ex_err = world.sum(general_diag_ex_err) general_diag_dc_err = world.sum(general_diag_dc_err) ## general_diag_mr3_err = world.sum(general_diag_mr3_err) inverse_chol_err = world.sum(inverse_chol_err) ## assert diag_ex_err < tol assert diag_dc_err < tol ## assert diag_mr3_err < tol ## assert general_diag_ex_err < tol assert general_diag_dc_err < tol ## assert general_diag_mr3_err < tol assert inverse_chol_err < tol
def calculate_rkernel(self): gd = self.gd ng_c = gd.N_c cell_cv = gd.cell_cv icell_cv = 2 * np.pi * np.linalg.inv(cell_cv) vol = np.linalg.det(cell_cv) ns = self.calc.wfs.nspins n_g = self.n_g # density on rough grid fx_g = ns * self.get_fxc_g(n_g) # local exchange kernel qc_g = (-4 * np.pi * ns / fx_g)**0.5 # cutoff functional flocal_g = qc_g**3 * fx_g / (6 * np.pi**2) # ren. x-kernel for r=r' Vlocal_g = 2 * qc_g / np.pi # ren. Hartree kernel for r=r' ng = np.prod(ng_c) # number of grid points r_vg = gd.get_grid_point_coordinates() rx_g = r_vg[0].flatten() ry_g = r_vg[1].flatten() rz_g = r_vg[2].flatten() prnt(' %d grid points and %d plane waves at the Gamma point' % (ng, self.pd.ngmax), file=self.fd) # Unit cells R_Rv = [] weight_R = [] nR_v = self.unit_cells nR = np.prod(nR_v) for i in range(-nR_v[0] + 1, nR_v[0]): for j in range(-nR_v[1] + 1, nR_v[1]): for h in range(-nR_v[2] + 1, nR_v[2]): R_Rv.append(i * cell_cv[0] + j * cell_cv[1] + h * cell_cv[2]) weight_R.append((nR_v[0] - abs(i)) * (nR_v[1] - abs(j)) * (nR_v[2] - abs(h)) / float(nR)) if nR > 1: # with more than one unit cell only the exchange kernel is # calculated on the grid. The bare Coulomb kernel is added # in PW basis and Vlocal_g only the exchange part dv = self.calc.density.gd.dv gc = (3 * dv / 4 / np.pi)**(1 / 3.) Vlocal_g -= 2 * np.pi * gc**2 / dv prnt(' Lattice point sampling: ' + '(%s x %s x %s)^2 ' % (nR_v[0], nR_v[1], nR_v[2]) + ' Reduced to %s lattice points' % len(R_Rv), file=self.fd) l_g_size = -(-ng // mpi.world.size) l_g_range = range(mpi.world.rank * l_g_size, min((mpi.world.rank + 1) * l_g_size, ng)) fhxc_qsGr = {} for iq in range(len(self.ibzq_qc)): fhxc_qsGr[iq] = np.zeros( (ns, len(self.pd.G2_qG[iq]), len(l_g_range)), dtype=complex) inv_error = np.seterr() np.seterr(invalid='ignore') np.seterr(divide='ignore') t0 = time() # Loop over Lattice points for i, R_v in enumerate(R_Rv): # Loop over r'. f_rr and V_rr are functions of r (dim. as r_vg[0]) if i == 1: prnt(' Finished 1 cell in %s seconds' % int(time() - t0) + ' - estimated %s seconds left' % int( (len(R_Rv) - 1) * (time() - t0)), file=self.fd) self.fd.flush() if len(R_Rv) > 5: if (i + 1) % (len(R_Rv) / 5 + 1) == 0: prnt(' Finished %s cells in %s seconds' % (i, int(time() - t0)) + ' - estimated %s seconds left' % int( (len(R_Rv) - i) * (time() - t0) / i), file=self.fd) self.fd.flush() for g in l_g_range: rx = rx_g[g] + R_v[0] ry = ry_g[g] + R_v[1] rz = rz_g[g] + R_v[2] # |r-r'-R_i| rr = ((r_vg[0] - rx)**2 + (r_vg[1] - ry)**2 + (r_vg[2] - rz)**2)**0.5 n_av = (n_g + n_g.flatten()[g]) / 2. fx_g = ns * self.get_fxc_g(n_av, index=g) qc_g = (-4 * np.pi * ns / fx_g)**0.5 x = qc_g * rr osc_x = np.sin(x) - x * np.cos(x) f_rr = fx_g * osc_x / (2 * np.pi**2 * rr**3) if nR > 1: # include only exchange part of the kernel here V_rr = (sici(x)[0] * 2 / np.pi - 1) / rr else: # include the full kernel (also hartree part) V_rr = (sici(x)[0] * 2 / np.pi) / rr # Terms with r = r' if (np.abs(R_v) < 0.001).all(): tmp_flat = f_rr.flatten() tmp_flat[g] = flocal_g.flatten()[g] f_rr = tmp_flat.reshape(ng_c) tmp_flat = V_rr.flatten() tmp_flat[g] = Vlocal_g.flatten()[g] V_rr = tmp_flat.reshape(ng_c) del tmp_flat f_rr[np.where(n_av < self.density_cut)] = 0.0 V_rr[np.where(n_av < self.density_cut)] = 0.0 f_rr *= weight_R[i] V_rr *= weight_R[i] # r-r'-R_i r_r = np.array([r_vg[0] - rx, r_vg[1] - ry, r_vg[2] - rz]) # Fourier transform of r for iq, q in enumerate(self.ibzq_qc): q_v = np.dot(q, icell_cv) e_q = np.exp(-1j * gemmdot(q_v, r_r, beta=0.0)) f_q = self.pd.fft((f_rr + V_rr) * e_q, iq) * vol / ng fhxc_qsGr[iq][0, :, g - l_g_range[0]] += f_q if ns == 2: f_q = self.pd.fft(V_rr * e_q, iq) * vol / ng fhxc_qsGr[iq][1, :, g - l_g_range[0]] += f_q mpi.world.barrier() np.seterr(**inv_error) for iq, q in enumerate(self.ibzq_qc): npw = len(self.pd.G2_qG[iq]) fhxc_sGsG = np.zeros((ns * npw, ns * npw), complex) l_pw_size = -(-npw // mpi.world.size) # parallelize over PW below l_pw_range = range(mpi.world.rank * l_pw_size, min((mpi.world.rank + 1) * l_pw_size, npw)) if mpi.world.size > 1: # redistribute grid and plane waves in fhxc_qsGr[iq] bg1 = BlacsGrid(mpi.world, 1, mpi.world.size) bg2 = BlacsGrid(mpi.world, mpi.world.size, 1) bd1 = bg1.new_descriptor(npw, ng, npw, -(-ng / mpi.world.size)) bd2 = bg2.new_descriptor(npw, ng, -(-npw / mpi.world.size), ng) fhxc_Glr = np.zeros((len(l_pw_range), ng), dtype=complex) if ns == 2: Koff_Glr = np.zeros((len(l_pw_range), ng), dtype=complex) r = Redistributor(bg1.comm, bd1, bd2) r.redistribute(fhxc_qsGr[iq][0], fhxc_Glr, npw, ng) if ns == 2: r.redistribute(fhxc_qsGr[iq][1], Koff_Glr, npw, ng) else: fhxc_Glr = fhxc_qsGr[iq][0] if ns == 2: Koff_Glr = fhxc_qsGr[iq][1] # Fourier transform of r' for iG in range(len(l_pw_range)): f_g = fhxc_Glr[iG].reshape(ng_c) f_G = self.pd.fft(f_g.conj(), iq) * vol / ng fhxc_sGsG[l_pw_range[0] + iG, :npw] = f_G.conj() if ns == 2: v_g = Koff_Glr[iG].reshape(ng_c) v_G = self.pd.fft(v_g.conj(), iq) * vol / ng fhxc_sGsG[npw + l_pw_range[0] + iG, :npw] = v_G.conj() if ns == 2: # f_00 = f_11 and f_01 = f_10 fhxc_sGsG[:npw, npw:] = fhxc_sGsG[npw:, :npw] fhxc_sGsG[npw:, npw:] = fhxc_sGsG[:npw, :npw] mpi.world.sum(fhxc_sGsG) fhxc_sGsG /= vol if mpi.rank == 0: w = Writer('fhxc_%s_%s_%s_%s.gpw' % (self.tag, self.xc, self.ecut, iq)) w.dimension('sG', ns * npw) w.add('fhxc_sGsG', ('sG', 'sG'), dtype=complex) if nR > 1: # add Hartree kernel evaluated in PW basis Gq2_G = self.pd.G2_qG[iq] if (q == 0).all(): Gq2_G[0] = 1. vq_G = 4 * np.pi / Gq2_G fhxc_sGsG += np.tile(np.eye(npw) * vq_G, (ns, ns)) w.fill(fhxc_sGsG) w.close() mpi.world.barrier() prnt(file=self.fd)
def calculate_blocked_density_matrix(self, f_n, C_nM): nbands = self.bd.nbands nao = self.nao dtype = C_nM.dtype self.nMdescriptor.checkassert(C_nM) if self.gd.rank == 0: Cf_nM = (C_nM * f_n[:, None]) else: C_nM = self.nM_unique_descriptor.zeros(dtype=dtype) Cf_nM = self.nM_unique_descriptor.zeros(dtype=dtype) r = Redistributor(self.block_comm, self.nM_unique_descriptor, self.mmdescriptor) Cf_mm = self.mmdescriptor.zeros(dtype=dtype) r.redistribute(Cf_nM, Cf_mm, nbands, nao) del Cf_nM C_mm = self.mmdescriptor.zeros(dtype=dtype) r.redistribute(C_nM, C_mm, nbands, nao) # no use to delete C_nM as it's in the input... rho_mm = self.mmdescriptor.zeros(dtype=dtype) if 1: # if self.libelpa is None: pblas_simple_gemm(self.mmdescriptor, self.mmdescriptor, self.mmdescriptor, Cf_mm, C_mm, rho_mm, transa='C') else: # elpa_hermitian_multiply was not faster than the ordinary # multiplication in the test. The way we have things distributed, # we need to transpose things at the moment. # # Rather than enabling this, we should store the coefficients # in an appropriate 2D block cyclic format (c_nm) and not the # current C_nM format. This makes it possible to avoid # redistributing the coefficients at all. But we don't have time # to implement this at the moment. mul = self.libelpa.hermitian_multiply desc = self.mmdescriptor from gpaw.utilities.scalapack import pblas_tran def T(array): tmp = array.copy() pblas_tran(alpha=1.0, a_MN=tmp, beta=0.0, c_NM=array, desca=desc, descc=desc) T(C_mm) T(Cf_mm) mul(C_mm, Cf_mm, rho_mm, desc, desc, desc, uplo_a='X', uplo_c='X') return rho_mm
def main(N=72, seed=42, mprocs=2, nprocs=2, dtype=float): gen = np.random.RandomState(seed) grid = BlacsGrid(world, mprocs, nprocs) if (dtype == complex): epsilon = 1.0j else: epsilon = 0.0 # Create descriptors for matrices on master: glob = grid.new_descriptor(N, N, N, N) # print globA.asarray() # Populate matrices local to master: H0 = glob.zeros(dtype=dtype) + gen.rand(*glob.shape) S0 = glob.zeros(dtype=dtype) + gen.rand(*glob.shape) C0 = glob.empty(dtype=dtype) if rank == 0: # Complex case must have real numbers on the diagonal. # We make a simple complex Hermitian matrix below. H0 = H0 + epsilon * (0.1 * np.tri(N, N, k=-N // nprocs) + 0.3 * np.tri(N, N, k=-1)) S0 = S0 + epsilon * (0.2 * np.tri(N, N, k=-N // nprocs) + 0.4 * np.tri(N, N, k=-1)) # Make matrices symmetric rk(1.0, H0.copy(), 0.0, H0) rk(1.0, S0.copy(), 0.0, S0) # Overlap matrix must be semi-positive definite S0 = S0 + 50.0 * np.eye(N, N, 0) # Hamiltonian is usually diagonally dominant H0 = H0 + 75.0 * np.eye(N, N, 0) C0 = S0.copy() S0_inv = S0.copy() # Local result matrices W0 = np.empty((N), dtype=float) W0_g = np.empty((N), dtype=float) # Calculate eigenvalues / other serial results if rank == 0: diagonalize(H0.copy(), W0) general_diagonalize(H0.copy(), W0_g, S0.copy()) inverse_cholesky(C0) # result returned in lower triangle tri2full(S0_inv, 'L') S0_inv = inv(S0_inv) # tri2full(C0) # symmetrize assert glob.check(H0) and glob.check(S0) and glob.check(C0) # Create distributed destriptors with various block sizes: dist = grid.new_descriptor(N, N, 8, 8) # Distributed matrices: # We can use empty here, but end up with garbage on # on the other half of the triangle when we redistribute. # This is fine because ScaLAPACK does not care. H = dist.empty(dtype=dtype) S = dist.empty(dtype=dtype) Sinv = dist.empty(dtype=dtype) Z = dist.empty(dtype=dtype) C = dist.empty(dtype=dtype) Sinv = dist.empty(dtype=dtype) # Eigenvalues are non-BLACS matrices W = np.empty((N), dtype=float) W_dc = np.empty((N), dtype=float) W_mr3 = np.empty((N), dtype=float) W_g = np.empty((N), dtype=float) W_g_dc = np.empty((N), dtype=float) W_g_mr3 = np.empty((N), dtype=float) Glob2dist = Redistributor(world, glob, dist) Glob2dist.redistribute(H0, H, uplo='L') Glob2dist.redistribute(S0, S, uplo='L') Glob2dist.redistribute(S0, C, uplo='L') # C0 was previously overwritten Glob2dist.redistribute(S0, Sinv, uplo='L') # we don't test the expert drivers anymore since there # might be a buffer overflow error ## scalapack_diagonalize_ex(dist, H.copy(), Z, W, 'L') scalapack_diagonalize_dc(dist, H.copy(), Z, W_dc, 'L') ## scalapack_diagonalize_mr3(dist, H.copy(), Z, W_mr3, 'L') ## scalapack_general_diagonalize_ex(dist, H.copy(), S.copy(), Z, W_g, 'L') scalapack_general_diagonalize_dc(dist, H.copy(), S.copy(), Z, W_g_dc, 'L') ## scalapack_general_diagonalize_mr3(dist, H.copy(), S.copy(), Z, W_g_mr3, 'L') scalapack_inverse_cholesky(dist, C, 'L') if dtype == complex: # Only supported for complex for now scalapack_inverse(dist, Sinv, 'L') # Undo redistribute C_test = glob.empty(dtype=dtype) Sinv_test = glob.empty(dtype=dtype) Dist2glob = Redistributor(world, dist, glob) Dist2glob.redistribute(C, C_test) Dist2glob.redistribute(Sinv, Sinv_test) if rank == 0: ## diag_ex_err = abs(W - W0).max() diag_dc_err = abs(W_dc - W0).max() ## diag_mr3_err = abs(W_mr3 - W0).max() ## general_diag_ex_err = abs(W_g - W0_g).max() general_diag_dc_err = abs(W_g_dc - W0_g).max() ## general_diag_mr3_err = abs(W_g_mr3 - W0_g).max() inverse_chol_err = abs(C_test - C0).max() tri2full(Sinv_test, 'L') inverse_err = abs(Sinv_test - S0_inv).max() ## print 'diagonalize ex err', diag_ex_err print('diagonalize dc err', diag_dc_err) ## print 'diagonalize mr3 err', diag_mr3_err ## print 'general diagonalize ex err', general_diag_ex_err print('general diagonalize dc err', general_diag_dc_err) ## print 'general diagonalize mr3 err', general_diag_mr3_err print('inverse chol err', inverse_chol_err) if dtype == complex: print('inverse err', inverse_err) else: ## diag_ex_err = 0.0 diag_dc_err = 0.0 ## diag_mr3_err = 0.0 ## general_diag_ex_err = 0.0 general_diag_dc_err = 0.0 ## general_diag_mr3_err = 0.0 inverse_chol_err = 0.0 inverse_err = 0.0 # We don't like exceptions on only one cpu ## diag_ex_err = world.sum(diag_ex_err) diag_dc_err = world.sum(diag_dc_err) ## diag_mr3_err = world.sum(diag_mr3_err) ## general_diag_ex_err = world.sum(general_diag_ex_err) general_diag_dc_err = world.sum(general_diag_dc_err) ## general_diag_mr3_err = world.sum(general_diag_mr3_err) inverse_chol_err = world.sum(inverse_chol_err) inverse_err = world.sum(inverse_err) ## assert diag_ex_err < tol assert diag_dc_err < tol ## assert diag_mr3_err < tol ## assert general_diag_ex_err < tol assert general_diag_dc_err < tol ## assert general_diag_mr3_err < tol assert inverse_chol_err < tol if dtype == complex: assert inverse_err < tol
class LCAOTDDFT(GPAW): def __init__(self, filename=None, propagator_debug=False, propagator='cn', fxc=None, **kwargs): self.time = 0.0 self.niter = 0 self.kick_strength = [0.0, 0.0, 0.0] GPAW.__init__(self, filename, **kwargs) self.propagator_debug = propagator_debug self.tddft_initialized = False self.fxc = fxc self.propagator = propagator # Restarting from a file if filename is not None: self.initialize() self.set_positions() def propagate_wfs(self, sourceC_nm, targetC_nm, S_MM, H_MM, dt): if self.propagator == 'cn': return self.linear_propagator(sourceC_nm, targetC_nm, S_MM, H_MM, dt) raise NotImplementedError def linear_propagator(self, sourceC_nM, targetC_nM, S_MM, H_MM, dt): self.timer.start('Linear solve') # XXX Debugging stuff. Remove if self.propagator_debug: if self.blacs: globalH_MM = self.blacs_mm_to_global(H_MM) globalS_MM = self.blacs_mm_to_global(S_MM) if world.rank == 0: tri2full(globalS_MM, 'L') tri2full(globalH_MM, 'L') U_MM = dot(inv(globalS_MM-0.5j*globalH_MM*dt), globalS_MM+0.5j*globalH_MM*dt) debugC_nM = dot(sourceC_nM, U_MM.T.conjugate()) #print 'PASS PROPAGATOR' #debugC_nM = sourceC_nM.copy() else: if world.rank == 0: U_MM = dot(inv(S_MM-0.5j*H_MM*dt), S_MM+0.5j*H_MM*dt) debugC_nM = dot(sourceC_nM, U_MM.T.conjugate()) #print 'PASS PROPAGATOR' #debugC_nM = sourceC_nM.copy() if self.blacs: target_blockC_nm = self.Cnm_block_descriptor.empty(dtype=complex) # XXX, Preallocate temp_blockC_nm = self.Cnm_block_descriptor.empty(dtype=complex) # XXX, Preallocate temp_block_mm = self.mm_block_descriptor.empty(dtype=complex) if self.density.gd.comm.rank != 0: # XXX Fake blacks nbands, nao, nbands, nao grid because some weird asserts # (these are 0,x or x,0 arrays) sourceC_nM = self.CnM_unique_descriptor.zeros(dtype=complex) # 1. target = (S+0.5j*H*dt) * source # Wave functions to target self.CnM2nm.redistribute(sourceC_nM, temp_blockC_nm) # XXX It can't be this f'n hard to symmetrize a matrix (tri2full) scalapack_zero(self.mm_block_descriptor, H_MM, 'U') # Remove upper diagonal temp_block_mm[:] = S_MM - (0.5j*dt) * H_MM # Lower diagonal matrix scalapack_set(self.mm_block_descriptor, temp_block_mm, 0, 0, 'U') # Note it's stricly lower diagonal matrix pblas_tran(-0.5j*dt, H_MM, 1.0, temp_block_mm, self.mm_block_descriptor, self.mm_block_descriptor) # Add transpose of H pblas_tran(1.0, S_MM, 1.0, temp_block_mm, self.mm_block_descriptor, self.mm_block_descriptor) # Add transpose of S pblas_simple_gemm(self.Cnm_block_descriptor, self.mm_block_descriptor, self.Cnm_block_descriptor, temp_blockC_nm, temp_block_mm, target_blockC_nm) # 2. target = (S-0.5j*H*dt)^-1 * target #temp_block_mm[:] = S_MM + (0.5j*dt) * H_MM # XXX It can't be this f'n hard to symmetrize a matrix (tri2full) temp_block_mm[:] = S_MM + (0.5j*dt) * H_MM # Lower diagonal matrix scalapack_set(self.mm_block_descriptor, temp_block_mm, 0, 0, 'U') # Not it's stricly lower diagonal matrix pblas_tran(+0.5j*dt, H_MM, 1.0, temp_block_mm, self.mm_block_descriptor, self.mm_block_descriptor) # Add transpose of H pblas_tran(1.0, S_MM, 1.0, temp_block_mm, self.mm_block_descriptor, self.mm_block_descriptor) # Add transpose of S scalapack_solve(self.mm_block_descriptor, self.Cnm_block_descriptor, temp_block_mm, target_blockC_nm) if self.density.gd.comm.rank != 0: # XXX is this correct? # XXX Fake blacks nbands, nao, nbands, nao grid because some weird asserts # (these are 0,x or x,0 arrays) target = self.CnM_unique_descriptor.zeros(dtype=complex) else: target = targetC_nM self.Cnm2nM.redistribute(target_blockC_nm, target) self.density.gd.comm.broadcast(targetC_nM, 0) # Is this required? else: # Note: The full equation is conjugated (therefore -+, not +-) targetC_nM[:] = solve(S_MM-0.5j*H_MM*dt, np.dot(S_MM+0.5j*H_MM*dt, sourceC_nM.T.conjugate())).T.conjugate() # XXX Debugging stuff. Remove if self.propagator_debug: if world.rank == 0: verify(targetC_nM, debugC_nM, 'Linear solver propagator vs. reference') self.timer.stop('Linear solve') def taylor_propagator(self, sourceC_nM, targetC_nM, S_MM, H_MM, dt): self.timer.start('Taylor propagator') # XXX Debugging stuff. Remove if self.propagator_debug: if self.blacs: globalH_MM = self.blacs_mm_to_global(H_MM) globalS_MM = self.blacs_mm_to_global(S_MM) if world.rank == 0: tri2full(globalS_MM, 'L') tri2full(globalH_MM, 'L') U_MM = dot(inv(globalS_MM-0.5j*globalH_MM*dt), globalS_MM+0.5j*globalH_MM*dt) debugC_nM = dot(sourceC_nM, U_MM.T.conjugate()) #print 'PASS PROPAGATOR' #debugC_nM = sourceC_nM.copy() else: if world.rank == 0: U_MM = dot(inv(S_MM - 0.5j * H_MM * dt), S_MM + 0.5j * H_MM * dt) debugC_nM = dot(sourceC_nM, U_MM.T.conjugate()) #print 'PASS PROPAGATOR' #debugC_nM = sourceC_nM.copy() if self.blacs: target_blockC_nm = self.Cnm_block_descriptor.empty(dtype=complex) # XXX, Preallocate if self.density.gd.comm.rank != 0: # XXX Fake blacks nbands, nao, nbands, nao grid because some weird asserts # (these are 0,x or x,0 arrays) sourceC_nM = self.CnM_unique_descriptor.zeros(dtype=complex) # Zeroth order taylor to target self.CnM2nm.redistribute(sourceC_nM, target_blockC_nm) # XXX, preallocate, optimize use of temporal arrays temp_blockC_nm = target_blockC_nm.copy() temp2_blockC_nm = target_blockC_nm.copy() order = 4 assert self.wfs.kd.comm.size == 1 for n in range(order): # Multiply with hamiltonian pblas_simple_hemm(self.mm_block_descriptor, self.Cnm_block_descriptor, self.Cnm_block_descriptor, H_MM, temp_blockC_nm, temp2_blockC_nm, side='R') # XXX: replace with not simple gemm temp2_blockC_nm *= -1j*dt/(n+1) # Multiply with inverse overlap pblas_simple_hemm(self.mm_block_descriptor, self.Cnm_block_descriptor, self.Cnm_block_descriptor, self.wfs.kpt_u[0].invS_MM, # XXX temp2_blockC_nm, temp_blockC_nm, side='R') target_blockC_nm += temp_blockC_nm if self.density.gd.comm.rank != 0: # Todo: Change to gd.rank # XXX Fake blacks nbands, nao, nbands, nao grid because some weird asserts # (these are 0,x or x,0 arrays) target = self.CnM_unique_descriptor.zeros(dtype=complex) else: target = targetC_nM self.Cnm2nM.redistribute(target_blockC_nm, target) self.density.gd.comm.broadcast(targetC_nM, 0) else: assert self.wfs.kd.comm.size == 1 if self.density.gd.comm.rank == 0: targetC_nM[:] = sourceC_nM[:] tempC_nM = sourceC_nM.copy() order = 4 for n in range(order): tempC_nM[:] = np.dot(self.wfs.kpt_u[0].invS, np.dot(H_MM, 1j*dt/(n+1)*tempC_nM.T.conjugate())).T.conjugate() targetC_nM += tempC_nM self.density.gd.comm.broadcast(targetC_nM, 0) if self.propagator_debug: if world.rank == 0: verify(targetC_nM, debugC_nM, 'Linear solver propagator vs. reference') self.timer.stop('Taylor propagator') def kick(self, strength): self.tddft_init() self.timer.start('Kick') self.kick_strength = strength # magnitude magnitude = np.sqrt(strength[0]*strength[0] + strength[1]*strength[1] + strength[2]*strength[2]) # normalize direction = strength / magnitude self.text('Applying absorbtion kick') self.text('Magnitude: %.8f ' % magnitude) self.text('Direction: %.4f %.4f %.4f' % tuple(direction)) # Create hamiltonian object for absorbtion kick kick_hamiltonian = KickHamiltonian(self, ConstantElectricField(magnitude, direction=direction)) for k, kpt in enumerate(self.wfs.kpt_u): Vkick_MM = self.wfs.eigensolver.calculate_hamiltonian_matrix(kick_hamiltonian, self.wfs, kpt, add_kinetic=False, root=-1) for i in range(10): self.propagate_wfs(kpt.C_nM, kpt.C_nM, kpt.S_MM, Vkick_MM, 0.1) self.timer.stop('Kick') def blacs_mm_to_global(self, H_mm): target = self.MM_descriptor.empty(dtype=complex) self.mm2MM.redistribute(H_mm, target) world.barrier() return target def blacs_nm_to_global(self, C_nm): target = self.CnM_unique_descriptor.empty(dtype=complex) self.Cnm2nM.redistribute(C_nm, target) world.barrier() return target def tddft_init(self): if not self.tddft_initialized: if world.rank == 0: print('Initializing real time LCAO TD-DFT calculation.') print('XXX Warning: Array use not optimal for memory.') print('XXX Taylor propagator probably doesn\'t work') print('XXX ...and no arrays are listed in memory estimate yet.') self.blacs = self.wfs.ksl.using_blacs if self.blacs: self.ksl = ksl = self.wfs.ksl nao = ksl.nao nbands = ksl.bd.nbands mynbands = ksl.bd.mynbands blocksize = ksl.blocksize from gpaw.blacs import Redistributor if world.rank == 0: print('BLACS Parallelization') # Parallel grid descriptors self.MM_descriptor = ksl.blockgrid.new_descriptor(nao, nao, nao, nao) # FOR DEBUG self.mm_block_descriptor = ksl.blockgrid.new_descriptor(nao, nao, blocksize, blocksize) self.Cnm_block_descriptor = ksl.blockgrid.new_descriptor(nbands, nao, blocksize, blocksize) #self.CnM_descriptor = ksl.blockgrid.new_descriptor(nbands, nao, mynbands, nao) self.mM_column_descriptor = ksl.single_column_grid.new_descriptor(nao, nao, ksl.naoblocksize, nao) self.CnM_unique_descriptor = ksl.single_column_grid.new_descriptor(nbands, nao, mynbands, nao) # Redistributors self.mm2MM = Redistributor(ksl.block_comm, self.mm_block_descriptor, self.MM_descriptor) # XXX FOR DEBUG self.MM2mm = Redistributor(ksl.block_comm, self.MM_descriptor, self.mm_block_descriptor) # XXX FOR DEBUG self.Cnm2nM = Redistributor(ksl.block_comm, self.Cnm_block_descriptor, self.CnM_unique_descriptor) self.CnM2nm = Redistributor(ksl.block_comm, self.CnM_unique_descriptor, self.Cnm_block_descriptor) self.mM2mm = Redistributor(ksl.block_comm, self.mM_column_descriptor, self.mm_block_descriptor) for kpt in self.wfs.kpt_u: scalapack_zero(self.mm_block_descriptor, kpt.S_MM,'U') scalapack_zero(self.mm_block_descriptor, kpt.T_MM,'U') # XXX to propagator class if self.propagator == 'taylor' and self.blacs: # cholS_mm = self.mm_block_descriptor.empty(dtype=complex) for kpt in self.wfs.kpt_u: kpt.invS_MM = kpt.S_MM.copy() scalapack_inverse(self.mm_block_descriptor, kpt.invS_MM, 'L') if self.propagator_debug: if world.rank == 0: print('XXX Doing serial inversion of overlap matrix.') self.timer.start('Invert overlap (serial)') invS2_MM = self.MM_descriptor.empty(dtype=complex) for kpt in self.wfs.kpt_u: #kpt.S_MM[:] = 128.0*(2**world.rank) self.mm2MM.redistribute(self.wfs.S_qMM[kpt.q], invS2_MM) world.barrier() if world.rank == 0: tri2full(invS2_MM,'L') invS2_MM[:] = inv(invS2_MM.copy()) self.invS2_MM = invS2_MM kpt.invS2_MM = ksl.mmdescriptor.empty(dtype=complex) self.MM2mm.redistribute(invS2_MM, kpt.invS2_MM) verify(kpt.invS_MM, kpt.invS2_MM, 'overlap par. vs. serial', 'L') self.timer.stop('Invert overlap (serial)') if world.rank == 0: print('XXX Overlap inverted.') if self.propagator == 'taylor' and not self.blacs: tmp = inv(self.wfs.kpt_u[0].S_MM) self.wfs.kpt_u[0].invS = tmp # Reset the density mixer self.density.mixer = DummyMixer() self.tddft_initialized = True for k, kpt in enumerate(self.wfs.kpt_u): kpt.C2_nM = kpt.C_nM.copy() #kpt.firstC_nM = kpt.C_nM.copy() def update_projectors(self): self.timer.start('LCAO update projectors') # Loop over all k-points for k, kpt in enumerate(self.wfs.kpt_u): for a, P_ni in kpt.P_ani.items(): print('Update projector: Rank:', world.rank, 'a', a) P_ni.fill(117) gemm(1.0, kpt.P_aMi[a], kpt.C_nM, 0.0, P_ni, 'n') self.timer.stop('LCAO update projectors') def save_wfs(self): for k, kpt in enumerate(self.wfs.kpt_u): kpt.C2_nM[:] = kpt.C_nM def update_hamiltonian(self): self.update_projectors() self.density.update(self.wfs) self.hamiltonian.update(self.density) def propagate(self, time_step=10, iterations=2000, out='lcao.dm', dump_interval=50): assert self.wfs.dtype == complex time_step *= attosec_to_autime self.time_step = time_step self.dump_interval = dump_interval maxiter = self.niter + iterations if self.time < self.time_step: self.dm_file = paropen(out,'w') # XXXX # Bug: will fail if world != self.wfs.world. -askhl header = '# Kick = [%22.12le, %22.12le, %22.12le]\n' \ % (self.kick_strength[0], self.kick_strength[1], \ self.kick_strength[2]) header += '# %15s %15s %22s %22s %22s\n' \ % ('time', 'norm', 'dmx', 'dmy', 'dmz') self.dm_file.write(header) self.dm_file.flush() self.text('About to do %d propagation steps.' % iterations) else: self.dm_file = paropen(out,'a') # XXXX self.text('About to continue from iteration %d and do %d propagation steps' % (self.niter, maxiter)) self.tddft_init() dm0 = None # Initial dipole moment self.timer.start('Propagate') while self.niter < maxiter: dm = self.density.finegd.calculate_dipole_moment(self.density.rhot_g) if dm0 is None: dm0 = dm norm = self.density.finegd.integrate(self.density.rhot_g) line = '%20.8lf %20.8le %22.12le %22.12le %22.12le' % (self.time, norm, dm[0], dm[1], dm[2]) T = localtime() if world.rank == 0: print(line, file=self.dm_file) if world.rank == 0 and self.niter%10==0: print('iter: %3d %02d:%02d:%02d %11.2f %9.1f %12.8f' % (self.niter, T[3], T[4], T[5], self.time * autime_to_attosec, log(abs(norm)+1e-16)/log(10), np.sqrt(dm[0]**2+dm[1]**2+dm[2]**2))) self.dm_file.flush() # ---------------------------------------------------------------------------- # Predictor step # ---------------------------------------------------------------------------- # 1. Calculate H(t) self.save_wfs() # kpt.C2_nM = kpt.C_nM # 2. H_MM(t) = <M|H(t)|H> # Solve Psi(t+dt) from (S_MM - 0.5j*H_MM(t)*dt) Psi(t+dt) = (S_MM + 0.5j*H_MM(t)*dt) Psi(t) for k, kpt in enumerate(self.wfs.kpt_u): if self.fxc is not None: if self.time == 0.0: kpt.deltaXC_H_MM = self.wfs.eigensolver.calculate_hamiltonian_matrix(\ self.hamiltonian, self.wfs, kpt, root=-1) self.hamiltonian.xc = XC(self.fxc) self.update_hamiltonian() assert len(self.wfs.kpt_u) == 1 kpt.deltaXC_H_MM -= self.wfs.eigensolver.calculate_hamiltonian_matrix(\ self.hamiltonian, self.wfs, kpt, root=-1) self.update_hamiltonian() for k, kpt in enumerate(self.wfs.kpt_u): kpt.H0_MM = self.wfs.eigensolver.calculate_hamiltonian_matrix(self.hamiltonian, self.wfs, kpt, root=-1) if self.fxc is not None: kpt.H0_MM += kpt.deltaXC_H_MM self.propagate_wfs(kpt.C_nM, kpt.C_nM, kpt.S_MM, kpt.H0_MM, self.time_step) # ---------------------------------------------------------------------------- # Propagator step # ---------------------------------------------------------------------------- # 1. Calculate H(t+dt) self.update_hamiltonian() # 2. Estimate H(t+0.5*dt) ~ H(t) + H(t+dT) for k, kpt in enumerate(self.wfs.kpt_u): kpt.H0_MM *= 0.5 if self.fxc is not None: # Store this to H0_MM and maybe save one extra H_MM of memory? kpt.H0_MM += 0.5 * (self.wfs.eigensolver.calculate_hamiltonian_matrix( \ self.hamiltonian, self.wfs, kpt, root=-1) +\ kpt.deltaXC_H_MM) else: # Store this to H0_MM and maybe save one extra H_MM of memory? kpt.H0_MM += 0.5 * self.wfs.eigensolver.calculate_hamiltonian_matrix( \ self.hamiltonian, self.wfs, kpt, root=-1) # 3. Solve Psi(t+dt) from (S_MM - 0.5j*H_MM(t+0.5*dt)*dt) Psi(t+dt) = (S_MM + 0.5j*H_MM(t+0.5*dt)*dt) Psi(t) self.propagate_wfs(kpt.C2_nM, kpt.C_nM, kpt.S_MM, kpt.H0_MM, self.time_step) self.niter += 1 self.time += self.time_step # Call registered callback functions self.call_observers(self.niter) self.call_observers(self.niter, final=True) self.dm_file.close() self.timer.stop('Propagate')
class LrDiagonalizeLayout: """BLACS layout for distributed Omega matrix in linear response time-dependet DFT calculations""" def __init__(self, sl_lrtddft, nrows, lr_comms): self.mprocs, self.nprocs, self.block_size = tuple(sl_lrtddft) self.lr_comms = lr_comms # original grid, ie, how matrix is stored self.matrix_grid = BlacsGrid(self.lr_comms.parent_comm, self.lr_comms.dd_comm.size, self.lr_comms.eh_comm.size) # diagonalization grid self.diag_grid = BlacsGrid(self.lr_comms.parent_comm, self.mprocs, self.nprocs) # ----------------------------------------------------------------- # for SCALAPACK we need TRANSPOSED MATRIX (and vector) # # M = rows, N = cols M = nrows N = nrows mb = 1 nb = 1 self.matrix_descr = self.matrix_grid.new_descriptor(N, M, nb, mb) bs = self.block_size self.diag_descr = self.diag_grid.new_descriptor(N, M, bs, bs) self.diag_in_redist = Redistributor(self.lr_comms.parent_comm, self.matrix_descr, self.diag_descr) self.diag_out_redist = Redistributor(self.lr_comms.parent_comm, self.diag_descr, self.matrix_descr) def diagonalize(self, eigenvectors, eigenvalues): """Diagonalize symmetric distributed Casida matrix using Scalapack. Parameters: eigenvectors distributed Casida matrix on input, distributed eigenvectors on output eigenvalues zero array on input, eigenvalues on output """ O_diag = self.diag_descr.empty(dtype=float) if self.matrix_descr.blacsgrid.is_active(): O_orig = eigenvectors else: O_orig = np.empty((0, 0), dtype=float) self.diag_in_redist.redistribute(O_orig, O_diag) #print O_diag self.diag_descr.diagonalize_dc(O_diag.copy(), O_diag, eigenvalues, 'L') self.diag_out_redist.redistribute(O_diag, O_orig) self.lr_comms.parent_comm.broadcast(eigenvalues, 0)
grid = BlacsGrid(world, 2, world.size // 2) desc = grid.new_descriptor(12, 8, 2, 3) a = desc.zeros() a[:] = world.rank subdesc = grid.new_descriptor(7, 7, 2, 2) b = subdesc.zeros() r = Redistributor(grid.comm, desc, subdesc, uplo='G') ia = 3 ja = 2 ib = 1 jb = 1 M = 4 N = 5 r.redistribute(a, b, M, N, ia, ja, ib, jb) a0 = desc.collect_on_master(a) b0 = subdesc.collect_on_master(b) if world.rank == 0: print a0 print b0 xa = a0[ia:ia + M, ja:ja + N] xb = b0[ib:ib + M, jb:jb + N] assert (xa == xb).all()