def mpiverify(data, id): # Do some debugging when running on two procs XXX REMOVE if world.size == 2: if world.rank == 0: temp = -data.copy() else: temp = data.copy() world.sum(temp) err = sum(abs(temp).ravel()**2) if err > 1e-10: if world.rank == 0: print('Parallel assert failed: ', id, ' norm: ', sum(temp.ravel()**2)) print('Data from proc ', world.rank) print(data) assert False
def test_addition_theorem(self): lmax = 9 # Test that the complex spherical harmonic addition theorem holds thetam_L = np.random.uniform(0, np.pi, size=theta_L.shape) world.broadcast(thetam_L, 0) phim_L = np.random.uniform(0, 2*np.pi, size=phi_L.shape) world.broadcast(phim_L, 0) cosv_L = np.cos(theta_L)*np.cos(thetam_L) \ + np.sin(theta_L)*np.sin(thetam_L)*np.cos(phi_L-phim_L) P0_lL = np.array([legendre(l, 0, cosv_L) for l in range(lmax+1)]) P_lL = np.zeros_like(P0_lL) for l,m in lmiter(lmax, comm=world): P_lL[l] += 4 * np.pi / (2*l + 1.) * Y(l, m, theta_L, phi_L) \ * Y(l, m, thetam_L, phim_L).conj() world.sum(P_lL) self.assertAlmostEqual(np.abs(P_lL-P0_lL).max(), 0, 6)
def test_addition_theorem(self): lmax = 9 # Test that the complex spherical harmonic addition theorem holds thetam_L = np.random.uniform(0, np.pi, size=theta_L.shape) world.broadcast(thetam_L, 0) phim_L = np.random.uniform(0, 2 * np.pi, size=phi_L.shape) world.broadcast(phim_L, 0) cosv_L = np.cos(theta_L)*np.cos(thetam_L) \ + np.sin(theta_L)*np.sin(thetam_L)*np.cos(phi_L-phim_L) P0_lL = np.array([legendre(l, 0, cosv_L) for l in range(lmax + 1)]) P_lL = np.zeros_like(P0_lL) for l, m in lmiter(lmax, comm=world): P_lL[l] += 4 * np.pi / (2*l + 1.) * Y(l, m, theta_L, phi_L) \ * Y(l, m, thetam_L, phim_L).conj() world.sum(P_lL) self.assertAlmostEqual(np.abs(P_lL - P0_lL).max(), 0, 6)
def test_multipole_expansion(self): lmax = 9 R = 1.0 npts = 1000 tol = 1e-9 # Solve ((R-dR)/(R+dR))**(lmax+1) = tol for dR dR = R * (1 - tol**(1. / (lmax + 1))) / (1 + tol**(1. / (lmax + 1))) assert abs(((R - dR) / (R + dR))**(lmax + 1) - tol) < 1e-12 # Test multipole expansion of 1/|r-r'| in complex spherical harmonics r_g = np.random.uniform(R + dR, 10 * R, size=npts) world.broadcast(r_g, 0) theta_g = np.random.uniform(0, np.pi, size=npts) world.broadcast(theta_g, 0) phi_g = np.random.uniform(0, np.pi, size=npts) world.broadcast(phi_g, 0) r_vg = np.empty((3, npts), dtype=float) r_vg[0] = r_g * np.cos(phi_g) * np.sin(theta_g) r_vg[1] = r_g * np.sin(phi_g) * np.sin(theta_g) r_vg[2] = r_g * np.cos(theta_g) rm_g = np.random.uniform(0, R - dR, size=npts) world.broadcast(rm_g, 0) thetam_g = np.random.uniform(0, np.pi, size=npts) world.broadcast(thetam_g, 0) phim_g = np.random.uniform(0, np.pi, size=npts) world.broadcast(phim_g, 0) rm_vg = np.empty((3, npts), dtype=float) rm_vg[0] = rm_g * np.cos(phim_g) * np.sin(thetam_g) rm_vg[1] = rm_g * np.sin(phim_g) * np.sin(thetam_g) rm_vg[2] = rm_g * np.cos(thetam_g) f0_g = np.sum((r_vg - rm_vg)**2, axis=0)**(-0.5) f_g = np.zeros_like(f0_g) for l, m in lmiter(lmax, comm=world): f_g += 4 * np.pi / (2*l + 1.) * r_g**(-1) * (rm_g/r_g)**l \ * Y(l, m, theta_g, phi_g) * Y(l, m, thetam_g, phim_g).conj() world.sum(f_g) e = np.abs(f_g - f0_g).max() self.assertAlmostEqual(e, 0, 9)
def test_multipole_expansion(self): lmax = 9 R = 1.0 npts = 1000 tol = 1e-9 # Solve ((R-dR)/(R+dR))**(lmax+1) = tol for dR dR = R * (1 - tol**(1./(lmax+1))) / (1 + tol**(1./(lmax+1))) assert abs(((R-dR)/(R+dR))**(lmax+1) - tol) < 1e-12 # Test multipole expansion of 1/|r-r'| in complex spherical harmonics r_g = np.random.uniform(R+dR, 10*R, size=npts) world.broadcast(r_g, 0) theta_g = np.random.uniform(0, np.pi, size=npts) world.broadcast(theta_g, 0) phi_g = np.random.uniform(0, np.pi, size=npts) world.broadcast(phi_g, 0) r_vg = np.empty((3, npts), dtype=float) r_vg[0] = r_g*np.cos(phi_g)*np.sin(theta_g) r_vg[1] = r_g*np.sin(phi_g)*np.sin(theta_g) r_vg[2] = r_g*np.cos(theta_g) rm_g = np.random.uniform(0, R-dR, size=npts) world.broadcast(rm_g, 0) thetam_g = np.random.uniform(0, np.pi, size=npts) world.broadcast(thetam_g, 0) phim_g = np.random.uniform(0, np.pi, size=npts) world.broadcast(phim_g, 0) rm_vg = np.empty((3, npts), dtype=float) rm_vg[0] = rm_g*np.cos(phim_g)*np.sin(thetam_g) rm_vg[1] = rm_g*np.sin(phim_g)*np.sin(thetam_g) rm_vg[2] = rm_g*np.cos(thetam_g) f0_g = np.sum((r_vg-rm_vg)**2, axis=0)**(-0.5) f_g = np.zeros_like(f0_g) for l,m in lmiter(lmax, comm=world): f_g += 4 * np.pi / (2*l + 1.) * r_g**(-1) * (rm_g/r_g)**l \ * Y(l, m, theta_g, phi_g) * Y(l, m, thetam_g, phim_g).conj() world.sum(f_g) e = np.abs(f_g-f0_g).max() self.assertAlmostEqual(e, 0, 9)
def parallel_eigh(matrixfile, blacsgrid=(4, 2), blocksize=64): """Diagonalize matrix in parallel""" assert np.prod(blacsgrid) == world.size grid = BlacsGrid(world, *blacsgrid) if world.rank == MASTER: H_MM = np.load(matrixfile) assert H_MM.ndim == 2 assert H_MM.shape[0] == H_MM.shape[1] NM = len(H_MM) else: NM = 0 NM = world.sum(NM) # Distribute matrix shape to all nodes # descriptor for the individual blocks block_desc = grid.new_descriptor(NM, NM, blocksize, blocksize) # descriptor for global array on MASTER local_desc = grid.new_descriptor(NM, NM, NM, NM) # Make some dummy array on all the slaves if world.rank != MASTER: H_MM = local_desc.zeros() assert local_desc.check(H_MM) # The local version of the matrix H_mm = block_desc.empty() # Distribute global array to smaller blocks redistributor = Redistributor(world, local_desc, block_desc) redistributor.redistribute(H_MM, H_mm) # Allocate arrays for eigenvalues and -vectors eps_M = np.empty(NM) C_mm = block_desc.empty() block_desc.diagonalize_ex(H_mm, C_mm, eps_M) # Collect eigenvectors on MASTER C_MM = local_desc.empty() redistributor2 = Redistributor(world, block_desc, local_desc) redistributor2.redistribute(C_mm, C_MM) # Return eigenvalues and -vectors on Master if world.rank == MASTER: return eps_M, C_MM else: return None, None
def calculate_exx(self): """Non-selfconsistent calculation.""" kd = self.kd K = len(kd.bzk_kc) W = world.size // self.nspins parallel = (W > 1) self.exx = 0.0 self.exx_kq = np.zeros((K, len(self.ibzq_qc)), float) for s in range(self.nspins): ibz_kpts = [KPoint(kd, kpt) for kpt in self.kpt_u if kpt.s == s] for ik, kpt in enumerate(kd.bzk_kc): print('K %s %s ...' % (ik, kpt), file=self.txt) for iq, q in enumerate(self.ibzq_qc): kpq = kd.find_k_plus_q(q, kpts_k=[ik]) self.apply(ibz_kpts[kd.bz2ibz_k[ik]], ibz_kpts[kd.bz2ibz_k[kpq[0]]], ik, kpq[0], iq) self.exx = world.sum(self.exx) self.exx += self.calculate_exx_paw_correction() exx_q = np.sum(self.exx_kq, 0) print(file=self.txt) print('------------------------------------------------------', file=self.txt) print(file=self.txt) print('Contributions: q w E_q (eV)', file=self.txt) for q in range(len(exx_q)): print('[%1.3f %1.3f %1.3f] %1.3f %s' % \ (self.ibzq_qc[q][0], self.ibzq_qc[q][1], self.ibzq_qc[q][2], self.q_weights[q]/len(self.bzq_qc), exx_q[q]/self.q_weights[q]*len(self.bzq_qc)*Ha), file=self.txt) print('E_EXX = %s eV' % (self.exx * Ha), file=self.txt) print(file=self.txt) print('Calculation completed at: ', ctime(), file=self.txt) print(file=self.txt) print('------------------------------------------------------', file=self.txt) print(file=self.txt)
def calculate_exx(self): """Non-selfconsistent calculation.""" kd = self.kd K = len(kd.bzk_kc) W = world.size // self.nspins parallel = W > 1 self.exx = 0.0 self.exx_kq = np.zeros((K, len(self.ibzq_qc)), float) for s in range(self.nspins): ibz_kpts = [KPoint(kd, kpt) for kpt in self.kpt_u if kpt.s == s] for ik, kpt in enumerate(kd.bzk_kc): print >>self.txt, "K %s %s ..." % (ik, kpt) for iq, q in enumerate(self.ibzq_qc): kpq = kd.find_k_plus_q(q, kpts_k=[ik]) self.apply(ibz_kpts[kd.bz2ibz_k[ik]], ibz_kpts[kd.bz2ibz_k[kpq[0]]], ik, kpq[0], iq) self.exx = world.sum(self.exx) self.exx += self.calculate_exx_paw_correction() exx_q = np.sum(self.exx_kq, 0) print >>self.txt print >>self.txt, "------------------------------------------------------" print >>self.txt print >>self.txt, "Contributions: q w E_q (eV)" for q in range(len(exx_q)): print >>self.txt, "[%1.3f %1.3f %1.3f] %1.3f %s" % ( self.ibzq_qc[q][0], self.ibzq_qc[q][1], self.ibzq_qc[q][2], self.q_weights[q] / len(self.bzq_qc), exx_q[q] / self.q_weights[q] * len(self.bzq_qc) * Ha, ) print >>self.txt, "E_EXX = %s eV" % (self.exx * Ha) print >>self.txt print >>self.txt, "Calculation completed at: ", ctime() print >>self.txt print >>self.txt, "------------------------------------------------------" print >>self.txt
def calculate_exx(self): """Non-selfconsistent calculation.""" kd = self.kd K = self.fullkd.nibzkpts assert self.nspins == 1 Q = K // world.size assert Q * world.size == K parallel = (world.size > self.nspins) self.exx = 0.0 self.exx_skn = np.zeros((self.nspins, K, self.bd.nbands)) kpt_u = [] for k in range(world.rank * Q, (world.rank + 1) * Q): k_c = self.fullkd.ibzk_kc[k] for k1, k1_c in enumerate(kd.bzk_kc): if abs(k1_c - k_c).max() < 1e-10: break # Index of symmetry related point in the irreducible BZ ik = kd.kibz_k[k1] kpt = self.kpt_u[ik] # KPoint from ground-state calculation phase_cd = np.exp(2j * pi * self.gd.sdisp_cd * k_c[:, np.newaxis]) kpt2 = KPoint0(kpt.weight, kpt.s, k, None, phase_cd) kpt2.psit_nG = np.empty_like(kpt.psit_nG) kpt2.f_n = kpt.f_n / kpt.weight / K * 2 for n, psit_G in enumerate(kpt2.psit_nG): psit_G[:] = kd.transform_wave_function(kpt.psit_nG[n], k1) kpt2.P_ani = self.pt.dict(len(kpt.psit_nG)) self.pt.integrate(kpt2.psit_nG, kpt2.P_ani, k) kpt_u.append(kpt2) for s in range(self.nspins): kpt1_q = [KPoint(self.fullkd, kpt) for kpt in kpt_u if kpt.s == s] kpt2_q = kpt1_q[:] if len(kpt1_q) == 0: # No s-spins on this CPU: continue # Send rank: srank = self.fullkd.get_rank_and_index(s, (kpt1_q[0].k - 1) % K)[0] # Receive rank: rrank = self.fullkd.get_rank_and_index(s, (kpt1_q[-1].k + 1) % K)[0] # Shift k-points K // 2 times: for i in range(K // 2 + 1): if i < K // 2: if parallel: kpt = kpt2_q[-1].next() kpt.start_receiving(rrank) kpt2_q[0].start_sending(srank) else: kpt = kpt2_q[0] for kpt1, kpt2 in zip(kpt1_q, kpt2_q): if 2 * i == K: self.apply(kpt1, kpt2, invert=(kpt1.k > kpt2.k)) else: self.apply(kpt1, kpt2) self.apply(kpt1, kpt2, invert=True) if i < K // 2: if parallel: kpt.wait() kpt2_q[0].wait() kpt2_q.pop(0) kpt2_q.append(kpt) self.exx = world.sum(self.exx) world.sum(self.exx_skn) self.exx += self.calculate_paw_correction()
def main(N=72, seed=42, mprocs=2, nprocs=2, dtype=float): gen = np.random.RandomState(seed) grid = BlacsGrid(world, mprocs, nprocs) if (dtype == complex): epsilon = 1.0j else: epsilon = 0.0 # Create descriptors for matrices on master: glob = grid.new_descriptor(N, N, N, N) # print globA.asarray() # Populate matrices local to master: H0 = glob.zeros(dtype=dtype) + gen.rand(*glob.shape) S0 = glob.zeros(dtype=dtype) + gen.rand(*glob.shape) C0 = glob.empty(dtype=dtype) if rank == 0: # Complex case must have real numbers on the diagonal. # We make a simple complex Hermitian matrix below. H0 = H0 + epsilon * (0.1 * np.tri(N, N, k=-N // nprocs) + 0.3 * np.tri(N, N, k=-1)) S0 = S0 + epsilon * (0.2 * np.tri(N, N, k=-N // nprocs) + 0.4 * np.tri(N, N, k=-1)) # Make matrices symmetric rk(1.0, H0.copy(), 0.0, H0) rk(1.0, S0.copy(), 0.0, S0) # Overlap matrix must be semi-positive definite S0 = S0 + 50.0 * np.eye(N, N, 0) # Hamiltonian is usually diagonally dominant H0 = H0 + 75.0 * np.eye(N, N, 0) C0 = S0.copy() S0_inv = S0.copy() # Local result matrices W0 = np.empty((N), dtype=float) W0_g = np.empty((N), dtype=float) # Calculate eigenvalues / other serial results if rank == 0: diagonalize(H0.copy(), W0) general_diagonalize(H0.copy(), W0_g, S0.copy()) inverse_cholesky(C0) # result returned in lower triangle tri2full(S0_inv, 'L') S0_inv = inv(S0_inv) # tri2full(C0) # symmetrize assert glob.check(H0) and glob.check(S0) and glob.check(C0) # Create distributed destriptors with various block sizes: dist = grid.new_descriptor(N, N, 8, 8) # Distributed matrices: # We can use empty here, but end up with garbage on # on the other half of the triangle when we redistribute. # This is fine because ScaLAPACK does not care. H = dist.empty(dtype=dtype) S = dist.empty(dtype=dtype) Sinv = dist.empty(dtype=dtype) Z = dist.empty(dtype=dtype) C = dist.empty(dtype=dtype) Sinv = dist.empty(dtype=dtype) # Eigenvalues are non-BLACS matrices W = np.empty((N), dtype=float) W_dc = np.empty((N), dtype=float) W_mr3 = np.empty((N), dtype=float) W_g = np.empty((N), dtype=float) W_g_dc = np.empty((N), dtype=float) W_g_mr3 = np.empty((N), dtype=float) Glob2dist = Redistributor(world, glob, dist) Glob2dist.redistribute(H0, H, uplo='L') Glob2dist.redistribute(S0, S, uplo='L') Glob2dist.redistribute(S0, C, uplo='L') # C0 was previously overwritten Glob2dist.redistribute(S0, Sinv, uplo='L') # we don't test the expert drivers anymore since there # might be a buffer overflow error ## scalapack_diagonalize_ex(dist, H.copy(), Z, W, 'L') scalapack_diagonalize_dc(dist, H.copy(), Z, W_dc, 'L') ## scalapack_diagonalize_mr3(dist, H.copy(), Z, W_mr3, 'L') ## scalapack_general_diagonalize_ex(dist, H.copy(), S.copy(), Z, W_g, 'L') scalapack_general_diagonalize_dc(dist, H.copy(), S.copy(), Z, W_g_dc, 'L') ## scalapack_general_diagonalize_mr3(dist, H.copy(), S.copy(), Z, W_g_mr3, 'L') scalapack_inverse_cholesky(dist, C, 'L') if dtype == complex: # Only supported for complex for now scalapack_inverse(dist, Sinv, 'L') # Undo redistribute C_test = glob.empty(dtype=dtype) Sinv_test = glob.empty(dtype=dtype) Dist2glob = Redistributor(world, dist, glob) Dist2glob.redistribute(C, C_test) Dist2glob.redistribute(Sinv, Sinv_test) if rank == 0: ## diag_ex_err = abs(W - W0).max() diag_dc_err = abs(W_dc - W0).max() ## diag_mr3_err = abs(W_mr3 - W0).max() ## general_diag_ex_err = abs(W_g - W0_g).max() general_diag_dc_err = abs(W_g_dc - W0_g).max() ## general_diag_mr3_err = abs(W_g_mr3 - W0_g).max() inverse_chol_err = abs(C_test - C0).max() tri2full(Sinv_test, 'L') inverse_err = abs(Sinv_test - S0_inv).max() ## print 'diagonalize ex err', diag_ex_err print('diagonalize dc err', diag_dc_err) ## print 'diagonalize mr3 err', diag_mr3_err ## print 'general diagonalize ex err', general_diag_ex_err print('general diagonalize dc err', general_diag_dc_err) ## print 'general diagonalize mr3 err', general_diag_mr3_err print('inverse chol err', inverse_chol_err) if dtype == complex: print('inverse err', inverse_err) else: ## diag_ex_err = 0.0 diag_dc_err = 0.0 ## diag_mr3_err = 0.0 ## general_diag_ex_err = 0.0 general_diag_dc_err = 0.0 ## general_diag_mr3_err = 0.0 inverse_chol_err = 0.0 inverse_err = 0.0 # We don't like exceptions on only one cpu ## diag_ex_err = world.sum(diag_ex_err) diag_dc_err = world.sum(diag_dc_err) ## diag_mr3_err = world.sum(diag_mr3_err) ## general_diag_ex_err = world.sum(general_diag_ex_err) general_diag_dc_err = world.sum(general_diag_dc_err) ## general_diag_mr3_err = world.sum(general_diag_mr3_err) inverse_chol_err = world.sum(inverse_chol_err) inverse_err = world.sum(inverse_err) ## assert diag_ex_err < tol assert diag_dc_err < tol ## assert diag_mr3_err < tol ## assert general_diag_ex_err < tol assert general_diag_dc_err < tol ## assert general_diag_mr3_err < tol assert inverse_chol_err < tol if dtype == complex: assert inverse_err < tol
def parallel_transport(calc, direction=0, spinors=True, name=None, scale=1.0, bands=None, theta=0.0, phi=0.0): if isinstance(calc, str): calc = GPAW(calc, txt=None, communicator=serial_comm) if bands is None: nv = int(calc.get_number_of_electrons()) bands = range(nv) cell_cv = calc.wfs.gd.cell_cv icell_cv = (2 * np.pi) * np.linalg.inv(cell_cv).T r_g = calc.wfs.gd.get_grid_point_coordinates() Ng = np.prod(np.shape(r_g)[1:]) * (spinors + 1) dO_aii = [] for ia in calc.wfs.kpt_u[0].P_ani.keys(): dO_ii = calc.wfs.setups[ia].dO_ii if spinors: # Spinor projections require doubling of the (identical) orbitals dO_jj = np.zeros((2 * len(dO_ii), 2 * len(dO_ii)), complex) dO_jj[::2, ::2] = dO_ii dO_jj[1::2, 1::2] = dO_ii dO_aii.append(dO_jj) else: dO_aii.append(dO_ii) N_c = calc.wfs.kd.N_c assert 1 in np.delete(N_c, direction) Nkx = N_c[0] Nky = N_c[1] Nkz = N_c[2] Nk = Nkx * Nky * Nkz Nloc = N_c[direction] Npar = Nk // Nloc # Parallelization stuff myKsize = -(-Npar // (world.size)) myKrange = range(rank * myKsize, min((rank + 1) * myKsize, Npar)) myKsize = len(myKrange) # Get array of k-point indices of the path. q index is loc direction kpts_kq = [] for k in range(Npar): if direction == 0: kpts_kq.append(list(range(k, Nkx * Nky, Nky))) if direction == 1: if Nkz == 1: kpts_kq.append(list(range(k * Nky, (k + 1) * Nky))) else: kpts_kq.append(list(range(k, Nkz * Nky, Nkz))) if direction == 2: kpts_kq.append(list(range(k * Nloc, (k + 1) * Nloc))) G_c = np.array([0, 0, 0]) G_c[direction] = 1 G_v = np.dot(G_c, icell_cv) kpts_kc = calc.get_bz_k_points() kpts_kv = np.dot(kpts_kc, icell_cv) if Nloc > 1: b_c = kpts_kc[kpts_kq[0][1]] - kpts_kc[kpts_kq[0][0]] b_v = np.dot(b_c, icell_cv) else: b_v = G_v e_mk, v_knm = get_spinorbit_eigenvalues(calc, return_wfs=True, scale=scale, theta=theta, phi=phi) phi_km = np.zeros((Npar, len(bands)), float) S_km = np.zeros((Npar, len(bands)), float) # Loop over the direction parallel components for k in myKrange: U_qmm = [np.eye(len(bands))] print(k) qpts_q = kpts_kq[k] # Loop over kpoints in the phase direction for q in range(Nloc - 1): iq1 = qpts_q[q] iq2 = qpts_q[q + 1] # print(kpts_kc[iq1], kpts_kc[iq2]) if q == 0: u1_nsG = get_spinorbit_wavefunctions(calc, iq1, v_knm[iq1])[bands] # Transform from psi-like to u-like u1_nsG[:] *= np.exp(-1.0j * gemmdot(kpts_kv[iq1], r_g, beta=0.0)) P1_ani = get_spinorbit_projections(calc, iq1, v_knm[iq1]) u2_nsG = get_spinorbit_wavefunctions(calc, iq2, v_knm[iq2])[bands] u2_nsG[:] *= np.exp(-1.0j * gemmdot(kpts_kv[iq2], r_g, beta=0.0)) P2_ani = get_spinorbit_projections(calc, iq2, v_knm[iq2]) M_mm = get_overlap(calc, bands, np.reshape(u1_nsG, (len(u1_nsG), Ng)), np.reshape(u2_nsG, (len(u2_nsG), Ng)), P1_ani, P2_ani, dO_aii, b_v) V_mm, sing_m, W_mm = np.linalg.svd(M_mm) U_mm = np.dot(V_mm, W_mm).conj() u_nysxz = np.dot(U_mm, np.swapaxes(u2_nsG, 0, 3)) u_nxsyz = np.swapaxes(u_nysxz, 1, 3) u_nsxyz = np.swapaxes(u_nxsyz, 1, 2) u2_nsG = u_nsxyz for a in range(len(calc.atoms)): P2_ni = P2_ani[a][bands] P2_ni = np.dot(U_mm, P2_ni) P2_ani[a][bands] = P2_ni U_qmm.append(U_mm) u1_nsG = u2_nsG P1_ani = P2_ani U_qmm = np.array(U_qmm) # Fix phases for last point iq0 = qpts_q[0] if Nloc == 1: u1_nsG = get_spinorbit_wavefunctions(calc, iq0, v_knm[iq0])[bands] u1_nsG[:] *= np.exp(-1.0j * gemmdot(kpts_kv[iq0], r_g, beta=0.0)) P1_ani = get_spinorbit_projections(calc, iq0, v_knm[iq0]) u2_nsG = get_spinorbit_wavefunctions(calc, iq0, v_knm[iq0])[bands] u2_nsG[:] *= np.exp(-1.0j * gemmdot(kpts_kv[iq0], r_g, beta=0.0)) u2_nsG[:] *= np.exp(-1.0j * gemmdot(G_v, r_g, beta=0.0)) P2_ani = get_spinorbit_projections(calc, iq0, v_knm[iq0]) for a in range(len(calc.atoms)): P2_ni = P2_ani[a][bands] # P2_ni *= np.exp(-1.0j * np.dot(G_v, r_av[a])) P2_ani[a][bands] = P2_ni M_mm = get_overlap(calc, bands, np.reshape(u1_nsG, (len(u1_nsG), Ng)), np.reshape(u2_nsG, (len(u2_nsG), Ng)), P1_ani, P2_ani, dO_aii, b_v) V_mm, sing_m, W_mm = np.linalg.svd(M_mm) U_mm = np.dot(V_mm, W_mm).conj() u_nysxz = np.dot(U_mm, np.swapaxes(u2_nsG, 0, 3)) u_nxsyz = np.swapaxes(u_nysxz, 1, 3) u_nsxyz = np.swapaxes(u_nxsyz, 1, 2) u2_nsG = u_nsxyz for a in range(len(calc.atoms)): P2_ni = P2_ani[a][bands] P2_ni = np.dot(U_mm, P2_ni) P2_ani[a][bands] = P2_ni # Get overlap between first kpts and its smoothly translated image u2_nsG[:] *= np.exp(1.0j * gemmdot(G_v, r_g, beta=0.0)) for a in range(len(calc.atoms)): P2_ni = P2_ani[a][bands] # P2_ni *= np.exp(1.0j * np.dot(G_v, r_av[a])) P2_ani[a][bands] = P2_ni u1_nsG = get_spinorbit_wavefunctions(calc, iq0, v_knm[iq0])[bands] u1_nsG[:] *= np.exp(-1.0j * gemmdot(kpts_kv[iq0], r_g, beta=0.0)) P1_ani = get_spinorbit_projections(calc, iq0, v_knm[iq0]) M_mm = get_overlap(calc, bands, np.reshape(u1_nsG, (len(u1_nsG), Ng)), np.reshape(u2_nsG, (len(u2_nsG), Ng)), P1_ani, P2_ani, dO_aii, np.array([0.0, 0.0, 0.0])) l_m, l_mm = np.linalg.eig(M_mm) phi_km[k] = np.angle(l_m) print(phi_km[k] / 2 / np.pi) A_mm = np.zeros_like(l_mm, complex) for q in range(Nloc): iq = qpts_q[q] U_mm = U_qmm[q] v_nm = U_mm.dot(v_knm[iq][:, bands].T).T A_mm += np.dot(v_nm[::2].T.conj(), v_nm[::2]) A_mm -= np.dot(v_nm[1::2].T.conj(), v_nm[1::2]) A_mm /= Nloc S_km[k] = np.diag(l_mm.T.conj().dot(A_mm).dot(l_mm)).real world.sum(phi_km) world.sum(S_km) np.savez('phases_%s.npz' % name, phi_km=phi_km, S_km=S_km)
def calculate_Kxc(pd, calc, functional='ALDA', density_cut=None): """ALDA kernel""" gd = pd.gd npw = pd.ngmax nG = pd.gd.N_c vol = pd.gd.volume G_Gv = pd.get_reciprocal_vectors() nt_sG = calc.density.nt_sG R_av = calc.atoms.positions / Bohr setups = calc.wfs.setups D_asp = calc.density.D_asp # The soft part # assert np.abs(nt_sG[0].shape - nG).sum() == 0 if functional == 'ALDA_X': x_only = True A_x = -3. / 4. * (3. / np.pi)**(1. / 3.) nspins = len(nt_sG) assert nspins in [1, 2] fxc_sg = nspins**(1. / 3.) * 4. / 9. * A_x * nt_sG**(-2. / 3.) else: assert len(nt_sG) == 1 x_only = False fxc_sg = np.zeros_like(nt_sG) xc = XC(functional[1:]) xc.calculate_fxc(gd, nt_sG, fxc_sg) if density_cut is not None: fxc_sg[np.where(nt_sG * len(nt_sG) < density_cut)] = 0.0 # FFT fxc(r) nG0 = nG[0] * nG[1] * nG[2] tmp_sg = [np.fft.fftn(fxc_sg[s]) * vol / nG0 for s in range(len(nt_sG))] Kxc_sGG = np.zeros((len(fxc_sg), npw, npw), dtype=complex) for s in range(len(fxc_sg)): for iG, iQ in enumerate(pd.Q_qG[0]): iQ_c = (np.unravel_index(iQ, nG) + nG // 2) % nG - nG // 2 for jG, jQ in enumerate(pd.Q_qG[0]): jQ_c = (np.unravel_index(jQ, nG) + nG // 2) % nG - nG // 2 ijQ_c = (iQ_c - jQ_c) if (abs(ijQ_c) < nG // 2).all(): Kxc_sGG[s, iG, jG] = tmp_sg[s][tuple(ijQ_c)] # The PAW part KxcPAW_sGG = np.zeros_like(Kxc_sGG) dG_GGv = np.zeros((npw, npw, 3)) for v in range(3): dG_GGv[:, :, v] = np.subtract.outer(G_Gv[:, v], G_Gv[:, v]) for a, setup in enumerate(setups): if rank == a % size: rgd = setup.xc_correction.rgd n_qg = setup.xc_correction.n_qg nt_qg = setup.xc_correction.nt_qg nc_g = setup.xc_correction.nc_g nct_g = setup.xc_correction.nct_g Y_nL = setup.xc_correction.Y_nL dv_g = rgd.dv_g D_sp = D_asp[a] B_pqL = setup.xc_correction.B_pqL D_sLq = np.inner(D_sp, B_pqL.T) nspins = len(D_sp) f_sg = rgd.empty(nspins) ft_sg = rgd.empty(nspins) n_sLg = np.dot(D_sLq, n_qg) nt_sLg = np.dot(D_sLq, nt_qg) # Add core density n_sLg[:, 0] += np.sqrt(4. * np.pi) / nspins * nc_g nt_sLg[:, 0] += np.sqrt(4. * np.pi) / nspins * nct_g coefatoms_GG = np.exp(-1j * np.inner(dG_GGv, R_av[a])) for n, Y_L in enumerate(Y_nL): w = weight_n[n] f_sg[:] = 0.0 n_sg = np.dot(Y_L, n_sLg) if x_only: f_sg = nspins * (4 / 9.) * A_x * (nspins * n_sg)**(-2 / 3.) else: xc.calculate_fxc(rgd, n_sg, f_sg) ft_sg[:] = 0.0 nt_sg = np.dot(Y_L, nt_sLg) if x_only: ft_sg = nspins * (4 / 9.) * (A_x * (nspins * nt_sg)**(-2 / 3.)) else: xc.calculate_fxc(rgd, nt_sg, ft_sg) for i in range(len(rgd.r_g)): coef_GG = np.exp(-1j * np.inner(dG_GGv, R_nv[n]) * rgd.r_g[i]) for s in range(len(f_sg)): KxcPAW_sGG[s] += w * np.dot(coef_GG, (f_sg[s, i] - ft_sg[s, i]) * dv_g[i]) * coefatoms_GG world.sum(KxcPAW_sGG) Kxc_sGG += KxcPAW_sGG if pd.kd.gamma: Kxc_sGG[:, 0, :] = 0.0 Kxc_sGG[:, :, 0] = 0.0 return Kxc_sGG / vol
def calculate_Kxc(gd, nt_sG, npw, Gvec_Gc, nG, vol, bcell_cv, R_av, setups, D_asp): """LDA kernel""" # The soft part assert np.abs(nt_sG[0].shape - nG).sum() == 0 xc = XC('LDA') fxc_sg = np.zeros_like(nt_sG) xc.calculate_fxc(gd, nt_sG, fxc_sg) fxc_g = fxc_sg[0] # FFT fxc(r) nG0 = nG[0] * nG[1] * nG[2] tmp_g = np.fft.fftn(fxc_g) * vol / nG0 r_vg = gd.get_grid_point_coordinates() Kxc_GG = np.zeros((npw, npw), dtype=complex) for iG in range(npw): for jG in range(npw): dG_c = Gvec_Gc[iG] - Gvec_Gc[jG] if (nG / 2 - np.abs(dG_c) > 0).all(): index = (dG_c + nG) % nG Kxc_GG[iG, jG] = tmp_g[index[0], index[1], index[2]] else: # not in the fft index dG_v = np.dot(dG_c, bcell_cv) dGr_g = gemmdot(dG_v, r_vg, beta=0.0) Kxc_GG[iG, jG] = gd.integrate(np.exp(-1j * dGr_g) * fxc_g) KxcPAW_GG = np.zeros_like(Kxc_GG) # The PAW part dG_GGv = np.zeros((npw, npw, 3)) for iG in range(npw): for jG in range(npw): dG_c = Gvec_Gc[iG] - Gvec_Gc[jG] dG_GGv[iG, jG] = np.dot(dG_c, bcell_cv) for a, setup in enumerate(setups): if rank == a % size: rgd = setup.xc_correction.rgd n_qg = setup.xc_correction.n_qg nt_qg = setup.xc_correction.nt_qg nc_g = setup.xc_correction.nc_g nct_g = setup.xc_correction.nct_g Y_nL = setup.xc_correction.Y_nL dv_g = rgd.dv_g D_sp = D_asp[a] B_pqL = setup.xc_correction.B_pqL D_sLq = np.inner(D_sp, B_pqL.T) nspins = len(D_sp) assert nspins == 1 f_sg = rgd.empty(nspins) ft_sg = rgd.empty(nspins) n_sLg = np.dot(D_sLq, n_qg) nt_sLg = np.dot(D_sLq, nt_qg) # Add core density n_sLg[:, 0] += sqrt(4 * pi) / nspins * nc_g nt_sLg[:, 0] += sqrt(4 * pi) / nspins * nct_g coefatoms_GG = np.exp(-1j * np.inner(dG_GGv, R_av[a])) for n, Y_L in enumerate(Y_nL): w = weight_n[n] f_sg[:] = 0.0 n_sg = np.dot(Y_L, n_sLg) xc.calculate_fxc(rgd, n_sg, f_sg) ft_sg[:] = 0.0 nt_sg = np.dot(Y_L, nt_sLg) xc.calculate_fxc(rgd, nt_sg, ft_sg) coef_GGg = np.exp( -1j * np.outer(np.inner(dG_GGv, R_nv[n]), rgd.r_g)).reshape( npw, npw, rgd.ng) KxcPAW_GG += w * np.dot( coef_GGg, (f_sg[0] - ft_sg[0]) * dv_g) * coefatoms_GG world.sum(KxcPAW_GG) Kxc_GG += KxcPAW_GG return Kxc_GG / vol
from ase.structure import molecule from gpaw import GPAW from gpaw.wavefunctions.pw import PW from gpaw.mpi import world a = molecule('H', pbc=1) a.center(vacuum=2) comm = world.new_communicator([0]) e0 = 0.0 if world.rank == 0: a.calc = GPAW(mode=PW(250), communicator=comm, txt=None) e0 = a.get_potential_energy() e0 = world.sum(e0) a.calc = GPAW(mode=PW(250), eigensolver='rmm-diis', basis='szp(dzp)', txt='%d.txt' % world.size) e = a.get_potential_energy() f = a.get_forces() assert abs(e - e0) < 7e-5, abs(e - e0) assert abs(f).max() < 1e-10, abs(f).max()
if Fref is not None: Ferr = np.abs(F - Fref).max() assert Ferr < 1e-6, 'Bad F: err=%f; parallel=%s' % (Ferr, parallel) return E, F # First calculate reference energy and forces E and F # # If we want to really dumb things down, enable this to force an # entirely serial calculation: if 0: serial = world.new_communicator([0]) E = 0.0 F = np.zeros((len(system), 3)) if world.rank == 0: E, F = calculate({}, serial) E = world.sum(E) world.sum(F) else: # Normally we'll just do it in parallel; # that case is covered well by other tests, so we can probably trust it E, F = calculate({}, world) def check(parallel): return calculate(parallel, comm=world, Eref=E, Fref=F) assert world.size in [1, 2, 4, 8], ('Number of CPUs %d not supported' % world.size) parallel = dict(domain=1, band=1) sl_cpus = world.size if world.size % 2 == 0:
def calculate(self): calc = self.calc focc_S = self.focc_S e_S = self.e_S op_scc = calc.wfs.kd.symmetry.op_scc # Get phi_qaGp if self.mode == 'RPA': self.phi_aGp = self.get_phi_aGp() else: fd = opencew('phi_qaGp') if fd is None: self.reader = Reader('phi_qaGp') tmp = self.load_phi_aGp(self.reader, 0)[0] assert len(tmp) == self.npw self.printtxt('Finished reading phi_aGp') else: self.printtxt('Calculating phi_qaGp') self.get_phi_qaGp() world.barrier() self.reader = Reader('phi_qaGp') self.printtxt('Memory used %f M' % (maxrss() / 1024.**2)) self.printtxt('') if self.optical_limit: iq = np.where(np.sum(abs(self.ibzq_qc), axis=1) < 1e-5)[0][0] else: iq = np.where( np.sum(abs(self.ibzq_qc - self.q_c), axis=1) < 1e-5)[0][0] kc_G = np.array([self.V_qGG[iq, iG, iG] for iG in range(self.npw)]) if self.optical_limit: kc_G[0] = 0. # Get screened Coulomb kernel if self.mode == 'BSE': try: # Read data = pickle.load(open(self.kernel_file + '.pckl')) W_qGG = data['W_qGG'] assert np.shape(W_qGG) == np.shape(self.V_qGG) self.printtxt('Finished reading screening interaction kernel') except: # Calculate from scratch self.printtxt('Calculating screening interaction kernel.') W_qGG = self.full_static_screened_interaction() self.printtxt('') else: W_qGG = self.V_qGG t0 = time() self.printtxt('Calculating %s matrix elements' % self.mode) # Calculate full kernel K_SS = np.zeros((self.nS_local, self.nS), dtype=complex) self.rhoG0_S = np.zeros(self.nS, dtype=complex) #noGmap = 0 for iS in range(self.nS_start, self.nS_end): k1, n1, m1 = self.Sindex_S3[iS] rho1_G = self.density_matrix(n1, m1, k1) self.rhoG0_S[iS] = rho1_G[0] for jS in range(self.nS): k2, n2, m2 = self.Sindex_S3[jS] rho2_G = self.density_matrix(n2, m2, k2) K_SS[iS - self.nS_start, jS] = np.sum(rho1_G.conj() * rho2_G * kc_G) if not self.mode == 'RPA': rho3_G = self.density_matrix(n1, n2, k1, k2) rho4_G = self.density_matrix(m1, m2, self.kq_k[k1], self.kq_k[k2]) q_c = self.kd.bzk_kc[k2] - self.kd.bzk_kc[k1] q_c[np.where(q_c > 0.501)] -= 1. q_c[np.where(q_c < -0.499)] += 1. iq = self.kd.where_is_q(q_c, self.bzq_qc) if not self.qsymm: W_GG = W_qGG[iq] else: ibzq = self.ibzq_q[iq] W_GG_tmp = W_qGG[ibzq] iop = self.iop_q[iq] timerev = self.timerev_q[iq] diff_c = self.diff_qc[iq] invop = np.linalg.inv(op_scc[iop]) Gindex = np.zeros(self.npw, dtype=int) for iG in range(self.npw): G_c = self.Gvec_Gc[iG] if timerev: RotG_c = -np.int8( np.dot(invop, G_c + diff_c).round()) else: RotG_c = np.int8( np.dot(invop, G_c + diff_c).round()) tmp_G = np.abs(self.Gvec_Gc - RotG_c).sum(axis=1) try: Gindex[iG] = np.where(tmp_G < 1e-5)[0][0] except: #noGmap += 1 Gindex[iG] = -1 W_GG = np.zeros_like(W_GG_tmp) for iG in range(self.npw): for jG in range(self.npw): if Gindex[iG] == -1 or Gindex[jG] == -1: W_GG[iG, jG] = 0 else: W_GG[iG, jG] = W_GG_tmp[Gindex[iG], Gindex[jG]] if self.mode == 'BSE': tmp_GG = np.outer(rho3_G.conj(), rho4_G) * W_GG K_SS[iS - self.nS_start, jS] -= 0.5 * np.sum(tmp_GG) else: tmp_G = rho3_G.conj() * rho4_G * np.diag(W_GG) K_SS[iS - self.nS_start, jS] -= 0.5 * np.sum(tmp_G) self.timing(iS, t0, self.nS_local, 'pair orbital') K_SS /= self.vol world.sum(self.rhoG0_S) #self.printtxt('Number of G indices outside the Gvec_Gc: %d' % noGmap) # Get and solve Hamiltonian H_sS = np.zeros_like(K_SS) for iS in range(self.nS_start, self.nS_end): H_sS[iS - self.nS_start, iS] = e_S[iS] for jS in range(self.nS): H_sS[iS - self.nS_start, jS] += focc_S[iS] * K_SS[iS - self.nS_start, jS] # Force matrix to be Hermitian if not self.coupling: if world.size > 1: H_Ss = self.redistribute_H(H_sS) else: H_Ss = H_sS H_sS = (np.real(H_sS) + np.real(H_Ss.T)) / 2. + 1j * ( np.imag(H_sS) - np.imag(H_Ss.T)) / 2. # Save H_sS matrix self.par_save('H_SS', 'H_SS', H_sS) return H_sS
def main(N=73, seed=42, mprocs=2, nprocs=2, dtype=float): gen = np.random.RandomState(seed) grid = BlacsGrid(world, mprocs, nprocs) if (dtype==complex): epsilon = 1.0j else: epsilon = 0.0 # Create descriptors for matrices on master: glob = grid.new_descriptor(N, N, N, N) # print globA.asarray() # Populate matrices local to master: H0 = glob.zeros(dtype=dtype) + gen.rand(*glob.shape) S0 = glob.zeros(dtype=dtype) + gen.rand(*glob.shape) C0 = glob.empty(dtype=dtype) if rank == 0: # Complex case must have real numbers on the diagonal. # We make a simple complex Hermitian matrix below. H0 = H0 + epsilon * (0.1*np.tri(N, N, k= -N // nprocs) + 0.3*np.tri(N, N, k=-1)) S0 = S0 + epsilon * (0.2*np.tri(N, N, k= -N // nprocs) + 0.4*np.tri(N, N, k=-1)) # Make matrices symmetric rk(1.0, H0.copy(), 0.0, H0) rk(1.0, S0.copy(), 0.0, S0) # Overlap matrix must be semi-positive definite S0 = S0 + 50.0*np.eye(N, N, 0) # Hamiltonian is usually diagonally dominant H0 = H0 + 75.0*np.eye(N, N, 0) C0 = S0.copy() # Local result matrices W0 = np.empty((N),dtype=float) W0_g = np.empty((N),dtype=float) # Calculate eigenvalues if rank == 0: diagonalize(H0.copy(), W0) general_diagonalize(H0.copy(), W0_g, S0.copy()) inverse_cholesky(C0) # result returned in lower triangle # tri2full(C0) # symmetrize assert glob.check(H0) and glob.check(S0) and glob.check(C0) # Create distributed destriptors with various block sizes: dist = grid.new_descriptor(N, N, 8, 8) # Distributed matrices: # We can use empty here, but end up with garbage on # on the other half of the triangle when we redistribute. # This is fine because ScaLAPACK does not care. H = dist.empty(dtype=dtype) S = dist.empty(dtype=dtype) Z = dist.empty(dtype=dtype) C = dist.empty(dtype=dtype) # Eigenvalues are non-BLACS matrices W = np.empty((N), dtype=float) W_dc = np.empty((N), dtype=float) W_mr3 = np.empty((N), dtype=float) W_g = np.empty((N), dtype=float) W_g_dc = np.empty((N), dtype=float) W_g_mr3 = np.empty((N), dtype=float) Glob2dist = Redistributor(world, glob, dist) Glob2dist.redistribute(H0, H, uplo='L') Glob2dist.redistribute(S0, S, uplo='L') Glob2dist.redistribute(S0, C, uplo='L') # C0 was previously overwritten # we don't test the expert drivers anymore since there # might be a buffer overflow error ## scalapack_diagonalize_ex(dist, H.copy(), Z, W, 'L') scalapack_diagonalize_dc(dist, H.copy(), Z, W_dc, 'L') ## scalapack_diagonalize_mr3(dist, H.copy(), Z, W_mr3, 'L') ## scalapack_general_diagonalize_ex(dist, H.copy(), S.copy(), Z, W_g, 'L') scalapack_general_diagonalize_dc(dist, H.copy(), S.copy(), Z, W_g_dc, 'L') ## scalapack_general_diagonalize_mr3(dist, H.copy(), S.copy(), Z, W_g_mr3, 'L') scalapack_inverse_cholesky(dist, C, 'L') # Undo redistribute C_test = glob.empty(dtype=dtype) Dist2glob = Redistributor(world, dist, glob) Dist2glob.redistribute(C, C_test) if rank == 0: ## diag_ex_err = abs(W - W0).max() diag_dc_err = abs(W_dc - W0).max() ## diag_mr3_err = abs(W_mr3 - W0).max() ## general_diag_ex_err = abs(W_g - W0_g).max() general_diag_dc_err = abs(W_g_dc - W0_g).max() ## general_diag_mr3_err = abs(W_g_mr3 - W0_g).max() inverse_chol_err = abs(C_test-C0).max() ## print 'diagonalize ex err', diag_ex_err print 'diagonalize dc err', diag_dc_err ## print 'diagonalize mr3 err', diag_mr3_err ## print 'general diagonalize ex err', general_diag_ex_err print 'general diagonalize dc err', general_diag_dc_err ## print 'general diagonalize mr3 err', general_diag_mr3_err print 'inverse chol err', inverse_chol_err else: ## diag_ex_err = 0.0 diag_dc_err = 0.0 ## diag_mr3_err = 0.0 ## general_diag_ex_err = 0.0 general_diag_dc_err = 0.0 ## general_diag_mr3_err = 0.0 inverse_chol_err = 0.0 # We don't like exceptions on only one cpu ## diag_ex_err = world.sum(diag_ex_err) diag_dc_err = world.sum(diag_dc_err) ## diag_mr3_err = world.sum(diag_mr3_err) ## general_diag_ex_err = world.sum(general_diag_ex_err) general_diag_dc_err = world.sum(general_diag_dc_err) ## general_diag_mr3_err = world.sum(general_diag_mr3_err) inverse_chol_err = world.sum(inverse_chol_err) ## assert diag_ex_err < tol assert diag_dc_err < tol ## assert diag_mr3_err < tol ## assert general_diag_ex_err < tol assert general_diag_dc_err < tol ## assert general_diag_mr3_err < tol assert inverse_chol_err < tol
from gpaw.mpi import world from gpaw.utilities.dscftools import mpi_debug W = world.size N = 32 assert N%W == 0 M = N//W # Create my share of data data = np.arange(world.rank*M, (world.rank+1)*M) # Let's calculate the global sum slocal = data.sum() s = world.sum(slocal) mpi_debug('data: %s, slocal=%d, s=%d' % (data,slocal,s)) assert s == N*(N-1)//2 # Subtract the global mean data -= s/N mpi_debug('data: %s' % data) # ------------------------------------------------------------------- if world.rank == 0: print('-'*16) # Who has global index 11? The master needs it! i = 11 rank, ilocal = divmod(i, M)
def calculate(self): calc = self.calc f_skn = self.f_skn e_skn = self.e_skn kq_k = self.kq_k focc_S = self.focc_S e_S = self.e_S op_scc = calc.wfs.symmetry.op_scc # Get phi_qaGp if self.mode == 'RPA': self.phi_aGp = self.get_phi_aGp() else: try: self.reader = Reader('phi_qaGp') tmp = self.load_phi_aGp(self.reader, 0)[0] assert len(tmp) == self.npw self.printtxt('Finished reading phi_aGp') except: self.printtxt('Calculating phi_qaGp') self.get_phi_qaGp() world.barrier() self.reader = Reader('phi_qaGp') self.printtxt('Memory used %f M' % (maxrss() / 1024.**2)) self.printtxt('') if self.optical_limit: iq = np.where(np.sum(abs(self.ibzq_qc), axis=1) < 1e-5)[0][0] else: iq = np.where(np.sum(abs(self.ibzq_qc - self.q_c), axis=1) < 1e-5)[0][0] kc_G = np.array([self.V_qGG[iq, iG, iG] for iG in range(self.npw)]) if self.optical_limit: kc_G[0] = 0. # Get screened Coulomb kernel if self.mode == 'BSE': try: # Read data = pickle.load(open(self.kernel_file+'.pckl')) W_qGG = data['W_qGG'] assert np.shape(W_qGG) == np.shape(self.V_qGG) self.printtxt('Finished reading screening interaction kernel') except: # Calculate from scratch self.printtxt('Calculating screening interaction kernel.') W_qGG = self.full_static_screened_interaction() self.printtxt('') else: W_qGG = self.V_qGG t0 = time() self.printtxt('Calculating %s matrix elements' % self.mode) # Calculate full kernel K_SS = np.zeros((self.nS_local, self.nS), dtype=complex) self.rhoG0_S = np.zeros(self.nS, dtype=complex) #noGmap = 0 for iS in range(self.nS_start, self.nS_end): k1, n1, m1 = self.Sindex_S3[iS] rho1_G = self.density_matrix(n1,m1,k1) self.rhoG0_S[iS] = rho1_G[0] for jS in range(self.nS): k2, n2, m2 = self.Sindex_S3[jS] rho2_G = self.density_matrix(n2,m2,k2) K_SS[iS-self.nS_start, jS] = np.sum(rho1_G.conj() * rho2_G * kc_G) if not self.mode == 'RPA': rho3_G = self.density_matrix(n1,n2,k1,k2) rho4_G = self.density_matrix(m1,m2,self.kq_k[k1], self.kq_k[k2]) q_c = self.kd.bzk_kc[k2] - self.kd.bzk_kc[k1] q_c[np.where(q_c > 0.501)] -= 1. q_c[np.where(q_c < -0.499)] += 1. iq = self.kd.where_is_q(q_c, self.bzq_qc) if not self.qsymm: W_GG = W_qGG[iq] else: ibzq = self.ibzq_q[iq] W_GG_tmp = W_qGG[ibzq] iop = self.iop_q[iq] timerev = self.timerev_q[iq] diff_c = self.diff_qc[iq] invop = np.linalg.inv(op_scc[iop]) Gindex = np.zeros(self.npw, dtype=int) for iG in range(self.npw): G_c = self.Gvec_Gc[iG] if timerev: RotG_c = -np.int8(np.dot(invop, G_c+diff_c).round()) else: RotG_c = np.int8(np.dot(invop, G_c+diff_c).round()) tmp_G = np.abs(self.Gvec_Gc - RotG_c).sum(axis=1) try: Gindex[iG] = np.where(tmp_G < 1e-5)[0][0] except: #noGmap += 1 Gindex[iG] = -1 W_GG = np.zeros_like(W_GG_tmp) for iG in range(self.npw): for jG in range(self.npw): if Gindex[iG] == -1 or Gindex[jG] == -1: W_GG[iG, jG] = 0 else: W_GG[iG, jG] = W_GG_tmp[Gindex[iG], Gindex[jG]] if self.mode == 'BSE': tmp_GG = np.outer(rho3_G.conj(), rho4_G) * W_GG K_SS[iS-self.nS_start, jS] -= 0.5 * np.sum(tmp_GG) else: tmp_G = rho3_G.conj() * rho4_G * np.diag(W_GG) K_SS[iS-self.nS_start, jS] -= 0.5 * np.sum(tmp_G) self.timing(iS, t0, self.nS_local, 'pair orbital') K_SS /= self.vol world.sum(self.rhoG0_S) #self.printtxt('Number of G indices outside the Gvec_Gc: %d' % noGmap) # Get and solve Hamiltonian H_sS = np.zeros_like(K_SS) for iS in range(self.nS_start, self.nS_end): H_sS[iS-self.nS_start,iS] = e_S[iS] for jS in range(self.nS): H_sS[iS-self.nS_start,jS] += focc_S[iS] * K_SS[iS-self.nS_start,jS] # Force matrix to be Hermitian if not self.coupling: if world.size > 1: H_Ss = self.redistribute_H(H_sS) else: H_Ss = H_sS H_sS = (np.real(H_sS) + np.real(H_Ss.T)) / 2. + 1j * (np.imag(H_sS) - np.imag(H_Ss.T)) /2. # Save H_sS matrix self.par_save('H_SS','H_SS', H_sS) return H_sS
def calculate(self, optical=True, ac=1.0): if self.spinors: """Calculate spinors. Here m is index of eigenvalues with SOC and n is the basis of eigenstates withour SOC. Below m is used for unoccupied states and n is used for occupied states so be careful!""" print('Diagonalizing spin-orbit Hamiltonian', file=self.fd) param = self.calc.parameters if not param['symmetry'] == 'off': print('Calculating KS wavefunctions without symmetry ' + 'for spin-orbit', file=self.fd) if not op.isfile('gs_nosym.gpw'): calc_so = GPAW(**param) calc_so.set(symmetry='off', fixdensity=True, txt='gs_nosym.txt') calc_so.atoms = self.calc.atoms calc_so.density = self.calc.density calc_so.get_potential_energy() calc_so.write('gs_nosym.gpw') calc_so = GPAW('gs_nosym.gpw', txt=None, communicator=serial_comm) e_mk, v_knm = get_spinorbit_eigenvalues(calc_so, return_wfs=True, scale=self.scale) del calc_so else: e_mk, v_knm = get_spinorbit_eigenvalues(self.calc, return_wfs=True, scale=self.scale) e_mk /= Hartree # Parallelization stuff nK = self.kd.nbzkpts myKrange, myKsize, mySsize = self.parallelisation_sizes() # Calculate exchange interaction qd0 = KPointDescriptor([self.q_c]) pd0 = PWDescriptor(self.ecut, self.calc.wfs.gd, complex, qd0) ikq_k = self.kd.find_k_plus_q(self.q_c) v_G = get_coulomb_kernel(pd0, self.kd.N_c, truncation=self.truncation, wstc=self.wstc) if optical: v_G[0] = 0.0 self.pair = PairDensity(self.calc, self.ecut, world=serial_comm, txt='pair.txt') # Calculate direct (screened) interaction and PAW corrections if self.mode == 'RPA': Q_aGii = self.pair.initialize_paw_corrections(pd0) else: self.get_screened_potential(ac=ac) if (self.qd.ibzk_kc - self.q_c < 1.0e-6).all(): iq0 = self.qd.bz2ibz_k[self.kd.where_is_q(self.q_c, self.qd.bzk_kc)] Q_aGii = self.Q_qaGii[iq0] else: Q_aGii = self.pair.initialize_paw_corrections(pd0) # Calculate pair densities, eigenvalues and occupations so = self.spinors + 1 Nv, Nc = so * self.nv, so * self.nc Ns = self.spins rhoex_KsmnG = np.zeros((nK, Ns, Nv, Nc, len(v_G)), complex) # rhoG0_Ksmn = np.zeros((nK, Ns, Nv, Nc), complex) df_Ksmn = np.zeros((nK, Ns, Nv, Nc), float) # -(ev - ec) deps_ksmn = np.zeros((myKsize, Ns, Nv, Nc), float) # -(fv - fc) if np.allclose(self.q_c, 0.0): optical_limit = True else: optical_limit = False get_pair = self.pair.get_kpoint_pair get_rho = self.pair.get_pair_density if self.spinors: # Get all pair densities to allow for SOC mixing # Use twice as many no-SOC states as BSE bands to allow mixing vi_s = [2 * self.val_sn[0, 0] - self.val_sn[0, -1] - 1] vf_s = [2 * self.con_sn[0, -1] - self.con_sn[0, 0] + 2] if vi_s[0] < 0: vi_s[0] = 0 ci_s, cf_s = vi_s, vf_s ni, nf = vi_s[0], vf_s[0] mvi = 2 * self.val_sn[0, 0] mvf = 2 * (self.val_sn[0, -1] + 1) mci = 2 * self.con_sn[0, 0] mcf = 2 * (self.con_sn[0, -1] + 1) else: vi_s, vf_s = self.val_sn[:, 0], self.val_sn[:, -1] + 1 ci_s, cf_s = self.con_sn[:, 0], self.con_sn[:, -1] + 1 for ik, iK in enumerate(myKrange): for s in range(Ns): pair = get_pair(pd0, s, iK, vi_s[s], vf_s[s], ci_s[s], cf_s[s]) m_m = np.arange(vi_s[s], vf_s[s]) n_n = np.arange(ci_s[s], cf_s[s]) if self.gw_skn is not None: iKq = self.calc.wfs.kd.find_k_plus_q(self.q_c, [iK])[0] epsv_m = self.gw_skn[s, iK, :self.nv] epsc_n = self.gw_skn[s, iKq, self.nv:] deps_ksmn[ik] = -(epsv_m[:, np.newaxis] - epsc_n) elif self.spinors: iKq = self.calc.wfs.kd.find_k_plus_q(self.q_c, [iK])[0] epsv_m = e_mk[mvi:mvf, iK] epsc_n = e_mk[mci:mcf, iKq] deps_ksmn[ik, s] = -(epsv_m[:, np.newaxis] - epsc_n) else: deps_ksmn[ik, s] = -pair.get_transition_energies(m_m, n_n) df_mn = pair.get_occupation_differences(self.val_sn[s], self.con_sn[s]) rho_mnG = get_rho(pd0, pair, m_m, n_n, optical_limit=optical_limit, direction=self.direction, Q_aGii=Q_aGii, extend_head=False) if self.spinors: if optical_limit: deps0_mn = -pair.get_transition_energies(m_m, n_n) rho_mnG[:, :, 0] *= deps0_mn df_Ksmn[iK, s, ::2, ::2] = df_mn df_Ksmn[iK, s, ::2, 1::2] = df_mn df_Ksmn[iK, s, 1::2, ::2] = df_mn df_Ksmn[iK, s, 1::2, 1::2] = df_mn vecv0_nm = v_knm[iK][::2][ni:nf, mvi:mvf] vecc0_nm = v_knm[iKq][::2][ni:nf, mci:mcf] rho_0mnG = np.dot(vecv0_nm.T.conj(), np.dot(vecc0_nm.T, rho_mnG)) vecv1_nm = v_knm[iK][1::2][ni:nf, mvi:mvf] vecc1_nm = v_knm[iKq][1::2][ni:nf, mci:mcf] rho_1mnG = np.dot(vecv1_nm.T.conj(), np.dot(vecc1_nm.T, rho_mnG)) rhoex_KsmnG[iK, s] = rho_0mnG + rho_1mnG if optical_limit: rhoex_KsmnG[iK, s, :, :, 0] /= deps_ksmn[ik, s] else: df_Ksmn[iK, s] = pair.get_occupation_differences(m_m, n_n) rhoex_KsmnG[iK, s] = rho_mnG if self.eshift is not None: deps_ksmn[np.where(df_Ksmn[myKrange] > 1.0e-3)] += self.eshift deps_ksmn[np.where(df_Ksmn[myKrange] < -1.0e-3)] -= self.eshift world.sum(df_Ksmn) world.sum(rhoex_KsmnG) self.rhoG0_S = np.reshape(rhoex_KsmnG[:, :, :, :, 0], -1) if hasattr(self, 'H_sS'): return # Calculate Hamiltonian t0 = time() print('Calculating %s matrix elements at q_c = %s' % (self.mode, self.q_c), file=self.fd) H_ksmnKsmn = np.zeros((myKsize, Ns, Nv, Nc, nK, Ns, Nv, Nc), complex) for ik1, iK1 in enumerate(myKrange): for s1 in range(Ns): kptv1 = self.pair.get_k_point(s1, iK1, vi_s[s1], vf_s[s1]) kptc1 = self.pair.get_k_point(s1, ikq_k[iK1], ci_s[s1], cf_s[s1]) rho1_mnG = rhoex_KsmnG[iK1, s1] #rhoG0_Ksmn[iK1, s1] = rho1_mnG[:, :, 0] rho1ccV_mnG = rho1_mnG.conj()[:, :] * v_G for s2 in range(Ns): for Q_c in self.qd.bzk_kc: iK2 = self.kd.find_k_plus_q(Q_c, [kptv1.K])[0] rho2_mnG = rhoex_KsmnG[iK2, s2] rho2_mGn = np.swapaxes(rho2_mnG, 1, 2) H_ksmnKsmn[ik1, s1, :, :, iK2, s2, :, :] += ( np.dot(rho1ccV_mnG, rho2_mGn)) if not self.mode == 'RPA' and s1 == s2: ikq = ikq_k[iK2] kptv2 = self.pair.get_k_point(s1, iK2, vi_s[s1], vf_s[s1]) kptc2 = self.pair.get_k_point(s1, ikq, ci_s[s1], cf_s[s1]) rho3_mmG, iq = self.get_density_matrix(kptv1, kptv2) rho4_nnG, iq = self.get_density_matrix(kptc1, kptc2) if self.spinors: vec0_nm = v_knm[iK1][::2][ni:nf, mvi:mvf] vec1_nm = v_knm[iK1][1::2][ni:nf, mvi:mvf] vec2_nm = v_knm[iK2][::2][ni:nf, mvi:mvf] vec3_nm = v_knm[iK2][1::2][ni:nf, mvi:mvf] rho_0mnG = np.dot(vec0_nm.T.conj(), np.dot(vec2_nm.T, rho3_mmG)) rho_1mnG = np.dot(vec1_nm.T.conj(), np.dot(vec3_nm.T, rho3_mmG)) rho3_mmG = rho_0mnG + rho_1mnG vec0_nm = v_knm[ikq_k[iK1]][::2][ni:nf, mci:mcf] vec1_nm = v_knm[ikq_k[iK1]][1::2][ni:nf,mci:mcf] vec2_nm = v_knm[ikq][::2][ni:nf, mci:mcf] vec3_nm = v_knm[ikq][1::2][ni:nf, mci:mcf] rho_0mnG = np.dot(vec0_nm.T.conj(), np.dot(vec2_nm.T, rho4_nnG)) rho_1mnG = np.dot(vec1_nm.T.conj(), np.dot(vec3_nm.T, rho4_nnG)) rho4_nnG = rho_0mnG + rho_1mnG rho3ccW_mmG = np.dot(rho3_mmG.conj(), self.W_qGG[iq]) W_mmnn = np.dot(rho3ccW_mmG, np.swapaxes(rho4_nnG, 1, 2)) W_mnmn = np.swapaxes(W_mmnn, 1, 2) * Ns * so H_ksmnKsmn[ik1, s1, :, :, iK2, s1] -= 0.5 * W_mnmn if iK1 % (myKsize // 5 + 1) == 0: dt = time() - t0 tleft = dt * myKsize / (iK1 + 1) - dt print(' Finished %s pair orbitals in %s - Estimated %s left' % ((iK1 + 1) * Nv * Nc * Ns * world.size, timedelta(seconds=round(dt)), timedelta(seconds=round(tleft))), file=self.fd) #if self.mode == 'BSE': # del self.Q_qaGii, self.W_qGG, self.pd_q H_ksmnKsmn /= self.vol mySsize = myKsize * Nv * Nc * Ns if myKsize > 0: iS0 = myKrange[0] * Nv * Nc * Ns #world.sum(rhoG0_Ksmn) #self.rhoG0_S = np.reshape(rhoG0_Ksmn, -1) self.df_S = np.reshape(df_Ksmn, -1) if not self.td: self.excludef_S = np.where(np.abs(self.df_S) < 0.001)[0] # multiply by 2 when spin-paired and no SOC self.df_S *= 2.0 / nK / Ns / so self.deps_s = np.reshape(deps_ksmn, -1) H_sS = np.reshape(H_ksmnKsmn, (mySsize, self.nS)) for iS in range(mySsize): # Multiply by occupations and adiabatic coupling H_sS[iS] *= self.df_S[iS0 + iS] * ac # add bare transition energies H_sS[iS, iS0 + iS] += self.deps_s[iS] self.H_sS = H_sS if self.write_h: self.par_save('H_SS.ulm', 'H_SS', self.H_sS)
from ase.structure import molecule from gpaw import GPAW from gpaw.wavefunctions.pw import PW from gpaw.mpi import world a = molecule('H', pbc=1) a.center(vacuum=2) comm = world.new_communicator([world.rank]) e0 = 0.0 a.calc = GPAW(mode=PW(250), communicator=comm, txt=None) e0 = a.get_potential_energy() e0 = world.sum(e0) / world.size a.calc = GPAW(mode=PW(250), eigensolver='rmm-diis', basis='szp(dzp)', txt='%d.txt' % world.size) e = a.get_potential_energy() f = a.get_forces() assert abs(e - e0) < 7e-5, abs(e - e0) assert abs(f).max() < 1e-10, abs(f).max()
def calculate_Kxc(gd, nt_sG, npw, Gvec_Gc, nG, vol, bcell_cv, R_av, setups, D_asp): """LDA kernel""" # The soft part assert np.abs(nt_sG[0].shape - nG).sum() == 0 xc = XC('LDA') fxc_sg = np.zeros_like(nt_sG) xc.calculate_fxc(gd, nt_sG, fxc_sg) fxc_g = fxc_sg[0] # FFT fxc(r) nG0 = nG[0] * nG[1] * nG[2] tmp_g = np.fft.fftn(fxc_g) * vol / nG0 r_vg = gd.get_grid_point_coordinates() Kxc_GG = np.zeros((npw, npw), dtype=complex) for iG in range(npw): for jG in range(npw): dG_c = Gvec_Gc[iG] - Gvec_Gc[jG] if (nG / 2 - np.abs(dG_c) > 0).all(): index = (dG_c + nG) % nG Kxc_GG[iG, jG] = tmp_g[index[0], index[1], index[2]] else: # not in the fft index dG_v = np.dot(dG_c, bcell_cv) dGr_g = gemmdot(dG_v, r_vg, beta=0.0) Kxc_GG[iG, jG] = gd.integrate(np.exp(-1j*dGr_g)*fxc_g) KxcPAW_GG = np.zeros_like(Kxc_GG) # The PAW part dG_GGv = np.zeros((npw, npw, 3)) for iG in range(npw): for jG in range(npw): dG_c = Gvec_Gc[iG] - Gvec_Gc[jG] dG_GGv[iG, jG] = np.dot(dG_c, bcell_cv) for a, setup in enumerate(setups): if rank == a % size: rgd = setup.xc_correction.rgd n_qg = setup.xc_correction.n_qg nt_qg = setup.xc_correction.nt_qg nc_g = setup.xc_correction.nc_g nct_g = setup.xc_correction.nct_g Y_nL = setup.xc_correction.Y_nL dv_g = rgd.dv_g D_sp = D_asp[a] B_pqL = setup.xc_correction.B_pqL D_sLq = np.inner(D_sp, B_pqL.T) nspins = len(D_sp) assert nspins == 1 f_sg = rgd.empty(nspins) ft_sg = rgd.empty(nspins) n_sLg = np.dot(D_sLq, n_qg) nt_sLg = np.dot(D_sLq, nt_qg) # Add core density n_sLg[:, 0] += sqrt(4 * pi) / nspins * nc_g nt_sLg[:, 0] += sqrt(4 * pi) / nspins * nct_g coefatoms_GG = np.exp(-1j * np.inner(dG_GGv, R_av[a])) for n, Y_L in enumerate(Y_nL): w = weight_n[n] f_sg[:] = 0.0 n_sg = np.dot(Y_L, n_sLg) xc.calculate_fxc(rgd, n_sg, f_sg) ft_sg[:] = 0.0 nt_sg = np.dot(Y_L, nt_sLg) xc.calculate_fxc(rgd, nt_sg, ft_sg) coef_GGg = np.exp(-1j * np.outer(np.inner(dG_GGv, R_nv[n]), rgd.r_g)).reshape(npw,npw,rgd.ng) KxcPAW_GG += w * np.dot(coef_GGg, (f_sg[0]-ft_sg[0]) * dv_g) * coefatoms_GG world.sum(KxcPAW_GG) Kxc_GG += KxcPAW_GG return Kxc_GG / vol
def calculate_Kxc(pd, nt_sG, R_av, setups, D_asp, functional='ALDA', density_cut=None): """ALDA kernel""" gd = pd.gd npw = pd.ngmax nG = pd.gd.N_c vol = pd.gd.volume bcell_cv = np.linalg.inv(pd.gd.cell_cv) G_Gv = pd.get_reciprocal_vectors() # The soft part #assert np.abs(nt_sG[0].shape - nG).sum() == 0 if functional == 'ALDA_X': x_only = True A_x = -3. / 4. * (3. / np.pi)**(1. / 3.) nspins = len(nt_sG) assert nspins in [1, 2] fxc_sg = nspins**(1. / 3.) * 4. / 9. * A_x * nt_sG**(-2. / 3.) else: assert len(nt_sG) == 1 x_only = False fxc_sg = np.zeros_like(nt_sG) xc = XC(functional[1:]) xc.calculate_fxc(gd, nt_sG, fxc_sg) if density_cut is not None: fxc_sg[np.where(nt_sG * len(nt_sG) < density_cut)] = 0.0 # FFT fxc(r) nG0 = nG[0] * nG[1] * nG[2] tmp_sg = [np.fft.fftn(fxc_sg[s]) * vol / nG0 for s in range(len(nt_sG))] r_vg = gd.get_grid_point_coordinates() Kxc_sGG = np.zeros((len(fxc_sg), npw, npw), dtype=complex) for s in range(len(fxc_sg)): for iG, iQ in enumerate(pd.Q_qG[0]): iQ_c = (np.unravel_index(iQ, nG) + nG // 2) % nG - nG // 2 for jG, jQ in enumerate(pd.Q_qG[0]): jQ_c = (np.unravel_index(jQ, nG) + nG // 2) % nG - nG // 2 ijQ_c = (iQ_c - jQ_c) if (abs(ijQ_c) < nG // 2).all(): Kxc_sGG[s, iG, jG] = tmp_sg[s][tuple(ijQ_c)] # The PAW part KxcPAW_sGG = np.zeros_like(Kxc_sGG) dG_GGv = np.zeros((npw, npw, 3)) for v in range(3): dG_GGv[:, :, v] = np.subtract.outer(G_Gv[:, v], G_Gv[:, v]) for a, setup in enumerate(setups): if rank == a % size: rgd = setup.xc_correction.rgd n_qg = setup.xc_correction.n_qg nt_qg = setup.xc_correction.nt_qg nc_g = setup.xc_correction.nc_g nct_g = setup.xc_correction.nct_g Y_nL = setup.xc_correction.Y_nL dv_g = rgd.dv_g D_sp = D_asp[a] B_pqL = setup.xc_correction.B_pqL D_sLq = np.inner(D_sp, B_pqL.T) nspins = len(D_sp) f_sg = rgd.empty(nspins) ft_sg = rgd.empty(nspins) n_sLg = np.dot(D_sLq, n_qg) nt_sLg = np.dot(D_sLq, nt_qg) # Add core density n_sLg[:, 0] += np.sqrt(4. * np.pi) / nspins * nc_g nt_sLg[:, 0] += np.sqrt(4. * np.pi) / nspins * nct_g coefatoms_GG = np.exp(-1j * np.inner(dG_GGv, R_av[a])) for n, Y_L in enumerate(Y_nL): w = weight_n[n] f_sg[:] = 0.0 n_sg = np.dot(Y_L, n_sLg) if x_only: f_sg = nspins * (4 / 9.) * A_x * (nspins * n_sg)**(-2 / 3.) else: xc.calculate_fxc(rgd, n_sg, f_sg) ft_sg[:] = 0.0 nt_sg = np.dot(Y_L, nt_sLg) if x_only: ft_sg = nspins * (4 / 9.) * (A_x * (nspins * nt_sg)**(-2 / 3.)) else: xc.calculate_fxc(rgd, nt_sg, ft_sg) for i in range(len(rgd.r_g)): coef_GG = np.exp(-1j * np.inner(dG_GGv, R_nv[n]) * rgd.r_g[i]) for s in range(len(f_sg)): KxcPAW_sGG[s] += w * np.dot(coef_GG, (f_sg[s, i] - ft_sg[s, i]) * dv_g[i]) * coefatoms_GG world.sum(KxcPAW_sGG) Kxc_sGG += KxcPAW_sGG return Kxc_sGG / vol
from ase.build import molecule from gpaw import GPAW from gpaw.wavefunctions.pw import PW from gpaw.mpi import world a = molecule('H', pbc=1) a.center(vacuum=2) comm = world.new_communicator([world.rank]) e0 = 0.0 a.calc = GPAW(mode=PW(250), communicator=comm, txt=None) e0 = a.get_potential_energy() e0 = world.sum(e0) / world.size a.calc = GPAW(mode=PW(250), eigensolver='rmm-diis', basis='szp(dzp)', txt='%d.txt' % world.size) e = a.get_potential_energy() f = a.get_forces() assert abs(e - e0) < 7e-5, abs(e - e0) assert abs(f).max() < 1e-10, abs(f).max()
from gpaw.atom.configurations import parameters from gpaw import setup_paths from gpaw.test import equal from gpaw.mpi import world atom = 'Ne' setup_paths.insert(0, '.') for xcname in ['GLLBSC', 'GLLB']: if world.rank == 0: g = Generator(atom, xcname=xcname, scalarrel=False, nofiles=True) g.run(**parameters[atom]) eps = g.e_j[-1] else: eps = 0.0 eps = world.sum(eps) world.barrier() a = 5 Ne = Atoms([Atom(atom, (0, 0, 0))], cell=(a, a, a), pbc=False) Ne.center() calc = GPAW(nbands=7, h=0.25, xc=xcname) Ne.set_calculator(calc) e = Ne.get_potential_energy() # Calculate the discontinuity response = calc.hamiltonian.xc.xcs['RESPONSE'] response.calculate_delta_xc() response.calculate_delta_xc_perturbation() eps3d = calc.wfs.kpt_u[0].eps_n[3] #if world.rank == 0:
def calculate_Kxc(gd, nt_sG, npw, Gvec_Gc, nG, vol, bcell_cv, R_av, setups, D_asp, functional='ALDA', density_cut=None): """ALDA kernel""" # The soft part #assert np.abs(nt_sG[0].shape - nG).sum() == 0 if functional == 'ALDA_X': x_only = True A_x = -3. / 4. * (3. / np.pi)**(1. / 3.) nspins = len(nt_sG) assert nspins in [1, 2] fxc_sg = nspins**(1. / 3.) * 4. / 9. * A_x * nt_sG**(-2. / 3.) else: assert len(nt_sG) == 1 x_only = False fxc_sg = np.zeros_like(nt_sG) xc = XC(functional[1:]) xc.calculate_fxc(gd, nt_sG, fxc_sg) if density_cut is not None: fxc_sg[np.where(nt_sG * len(nt_sG) < density_cut)] = 0.0 # FFT fxc(r) nG0 = nG[0] * nG[1] * nG[2] tmp_sg = [np.fft.fftn(fxc_sg[s]) * vol / nG0 for s in range(len(nt_sG))] r_vg = gd.get_grid_point_coordinates() Kxc_sGG = np.zeros((len(fxc_sg), npw, npw), dtype=complex) for s in range(len(fxc_sg)): for iG in range(npw): for jG in range(npw): dG_c = Gvec_Gc[iG] - Gvec_Gc[jG] if (nG / 2 - np.abs(dG_c) > 0).all(): index = (dG_c + nG) % nG Kxc_sGG[s, iG, jG] = tmp_sg[s][index[0], index[1], index[2]] else: # not in the fft index dG_v = np.dot(dG_c, bcell_cv) dGr_g = gemmdot(dG_v, r_vg, beta=0.0) Kxc_sGG[s, iG, jG] = gd.integrate(np.exp(-1j * dGr_g) * fxc_sg[s]) # The PAW part KxcPAW_sGG = np.zeros_like(Kxc_sGG) dG_GGv = np.zeros((npw, npw, 3)) for iG in range(npw): for jG in range(npw): dG_c = Gvec_Gc[iG] - Gvec_Gc[jG] dG_GGv[iG, jG] = np.dot(dG_c, bcell_cv) for a, setup in enumerate(setups): if rank == a % size: rgd = setup.xc_correction.rgd n_qg = setup.xc_correction.n_qg nt_qg = setup.xc_correction.nt_qg nc_g = setup.xc_correction.nc_g nct_g = setup.xc_correction.nct_g Y_nL = setup.xc_correction.Y_nL dv_g = rgd.dv_g D_sp = D_asp[a] B_pqL = setup.xc_correction.B_pqL D_sLq = np.inner(D_sp, B_pqL.T) nspins = len(D_sp) f_sg = rgd.empty(nspins) ft_sg = rgd.empty(nspins) n_sLg = np.dot(D_sLq, n_qg) nt_sLg = np.dot(D_sLq, nt_qg) # Add core density n_sLg[:, 0] += np.sqrt(4. * np.pi) / nspins * nc_g nt_sLg[:, 0] += np.sqrt(4. * np.pi) / nspins * nct_g coefatoms_GG = np.exp(-1j * np.inner(dG_GGv, R_av[a])) for n, Y_L in enumerate(Y_nL): w = weight_n[n] f_sg[:] = 0.0 n_sg = np.dot(Y_L, n_sLg) if x_only: f_sg = nspins * (4 / 9.) * A_x * (nspins * n_sg)**(-2 / 3.) else: xc.calculate_fxc(rgd, n_sg, f_sg) ft_sg[:] = 0.0 nt_sg = np.dot(Y_L, nt_sLg) if x_only: ft_sg = nspins * (4 / 9.) * (A_x * (nspins * nt_sg)**(-2 / 3.)) else: xc.calculate_fxc(rgd, nt_sg, ft_sg) for i in range(len(rgd.r_g)): coef_GG = np.exp(-1j * np.inner(dG_GGv, R_nv[n]) * rgd.r_g[i]) for s in range(len(f_sg)): KxcPAW_sGG[s] += w * np.dot(coef_GG, (f_sg[s, i] - ft_sg[s, i]) * dv_g[i]) * coefatoms_GG world.sum(KxcPAW_sGG) Kxc_sGG += KxcPAW_sGG return Kxc_sGG / vol
def calculate_Kxc(gd, nt_sG, npw, Gvec_Gc, nG, vol, bcell_cv, R_av, setups, D_asp, functional='ALDA', density_cut=None): """ALDA kernel""" # The soft part #assert np.abs(nt_sG[0].shape - nG).sum() == 0 if functional == 'ALDA_X': x_only = True A_x = -3. / 4. * (3. / np.pi)**(1. / 3.) nspins = len(nt_sG) assert nspins in [1, 2] fxc_sg = nspins**(1. / 3.) * 4. / 9. * A_x * nt_sG**(-2. / 3.) else: assert len(nt_sG) == 1 x_only = False fxc_sg = np.zeros_like(nt_sG) xc = XC(functional[1:]) xc.calculate_fxc(gd, nt_sG, fxc_sg) if density_cut is not None: fxc_sg[np.where(nt_sG * len(nt_sG) < density_cut)] = 0.0 # FFT fxc(r) nG0 = nG[0] * nG[1] * nG[2] tmp_sg = [np.fft.fftn(fxc_sg[s]) * vol / nG0 for s in range(len(nt_sG))] r_vg = gd.get_grid_point_coordinates() Kxc_sGG = np.zeros((len(fxc_sg), npw, npw), dtype=complex) for s in range(len(fxc_sg)): for iG in range(npw): for jG in range(npw): dG_c = Gvec_Gc[iG] - Gvec_Gc[jG] if (nG / 2 - np.abs(dG_c) > 0).all(): index = (dG_c + nG) % nG Kxc_sGG[s, iG, jG] = tmp_sg[s][index[0], index[1], index[2]] else: # not in the fft index dG_v = np.dot(dG_c, bcell_cv) dGr_g = gemmdot(dG_v, r_vg, beta=0.0) Kxc_sGG[s, iG, jG] = gd.integrate( np.exp(-1j * dGr_g) * fxc_sg[s]) # The PAW part KxcPAW_sGG = np.zeros_like(Kxc_sGG) dG_GGv = np.zeros((npw, npw, 3)) for iG in range(npw): for jG in range(npw): dG_c = Gvec_Gc[iG] - Gvec_Gc[jG] dG_GGv[iG, jG] = np.dot(dG_c, bcell_cv) for a, setup in enumerate(setups): if rank == a % size: rgd = setup.xc_correction.rgd n_qg = setup.xc_correction.n_qg nt_qg = setup.xc_correction.nt_qg nc_g = setup.xc_correction.nc_g nct_g = setup.xc_correction.nct_g Y_nL = setup.xc_correction.Y_nL dv_g = rgd.dv_g D_sp = D_asp[a] B_pqL = setup.xc_correction.B_pqL D_sLq = np.inner(D_sp, B_pqL.T) nspins = len(D_sp) f_sg = rgd.empty(nspins) ft_sg = rgd.empty(nspins) n_sLg = np.dot(D_sLq, n_qg) nt_sLg = np.dot(D_sLq, nt_qg) # Add core density n_sLg[:, 0] += np.sqrt(4. * np.pi) / nspins * nc_g nt_sLg[:, 0] += np.sqrt(4. * np.pi) / nspins * nct_g coefatoms_GG = np.exp(-1j * np.inner(dG_GGv, R_av[a])) for n, Y_L in enumerate(Y_nL): w = weight_n[n] f_sg[:] = 0.0 n_sg = np.dot(Y_L, n_sLg) if x_only: f_sg = nspins * (4 / 9.) * A_x * (nspins * n_sg)**(-2 / 3.) else: xc.calculate_fxc(rgd, n_sg, f_sg) ft_sg[:] = 0.0 nt_sg = np.dot(Y_L, nt_sLg) if x_only: ft_sg = nspins * (4 / 9.) * (A_x * (nspins * nt_sg)**(-2 / 3.)) else: xc.calculate_fxc(rgd, nt_sg, ft_sg) for i in range(len(rgd.r_g)): coef_GG = np.exp(-1j * np.inner(dG_GGv, R_nv[n]) * rgd.r_g[i]) for s in range(len(f_sg)): KxcPAW_sGG[s] += w * np.dot(coef_GG, (f_sg[s, i] - ft_sg[s, i]) * dv_g[i]) * coefatoms_GG world.sum(KxcPAW_sGG) Kxc_sGG += KxcPAW_sGG return Kxc_sGG / vol
def main(M=160, N=120, K=140, seed=42, mprocs=2, nprocs=2, dtype=float): gen = np.random.RandomState(seed) grid = BlacsGrid(world, mprocs, nprocs) if dtype == complex: epsilon = 1.0j else: epsilon = 0.0 # Create descriptors for matrices on master: globA = grid.new_descriptor(M, K, M, K) globB = grid.new_descriptor(K, N, K, N) globC = grid.new_descriptor(M, N, M, N) globZ = grid.new_descriptor(K, K, K, K) globX = grid.new_descriptor(K, 1, K, 1) globY = grid.new_descriptor(M, 1, M, 1) globD = grid.new_descriptor(M, K, M, K) globS = grid.new_descriptor(M, M, M, M) globU = grid.new_descriptor(M, M, M, M) globHEC = grid.new_descriptor(K, K, K, K) # print globA.asarray() # Populate matrices local to master: A0 = gen.rand(*globA.shape) + epsilon * gen.rand(*globA.shape) B0 = gen.rand(*globB.shape) + epsilon * gen.rand(*globB.shape) D0 = gen.rand(*globD.shape) + epsilon * gen.rand(*globD.shape) X0 = gen.rand(*globX.shape) + epsilon * gen.rand(*globX.shape) # HEC = HEA * B HEA0 = gen.rand(*globHEC.shape) + epsilon * gen.rand(*globHEC.shape) if world.rank == 0: HEA0 = HEA0 + HEA0.T.conjugate() # Make H0 hermitean HEA0 = np.ascontiguousarray(HEA0) # Local result matrices Y0 = globY.empty(dtype=dtype) C0 = globC.zeros(dtype=dtype) Z0 = globZ.zeros(dtype=dtype) S0 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U0 = globU.zeros(dtype=dtype) # zeros needed for rank-updates HEC0 = globB.zeros(dtype=dtype) # Local reference matrix product: if rank == 0: # C0[:] = np.dot(A0, B0) gemm(1.0, B0, A0, 0.0, C0) # gemm(1.0, A0, A0, 0.0, Z0, transa='t') print(A0.shape, Z0.shape) Z0[:] = np.dot(A0.T, A0) # Y0[:] = np.dot(A0, X0) gemv(1.0, A0, X0.ravel(), 0.0, Y0.ravel()) r2k(1.0, A0, D0, 0.0, S0) rk(1.0, A0, 0.0, U0) HEC0[:] = np.dot(HEA0, B0) sM, sN = HEA0.shape # We don't use upper diagonal for i in range(sM): for j in range(sN): if i < j: HEA0[i][j] = 99999.0 if world.rank == 0: print(HEA0) assert globA.check(A0) and globB.check(B0) and globC.check(C0) assert globX.check(X0) and globY.check(Y0) assert globD.check(D0) and globS.check(S0) and globU.check(U0) # Create distributed destriptors with various block sizes: distA = grid.new_descriptor(M, K, 2, 2) distB = grid.new_descriptor(K, N, 2, 4) distC = grid.new_descriptor(M, N, 3, 2) distZ = grid.new_descriptor(K, K, 5, 7) distX = grid.new_descriptor(K, 1, 4, 1) distY = grid.new_descriptor(M, 1, 3, 1) distD = grid.new_descriptor(M, K, 2, 3) distS = grid.new_descriptor(M, M, 2, 2) distU = grid.new_descriptor(M, M, 2, 2) distHE = grid.new_descriptor(K, K, 2, 4) # Distributed matrices: A = distA.empty(dtype=dtype) B = distB.empty(dtype=dtype) C = distC.empty(dtype=dtype) Z = distZ.empty(dtype=dtype) X = distX.empty(dtype=dtype) Y = distY.empty(dtype=dtype) D = distD.empty(dtype=dtype) S = distS.zeros(dtype=dtype) # zeros needed for rank-updates U = distU.zeros(dtype=dtype) # zeros needed for rank-updates HEC = distB.zeros(dtype=dtype) HEA = distHE.zeros(dtype=dtype) Redistributor(world, globA, distA).redistribute(A0, A) Redistributor(world, globB, distB).redistribute(B0, B) Redistributor(world, globX, distX).redistribute(X0, X) Redistributor(world, globD, distD).redistribute(D0, D) Redistributor(world, globHEC, distHE).redistribute(HEA0, HEA) pblas_simple_gemm(distA, distB, distC, A, B, C) pblas_simple_gemm(distA, distA, distZ, A, A, Z, transa='T') pblas_simple_gemv(distA, distX, distY, A, X, Y) pblas_simple_r2k(distA, distD, distS, A, D, S) pblas_simple_rk(distA, distU, A, U) pblas_simple_hemm(distHE, distB, distB, HEA, B, HEC, uplo='L', side='L') # Collect result back on master C1 = globC.empty(dtype=dtype) Y1 = globY.empty(dtype=dtype) S1 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U1 = globU.zeros(dtype=dtype) # zeros needed for rank-updates HEC1 = globB.zeros(dtype=dtype) Redistributor(world, distC, globC).redistribute(C, C1) Redistributor(world, distY, globY).redistribute(Y, Y1) Redistributor(world, distS, globS).redistribute(S, S1) Redistributor(world, distU, globU).redistribute(U, U1) Redistributor(world, distB, globB).redistribute(HEC, HEC1) if rank == 0: gemm_err = abs(C1 - C0).max() gemv_err = abs(Y1 - Y0).max() r2k_err = abs(S1 - S0).max() rk_err = abs(U1 - U0).max() hemm_err = abs(HEC1 - HEC0).max() print('gemm err', gemm_err) print('gemv err', gemv_err) print('r2k err', r2k_err) print('rk_err', rk_err) print('hemm_err', hemm_err) else: gemm_err = 0.0 gemv_err = 0.0 r2k_err = 0.0 rk_err = 0.0 hemm_err = 0.0 gemm_err = world.sum(gemm_err) # We don't like exceptions on only one cpu gemv_err = world.sum(gemv_err) r2k_err = world.sum(r2k_err) rk_err = world.sum(rk_err) hemm_err = world.sum(hemm_err) equal(gemm_err, 0, tol) equal(gemv_err, 0, tol) equal(r2k_err, 0, tol) equal(rk_err, 0, tol) equal(hemm_err, 0, tol)
def main(M=160, N=120, K=140, seed=42, mprocs=2, nprocs=2, dtype=float): gen = np.random.RandomState(seed) grid = BlacsGrid(world, mprocs, nprocs) if (dtype==complex): epsilon = 1.0j else: epsilon = 0.0 # Create descriptors for matrices on master: globA = grid.new_descriptor(M, K, M, K) globB = grid.new_descriptor(K, N, K, N) globC = grid.new_descriptor(M, N, M, N) globZ = grid.new_descriptor(K, K, K, K) globX = grid.new_descriptor(K, 1, K, 1) globY = grid.new_descriptor(M, 1, M, 1) globD = grid.new_descriptor(M, K, M, K) globS = grid.new_descriptor(M, M, M, M) globU = grid.new_descriptor(M, M, M, M) # print globA.asarray() # Populate matrices local to master: A0 = gen.rand(*globA.shape) + epsilon * gen.rand(*globA.shape) B0 = gen.rand(*globB.shape) + epsilon * gen.rand(*globB.shape) D0 = gen.rand(*globD.shape) + epsilon * gen.rand(*globD.shape) X0 = gen.rand(*globX.shape) + epsilon * gen.rand(*globX.shape) # Local result matrices Y0 = globY.empty(dtype=dtype) C0 = globC.zeros(dtype=dtype) Z0 = globZ.zeros(dtype=dtype) S0 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U0 = globU.zeros(dtype=dtype) # zeros needed for rank-updates # Local reference matrix product: if rank == 0: # C0[:] = np.dot(A0, B0) gemm(1.0, B0, A0, 0.0, C0) #gemm(1.0, A0, A0, 0.0, Z0, transa='t') print A0.shape, Z0.shape Z0[:] = np.dot(A0.T, A0) # Y0[:] = np.dot(A0, X0) gemv(1.0, A0, X0.ravel(), 0.0, Y0.ravel()) r2k(1.0, A0, D0, 0.0, S0) rk(1.0, A0, 0.0, U0) assert globA.check(A0) and globB.check(B0) and globC.check(C0) assert globX.check(X0) and globY.check(Y0) assert globD.check(D0) and globS.check(S0) and globU.check(U0) # Create distributed destriptors with various block sizes: distA = grid.new_descriptor(M, K, 2, 2) distB = grid.new_descriptor(K, N, 2, 4) distC = grid.new_descriptor(M, N, 3, 2) distZ = grid.new_descriptor(K, K, 5, 7) distX = grid.new_descriptor(K, 1, 4, 1) distY = grid.new_descriptor(M, 1, 3, 1) distD = grid.new_descriptor(M, K, 2, 3) distS = grid.new_descriptor(M, M, 2, 2) distU = grid.new_descriptor(M, M, 2, 2) # Distributed matrices: A = distA.empty(dtype=dtype) B = distB.empty(dtype=dtype) C = distC.empty(dtype=dtype) Z = distZ.empty(dtype=dtype) X = distX.empty(dtype=dtype) Y = distY.empty(dtype=dtype) D = distD.empty(dtype=dtype) S = distS.zeros(dtype=dtype) # zeros needed for rank-updates U = distU.zeros(dtype=dtype) # zeros needed for rank-updates Redistributor(world, globA, distA).redistribute(A0, A) Redistributor(world, globB, distB).redistribute(B0, B) Redistributor(world, globX, distX).redistribute(X0, X) Redistributor(world, globD, distD).redistribute(D0, D) pblas_simple_gemm(distA, distB, distC, A, B, C) pblas_simple_gemm(distA, distA, distZ, A, A, Z, transa='T') pblas_simple_gemv(distA, distX, distY, A, X, Y) pblas_simple_r2k(distA, distD, distS, A, D, S) pblas_simple_rk(distA, distU, A, U) # Collect result back on master C1 = globC.empty(dtype=dtype) Y1 = globY.empty(dtype=dtype) S1 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U1 = globU.zeros(dtype=dtype) # zeros needed for rank-updates Redistributor(world, distC, globC).redistribute(C, C1) Redistributor(world, distY, globY).redistribute(Y, Y1) Redistributor(world, distS, globS).redistribute(S, S1) Redistributor(world, distU, globU).redistribute(U, U1) if rank == 0: gemm_err = abs(C1 - C0).max() gemv_err = abs(Y1 - Y0).max() r2k_err = abs(S1 - S0).max() rk_err = abs(U1 - U0).max() print 'gemm err', gemm_err print 'gemv err', gemv_err print 'r2k err' , r2k_err print 'rk_err' , rk_err else: gemm_err = 0.0 gemv_err = 0.0 r2k_err = 0.0 rk_err = 0.0 gemm_err = world.sum(gemm_err) # We don't like exceptions on only one cpu gemv_err = world.sum(gemv_err) r2k_err = world.sum(r2k_err) rk_err = world.sum(rk_err) equal(gemm_err, 0, tol) equal(gemv_err, 0, tol) equal(r2k_err, 0, tol) equal(rk_err,0, tol)
Ferr = np.abs(F - Fref).max() assert Ferr < 1e-6, 'Bad F: err=%f; parallel=%s' % (Ferr, parallel) return E, F # First calculate reference energy and forces E and F # # If we want to really dumb things down, enable this to force an # entirely serial calculation: if 0: serial = world.new_communicator([0]) E = 0.0 F = np.zeros((len(system), 3)) if world.rank == 0: E, F = calculate({}, serial) E = world.sum(E) world.sum(F) else: # Normally we'll just do it in parallel; # that case is covered well by other tests, so we can probably trust it E, F = calculate({}, world) def check(parallel): return calculate(parallel, comm=world, Eref=E, Fref=F) assert world.size in [1, 2, 4, 8], ('Number of CPUs %d not supported' % world.size) parallel = dict(domain=1, band=1)
def main(M=160, N=120, K=140, seed=42, mprocs=2, nprocs=2, dtype=float): gen = np.random.RandomState(seed) grid = BlacsGrid(world, mprocs, nprocs) if dtype == complex: epsilon = 1.0j else: epsilon = 0.0 # Create descriptors for matrices on master: globA = grid.new_descriptor(M, K, M, K) globB = grid.new_descriptor(K, N, K, N) globC = grid.new_descriptor(M, N, M, N) globZ = grid.new_descriptor(K, K, K, K) globX = grid.new_descriptor(K, 1, K, 1) globY = grid.new_descriptor(M, 1, M, 1) globD = grid.new_descriptor(M, K, M, K) globS = grid.new_descriptor(M, M, M, M) globU = grid.new_descriptor(M, M, M, M) globHEC = grid.new_descriptor(K, K, K, K) # print globA.asarray() # Populate matrices local to master: A0 = gen.rand(*globA.shape) + epsilon * gen.rand(*globA.shape) B0 = gen.rand(*globB.shape) + epsilon * gen.rand(*globB.shape) D0 = gen.rand(*globD.shape) + epsilon * gen.rand(*globD.shape) X0 = gen.rand(*globX.shape) + epsilon * gen.rand(*globX.shape) # HEC = HEA * B HEA0 = gen.rand(*globHEC.shape) + epsilon * gen.rand(*globHEC.shape) if world.rank == 0: HEA0 = HEA0 + HEA0.T.conjugate() # Make H0 hermitean # Local result matrices Y0 = globY.empty(dtype=dtype) C0 = globC.zeros(dtype=dtype) Z0 = globZ.zeros(dtype=dtype) S0 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U0 = globU.zeros(dtype=dtype) # zeros needed for rank-updates HEC0 = globB.zeros(dtype=dtype) # Local reference matrix product: if rank == 0: # C0[:] = np.dot(A0, B0) gemm(1.0, B0, A0, 0.0, C0) # gemm(1.0, A0, A0, 0.0, Z0, transa='t') print(A0.shape, Z0.shape) Z0[:] = np.dot(A0.T, A0) # Y0[:] = np.dot(A0, X0) gemv(1.0, A0, X0.ravel(), 0.0, Y0.ravel()) r2k(1.0, A0, D0, 0.0, S0) rk(1.0, A0, 0.0, U0) HEC0[:] = np.dot(HEA0, B0) sM, sN = HEA0.shape # We don't use upper diagonal for i in range(sM): for j in range(sN): if i < j: HEA0[i][j] = 99999.0 if world.rank == 0: print(HEA0) assert globA.check(A0) and globB.check(B0) and globC.check(C0) assert globX.check(X0) and globY.check(Y0) assert globD.check(D0) and globS.check(S0) and globU.check(U0) # Create distributed destriptors with various block sizes: distA = grid.new_descriptor(M, K, 2, 2) distB = grid.new_descriptor(K, N, 2, 4) distC = grid.new_descriptor(M, N, 3, 2) distZ = grid.new_descriptor(K, K, 5, 7) distX = grid.new_descriptor(K, 1, 4, 1) distY = grid.new_descriptor(M, 1, 3, 1) distD = grid.new_descriptor(M, K, 2, 3) distS = grid.new_descriptor(M, M, 2, 2) distU = grid.new_descriptor(M, M, 2, 2) distHE = grid.new_descriptor(K, K, 2, 4) # Distributed matrices: A = distA.empty(dtype=dtype) B = distB.empty(dtype=dtype) C = distC.empty(dtype=dtype) Z = distZ.empty(dtype=dtype) X = distX.empty(dtype=dtype) Y = distY.empty(dtype=dtype) D = distD.empty(dtype=dtype) S = distS.zeros(dtype=dtype) # zeros needed for rank-updates U = distU.zeros(dtype=dtype) # zeros needed for rank-updates HEC = distB.zeros(dtype=dtype) HEA = distHE.zeros(dtype=dtype) Redistributor(world, globA, distA).redistribute(A0, A) Redistributor(world, globB, distB).redistribute(B0, B) Redistributor(world, globX, distX).redistribute(X0, X) Redistributor(world, globD, distD).redistribute(D0, D) Redistributor(world, globHEC, distHE).redistribute(HEA0, HEA) pblas_simple_gemm(distA, distB, distC, A, B, C) pblas_simple_gemm(distA, distA, distZ, A, A, Z, transa="T") pblas_simple_gemv(distA, distX, distY, A, X, Y) pblas_simple_r2k(distA, distD, distS, A, D, S) pblas_simple_rk(distA, distU, A, U) pblas_simple_hemm(distHE, distB, distB, HEA, B, HEC, uplo="L", side="L") # Collect result back on master C1 = globC.empty(dtype=dtype) Y1 = globY.empty(dtype=dtype) S1 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U1 = globU.zeros(dtype=dtype) # zeros needed for rank-updates HEC1 = globB.zeros(dtype=dtype) Redistributor(world, distC, globC).redistribute(C, C1) Redistributor(world, distY, globY).redistribute(Y, Y1) Redistributor(world, distS, globS).redistribute(S, S1) Redistributor(world, distU, globU).redistribute(U, U1) Redistributor(world, distB, globB).redistribute(HEC, HEC1) if rank == 0: gemm_err = abs(C1 - C0).max() gemv_err = abs(Y1 - Y0).max() r2k_err = abs(S1 - S0).max() rk_err = abs(U1 - U0).max() hemm_err = abs(HEC1 - HEC0).max() print("gemm err", gemm_err) print("gemv err", gemv_err) print("r2k err", r2k_err) print("rk_err", rk_err) print("hemm_err", hemm_err) else: gemm_err = 0.0 gemv_err = 0.0 r2k_err = 0.0 rk_err = 0.0 hemm_err = 0.0 gemm_err = world.sum(gemm_err) # We don't like exceptions on only one cpu gemv_err = world.sum(gemv_err) r2k_err = world.sum(r2k_err) rk_err = world.sum(rk_err) hemm_err = world.sum(hemm_err) equal(gemm_err, 0, tol) equal(gemv_err, 0, tol) equal(r2k_err, 0, tol) equal(rk_err, 0, tol) equal(hemm_err, 0, tol)