def _pseudo_braket(self, bra_xG, ket_yG, A_yx, square=None): """Calculate matrix elements of braket pairs of pseudo wave functions. Low-level helper function. Results will be put in the *A_yx* array:: / ~ * ~ A = | dG bra (G) ket (G) nn' / n n' Parameters: bra_xG: ndarray Set of bra-like vectors in which the matrix elements are evaluated. key_yG: ndarray Set of ket-like vectors in which the matrix elements are evaluated. A_yx: ndarray Matrix in which to put calculated elements. Take care: Due to the difference in Fortran/C array order and the inherent BLAS nature, the matrix has to be filled in transposed (conjugated in future?). """ assert bra_xG.shape[1:] == ket_yG.shape[1:] assert (ket_yG.shape[0], bra_xG.shape[0]) == A_yx.shape if square is None: square = (bra_xG.shape[0] == ket_yG.shape[0]) dv = self.gd.dv if ket_yG is bra_xG: rk(dv, bra_xG, 0.0, A_yx) elif self.hermitian and square: r2k(0.5 * dv, bra_xG, ket_yG, 0.0, A_yx) else: gemm(dv, bra_xG, ket_yG, 0.0, A_yx, 'c')
def integrate(self, a_xg, b_yg=None, global_integral=True, hermitian=False, _transposed_result=None): """Integrate function(s) over domain. a_xg: ndarray Function(s) to be integrated. b_yg: ndarray If present, integrate a_xg.conj() * b_yg. global_integral: bool If the array(s) are distributed over several domains, then the total sum will be returned. To get the local contribution only, use global_integral=False. hermitian: bool Result is hermitian. _transposed_result: ndarray Long story. Don't use this unless you are a method of the MatrixOperator class ...""" xshape = a_xg.shape[:-3] if b_yg is None: # Only one array: result = a_xg.reshape(xshape + (-1, )).sum(axis=-1) * self.dv if global_integral: if result.ndim == 0: result = self.comm.sum(result) else: self.comm.sum(result) return result A_xg = np.ascontiguousarray(a_xg.reshape((-1, ) + a_xg.shape[-3:])) B_yg = np.ascontiguousarray(b_yg.reshape((-1, ) + b_yg.shape[-3:])) if _transposed_result is None: result_yx = np.zeros((len(B_yg), len(A_xg)), A_xg.dtype) else: result_yx = _transposed_result global_integral = False if a_xg is b_yg: rk(self.dv, A_xg, 0.0, result_yx) elif hermitian: r2k(0.5 * self.dv, A_xg, B_yg, 0.0, result_yx) else: gemm(self.dv, A_xg, B_yg, 0.0, result_yx, 'c') if global_integral: self.comm.sum(result_yx) yshape = b_yg.shape[:-3] result = result_yx.T.reshape(xshape + yshape) if result.ndim == 0: return result.item() else: return result
def _pseudo_braket(self, bra_xG, ket_yG, A_yx, square=None): """Calculate matrix elements of braket pairs of pseudo wave functions. Low-level helper function. Results will be put in the *A_yx* array:: / ~ * ~ A = | dG bra (G) ket (G) nn' / n n' Parameters: bra_xG: ndarray Set of bra-like vectors in which the matrix elements are evaluated. key_yG: ndarray Set of ket-like vectors in which the matrix elements are evaluated. A_yx: ndarray Matrix in which to put calculated elements. Take care: Due to the difference in Fortran/C array order and the inherent BLAS nature, the matrix has to be filled in transposed (conjugated in future?). """ assert bra_xG.shape[1:] == ket_yG.shape[1:] assert (ket_yG.shape[0], bra_xG.shape[0]) == A_yx.shape if square is None: square = (bra_xG.shape[0]==ket_yG.shape[0]) dv = self.gd.dv if ket_yG is bra_xG: rk(dv, bra_xG, 0.0, A_yx) elif self.hermitian and square: r2k(0.5 * dv, bra_xG, ket_yG, 0.0, A_yx) else: gemm(dv, bra_xG, ket_yG, 0.0, A_yx, 'c')
def update_hilbert(self, n_mG, deps_m, df_m, chi0_wGG): domega = self.omega_w[1] for omega, df, n_G in zip(deps_m, df_m, n_mG): w = omega / domega iw = int(w) weights = df * np.array([[1 - w + iw], [w - iw]]) x_2G = n_G * weights**0.5 rk(self.prefactor, x_2G, 1.0, chi0_wGG[iw:iw + 2])
def integrate(self, a_xg, b_yg=None, global_integral=True, hermitian=False, _transposed_result=None): """Integrate function(s) over domain. a_xg: ndarray Function(s) to be integrated. b_yg: ndarray If present, integrate a_xg.conj() * b_yg. global_integral: bool If the array(s) are distributed over several domains, then the total sum will be returned. To get the local contribution only, use global_integral=False. hermitian: bool Result is hermitian. _transposed_result: ndarray Long story. Don't use this unless you are a method of the MatrixOperator class ...""" xshape = a_xg.shape[:-3] if b_yg is None: # Only one array: result = a_xg.reshape(xshape + (-1,)).sum(axis=-1) * self.dv if global_integral: if result.ndim == 0: result = self.comm.sum(result) else: self.comm.sum(result) return result A_xg = np.ascontiguousarray(a_xg.reshape((-1,) + a_xg.shape[-3:])) B_yg = np.ascontiguousarray(b_yg.reshape((-1,) + b_yg.shape[-3:])) if _transposed_result is None: result_yx = np.zeros((len(B_yg), len(A_xg)), A_xg.dtype) else: result_yx = _transposed_result global_integral = False if a_xg is b_yg: rk(self.dv, A_xg, 0.0, result_yx) elif hermitian: r2k(0.5 * self.dv, A_xg, B_yg, 0.0, result_yx) else: gemm(self.dv, A_xg, B_yg, 0.0, result_yx, 'c') if global_integral: self.comm.sum(result_yx) yshape = b_yg.shape[:-3] result = result_yx.T.reshape(xshape + yshape) if result.ndim == 0: return result.item() else: return result
def update_hermitian(self, n_mG, deps_m, df_m, chi0_wGG): for w, omega in enumerate(self.omega_w): if self.blockcomm.size == 1: x_m = (-2 * df_m * deps_m / (omega.imag**2 + deps_m**2))**0.5 nx_mG = n_mG.conj() * x_m[:, np.newaxis] rk(-self.prefactor, nx_mG, 1.0, chi0_wGG[w], 'n') else: x_m = 2 * df_m * deps_m / (omega.imag**2 + deps_m**2) mynx_mG = n_mG[:, self.Ga:self.Gb] * x_m[:, np.newaxis] mmm(self.prefactor, mynx_mG, 'c', n_mG, 'n', 1.0, chi0_wGG[w])
def multiply(self, alpha, a, opa, b, opb, beta, c, symmetric): if symmetric: assert opa == 'N' assert opb == 'C' or opb == 'T' and a.dtype == float if a is b: blas.rk(alpha, a.array, beta, c.array) else: if beta == 1.0 and a.shape[1] == 0: return blas.r2k(0.5 * alpha, a.array, b.array, beta, c.array) else: blas.mmm(alpha, a.array, opa, b.array, opb, beta, c.array)
def update_hermitian(self, n_mG, deps_m, wd, chi0_wGG): """If eta=0 use hermitian update.""" omega_w = wd.get_data() deps_m += self.eshift * np.sign(deps_m) for w, omega in enumerate(omega_w): if self.blockcomm.size == 1: x_m = (-2 * deps_m / (omega.imag**2 + deps_m**2) + 0j)**0.5 nx_mG = n_mG.conj() * x_m[:, np.newaxis] rk(-1.0, nx_mG, 1.0, chi0_wGG[w], 'n') else: x_m = 2 * deps_m / (omega.imag**2 + deps_m**2) mynx_mG = n_mG[:, self.Ga:self.Gb] * x_m[:, np.newaxis] mmm(1.0, mynx_mG, 'C', n_mG, 'N', 1.0, chi0_wGG[w])
def integrate(self, a_xg, b_yg=None, global_integral=True, hermitian=False, _transposed_result=None): """Integrate function(s) over domain. a_xg: ndarray Function(s) to be integrated. b_yg: ndarray If present, integrate a_xg.conj() * b_yg. global_integral: bool If the array(s) are distributed over several domains, then the total sum will be returned. To get the local contribution only, use global_integral=False. hermitian: bool Result is hermitian. _transposed_result: ndarray Long story. Don't use this unless you are a method of the MatrixOperator class ...""" xshape = a_xg.shape[:-3] if b_yg is None: # Only one array: result = a_xg.reshape(xshape + (-1,)).sum(axis=-1) * self.dv if global_integral: if result.ndim == 0: result = self.comm.sum(result) else: self.comm.sum(result) return result if isinstance(a_xg, mic.OffloadArray): # offload arrays have to be contiguous in any case A_xg = a_xg B_yg = b_yg else: A_xg = np.ascontiguousarray(a_xg.reshape((-1,) + a_xg.shape[-3:])) B_yg = np.ascontiguousarray(b_yg.reshape((-1,) + b_yg.shape[-3:])) if _transposed_result is None: result_yx = np.zeros((len(B_yg), len(A_xg)), A_xg.dtype) else: result_yx = _transposed_result global_integral = False if isinstance(a_xg, mic.OffloadArray): result_yx_mic = stream.bind(result_yx) stream.sync() # result_yx_mic.fillfrom(result_yx) # result_yx_mic.array[:] = result_yx[:] # result_yx_mic.update_device() if a_xg is b_yg: if isinstance(a_xg, mic.OffloadArray): # dsyrk performs badly in MIC so use dgemm here # mic_rk(self.dv, A_xg, 0.0, result_yx_mic) mic_gemm(self.dv, A_xg, A_xg, 0.0, result_yx_mic, 'c') else: rk(self.dv, A_xg, 0.0, result_yx) elif hermitian: if isinstance(a_xg, mic.OffloadArray): mic_r2k(self.dv, A_xg, B_yg, 0.0, result_yx_mic) else: r2k(0.5 * self.dv, A_xg, B_yg, 0.0, result_yx) else: if isinstance(a_xg, mic.OffloadArray): mic_gemm(self.dv, A_xg, B_yg, 0.0, result_yx_mic, 'c') else: gemm(self.dv, A_xg, B_yg, 0.0, result_yx, 'c') if isinstance(a_xg, mic.OffloadArray): result_yx_mic.update_host() stream.sync() if global_integral: self.comm.sum(result_yx) yshape = b_yg.shape[:-3] result = result_yx.T.reshape(xshape + yshape) if result.ndim == 0: return result.item() else: return result
def main(M=160, N=120, K=140, seed=42, mprocs=2, nprocs=2, dtype=float): gen = np.random.RandomState(seed) grid = BlacsGrid(world, mprocs, nprocs) if (dtype==complex): epsilon = 1.0j else: epsilon = 0.0 # Create descriptors for matrices on master: globA = grid.new_descriptor(M, K, M, K) globB = grid.new_descriptor(K, N, K, N) globC = grid.new_descriptor(M, N, M, N) globZ = grid.new_descriptor(K, K, K, K) globX = grid.new_descriptor(K, 1, K, 1) globY = grid.new_descriptor(M, 1, M, 1) globD = grid.new_descriptor(M, K, M, K) globS = grid.new_descriptor(M, M, M, M) globU = grid.new_descriptor(M, M, M, M) # print globA.asarray() # Populate matrices local to master: A0 = gen.rand(*globA.shape) + epsilon * gen.rand(*globA.shape) B0 = gen.rand(*globB.shape) + epsilon * gen.rand(*globB.shape) D0 = gen.rand(*globD.shape) + epsilon * gen.rand(*globD.shape) X0 = gen.rand(*globX.shape) + epsilon * gen.rand(*globX.shape) # Local result matrices Y0 = globY.empty(dtype=dtype) C0 = globC.zeros(dtype=dtype) Z0 = globZ.zeros(dtype=dtype) S0 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U0 = globU.zeros(dtype=dtype) # zeros needed for rank-updates # Local reference matrix product: if rank == 0: # C0[:] = np.dot(A0, B0) gemm(1.0, B0, A0, 0.0, C0) #gemm(1.0, A0, A0, 0.0, Z0, transa='t') print A0.shape, Z0.shape Z0[:] = np.dot(A0.T, A0) # Y0[:] = np.dot(A0, X0) gemv(1.0, A0, X0.ravel(), 0.0, Y0.ravel()) r2k(1.0, A0, D0, 0.0, S0) rk(1.0, A0, 0.0, U0) assert globA.check(A0) and globB.check(B0) and globC.check(C0) assert globX.check(X0) and globY.check(Y0) assert globD.check(D0) and globS.check(S0) and globU.check(U0) # Create distributed destriptors with various block sizes: distA = grid.new_descriptor(M, K, 2, 2) distB = grid.new_descriptor(K, N, 2, 4) distC = grid.new_descriptor(M, N, 3, 2) distZ = grid.new_descriptor(K, K, 5, 7) distX = grid.new_descriptor(K, 1, 4, 1) distY = grid.new_descriptor(M, 1, 3, 1) distD = grid.new_descriptor(M, K, 2, 3) distS = grid.new_descriptor(M, M, 2, 2) distU = grid.new_descriptor(M, M, 2, 2) # Distributed matrices: A = distA.empty(dtype=dtype) B = distB.empty(dtype=dtype) C = distC.empty(dtype=dtype) Z = distZ.empty(dtype=dtype) X = distX.empty(dtype=dtype) Y = distY.empty(dtype=dtype) D = distD.empty(dtype=dtype) S = distS.zeros(dtype=dtype) # zeros needed for rank-updates U = distU.zeros(dtype=dtype) # zeros needed for rank-updates Redistributor(world, globA, distA).redistribute(A0, A) Redistributor(world, globB, distB).redistribute(B0, B) Redistributor(world, globX, distX).redistribute(X0, X) Redistributor(world, globD, distD).redistribute(D0, D) pblas_simple_gemm(distA, distB, distC, A, B, C) pblas_simple_gemm(distA, distA, distZ, A, A, Z, transa='T') pblas_simple_gemv(distA, distX, distY, A, X, Y) pblas_simple_r2k(distA, distD, distS, A, D, S) pblas_simple_rk(distA, distU, A, U) # Collect result back on master C1 = globC.empty(dtype=dtype) Y1 = globY.empty(dtype=dtype) S1 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U1 = globU.zeros(dtype=dtype) # zeros needed for rank-updates Redistributor(world, distC, globC).redistribute(C, C1) Redistributor(world, distY, globY).redistribute(Y, Y1) Redistributor(world, distS, globS).redistribute(S, S1) Redistributor(world, distU, globU).redistribute(U, U1) if rank == 0: gemm_err = abs(C1 - C0).max() gemv_err = abs(Y1 - Y0).max() r2k_err = abs(S1 - S0).max() rk_err = abs(U1 - U0).max() print 'gemm err', gemm_err print 'gemv err', gemv_err print 'r2k err' , r2k_err print 'rk_err' , rk_err else: gemm_err = 0.0 gemv_err = 0.0 r2k_err = 0.0 rk_err = 0.0 gemm_err = world.sum(gemm_err) # We don't like exceptions on only one cpu gemv_err = world.sum(gemv_err) r2k_err = world.sum(r2k_err) rk_err = world.sum(rk_err) equal(gemm_err, 0, tol) equal(gemv_err, 0, tol) equal(r2k_err, 0, tol) equal(rk_err,0, tol)
def main(N=73, seed=42, mprocs=2, nprocs=2, dtype=float): gen = np.random.RandomState(seed) grid = BlacsGrid(world, mprocs, nprocs) if (dtype==complex): epsilon = 1.0j else: epsilon = 0.0 # Create descriptors for matrices on master: glob = grid.new_descriptor(N, N, N, N) # print globA.asarray() # Populate matrices local to master: H0 = glob.zeros(dtype=dtype) + gen.rand(*glob.shape) S0 = glob.zeros(dtype=dtype) + gen.rand(*glob.shape) C0 = glob.empty(dtype=dtype) if rank == 0: # Complex case must have real numbers on the diagonal. # We make a simple complex Hermitian matrix below. H0 = H0 + epsilon * (0.1*np.tri(N, N, k= -N // nprocs) + 0.3*np.tri(N, N, k=-1)) S0 = S0 + epsilon * (0.2*np.tri(N, N, k= -N // nprocs) + 0.4*np.tri(N, N, k=-1)) # Make matrices symmetric rk(1.0, H0.copy(), 0.0, H0) rk(1.0, S0.copy(), 0.0, S0) # Overlap matrix must be semi-positive definite S0 = S0 + 50.0*np.eye(N, N, 0) # Hamiltonian is usually diagonally dominant H0 = H0 + 75.0*np.eye(N, N, 0) C0 = S0.copy() # Local result matrices W0 = np.empty((N),dtype=float) W0_g = np.empty((N),dtype=float) # Calculate eigenvalues if rank == 0: diagonalize(H0.copy(), W0) general_diagonalize(H0.copy(), W0_g, S0.copy()) inverse_cholesky(C0) # result returned in lower triangle # tri2full(C0) # symmetrize assert glob.check(H0) and glob.check(S0) and glob.check(C0) # Create distributed destriptors with various block sizes: dist = grid.new_descriptor(N, N, 8, 8) # Distributed matrices: # We can use empty here, but end up with garbage on # on the other half of the triangle when we redistribute. # This is fine because ScaLAPACK does not care. H = dist.empty(dtype=dtype) S = dist.empty(dtype=dtype) Z = dist.empty(dtype=dtype) C = dist.empty(dtype=dtype) # Eigenvalues are non-BLACS matrices W = np.empty((N), dtype=float) W_dc = np.empty((N), dtype=float) W_mr3 = np.empty((N), dtype=float) W_g = np.empty((N), dtype=float) W_g_dc = np.empty((N), dtype=float) W_g_mr3 = np.empty((N), dtype=float) Glob2dist = Redistributor(world, glob, dist) Glob2dist.redistribute(H0, H, uplo='L') Glob2dist.redistribute(S0, S, uplo='L') Glob2dist.redistribute(S0, C, uplo='L') # C0 was previously overwritten # we don't test the expert drivers anymore since there # might be a buffer overflow error ## scalapack_diagonalize_ex(dist, H.copy(), Z, W, 'L') scalapack_diagonalize_dc(dist, H.copy(), Z, W_dc, 'L') ## scalapack_diagonalize_mr3(dist, H.copy(), Z, W_mr3, 'L') ## scalapack_general_diagonalize_ex(dist, H.copy(), S.copy(), Z, W_g, 'L') scalapack_general_diagonalize_dc(dist, H.copy(), S.copy(), Z, W_g_dc, 'L') ## scalapack_general_diagonalize_mr3(dist, H.copy(), S.copy(), Z, W_g_mr3, 'L') scalapack_inverse_cholesky(dist, C, 'L') # Undo redistribute C_test = glob.empty(dtype=dtype) Dist2glob = Redistributor(world, dist, glob) Dist2glob.redistribute(C, C_test) if rank == 0: ## diag_ex_err = abs(W - W0).max() diag_dc_err = abs(W_dc - W0).max() ## diag_mr3_err = abs(W_mr3 - W0).max() ## general_diag_ex_err = abs(W_g - W0_g).max() general_diag_dc_err = abs(W_g_dc - W0_g).max() ## general_diag_mr3_err = abs(W_g_mr3 - W0_g).max() inverse_chol_err = abs(C_test-C0).max() ## print 'diagonalize ex err', diag_ex_err print 'diagonalize dc err', diag_dc_err ## print 'diagonalize mr3 err', diag_mr3_err ## print 'general diagonalize ex err', general_diag_ex_err print 'general diagonalize dc err', general_diag_dc_err ## print 'general diagonalize mr3 err', general_diag_mr3_err print 'inverse chol err', inverse_chol_err else: ## diag_ex_err = 0.0 diag_dc_err = 0.0 ## diag_mr3_err = 0.0 ## general_diag_ex_err = 0.0 general_diag_dc_err = 0.0 ## general_diag_mr3_err = 0.0 inverse_chol_err = 0.0 # We don't like exceptions on only one cpu ## diag_ex_err = world.sum(diag_ex_err) diag_dc_err = world.sum(diag_dc_err) ## diag_mr3_err = world.sum(diag_mr3_err) ## general_diag_ex_err = world.sum(general_diag_ex_err) general_diag_dc_err = world.sum(general_diag_dc_err) ## general_diag_mr3_err = world.sum(general_diag_mr3_err) inverse_chol_err = world.sum(inverse_chol_err) ## assert diag_ex_err < tol assert diag_dc_err < tol ## assert diag_mr3_err < tol ## assert general_diag_ex_err < tol assert general_diag_dc_err < tol ## assert general_diag_mr3_err < tol assert inverse_chol_err < tol
def main(M=160, N=120, K=140, seed=42, mprocs=2, nprocs=2, dtype=float): gen = np.random.RandomState(seed) grid = BlacsGrid(world, mprocs, nprocs) if dtype == complex: epsilon = 1.0j else: epsilon = 0.0 # Create descriptors for matrices on master: globA = grid.new_descriptor(M, K, M, K) globB = grid.new_descriptor(K, N, K, N) globC = grid.new_descriptor(M, N, M, N) globZ = grid.new_descriptor(K, K, K, K) globX = grid.new_descriptor(K, 1, K, 1) globY = grid.new_descriptor(M, 1, M, 1) globD = grid.new_descriptor(M, K, M, K) globS = grid.new_descriptor(M, M, M, M) globU = grid.new_descriptor(M, M, M, M) globHEC = grid.new_descriptor(K, K, K, K) # print globA.asarray() # Populate matrices local to master: A0 = gen.rand(*globA.shape) + epsilon * gen.rand(*globA.shape) B0 = gen.rand(*globB.shape) + epsilon * gen.rand(*globB.shape) D0 = gen.rand(*globD.shape) + epsilon * gen.rand(*globD.shape) X0 = gen.rand(*globX.shape) + epsilon * gen.rand(*globX.shape) # HEC = HEA * B HEA0 = gen.rand(*globHEC.shape) + epsilon * gen.rand(*globHEC.shape) if world.rank == 0: HEA0 = HEA0 + HEA0.T.conjugate() # Make H0 hermitean # Local result matrices Y0 = globY.empty(dtype=dtype) C0 = globC.zeros(dtype=dtype) Z0 = globZ.zeros(dtype=dtype) S0 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U0 = globU.zeros(dtype=dtype) # zeros needed for rank-updates HEC0 = globB.zeros(dtype=dtype) # Local reference matrix product: if rank == 0: # C0[:] = np.dot(A0, B0) gemm(1.0, B0, A0, 0.0, C0) # gemm(1.0, A0, A0, 0.0, Z0, transa='t') print(A0.shape, Z0.shape) Z0[:] = np.dot(A0.T, A0) # Y0[:] = np.dot(A0, X0) gemv(1.0, A0, X0.ravel(), 0.0, Y0.ravel()) r2k(1.0, A0, D0, 0.0, S0) rk(1.0, A0, 0.0, U0) HEC0[:] = np.dot(HEA0, B0) sM, sN = HEA0.shape # We don't use upper diagonal for i in range(sM): for j in range(sN): if i < j: HEA0[i][j] = 99999.0 if world.rank == 0: print(HEA0) assert globA.check(A0) and globB.check(B0) and globC.check(C0) assert globX.check(X0) and globY.check(Y0) assert globD.check(D0) and globS.check(S0) and globU.check(U0) # Create distributed destriptors with various block sizes: distA = grid.new_descriptor(M, K, 2, 2) distB = grid.new_descriptor(K, N, 2, 4) distC = grid.new_descriptor(M, N, 3, 2) distZ = grid.new_descriptor(K, K, 5, 7) distX = grid.new_descriptor(K, 1, 4, 1) distY = grid.new_descriptor(M, 1, 3, 1) distD = grid.new_descriptor(M, K, 2, 3) distS = grid.new_descriptor(M, M, 2, 2) distU = grid.new_descriptor(M, M, 2, 2) distHE = grid.new_descriptor(K, K, 2, 4) # Distributed matrices: A = distA.empty(dtype=dtype) B = distB.empty(dtype=dtype) C = distC.empty(dtype=dtype) Z = distZ.empty(dtype=dtype) X = distX.empty(dtype=dtype) Y = distY.empty(dtype=dtype) D = distD.empty(dtype=dtype) S = distS.zeros(dtype=dtype) # zeros needed for rank-updates U = distU.zeros(dtype=dtype) # zeros needed for rank-updates HEC = distB.zeros(dtype=dtype) HEA = distHE.zeros(dtype=dtype) Redistributor(world, globA, distA).redistribute(A0, A) Redistributor(world, globB, distB).redistribute(B0, B) Redistributor(world, globX, distX).redistribute(X0, X) Redistributor(world, globD, distD).redistribute(D0, D) Redistributor(world, globHEC, distHE).redistribute(HEA0, HEA) pblas_simple_gemm(distA, distB, distC, A, B, C) pblas_simple_gemm(distA, distA, distZ, A, A, Z, transa="T") pblas_simple_gemv(distA, distX, distY, A, X, Y) pblas_simple_r2k(distA, distD, distS, A, D, S) pblas_simple_rk(distA, distU, A, U) pblas_simple_hemm(distHE, distB, distB, HEA, B, HEC, uplo="L", side="L") # Collect result back on master C1 = globC.empty(dtype=dtype) Y1 = globY.empty(dtype=dtype) S1 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U1 = globU.zeros(dtype=dtype) # zeros needed for rank-updates HEC1 = globB.zeros(dtype=dtype) Redistributor(world, distC, globC).redistribute(C, C1) Redistributor(world, distY, globY).redistribute(Y, Y1) Redistributor(world, distS, globS).redistribute(S, S1) Redistributor(world, distU, globU).redistribute(U, U1) Redistributor(world, distB, globB).redistribute(HEC, HEC1) if rank == 0: gemm_err = abs(C1 - C0).max() gemv_err = abs(Y1 - Y0).max() r2k_err = abs(S1 - S0).max() rk_err = abs(U1 - U0).max() hemm_err = abs(HEC1 - HEC0).max() print("gemm err", gemm_err) print("gemv err", gemv_err) print("r2k err", r2k_err) print("rk_err", rk_err) print("hemm_err", hemm_err) else: gemm_err = 0.0 gemv_err = 0.0 r2k_err = 0.0 rk_err = 0.0 hemm_err = 0.0 gemm_err = world.sum(gemm_err) # We don't like exceptions on only one cpu gemv_err = world.sum(gemv_err) r2k_err = world.sum(r2k_err) rk_err = world.sum(rk_err) hemm_err = world.sum(hemm_err) equal(gemm_err, 0, tol) equal(gemv_err, 0, tol) equal(r2k_err, 0, tol) equal(rk_err, 0, tol) equal(hemm_err, 0, tol)
assert not c.any() # Check gemm for transa='c' a = np.arange(4 * 5 * 1 * 3).reshape(4, 5, 1, 3) * (3. - 2.j) + 4. c = np.tensordot(a, a2.conj(), [[1, 2, 3], [1, 2, 3]]) gemm(1., a2, a, -1., c, 'c') assert not c.any() # Check axpy c = 5.j * a axpy(-5.j, a, c) assert not c.any() # Check rk c = np.tensordot(a, a.conj(), [[1, 2, 3], [1, 2, 3]]) rk(1., a, -1., c) tri2full(c) assert not c.any() # Check gemmdot for transa='c' c = np.tensordot(a, a2.conj(), [-1, -1]) gemmdot(a, a2, beta=-1., out=c, trans='c') assert not c.any() # Check gemmdot for transa='n' a2.shape = 3, 7, 5, 1 c = np.tensordot(a, a2, [-1, 0]) gemmdot(a, a2, beta=-1., out=c, trans='n') assert not c.any() # Check r2k
def update_hermitian(self, n_mG, deps_m, df_m, chi0_wGG): for w, omega in enumerate(self.omega_w): x_m = (-2 * df_m * deps_m / (omega.imag**2 + deps_m**2))**0.5 nx_mG = n_mG.T.copy() * x_m rk(-self.prefactor, nx_mG, 1.0, chi0_wGG[w])
def main(N=72, seed=42, mprocs=2, nprocs=2, dtype=float): gen = np.random.RandomState(seed) grid = BlacsGrid(world, mprocs, nprocs) if (dtype == complex): epsilon = 1.0j else: epsilon = 0.0 # Create descriptors for matrices on master: glob = grid.new_descriptor(N, N, N, N) # print globA.asarray() # Populate matrices local to master: H0 = glob.zeros(dtype=dtype) + gen.rand(*glob.shape) S0 = glob.zeros(dtype=dtype) + gen.rand(*glob.shape) C0 = glob.empty(dtype=dtype) if rank == 0: # Complex case must have real numbers on the diagonal. # We make a simple complex Hermitian matrix below. H0 = H0 + epsilon * (0.1 * np.tri(N, N, k=-N // nprocs) + 0.3 * np.tri(N, N, k=-1)) S0 = S0 + epsilon * (0.2 * np.tri(N, N, k=-N // nprocs) + 0.4 * np.tri(N, N, k=-1)) # Make matrices symmetric rk(1.0, H0.copy(), 0.0, H0) rk(1.0, S0.copy(), 0.0, S0) # Overlap matrix must be semi-positive definite S0 = S0 + 50.0 * np.eye(N, N, 0) # Hamiltonian is usually diagonally dominant H0 = H0 + 75.0 * np.eye(N, N, 0) C0 = S0.copy() S0_inv = S0.copy() # Local result matrices W0 = np.empty((N), dtype=float) W0_g = np.empty((N), dtype=float) # Calculate eigenvalues / other serial results if rank == 0: diagonalize(H0.copy(), W0) general_diagonalize(H0.copy(), W0_g, S0.copy()) inverse_cholesky(C0) # result returned in lower triangle tri2full(S0_inv, 'L') S0_inv = inv(S0_inv) # tri2full(C0) # symmetrize assert glob.check(H0) and glob.check(S0) and glob.check(C0) # Create distributed destriptors with various block sizes: dist = grid.new_descriptor(N, N, 8, 8) # Distributed matrices: # We can use empty here, but end up with garbage on # on the other half of the triangle when we redistribute. # This is fine because ScaLAPACK does not care. H = dist.empty(dtype=dtype) S = dist.empty(dtype=dtype) Sinv = dist.empty(dtype=dtype) Z = dist.empty(dtype=dtype) C = dist.empty(dtype=dtype) Sinv = dist.empty(dtype=dtype) # Eigenvalues are non-BLACS matrices W = np.empty((N), dtype=float) W_dc = np.empty((N), dtype=float) W_mr3 = np.empty((N), dtype=float) W_g = np.empty((N), dtype=float) W_g_dc = np.empty((N), dtype=float) W_g_mr3 = np.empty((N), dtype=float) Glob2dist = Redistributor(world, glob, dist) Glob2dist.redistribute(H0, H, uplo='L') Glob2dist.redistribute(S0, S, uplo='L') Glob2dist.redistribute(S0, C, uplo='L') # C0 was previously overwritten Glob2dist.redistribute(S0, Sinv, uplo='L') # we don't test the expert drivers anymore since there # might be a buffer overflow error ## scalapack_diagonalize_ex(dist, H.copy(), Z, W, 'L') scalapack_diagonalize_dc(dist, H.copy(), Z, W_dc, 'L') ## scalapack_diagonalize_mr3(dist, H.copy(), Z, W_mr3, 'L') ## scalapack_general_diagonalize_ex(dist, H.copy(), S.copy(), Z, W_g, 'L') scalapack_general_diagonalize_dc(dist, H.copy(), S.copy(), Z, W_g_dc, 'L') ## scalapack_general_diagonalize_mr3(dist, H.copy(), S.copy(), Z, W_g_mr3, 'L') scalapack_inverse_cholesky(dist, C, 'L') if dtype == complex: # Only supported for complex for now scalapack_inverse(dist, Sinv, 'L') # Undo redistribute C_test = glob.empty(dtype=dtype) Sinv_test = glob.empty(dtype=dtype) Dist2glob = Redistributor(world, dist, glob) Dist2glob.redistribute(C, C_test) Dist2glob.redistribute(Sinv, Sinv_test) if rank == 0: ## diag_ex_err = abs(W - W0).max() diag_dc_err = abs(W_dc - W0).max() ## diag_mr3_err = abs(W_mr3 - W0).max() ## general_diag_ex_err = abs(W_g - W0_g).max() general_diag_dc_err = abs(W_g_dc - W0_g).max() ## general_diag_mr3_err = abs(W_g_mr3 - W0_g).max() inverse_chol_err = abs(C_test - C0).max() tri2full(Sinv_test, 'L') inverse_err = abs(Sinv_test - S0_inv).max() ## print 'diagonalize ex err', diag_ex_err print('diagonalize dc err', diag_dc_err) ## print 'diagonalize mr3 err', diag_mr3_err ## print 'general diagonalize ex err', general_diag_ex_err print('general diagonalize dc err', general_diag_dc_err) ## print 'general diagonalize mr3 err', general_diag_mr3_err print('inverse chol err', inverse_chol_err) if dtype == complex: print('inverse err', inverse_err) else: ## diag_ex_err = 0.0 diag_dc_err = 0.0 ## diag_mr3_err = 0.0 ## general_diag_ex_err = 0.0 general_diag_dc_err = 0.0 ## general_diag_mr3_err = 0.0 inverse_chol_err = 0.0 inverse_err = 0.0 # We don't like exceptions on only one cpu ## diag_ex_err = world.sum(diag_ex_err) diag_dc_err = world.sum(diag_dc_err) ## diag_mr3_err = world.sum(diag_mr3_err) ## general_diag_ex_err = world.sum(general_diag_ex_err) general_diag_dc_err = world.sum(general_diag_dc_err) ## general_diag_mr3_err = world.sum(general_diag_mr3_err) inverse_chol_err = world.sum(inverse_chol_err) inverse_err = world.sum(inverse_err) ## assert diag_ex_err < tol assert diag_dc_err < tol ## assert diag_mr3_err < tol ## assert general_diag_ex_err < tol assert general_diag_dc_err < tol ## assert general_diag_mr3_err < tol assert inverse_chol_err < tol if dtype == complex: assert inverse_err < tol
def makeU(gpwfile='grid.gpw', orbitalfile='w_wG__P_awi.pckl', rotationfile='eps_q__U_pq.pckl', tolerance=1e-5, writeoptimizedpairs=False, dppname='D_pp.pckl', S_w=None): # S_w: None or diagonal of overlap matrix. In the latter case # the optimized and truncated pair orbitals are obtained from # normalized (to 1) orbitals. # # Tolerance is used for truncation of optimized pairorbitals #calc = GPAW(gpwfile, txt=None) from gpaw import GPAW from gpaw.utilities import pack, unpack from gpaw.utilities.blas import rk, gemm from gpaw.mpi import world, MASTER calc = GPAW(gpwfile, txt='pairorb.txt') # XXX gd = calc.wfs.gd setups = calc.wfs.setups myatoms = calc.density.D_asp.keys() del calc # Load orbitals on master and distribute to slaves if world.rank == MASTER: wglobal_wG, P_awi = pickle.load(open(orbitalfile)) Nw = len(wglobal_wG) print('Estimated total (serial) mem usage: %0.3f GB' % ( np.prod(gd.N_c) * Nw**2 * 8 / 1024.**3)) else: wglobal_wG = None Nw = 0 Nw = gd.comm.sum(Nw) #distribute Nw to all nodes w_wG = gd.empty(n=Nw) gd.distribute(wglobal_wG, w_wG) del wglobal_wG # Make pairorbitals f_pG = gd.zeros(n=Nw**2) Np = len(f_pG) for p, (w1, w2) in enumerate(np.ndindex(Nw, Nw)): np.multiply(w_wG[w1], w_wG[w2], f_pG[p]) del w_wG assert f_pG.flags.contiguous # Make pairorbital overlap (lower triangle only) D_pp = np.zeros((Nw**2, Nw**2)) rk(gd.dv, f_pG, 0., D_pp) # Add atomic corrections to pairorbital overlap for a in myatoms: if setups[a].type != 'ghost': P_pp = np.array([pack(np.outer(P_awi[a][w1], P_awi[a][w2])) for w1, w2 in np.ndindex(Nw, Nw)]) I4_pp = setups[a].four_phi_integrals() A = np.zeros((len(I4_pp), len(P_pp))) gemm(1.0, P_pp, I4_pp, 0.0, A, 't') gemm(1.0, A, P_pp, 1.0, D_pp) #D_pp += np.dot(P_pp, np.dot(I4_pp, P_pp.T)) # Summ all contributions to master gd.comm.sum(D_pp, MASTER) if world.rank == MASTER: if S_w is not None: print('renormalizing pairorb overlap matrix (D_pp)') S2 = np.sqrt(S_w) for pa, (wa1, wa2) in enumerate(np.ndindex(Nw, Nw)): for pb, (wb1, wb2) in enumerate(np.ndindex(Nw, Nw)): D_pp[pa, pb] /= S2[wa1] * S2[wa2] * S2[wb1] * S2[wb2] D_pp.dump(dppname) # XXX if the diagonalization below (on MASTER only) # fails, then one can always restart the stuff # below using only the stored D_pp matrix # Determine eigenvalues and vectors on master only eps_q, U_pq = np.linalg.eigh(D_pp, UPLO='L') del D_pp indices = np.argsort(-eps_q.real) eps_q = np.ascontiguousarray(eps_q.real[indices]) U_pq = np.ascontiguousarray(U_pq[:, indices]) # Truncate indices = eps_q > tolerance U_pq = np.ascontiguousarray(U_pq[:, indices]) eps_q = np.ascontiguousarray(eps_q[indices]) # Dump to file pickle.dump((eps_q, U_pq), open(rotationfile, 'wb'), 2) if writeoptimizedpairs is not False: assert world.size == 1 # works in parallel if U and eps are broadcast Uisq_qp = (U_pq / np.sqrt(eps_q)).T.copy() g_qG = gd.zeros(n=len(eps_q)) gemm(1.0, f_pG, Uisq_qp, 0.0, g_qG) g_qG = gd.collect(g_qG) if world.rank == MASTER: P_app = dict([(a, np.array([pack(np.outer(P_wi[w1], P_wi[w2]), tolerance=1e3) for w1, w2 in np.ndindex(Nw, Nw)])) for a, P_wi in P_awi.items()]) P_aqp = dict([(a, np.dot(Uisq_qp, P_pp)) for a, P_pp in P_app.items()]) pickle.dump((g_qG, P_aqp), open(writeoptimizedpairs, 'wb'), 2)
def integrate(self, a_xg, b_yg=None, global_integral=True, hermitian=False, _transposed_result=None): """Integrate function(s) over domain. a_xg: ndarray Function(s) to be integrated. b_yg: ndarray If present, integrate a_xg.conj() * b_yg. global_integral: bool If the array(s) are distributed over several domains, then the total sum will be returned. To get the local contribution only, use global_integral=False. hermitian: bool Result is hermitian. _transposed_result: ndarray Long story. Don't use this unless you are a method of the MatrixOperator class ...""" if b_yg is None: # Only one array: assert self.dtype == float return a_xg[..., 0].real * self.gd.dv A_xg = a_xg.reshape((-1, a_xg.shape[-1])) B_yg = b_yg.reshape((-1, b_yg.shape[-1])) alpha = self.gd.dv / self.gd.N_c.prod() if self.dtype == float: alpha *= 2 A_xg = A_xg.view(float) B_yg = B_yg.view(float) if _transposed_result is None: result_yx = np.zeros((len(B_yg), len(A_xg)), self.dtype) else: result_yx = _transposed_result if a_xg is b_yg: rk(alpha, A_xg, 0.0, result_yx) elif hermitian: r2k(0.5 * alpha, A_xg, B_yg, 0.0, result_yx) else: gemm(alpha, A_xg, B_yg, 0.0, result_yx, 'c') if self.dtype == float: correction_yx = np.outer(B_yg[:, 0], A_xg[:, 0]) if hermitian: result_yx -= 0.25 * alpha * (correction_yx + correction_yx.T) else: result_yx -= 0.5 * alpha * correction_yx xshape = a_xg.shape[:-1] yshape = b_yg.shape[:-1] result = result_yx.T.reshape(xshape + yshape) if result.ndim == 0: return result.item() else: return result
def makeU(gpwfile='grid.gpw', orbitalfile='w_wG__P_awi.pckl', rotationfile='eps_q__U_pq.pckl', tolerance=1e-5, writeoptimizedpairs=False, dppname='D_pp.pckl', S_w=None): # S_w: None or diagonal of overlap matrix. In the latter case # the optimized and truncated pair orbitals are obtained from # normalized (to 1) orbitals. # # Tolerance is used for truncation of optimized pairorbitals #calc = GPAW(gpwfile, txt=None) from gpaw import GPAW from gpaw.utilities import pack, unpack from gpaw.utilities.blas import rk, gemm from gpaw.mpi import world, MASTER, serial_comm calc = GPAW(gpwfile, txt='pairorb.txt') # XXX gd = calc.wfs.gd setups = calc.wfs.setups myatoms = calc.density.D_asp.keys() del calc # Load orbitals on master and distribute to slaves if world.rank == MASTER: wglobal_wG, P_awi = pickle.load(open(orbitalfile)) Nw = len(wglobal_wG) print 'Estimated total (serial) mem usage: %0.3f GB' % ( np.prod(gd.N_c) * Nw**2 * 8 / 1024.**3) else: wglobal_wG = None Nw = 0 Nw = gd.comm.sum(Nw) #distribute Nw to all nodes w_wG = gd.empty(n=Nw) gd.distribute(wglobal_wG, w_wG) del wglobal_wG # Make pairorbitals f_pG = gd.zeros(n=Nw**2) Np = len(f_pG) for p, (w1, w2) in enumerate(np.ndindex(Nw, Nw)): np.multiply(w_wG[w1], w_wG[w2], f_pG[p]) del w_wG assert f_pG.flags.contiguous # Make pairorbital overlap (lower triangle only) D_pp = np.zeros((Nw**2, Nw**2)) rk(gd.dv, f_pG, 0., D_pp) # Add atomic corrections to pairorbital overlap for a in myatoms: if setups[a].type != 'ghost': P_pp = np.array([pack(np.outer(P_awi[a][w1], P_awi[a][w2])) for w1, w2 in np.ndindex(Nw, Nw)]) I4_pp = setups[a].four_phi_integrals() A = np.zeros((len(I4_pp), len(P_pp))) gemm(1.0, P_pp, I4_pp, 0.0, A, 't') gemm(1.0, A, P_pp, 1.0, D_pp) #D_pp += np.dot(P_pp, np.dot(I4_pp, P_pp.T)) # Summ all contributions to master gd.comm.sum(D_pp, MASTER) if world.rank == MASTER: if S_w != None: print 'renormalizing pairorb overlap matrix (D_pp)' S2 = np.sqrt(S_w) for pa, (wa1, wa2) in enumerate(np.ndindex(Nw, Nw)): for pb, (wb1, wb2) in enumerate(np.ndindex(Nw, Nw)): D_pp[pa, pb] /= S2[wa1] * S2[wa2] * S2[wb1] * S2[wb2] D_pp.dump(dppname) # XXX if the diagonalization below (on MASTER only) # fails, then one can always restart the stuff # below using only the stored D_pp matrix # Determine eigenvalues and vectors on master only eps_q, U_pq = np.linalg.eigh(D_pp, UPLO='L') del D_pp indices = np.argsort(-eps_q.real) eps_q = np.ascontiguousarray(eps_q.real[indices]) U_pq = np.ascontiguousarray(U_pq[:, indices]) # Truncate indices = eps_q > tolerance U_pq = np.ascontiguousarray(U_pq[:, indices]) eps_q = np.ascontiguousarray(eps_q[indices]) # Dump to file pickle.dump((eps_q, U_pq), open(rotationfile, 'wb'), 2) if writeoptimizedpairs is not False: assert world.size == 1 # works in parallel if U and eps are broadcast Uisq_qp = (U_pq / np.sqrt(eps_q)).T.copy() g_qG = gd.zeros(n=len(eps_q)) gemm(1.0, f_pG, Uisq_qp, 0.0, g_qG) g_qG = gd.collect(g_qG) if world.rank == MASTER: P_app = dict([(a, np.array([pack(np.outer(P_wi[w1], P_wi[w2]), tolerance=1e3) for w1, w2 in np.ndindex(Nw, Nw)])) for a, P_wi in P_awi.items()]) P_aqp = dict([(a, np.dot(Uisq_qp, P_pp)) for a, P_pp in P_app.items()]) pickle.dump((g_qG, P_aqp), open(writeoptimizedpairs, 'wb'), 2)
def main(M=160, N=120, K=140, seed=42, mprocs=2, nprocs=2, dtype=float): gen = np.random.RandomState(seed) grid = BlacsGrid(world, mprocs, nprocs) if dtype == complex: epsilon = 1.0j else: epsilon = 0.0 # Create descriptors for matrices on master: globA = grid.new_descriptor(M, K, M, K) globB = grid.new_descriptor(K, N, K, N) globC = grid.new_descriptor(M, N, M, N) globZ = grid.new_descriptor(K, K, K, K) globX = grid.new_descriptor(K, 1, K, 1) globY = grid.new_descriptor(M, 1, M, 1) globD = grid.new_descriptor(M, K, M, K) globS = grid.new_descriptor(M, M, M, M) globU = grid.new_descriptor(M, M, M, M) globHEC = grid.new_descriptor(K, K, K, K) # print globA.asarray() # Populate matrices local to master: A0 = gen.rand(*globA.shape) + epsilon * gen.rand(*globA.shape) B0 = gen.rand(*globB.shape) + epsilon * gen.rand(*globB.shape) D0 = gen.rand(*globD.shape) + epsilon * gen.rand(*globD.shape) X0 = gen.rand(*globX.shape) + epsilon * gen.rand(*globX.shape) # HEC = HEA * B HEA0 = gen.rand(*globHEC.shape) + epsilon * gen.rand(*globHEC.shape) if world.rank == 0: HEA0 = HEA0 + HEA0.T.conjugate() # Make H0 hermitean HEA0 = np.ascontiguousarray(HEA0) # Local result matrices Y0 = globY.empty(dtype=dtype) C0 = globC.zeros(dtype=dtype) Z0 = globZ.zeros(dtype=dtype) S0 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U0 = globU.zeros(dtype=dtype) # zeros needed for rank-updates HEC0 = globB.zeros(dtype=dtype) # Local reference matrix product: if rank == 0: # C0[:] = np.dot(A0, B0) gemm(1.0, B0, A0, 0.0, C0) # gemm(1.0, A0, A0, 0.0, Z0, transa='t') print(A0.shape, Z0.shape) Z0[:] = np.dot(A0.T, A0) # Y0[:] = np.dot(A0, X0) gemv(1.0, A0, X0.ravel(), 0.0, Y0.ravel()) r2k(1.0, A0, D0, 0.0, S0) rk(1.0, A0, 0.0, U0) HEC0[:] = np.dot(HEA0, B0) sM, sN = HEA0.shape # We don't use upper diagonal for i in range(sM): for j in range(sN): if i < j: HEA0[i][j] = 99999.0 if world.rank == 0: print(HEA0) assert globA.check(A0) and globB.check(B0) and globC.check(C0) assert globX.check(X0) and globY.check(Y0) assert globD.check(D0) and globS.check(S0) and globU.check(U0) # Create distributed destriptors with various block sizes: distA = grid.new_descriptor(M, K, 2, 2) distB = grid.new_descriptor(K, N, 2, 4) distC = grid.new_descriptor(M, N, 3, 2) distZ = grid.new_descriptor(K, K, 5, 7) distX = grid.new_descriptor(K, 1, 4, 1) distY = grid.new_descriptor(M, 1, 3, 1) distD = grid.new_descriptor(M, K, 2, 3) distS = grid.new_descriptor(M, M, 2, 2) distU = grid.new_descriptor(M, M, 2, 2) distHE = grid.new_descriptor(K, K, 2, 4) # Distributed matrices: A = distA.empty(dtype=dtype) B = distB.empty(dtype=dtype) C = distC.empty(dtype=dtype) Z = distZ.empty(dtype=dtype) X = distX.empty(dtype=dtype) Y = distY.empty(dtype=dtype) D = distD.empty(dtype=dtype) S = distS.zeros(dtype=dtype) # zeros needed for rank-updates U = distU.zeros(dtype=dtype) # zeros needed for rank-updates HEC = distB.zeros(dtype=dtype) HEA = distHE.zeros(dtype=dtype) Redistributor(world, globA, distA).redistribute(A0, A) Redistributor(world, globB, distB).redistribute(B0, B) Redistributor(world, globX, distX).redistribute(X0, X) Redistributor(world, globD, distD).redistribute(D0, D) Redistributor(world, globHEC, distHE).redistribute(HEA0, HEA) pblas_simple_gemm(distA, distB, distC, A, B, C) pblas_simple_gemm(distA, distA, distZ, A, A, Z, transa='T') pblas_simple_gemv(distA, distX, distY, A, X, Y) pblas_simple_r2k(distA, distD, distS, A, D, S) pblas_simple_rk(distA, distU, A, U) pblas_simple_hemm(distHE, distB, distB, HEA, B, HEC, uplo='L', side='L') # Collect result back on master C1 = globC.empty(dtype=dtype) Y1 = globY.empty(dtype=dtype) S1 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U1 = globU.zeros(dtype=dtype) # zeros needed for rank-updates HEC1 = globB.zeros(dtype=dtype) Redistributor(world, distC, globC).redistribute(C, C1) Redistributor(world, distY, globY).redistribute(Y, Y1) Redistributor(world, distS, globS).redistribute(S, S1) Redistributor(world, distU, globU).redistribute(U, U1) Redistributor(world, distB, globB).redistribute(HEC, HEC1) if rank == 0: gemm_err = abs(C1 - C0).max() gemv_err = abs(Y1 - Y0).max() r2k_err = abs(S1 - S0).max() rk_err = abs(U1 - U0).max() hemm_err = abs(HEC1 - HEC0).max() print('gemm err', gemm_err) print('gemv err', gemv_err) print('r2k err', r2k_err) print('rk_err', rk_err) print('hemm_err', hemm_err) else: gemm_err = 0.0 gemv_err = 0.0 r2k_err = 0.0 rk_err = 0.0 hemm_err = 0.0 gemm_err = world.sum(gemm_err) # We don't like exceptions on only one cpu gemv_err = world.sum(gemv_err) r2k_err = world.sum(r2k_err) rk_err = world.sum(rk_err) hemm_err = world.sum(hemm_err) equal(gemm_err, 0, tol) equal(gemv_err, 0, tol) equal(r2k_err, 0, tol) equal(rk_err, 0, tol) equal(hemm_err, 0, tol)
def iterate_one_k_point(self, hamiltonian, wfs, kpt): """Do Davidson iterations for the kpoint""" niter = self.niter nbands = self.nbands self.subspace_diagonalize(hamiltonian, wfs, kpt) H_2n2n = self.H_2n2n S_2n2n = self.S_2n2n eps_2n = self.eps_2n psit2_nG = wfs.matrixoperator.suggest_temporary_buffer() self.timer.start('Davidson') R_nG = self.Htpsit_nG self.calculate_residuals(kpt, wfs, hamiltonian, kpt.psit_nG, kpt.P_ani, kpt.eps_n, R_nG) for nit in range(niter): H_2n2n[:] = 0.0 S_2n2n[:] = 0.0 error = 0.0 for n in range(nbands): if kpt.f_n is None: weight = kpt.weight else: weight = kpt.f_n[n] if self.nbands_converge != 'occupied': if n < self.nbands_converge: weight = kpt.weight else: weight = 0.0 error += weight * np.vdot(R_nG[n], R_nG[n]).real H_2n2n[n,n] = kpt.eps_n[n] S_2n2n[n,n] = 1.0 psit2_nG[n] = self.preconditioner(R_nG[n], kpt) # Calculate projections P2_ani = wfs.pt.dict(nbands) wfs.pt.integrate(psit2_nG, P2_ani, kpt.q) # Hamiltonian matrix # <psi2 | H | psi> wfs.kin.apply(psit2_nG, self.Htpsit_nG, kpt.phase_cd) hamiltonian.apply_local_potential(psit2_nG, self.Htpsit_nG, kpt.s) gemm(self.gd.dv, kpt.psit_nG, self.Htpsit_nG, 0.0, self.H_nn, 'c') for a, P_ni in kpt.P_ani.items(): P2_ni = P2_ani[a] dH_ii = unpack(hamiltonian.dH_asp[a][kpt.s]) self.H_nn += np.dot(P2_ni, np.dot(dH_ii, P_ni.T.conj())) self.gd.comm.sum(self.H_nn, 0) H_2n2n[nbands:, :nbands] = self.H_nn # <psi2 | H | psi2> r2k(0.5 * self.gd.dv, psit2_nG, self.Htpsit_nG, 0.0, self.H_nn) for a, P2_ni in P2_ani.items(): dH_ii = unpack(hamiltonian.dH_asp[a][kpt.s]) self.H_nn += np.dot(P2_ni, np.dot(dH_ii, P2_ni.T.conj())) self.gd.comm.sum(self.H_nn, 0) H_2n2n[nbands:, nbands:] = self.H_nn # Overlap matrix # <psi2 | S | psi> gemm(self.gd.dv, kpt.psit_nG, psit2_nG, 0.0, self.S_nn, "c") for a, P_ni in kpt.P_ani.items(): P2_ni = P2_ani[a] dO_ii = wfs.setups[a].dO_ii self.S_nn += np.dot(P2_ni, np.inner(dO_ii, P_ni.conj())) self.gd.comm.sum(self.S_nn, 0) S_2n2n[nbands:, :nbands] = self.S_nn # <psi2 | S | psi2> rk(self.gd.dv, psit2_nG, 0.0, self.S_nn) for a, P2_ni in P2_ani.items(): dO_ii = wfs.setups[a].dO_ii self.S_nn += np.dot(P2_ni, np.dot(dO_ii, P2_ni.T.conj())) self.gd.comm.sum(self.S_nn, 0) S_2n2n[nbands:, nbands:] = self.S_nn if self.gd.comm.rank == 0: general_diagonalize(H_2n2n, eps_2n, S_2n2n) self.gd.comm.broadcast(H_2n2n, 0) self.gd.comm.broadcast(eps_2n, 0) kpt.eps_n[:] = eps_2n[:nbands] # Rotate psit_nG gemm(1.0, kpt.psit_nG, H_2n2n[:nbands, :nbands], 0.0, self.Htpsit_nG) gemm(1.0, psit2_nG, H_2n2n[:nbands, nbands:], 1.0, self.Htpsit_nG) kpt.psit_nG, self.Htpsit_nG = self.Htpsit_nG, kpt.psit_nG # Rotate P_uni: for a, P_ni in kpt.P_ani.items(): P2_ni = P2_ani[a] gemm(1.0, P_ni.copy(), H_2n2n[:nbands, :nbands], 0.0, P_ni) gemm(1.0, P2_ni, H_2n2n[:nbands, nbands:], 1.0, P_ni) if nit < niter - 1 : wfs.kin.apply(kpt.psit_nG, self.Htpsit_nG, kpt.phase_cd) hamiltonian.apply_local_potential(kpt.psit_nG, self.Htpsit_nG, kpt.s) R_nG = self.Htpsit_nG self.calculate_residuals(kpt, wfs, hamiltonian, kpt.psit_nG, kpt.P_ani, kpt.eps_n, R_nG) self.timer.stop('Davidson') error = self.gd.comm.sum(error) return error