def calculate_blocked_density_matrix(self, f_n, C_nM): nbands = self.bd.nbands mynbands = self.bd.mynbands nao = self.nao dtype = C_nM.dtype self.nMdescriptor.checkassert(C_nM) if self.gd.rank == 0: Cf_nM = (C_nM * f_n[:, None]).conj() else: C_nM = self.nM_unique_descriptor.zeros(dtype=dtype) Cf_nM = self.nM_unique_descriptor.zeros(dtype=dtype) r = Redistributor(self.block_comm, self.nM_unique_descriptor, self.mmdescriptor) Cf_mm = self.mmdescriptor.zeros(dtype=dtype) r.redistribute(Cf_nM, Cf_mm, nbands, nao) del Cf_nM C_mm = self.mmdescriptor.zeros(dtype=dtype) r.redistribute(C_nM, C_mm, nbands, nao) # no use to delete C_nM as it's in the input... rho_mm = self.mmdescriptor.zeros(dtype=dtype) pblas_simple_gemm(self.mmdescriptor, self.mmdescriptor, self.mmdescriptor, Cf_mm, C_mm, rho_mm, transa='T') return rho_mm
def calculate_density_matrix(self, f_n, C_nM, rho_mM=None): nbands = self.bd.nbands mynbands = self.bd.mynbands nao = self.nao if rho_mM is None: rho_mM = self.mMdescriptor.zeros(dtype=C_nM.dtype) Cf_nM = (C_nM * f_n[:, None]).conj() pblas_simple_gemm(self.nMdescriptor, self.nMdescriptor, self.mMdescriptor, Cf_nM, C_nM, rho_mM, transa='T') return rho_mM
def oldcalculate_density_matrix(self, f_n, C_nM, rho_mM=None): # This version is parallel over the band descriptor only. # This is inefficient, but let's keep it for a while in case # there's trouble with the more efficient version nbands = self.bd.nbands mynbands = self.bd.mynbands nao = self.nao if rho_mM is None: rho_mM = self.mMdescriptor.zeros(dtype=C_nM.dtype) Cf_nM = (C_nM * f_n[:, None]).conj() pblas_simple_gemm(self.nMdescriptor, self.nMdescriptor, self.mMdescriptor, Cf_nM, C_nM, rho_mM, transa='T') return rho_mM
# simulate state-parallelization=2 and # domain-decomposition.prod=32 B = 2 D = 32 mb = 32 grid = BlacsGrid(world, B, D) nbands = 500 nG = 80**3 nGdesc = grid.new_descriptor(nbands, nG, nbands // B, nG // D) nndesc = grid.new_descriptor(nbands, nbands, mb, mb) psit_nG = gen.rand(*nGdesc.shape) A_nn = gen.rand(*nndesc.shape) assert nGdesc.check(psit_nG) assert nndesc.check(A_nn) parallelprint(world, (A_nn.shape, nndesc.shape, nndesc.lld)) pblas_simple_gemm(nGdesc, nGdesc, nndesc, psit_nG, psit_nG, A_nn, transa='N', transb='T')
def main(M=160, N=120, K=140, seed=42, mprocs=2, nprocs=2, dtype=float): gen = np.random.RandomState(seed) grid = BlacsGrid(world, mprocs, nprocs) if dtype == complex: epsilon = 1.0j else: epsilon = 0.0 # Create descriptors for matrices on master: globA = grid.new_descriptor(M, K, M, K) globB = grid.new_descriptor(K, N, K, N) globC = grid.new_descriptor(M, N, M, N) globZ = grid.new_descriptor(K, K, K, K) globX = grid.new_descriptor(K, 1, K, 1) globY = grid.new_descriptor(M, 1, M, 1) globD = grid.new_descriptor(M, K, M, K) globS = grid.new_descriptor(M, M, M, M) globU = grid.new_descriptor(M, M, M, M) globHEC = grid.new_descriptor(K, K, K, K) # print globA.asarray() # Populate matrices local to master: A0 = gen.rand(*globA.shape) + epsilon * gen.rand(*globA.shape) B0 = gen.rand(*globB.shape) + epsilon * gen.rand(*globB.shape) D0 = gen.rand(*globD.shape) + epsilon * gen.rand(*globD.shape) X0 = gen.rand(*globX.shape) + epsilon * gen.rand(*globX.shape) # HEC = HEA * B HEA0 = gen.rand(*globHEC.shape) + epsilon * gen.rand(*globHEC.shape) if world.rank == 0: HEA0 = HEA0 + HEA0.T.conjugate() # Make H0 hermitean HEA0 = np.ascontiguousarray(HEA0) # Local result matrices Y0 = globY.empty(dtype=dtype) C0 = globC.zeros(dtype=dtype) Z0 = globZ.zeros(dtype=dtype) S0 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U0 = globU.zeros(dtype=dtype) # zeros needed for rank-updates HEC0 = globB.zeros(dtype=dtype) # Local reference matrix product: if rank == 0: # C0[:] = np.dot(A0, B0) gemm(1.0, B0, A0, 0.0, C0) # gemm(1.0, A0, A0, 0.0, Z0, transa='t') print(A0.shape, Z0.shape) Z0[:] = np.dot(A0.T, A0) # Y0[:] = np.dot(A0, X0) gemv(1.0, A0, X0.ravel(), 0.0, Y0.ravel()) r2k(1.0, A0, D0, 0.0, S0) rk(1.0, A0, 0.0, U0) HEC0[:] = np.dot(HEA0, B0) sM, sN = HEA0.shape # We don't use upper diagonal for i in range(sM): for j in range(sN): if i < j: HEA0[i][j] = 99999.0 if world.rank == 0: print(HEA0) assert globA.check(A0) and globB.check(B0) and globC.check(C0) assert globX.check(X0) and globY.check(Y0) assert globD.check(D0) and globS.check(S0) and globU.check(U0) # Create distributed destriptors with various block sizes: distA = grid.new_descriptor(M, K, 2, 2) distB = grid.new_descriptor(K, N, 2, 4) distC = grid.new_descriptor(M, N, 3, 2) distZ = grid.new_descriptor(K, K, 5, 7) distX = grid.new_descriptor(K, 1, 4, 1) distY = grid.new_descriptor(M, 1, 3, 1) distD = grid.new_descriptor(M, K, 2, 3) distS = grid.new_descriptor(M, M, 2, 2) distU = grid.new_descriptor(M, M, 2, 2) distHE = grid.new_descriptor(K, K, 2, 4) # Distributed matrices: A = distA.empty(dtype=dtype) B = distB.empty(dtype=dtype) C = distC.empty(dtype=dtype) Z = distZ.empty(dtype=dtype) X = distX.empty(dtype=dtype) Y = distY.empty(dtype=dtype) D = distD.empty(dtype=dtype) S = distS.zeros(dtype=dtype) # zeros needed for rank-updates U = distU.zeros(dtype=dtype) # zeros needed for rank-updates HEC = distB.zeros(dtype=dtype) HEA = distHE.zeros(dtype=dtype) Redistributor(world, globA, distA).redistribute(A0, A) Redistributor(world, globB, distB).redistribute(B0, B) Redistributor(world, globX, distX).redistribute(X0, X) Redistributor(world, globD, distD).redistribute(D0, D) Redistributor(world, globHEC, distHE).redistribute(HEA0, HEA) pblas_simple_gemm(distA, distB, distC, A, B, C) pblas_simple_gemm(distA, distA, distZ, A, A, Z, transa='T') pblas_simple_gemv(distA, distX, distY, A, X, Y) pblas_simple_r2k(distA, distD, distS, A, D, S) pblas_simple_rk(distA, distU, A, U) pblas_simple_hemm(distHE, distB, distB, HEA, B, HEC, uplo='L', side='L') # Collect result back on master C1 = globC.empty(dtype=dtype) Y1 = globY.empty(dtype=dtype) S1 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U1 = globU.zeros(dtype=dtype) # zeros needed for rank-updates HEC1 = globB.zeros(dtype=dtype) Redistributor(world, distC, globC).redistribute(C, C1) Redistributor(world, distY, globY).redistribute(Y, Y1) Redistributor(world, distS, globS).redistribute(S, S1) Redistributor(world, distU, globU).redistribute(U, U1) Redistributor(world, distB, globB).redistribute(HEC, HEC1) if rank == 0: gemm_err = abs(C1 - C0).max() gemv_err = abs(Y1 - Y0).max() r2k_err = abs(S1 - S0).max() rk_err = abs(U1 - U0).max() hemm_err = abs(HEC1 - HEC0).max() print('gemm err', gemm_err) print('gemv err', gemv_err) print('r2k err', r2k_err) print('rk_err', rk_err) print('hemm_err', hemm_err) else: gemm_err = 0.0 gemv_err = 0.0 r2k_err = 0.0 rk_err = 0.0 hemm_err = 0.0 gemm_err = world.sum(gemm_err) # We don't like exceptions on only one cpu gemv_err = world.sum(gemv_err) r2k_err = world.sum(r2k_err) rk_err = world.sum(rk_err) hemm_err = world.sum(hemm_err) equal(gemm_err, 0, tol) equal(gemv_err, 0, tol) equal(r2k_err, 0, tol) equal(rk_err, 0, tol) equal(hemm_err, 0, tol)
import numpy as np from gpaw.blacs import BlacsGrid, parallelprint from gpaw.mpi import world, rank, size from gpaw.utilities.scalapack import pblas_simple_gemm gen = np.random.RandomState(42) # simulate state-parallelization=2 and # domain-decomposition.prod=32 B = 2 D = 32 mb = 32 grid = BlacsGrid(world, B, D) nbands = 500 nG = 80 ** 3 nGdesc = grid.new_descriptor(nbands, nG, nbands / B, nG / D) nndesc = grid.new_descriptor(nbands, nbands, mb, mb) psit_nG = gen.rand(*nGdesc.shape) A_nn = gen.rand(*nndesc.shape) assert nGdesc.check(psit_nG) assert nndesc.check(A_nn) parallelprint(world, (A_nn.shape, nndesc.shape, nndesc.lld)) pblas_simple_gemm(nGdesc, nGdesc, nndesc, psit_nG, psit_nG, A_nn, transa="N", transb="T")
def calculate_blocked_density_matrix(self, f_n, C_nM): nbands = self.bd.nbands nao = self.nao dtype = C_nM.dtype self.nMdescriptor.checkassert(C_nM) if self.gd.rank == 0: Cf_nM = (C_nM * f_n[:, None]) else: C_nM = self.nM_unique_descriptor.zeros(dtype=dtype) Cf_nM = self.nM_unique_descriptor.zeros(dtype=dtype) r = Redistributor(self.block_comm, self.nM_unique_descriptor, self.mmdescriptor) Cf_mm = self.mmdescriptor.zeros(dtype=dtype) r.redistribute(Cf_nM, Cf_mm, nbands, nao) del Cf_nM C_mm = self.mmdescriptor.zeros(dtype=dtype) r.redistribute(C_nM, C_mm, nbands, nao) # no use to delete C_nM as it's in the input... rho_mm = self.mmdescriptor.zeros(dtype=dtype) if 1: # if self.libelpa is None: pblas_simple_gemm(self.mmdescriptor, self.mmdescriptor, self.mmdescriptor, Cf_mm, C_mm, rho_mm, transa='C') else: # elpa_hermitian_multiply was not faster than the ordinary # multiplication in the test. The way we have things distributed, # we need to transpose things at the moment. # # Rather than enabling this, we should store the coefficients # in an appropriate 2D block cyclic format (c_nm) and not the # current C_nM format. This makes it possible to avoid # redistributing the coefficients at all. But we don't have time # to implement this at the moment. mul = self.libelpa.hermitian_multiply desc = self.mmdescriptor from gpaw.utilities.scalapack import pblas_tran def T(array): tmp = array.copy() pblas_tran(alpha=1.0, a_MN=tmp, beta=0.0, c_NM=array, desca=desc, descc=desc) T(C_mm) T(Cf_mm) mul(C_mm, Cf_mm, rho_mm, desc, desc, desc, uplo_a='X', uplo_c='X') return rho_mm
def linear_propagator(self, sourceC_nM, targetC_nM, S_MM, H_MM, dt): self.timer.start('Linear solve') if self.blacs: # XXX, Preallocate target_blockC_nm = self.Cnm_block_descriptor.empty(dtype=complex) temp_blockC_nm = self.Cnm_block_descriptor.empty(dtype=complex) temp_block_mm = self.mm_block_descriptor.empty(dtype=complex) if self.density.gd.comm.rank != 0: # XXX Fake blacks nbands, nao, nbands, nao grid because some # weird asserts # (these are 0,x or x,0 arrays) sourceC_nM = self.CnM_unique_descriptor.zeros(dtype=complex) # 1. target = (S+0.5j*H*dt) * source # Wave functions to target self.CnM2nm.redistribute(sourceC_nM, temp_blockC_nm) # XXX It can't be this f'n hard to symmetrize a matrix (tri2full) # Remove upper diagonal scalapack_zero(self.mm_block_descriptor, H_MM, 'U') # Lower diagonal matrix: temp_block_mm[:] = S_MM - (0.5j * dt) * H_MM scalapack_set(self.mm_block_descriptor, temp_block_mm, 0, 0, 'U') # Note it's strictly lower diagonal matrix # Add transpose of H pblas_tran(-0.5j * dt, H_MM, 1.0, temp_block_mm, self.mm_block_descriptor, self.mm_block_descriptor) # Add transpose of S pblas_tran(1.0, S_MM, 1.0, temp_block_mm, self.mm_block_descriptor, self.mm_block_descriptor) pblas_simple_gemm(self.Cnm_block_descriptor, self.mm_block_descriptor, self.Cnm_block_descriptor, temp_blockC_nm, temp_block_mm, target_blockC_nm) # 2. target = (S-0.5j*H*dt)^-1 * target # temp_block_mm[:] = S_MM + (0.5j*dt) * H_MM # XXX It can't be this f'n hard to symmetrize a matrix (tri2full) # Lower diagonal matrix: temp_block_mm[:] = S_MM + (0.5j * dt) * H_MM # Not it's stricly lower diagonal matrix: scalapack_set(self.mm_block_descriptor, temp_block_mm, 0, 0, 'U') # Add transpose of H: pblas_tran(+0.5j * dt, H_MM, 1.0, temp_block_mm, self.mm_block_descriptor, self.mm_block_descriptor) # Add transpose of S pblas_tran(1.0, S_MM, 1.0, temp_block_mm, self.mm_block_descriptor, self.mm_block_descriptor) scalapack_solve(self.mm_block_descriptor, self.Cnm_block_descriptor, temp_block_mm, target_blockC_nm) if self.density.gd.comm.rank != 0: # XXX is this correct? # XXX Fake blacks nbands, nao, nbands, nao grid because some # weird asserts # (these are 0,x or x,0 arrays) target = self.CnM_unique_descriptor.zeros(dtype=complex) else: target = targetC_nM self.Cnm2nM.redistribute(target_blockC_nm, target) self.density.gd.comm.broadcast(targetC_nM, 0) # Is this required? else: # Note: The full equation is conjugated (therefore -+, not +-) targetC_nM[:] = \ solve(S_MM - 0.5j * H_MM * dt, np.dot(S_MM + 0.5j * H_MM * dt, sourceC_nM.T.conjugate())).T.conjugate() self.timer.stop('Linear solve')
def linear_propagator(self, sourceC_nM, targetC_nM, S_MM, H_MM, dt): self.timer.start('Linear solve') # XXX Debugging stuff. Remove if self.propagator_debug: if self.blacs: globalH_MM = self.blacs_mm_to_global(H_MM) globalS_MM = self.blacs_mm_to_global(S_MM) if world.rank == 0: tri2full(globalS_MM, 'L') tri2full(globalH_MM, 'L') U_MM = dot(inv(globalS_MM-0.5j*globalH_MM*dt), globalS_MM+0.5j*globalH_MM*dt) debugC_nM = dot(sourceC_nM, U_MM.T.conjugate()) #print 'PASS PROPAGATOR' #debugC_nM = sourceC_nM.copy() else: if world.rank == 0: U_MM = dot(inv(S_MM-0.5j*H_MM*dt), S_MM+0.5j*H_MM*dt) debugC_nM = dot(sourceC_nM, U_MM.T.conjugate()) #print 'PASS PROPAGATOR' #debugC_nM = sourceC_nM.copy() if self.blacs: target_blockC_nm = self.Cnm_block_descriptor.empty(dtype=complex) # XXX, Preallocate temp_blockC_nm = self.Cnm_block_descriptor.empty(dtype=complex) # XXX, Preallocate temp_block_mm = self.mm_block_descriptor.empty(dtype=complex) if self.density.gd.comm.rank != 0: # XXX Fake blacks nbands, nao, nbands, nao grid because some weird asserts # (these are 0,x or x,0 arrays) sourceC_nM = self.CnM_unique_descriptor.zeros(dtype=complex) # 1. target = (S+0.5j*H*dt) * source # Wave functions to target self.CnM2nm.redistribute(sourceC_nM, temp_blockC_nm) # XXX It can't be this f'n hard to symmetrize a matrix (tri2full) scalapack_zero(self.mm_block_descriptor, H_MM, 'U') # Remove upper diagonal temp_block_mm[:] = S_MM - (0.5j*dt) * H_MM # Lower diagonal matrix scalapack_set(self.mm_block_descriptor, temp_block_mm, 0, 0, 'U') # Note it's stricly lower diagonal matrix pblas_tran(-0.5j*dt, H_MM, 1.0, temp_block_mm, self.mm_block_descriptor, self.mm_block_descriptor) # Add transpose of H pblas_tran(1.0, S_MM, 1.0, temp_block_mm, self.mm_block_descriptor, self.mm_block_descriptor) # Add transpose of S pblas_simple_gemm(self.Cnm_block_descriptor, self.mm_block_descriptor, self.Cnm_block_descriptor, temp_blockC_nm, temp_block_mm, target_blockC_nm) # 2. target = (S-0.5j*H*dt)^-1 * target #temp_block_mm[:] = S_MM + (0.5j*dt) * H_MM # XXX It can't be this f'n hard to symmetrize a matrix (tri2full) temp_block_mm[:] = S_MM + (0.5j*dt) * H_MM # Lower diagonal matrix scalapack_set(self.mm_block_descriptor, temp_block_mm, 0, 0, 'U') # Not it's stricly lower diagonal matrix pblas_tran(+0.5j*dt, H_MM, 1.0, temp_block_mm, self.mm_block_descriptor, self.mm_block_descriptor) # Add transpose of H pblas_tran(1.0, S_MM, 1.0, temp_block_mm, self.mm_block_descriptor, self.mm_block_descriptor) # Add transpose of S scalapack_solve(self.mm_block_descriptor, self.Cnm_block_descriptor, temp_block_mm, target_blockC_nm) if self.density.gd.comm.rank != 0: # XXX is this correct? # XXX Fake blacks nbands, nao, nbands, nao grid because some weird asserts # (these are 0,x or x,0 arrays) target = self.CnM_unique_descriptor.zeros(dtype=complex) else: target = targetC_nM self.Cnm2nM.redistribute(target_blockC_nm, target) self.density.gd.comm.broadcast(targetC_nM, 0) # Is this required? else: # Note: The full equation is conjugated (therefore -+, not +-) targetC_nM[:] = solve(S_MM-0.5j*H_MM*dt, np.dot(S_MM+0.5j*H_MM*dt, sourceC_nM.T.conjugate())).T.conjugate() # XXX Debugging stuff. Remove if self.propagator_debug: if world.rank == 0: verify(targetC_nM, debugC_nM, 'Linear solver propagator vs. reference') self.timer.stop('Linear solve')
def main(M=160, N=120, K=140, seed=42, mprocs=2, nprocs=2, dtype=float): gen = np.random.RandomState(seed) grid = BlacsGrid(world, mprocs, nprocs) if (dtype==complex): epsilon = 1.0j else: epsilon = 0.0 # Create descriptors for matrices on master: globA = grid.new_descriptor(M, K, M, K) globB = grid.new_descriptor(K, N, K, N) globC = grid.new_descriptor(M, N, M, N) globZ = grid.new_descriptor(K, K, K, K) globX = grid.new_descriptor(K, 1, K, 1) globY = grid.new_descriptor(M, 1, M, 1) globD = grid.new_descriptor(M, K, M, K) globS = grid.new_descriptor(M, M, M, M) globU = grid.new_descriptor(M, M, M, M) # print globA.asarray() # Populate matrices local to master: A0 = gen.rand(*globA.shape) + epsilon * gen.rand(*globA.shape) B0 = gen.rand(*globB.shape) + epsilon * gen.rand(*globB.shape) D0 = gen.rand(*globD.shape) + epsilon * gen.rand(*globD.shape) X0 = gen.rand(*globX.shape) + epsilon * gen.rand(*globX.shape) # Local result matrices Y0 = globY.empty(dtype=dtype) C0 = globC.zeros(dtype=dtype) Z0 = globZ.zeros(dtype=dtype) S0 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U0 = globU.zeros(dtype=dtype) # zeros needed for rank-updates # Local reference matrix product: if rank == 0: # C0[:] = np.dot(A0, B0) gemm(1.0, B0, A0, 0.0, C0) #gemm(1.0, A0, A0, 0.0, Z0, transa='t') print A0.shape, Z0.shape Z0[:] = np.dot(A0.T, A0) # Y0[:] = np.dot(A0, X0) gemv(1.0, A0, X0.ravel(), 0.0, Y0.ravel()) r2k(1.0, A0, D0, 0.0, S0) rk(1.0, A0, 0.0, U0) assert globA.check(A0) and globB.check(B0) and globC.check(C0) assert globX.check(X0) and globY.check(Y0) assert globD.check(D0) and globS.check(S0) and globU.check(U0) # Create distributed destriptors with various block sizes: distA = grid.new_descriptor(M, K, 2, 2) distB = grid.new_descriptor(K, N, 2, 4) distC = grid.new_descriptor(M, N, 3, 2) distZ = grid.new_descriptor(K, K, 5, 7) distX = grid.new_descriptor(K, 1, 4, 1) distY = grid.new_descriptor(M, 1, 3, 1) distD = grid.new_descriptor(M, K, 2, 3) distS = grid.new_descriptor(M, M, 2, 2) distU = grid.new_descriptor(M, M, 2, 2) # Distributed matrices: A = distA.empty(dtype=dtype) B = distB.empty(dtype=dtype) C = distC.empty(dtype=dtype) Z = distZ.empty(dtype=dtype) X = distX.empty(dtype=dtype) Y = distY.empty(dtype=dtype) D = distD.empty(dtype=dtype) S = distS.zeros(dtype=dtype) # zeros needed for rank-updates U = distU.zeros(dtype=dtype) # zeros needed for rank-updates Redistributor(world, globA, distA).redistribute(A0, A) Redistributor(world, globB, distB).redistribute(B0, B) Redistributor(world, globX, distX).redistribute(X0, X) Redistributor(world, globD, distD).redistribute(D0, D) pblas_simple_gemm(distA, distB, distC, A, B, C) pblas_simple_gemm(distA, distA, distZ, A, A, Z, transa='T') pblas_simple_gemv(distA, distX, distY, A, X, Y) pblas_simple_r2k(distA, distD, distS, A, D, S) pblas_simple_rk(distA, distU, A, U) # Collect result back on master C1 = globC.empty(dtype=dtype) Y1 = globY.empty(dtype=dtype) S1 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U1 = globU.zeros(dtype=dtype) # zeros needed for rank-updates Redistributor(world, distC, globC).redistribute(C, C1) Redistributor(world, distY, globY).redistribute(Y, Y1) Redistributor(world, distS, globS).redistribute(S, S1) Redistributor(world, distU, globU).redistribute(U, U1) if rank == 0: gemm_err = abs(C1 - C0).max() gemv_err = abs(Y1 - Y0).max() r2k_err = abs(S1 - S0).max() rk_err = abs(U1 - U0).max() print 'gemm err', gemm_err print 'gemv err', gemv_err print 'r2k err' , r2k_err print 'rk_err' , rk_err else: gemm_err = 0.0 gemv_err = 0.0 r2k_err = 0.0 rk_err = 0.0 gemm_err = world.sum(gemm_err) # We don't like exceptions on only one cpu gemv_err = world.sum(gemv_err) r2k_err = world.sum(r2k_err) rk_err = world.sum(rk_err) equal(gemm_err, 0, tol) equal(gemv_err, 0, tol) equal(r2k_err, 0, tol) equal(rk_err,0, tol)
from gpaw.blacs import BlacsGrid, parallelprint from gpaw.mpi import world, rank, size from gpaw.utilities.scalapack import pblas_simple_gemm gen = np.random.RandomState(42) # simulate state-parallelization=2 and # domain-decomposition.prod=32 B = 2 D = 32 mb = 32 grid = BlacsGrid(world, B, D) nbands = 500 nG = 80**3 nGdesc = grid.new_descriptor(nbands, nG, nbands/B, nG/D) nndesc = grid.new_descriptor(nbands, nbands, mb, mb) psit_nG = gen.rand(*nGdesc.shape) A_nn = gen.rand(*nndesc.shape) assert nGdesc.check(psit_nG) assert nndesc.check(A_nn) parallelprint(world, (A_nn.shape, nndesc.shape, nndesc.lld)) pblas_simple_gemm(nGdesc, nGdesc, nndesc, psit_nG, psit_nG, A_nn, transa='N', transb='T')
def main(M=160, N=120, K=140, seed=42, mprocs=2, nprocs=2, dtype=float): gen = np.random.RandomState(seed) grid = BlacsGrid(world, mprocs, nprocs) if dtype == complex: epsilon = 1.0j else: epsilon = 0.0 # Create descriptors for matrices on master: globA = grid.new_descriptor(M, K, M, K) globB = grid.new_descriptor(K, N, K, N) globC = grid.new_descriptor(M, N, M, N) globZ = grid.new_descriptor(K, K, K, K) globX = grid.new_descriptor(K, 1, K, 1) globY = grid.new_descriptor(M, 1, M, 1) globD = grid.new_descriptor(M, K, M, K) globS = grid.new_descriptor(M, M, M, M) globU = grid.new_descriptor(M, M, M, M) globHEC = grid.new_descriptor(K, K, K, K) # print globA.asarray() # Populate matrices local to master: A0 = gen.rand(*globA.shape) + epsilon * gen.rand(*globA.shape) B0 = gen.rand(*globB.shape) + epsilon * gen.rand(*globB.shape) D0 = gen.rand(*globD.shape) + epsilon * gen.rand(*globD.shape) X0 = gen.rand(*globX.shape) + epsilon * gen.rand(*globX.shape) # HEC = HEA * B HEA0 = gen.rand(*globHEC.shape) + epsilon * gen.rand(*globHEC.shape) if world.rank == 0: HEA0 = HEA0 + HEA0.T.conjugate() # Make H0 hermitean # Local result matrices Y0 = globY.empty(dtype=dtype) C0 = globC.zeros(dtype=dtype) Z0 = globZ.zeros(dtype=dtype) S0 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U0 = globU.zeros(dtype=dtype) # zeros needed for rank-updates HEC0 = globB.zeros(dtype=dtype) # Local reference matrix product: if rank == 0: # C0[:] = np.dot(A0, B0) gemm(1.0, B0, A0, 0.0, C0) # gemm(1.0, A0, A0, 0.0, Z0, transa='t') print(A0.shape, Z0.shape) Z0[:] = np.dot(A0.T, A0) # Y0[:] = np.dot(A0, X0) gemv(1.0, A0, X0.ravel(), 0.0, Y0.ravel()) r2k(1.0, A0, D0, 0.0, S0) rk(1.0, A0, 0.0, U0) HEC0[:] = np.dot(HEA0, B0) sM, sN = HEA0.shape # We don't use upper diagonal for i in range(sM): for j in range(sN): if i < j: HEA0[i][j] = 99999.0 if world.rank == 0: print(HEA0) assert globA.check(A0) and globB.check(B0) and globC.check(C0) assert globX.check(X0) and globY.check(Y0) assert globD.check(D0) and globS.check(S0) and globU.check(U0) # Create distributed destriptors with various block sizes: distA = grid.new_descriptor(M, K, 2, 2) distB = grid.new_descriptor(K, N, 2, 4) distC = grid.new_descriptor(M, N, 3, 2) distZ = grid.new_descriptor(K, K, 5, 7) distX = grid.new_descriptor(K, 1, 4, 1) distY = grid.new_descriptor(M, 1, 3, 1) distD = grid.new_descriptor(M, K, 2, 3) distS = grid.new_descriptor(M, M, 2, 2) distU = grid.new_descriptor(M, M, 2, 2) distHE = grid.new_descriptor(K, K, 2, 4) # Distributed matrices: A = distA.empty(dtype=dtype) B = distB.empty(dtype=dtype) C = distC.empty(dtype=dtype) Z = distZ.empty(dtype=dtype) X = distX.empty(dtype=dtype) Y = distY.empty(dtype=dtype) D = distD.empty(dtype=dtype) S = distS.zeros(dtype=dtype) # zeros needed for rank-updates U = distU.zeros(dtype=dtype) # zeros needed for rank-updates HEC = distB.zeros(dtype=dtype) HEA = distHE.zeros(dtype=dtype) Redistributor(world, globA, distA).redistribute(A0, A) Redistributor(world, globB, distB).redistribute(B0, B) Redistributor(world, globX, distX).redistribute(X0, X) Redistributor(world, globD, distD).redistribute(D0, D) Redistributor(world, globHEC, distHE).redistribute(HEA0, HEA) pblas_simple_gemm(distA, distB, distC, A, B, C) pblas_simple_gemm(distA, distA, distZ, A, A, Z, transa="T") pblas_simple_gemv(distA, distX, distY, A, X, Y) pblas_simple_r2k(distA, distD, distS, A, D, S) pblas_simple_rk(distA, distU, A, U) pblas_simple_hemm(distHE, distB, distB, HEA, B, HEC, uplo="L", side="L") # Collect result back on master C1 = globC.empty(dtype=dtype) Y1 = globY.empty(dtype=dtype) S1 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U1 = globU.zeros(dtype=dtype) # zeros needed for rank-updates HEC1 = globB.zeros(dtype=dtype) Redistributor(world, distC, globC).redistribute(C, C1) Redistributor(world, distY, globY).redistribute(Y, Y1) Redistributor(world, distS, globS).redistribute(S, S1) Redistributor(world, distU, globU).redistribute(U, U1) Redistributor(world, distB, globB).redistribute(HEC, HEC1) if rank == 0: gemm_err = abs(C1 - C0).max() gemv_err = abs(Y1 - Y0).max() r2k_err = abs(S1 - S0).max() rk_err = abs(U1 - U0).max() hemm_err = abs(HEC1 - HEC0).max() print("gemm err", gemm_err) print("gemv err", gemv_err) print("r2k err", r2k_err) print("rk_err", rk_err) print("hemm_err", hemm_err) else: gemm_err = 0.0 gemv_err = 0.0 r2k_err = 0.0 rk_err = 0.0 hemm_err = 0.0 gemm_err = world.sum(gemm_err) # We don't like exceptions on only one cpu gemv_err = world.sum(gemv_err) r2k_err = world.sum(r2k_err) rk_err = world.sum(rk_err) hemm_err = world.sum(hemm_err) equal(gemm_err, 0, tol) equal(gemv_err, 0, tol) equal(r2k_err, 0, tol) equal(rk_err, 0, tol) equal(hemm_err, 0, tol)