def matrix_multiply(self, C_NN, psit_nG, P_ani=None, out_nG=None): """Calculate new linear combinations of wave functions. Results will be put in the *P_ani* dict and a new psit_nG returned:: __ __ ~ \ ~ ~a ~ \ ~a ~ psi <-- ) C psi and <p |psi > <-- ) C <p |psi > n /__ nn' n' i n /__ nn' i n' n' n' Parameters: C_NN: ndarray Matrix representation of the requested linear combinations. Even with a hermitian operator, this matrix need not be self-adjoint. However, unlike the results from calculate_matrix_elements, it is assumed that all matrix elements are filled in (use e.g. tri2full). psit_nG: ndarray Set of vectors in which the matrix elements are evaluated. P_ani: dict Dictionary of projector overlap integrals P_ni = <p_i | psit_nG>. """ if self.A_nn is None: self.allocate_arrays() band_comm = self.bd.comm B = band_comm.size J = self.nblocks N = self.bd.mynbands C_NN = self.bmd.redistribute_input(C_NN) if B == 1 and J == 1: # Simple case: if use_mic: work_nG = self.work1_xG_mic else: work_nG = reshape(self.work1_xG, psit_nG.shape) if out_nG is None: out_nG = work_nG # out_nG[:] = 117 # gemm may not like nan's elif out_nG is psit_nG: work_nG[:] = psit_nG psit_nG = work_nG if use_mic: if self.gd.comm.rank == 0: offload_report(1) C_NN_mic = self.A_nn_mic C_NN_mic.array[:] = C_NN[:] C_NN_mic.update_device() stream.sync() mic_gemm(1.0, psit_nG, C_NN_mic, 0.0, out_nG) if self.gd.comm.rank == 0: offload_report(0) else: self.gd.gemm(1.0, psit_nG, C_NN, 0.0, out_nG) if P_ani: for P_ni in P_ani.values(): gemm(1.0, P_ni.copy(), C_NN, 0.0, P_ni) return out_nG # Now it gets nasty! We parallelize over B groups of bands and # each grid chunk is divided in J smaller slices (less memory). Q = B # always non-hermitian XXX rank = band_comm.rank shape = psit_nG.shape psit_nG = psit_nG.reshape(N, -1) G = psit_nG.shape[1] # number of grid-points g = int(np.ceil(G / float(J))) # Buffers for send/receive of pre-multiplication versions of P_ani's. sbuf_nI = rbuf_nI = None if P_ani: sbuf_nI = np.hstack([P_ni for P_ni in P_ani.values()]) sbuf_nI = np.ascontiguousarray(sbuf_nI) if B > 1: rbuf_nI = np.empty_like(sbuf_nI) # Because of the amount of communication involved, we need to # be syncronized up to this point but only on the 1D band_comm # communication ring band_comm.barrier() while g * J >= G + g: # remove extra slice(s) J -= 1 assert 0 < g * J < G + g work1_xG = reshape(self.work1_xG, (self.X, ) + psit_nG.shape[1:]) work2_xG = reshape(self.work2_xG, (self.X, ) + psit_nG.shape[1:]) for j in range(J): G1 = j * g G2 = G1 + g if G2 > G: G2 = G g = G2 - G1 sbuf_ng = reshape(work1_xG, (N, g)) rbuf_ng = reshape(work2_xG, (N, g)) sbuf_ng[:] = psit_nG[:, G1:G2] beta = 0.0 cycle_P_ani = (j == J - 1 and P_ani) for q in range(Q): # Start sending currently buffered kets to rank below # and receiving next set of kets from rank above us. # If we're at the last slice, start cycling P_ani too. if q < Q - 1: self._initialize_cycle(sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI, cycle_P_ani) # Calculate wave-function contributions from the current slice # of grid data by the current mynbands x mynbands matrix block. C_nn = self.bmd.extract_block(C_NN, (rank + q) % B, rank) self.gd.gemm(1.0, sbuf_ng, C_nn, beta, psit_nG[:, G1:G2]) # If we're at the last slice, add contributions to P_ani's. if cycle_P_ani: I1 = 0 for P_ni in P_ani.values(): I2 = I1 + P_ni.shape[1] gemm(1.0, sbuf_nI[:, I1:I2], C_nn, beta, P_ni) I1 = I2 # Wait for all send/receives to finish before next iteration. # Swap send and receive buffer such that next becomes current. # If we're at the last slice, also finishes the P_ani cycle. if q < Q - 1: sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI = self._finish_cycle( sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI, cycle_P_ani) # First iteration was special because we initialized the kets if q == 0: beta = 1.0 psit_nG.shape = shape return psit_nG
def matrix_multiply(self, C_NN, psit_nG, P_ani=None, out_nG=None): """Calculate new linear combinations of wave functions. Results will be put in the *P_ani* dict and a new psit_nG returned:: __ __ ~ \ ~ ~a ~ \ ~a ~ psi <-- ) C psi and <p |psi > <-- ) C <p |psi > n /__ nn' n' i n /__ nn' i n' n' n' Parameters: C_NN: ndarray Matrix representation of the requested linear combinations. Even with a hermitian operator, this matrix need not be self-adjoint. However, unlike the results from calculate_matrix_elements, it is assumed that all matrix elements are filled in (use e.g. tri2full). psit_nG: ndarray Set of vectors in which the matrix elements are evaluated. P_ani: dict Dictionary of projector overlap integrals P_ni = <p_i | psit_nG>. """ if self.A_nn is None: self.allocate_arrays() band_comm = self.bd.comm B = band_comm.size J = self.nblocks N = self.bd.mynbands C_NN = self.bmd.redistribute_input(C_NN) if B == 1 and J == 1: # Simple case: if use_mic: work_nG = self.work1_xG_mic else: work_nG = reshape(self.work1_xG, psit_nG.shape) if out_nG is None: out_nG = work_nG # out_nG[:] = 117 # gemm may not like nan's elif out_nG is psit_nG: work_nG[:] = psit_nG psit_nG = work_nG if use_mic: if self.gd.comm.rank == 0: offload_report(1) C_NN_mic = self.A_nn_mic C_NN_mic.array[:] = C_NN[:] C_NN_mic.update_device() stream.sync() mic_gemm(1.0, psit_nG, C_NN_mic, 0.0, out_nG) if self.gd.comm.rank == 0: offload_report(0) else: self.gd.gemm(1.0, psit_nG, C_NN, 0.0, out_nG) if P_ani: for P_ni in P_ani.values(): gemm(1.0, P_ni.copy(), C_NN, 0.0, P_ni) return out_nG # Now it gets nasty! We parallelize over B groups of bands and # each grid chunk is divided in J smaller slices (less memory). Q = B # always non-hermitian XXX rank = band_comm.rank shape = psit_nG.shape psit_nG = psit_nG.reshape(N, -1) G = psit_nG.shape[1] # number of grid-points g = int(np.ceil(G / float(J))) # Buffers for send/receive of pre-multiplication versions of P_ani's. sbuf_nI = rbuf_nI = None if P_ani: sbuf_nI = np.hstack([P_ni for P_ni in P_ani.values()]) sbuf_nI = np.ascontiguousarray(sbuf_nI) if B > 1: rbuf_nI = np.empty_like(sbuf_nI) # Because of the amount of communication involved, we need to # be syncronized up to this point but only on the 1D band_comm # communication ring band_comm.barrier() while g * J >= G + g: # remove extra slice(s) J -= 1 assert 0 < g * J < G + g work1_xG = reshape(self.work1_xG, (self.X,) + psit_nG.shape[1:]) work2_xG = reshape(self.work2_xG, (self.X,) + psit_nG.shape[1:]) for j in range(J): G1 = j * g G2 = G1 + g if G2 > G: G2 = G g = G2 - G1 sbuf_ng = reshape(work1_xG, (N, g)) rbuf_ng = reshape(work2_xG, (N, g)) sbuf_ng[:] = psit_nG[:, G1:G2] beta = 0.0 cycle_P_ani = (j == J - 1 and P_ani) for q in range(Q): # Start sending currently buffered kets to rank below # and receiving next set of kets from rank above us. # If we're at the last slice, start cycling P_ani too. if q < Q - 1: self._initialize_cycle(sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI, cycle_P_ani) # Calculate wave-function contributions from the current slice # of grid data by the current mynbands x mynbands matrix block. C_nn = self.bmd.extract_block(C_NN, (rank + q) % B, rank) self.gd.gemm(1.0, sbuf_ng, C_nn, beta, psit_nG[:, G1:G2]) # If we're at the last slice, add contributions to P_ani's. if cycle_P_ani: I1 = 0 for P_ni in P_ani.values(): I2 = I1 + P_ni.shape[1] gemm(1.0, sbuf_nI[:, I1:I2], C_nn, beta, P_ni) I1 = I2 # Wait for all send/receives to finish before next iteration. # Swap send and receive buffer such that next becomes current. # If we're at the last slice, also finishes the P_ani cycle. if q < Q - 1: sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI = self._finish_cycle( sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI, cycle_P_ani) # First iteration was special because we initialized the kets if q == 0: beta = 1.0 psit_nG.shape = shape return psit_nG
from ase.data.extra_molecules import data from ase.structure import molecule from gpaw import GPAW, ConvergenceError from gpaw.eigensolvers import RMM_DIIS from gpaw.mpi import size from _gpaw import offload_report from gpaw import use_mic from gpaw.mpi import rank import time offload_report(0) if use_mic: txt = 'out_C60_mic_p%d.txt' % size else: txt = 'out_C60_p%d.txt' % size if rank == 0: print "Starting solver..." tstart = time.time() atoms = molecule('C60', data=data) atoms.center(3.5) calc = GPAW(h=0.18, nbands=400, eigensolver=RMM_DIIS(keep_htpsit=False), txt=txt, maxiter=10) atoms.set_calculator(calc) try: atoms.get_potential_energy()
from ase.data.extra_molecules import data from ase.structure import molecule from gpaw import GPAW, ConvergenceError from gpaw.eigensolvers import RMM_DIIS from gpaw.mpi import size from _gpaw import offload_report from gpaw import use_mic from gpaw.mpi import rank import time offload_report(0) if use_mic: txt = 'out_C60_mic_p%d.txt' % size else: txt = 'out_C60_p%d.txt' % size if rank == 0: print "Starting solver..." tstart = time.time() atoms = molecule('C60', data=data) atoms.center(3.5) calc = GPAW(h=0.18, nbands=400, eigensolver=RMM_DIIS(keep_htpsit=False), txt=txt, maxiter=10) atoms.set_calculator(calc) try: atoms.get_potential_energy() except ConvergenceError: pass