def matrix_multiply(self, C_NN, psit_nG, P_ani=None, out_nG=None):
        """Calculate new linear combinations of wave functions.

        Results will be put in the *P_ani* dict and a new psit_nG returned::

                     __                                __
            ~       \       ~           ~a  ~         \       ~a  ~
           psi  <--  ) C   psi    and  <p |psi >  <--  ) C   <p |psi >
              n     /__ nn'   n'         i    n       /__ nn'  i    n'
                     n'                                n'


        Parameters:

        C_NN: ndarray
            Matrix representation of the requested linear combinations. Even
            with a hermitian operator, this matrix need not be self-adjoint.
            However, unlike the results from calculate_matrix_elements, it is
            assumed that all matrix elements are filled in (use e.g. tri2full).
        psit_nG: ndarray
            Set of vectors in which the matrix elements are evaluated.
        P_ani: dict
            Dictionary of projector overlap integrals P_ni = <p_i | psit_nG>.

        """

        if self.A_nn is None:
            self.allocate_arrays()

        band_comm = self.bd.comm
        B = band_comm.size
        J = self.nblocks
        N = self.bd.mynbands

        C_NN = self.bmd.redistribute_input(C_NN)

        if B == 1 and J == 1:
            # Simple case:
            if use_mic:
                work_nG = self.work1_xG_mic
            else:
                work_nG = reshape(self.work1_xG, psit_nG.shape)
            if out_nG is None:
                out_nG = work_nG
                # out_nG[:] = 117  # gemm may not like nan's
            elif out_nG is psit_nG:
                work_nG[:] = psit_nG
                psit_nG = work_nG

            if use_mic:
                if self.gd.comm.rank == 0:
                    offload_report(1)
                C_NN_mic = self.A_nn_mic
                C_NN_mic.array[:] = C_NN[:]
                C_NN_mic.update_device()
                stream.sync()
                mic_gemm(1.0, psit_nG, C_NN_mic, 0.0, out_nG)
                if self.gd.comm.rank == 0:
                    offload_report(0)
            else:
                self.gd.gemm(1.0, psit_nG, C_NN, 0.0, out_nG)
            if P_ani:
                for P_ni in P_ani.values():
                    gemm(1.0, P_ni.copy(), C_NN, 0.0, P_ni)
            return out_nG

        # Now it gets nasty! We parallelize over B groups of bands and
        # each grid chunk is divided in J smaller slices (less memory).

        Q = B  # always non-hermitian XXX
        rank = band_comm.rank
        shape = psit_nG.shape
        psit_nG = psit_nG.reshape(N, -1)
        G = psit_nG.shape[1]  # number of grid-points
        g = int(np.ceil(G / float(J)))

        # Buffers for send/receive of pre-multiplication versions of P_ani's.
        sbuf_nI = rbuf_nI = None
        if P_ani:
            sbuf_nI = np.hstack([P_ni for P_ni in P_ani.values()])
            sbuf_nI = np.ascontiguousarray(sbuf_nI)
            if B > 1:
                rbuf_nI = np.empty_like(sbuf_nI)

        # Because of the amount of communication involved, we need to
        # be syncronized up to this point but only on the 1D band_comm
        # communication ring
        band_comm.barrier()
        while g * J >= G + g:  # remove extra slice(s)
            J -= 1
        assert 0 < g * J < G + g

        work1_xG = reshape(self.work1_xG, (self.X, ) + psit_nG.shape[1:])
        work2_xG = reshape(self.work2_xG, (self.X, ) + psit_nG.shape[1:])

        for j in range(J):
            G1 = j * g
            G2 = G1 + g
            if G2 > G:
                G2 = G
                g = G2 - G1
            sbuf_ng = reshape(work1_xG, (N, g))
            rbuf_ng = reshape(work2_xG, (N, g))
            sbuf_ng[:] = psit_nG[:, G1:G2]
            beta = 0.0
            cycle_P_ani = (j == J - 1 and P_ani)
            for q in range(Q):
                # Start sending currently buffered kets to rank below
                # and receiving next set of kets from rank above us.
                # If we're at the last slice, start cycling P_ani too.
                if q < Q - 1:
                    self._initialize_cycle(sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI,
                                           cycle_P_ani)

                # Calculate wave-function contributions from the current slice
                # of grid data by the current mynbands x mynbands matrix block.
                C_nn = self.bmd.extract_block(C_NN, (rank + q) % B, rank)
                self.gd.gemm(1.0, sbuf_ng, C_nn, beta, psit_nG[:, G1:G2])

                # If we're at the last slice, add contributions to P_ani's.
                if cycle_P_ani:
                    I1 = 0
                    for P_ni in P_ani.values():
                        I2 = I1 + P_ni.shape[1]
                        gemm(1.0, sbuf_nI[:, I1:I2], C_nn, beta, P_ni)
                        I1 = I2

                # Wait for all send/receives to finish before next iteration.
                # Swap send and receive buffer such that next becomes current.
                # If we're at the last slice, also finishes the P_ani cycle.
                if q < Q - 1:
                    sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI = self._finish_cycle(
                        sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI, cycle_P_ani)

                # First iteration was special because we initialized the kets
                if q == 0:
                    beta = 1.0

        psit_nG.shape = shape
        return psit_nG
    def matrix_multiply(self, C_NN, psit_nG, P_ani=None, out_nG=None):
        """Calculate new linear combinations of wave functions.

        Results will be put in the *P_ani* dict and a new psit_nG returned::

                     __                                __
            ~       \       ~           ~a  ~         \       ~a  ~
           psi  <--  ) C   psi    and  <p |psi >  <--  ) C   <p |psi >
              n     /__ nn'   n'         i    n       /__ nn'  i    n'
                     n'                                n'


        Parameters:

        C_NN: ndarray
            Matrix representation of the requested linear combinations. Even
            with a hermitian operator, this matrix need not be self-adjoint.
            However, unlike the results from calculate_matrix_elements, it is
            assumed that all matrix elements are filled in (use e.g. tri2full).
        psit_nG: ndarray
            Set of vectors in which the matrix elements are evaluated.
        P_ani: dict
            Dictionary of projector overlap integrals P_ni = <p_i | psit_nG>.

        """

        if self.A_nn is None:
            self.allocate_arrays()

        band_comm = self.bd.comm
        B = band_comm.size
        J = self.nblocks
        N = self.bd.mynbands

        C_NN = self.bmd.redistribute_input(C_NN)

        if B == 1 and J == 1:
            # Simple case:
            if use_mic:
                work_nG = self.work1_xG_mic
            else:
                work_nG = reshape(self.work1_xG, psit_nG.shape)
            if out_nG is None:
                out_nG = work_nG
                # out_nG[:] = 117  # gemm may not like nan's
            elif out_nG is psit_nG:
                work_nG[:] = psit_nG
                psit_nG = work_nG

            if use_mic:
                if self.gd.comm.rank == 0:
                    offload_report(1)
                C_NN_mic = self.A_nn_mic
                C_NN_mic.array[:] = C_NN[:]
                C_NN_mic.update_device()
                stream.sync()
                mic_gemm(1.0, psit_nG, C_NN_mic, 0.0, out_nG)
                if self.gd.comm.rank == 0:
                    offload_report(0)
            else:
                self.gd.gemm(1.0, psit_nG, C_NN, 0.0, out_nG)
            if P_ani:
                for P_ni in P_ani.values():
                    gemm(1.0, P_ni.copy(), C_NN, 0.0, P_ni)
            return out_nG
        
        # Now it gets nasty! We parallelize over B groups of bands and
        # each grid chunk is divided in J smaller slices (less memory).

        Q = B  # always non-hermitian XXX
        rank = band_comm.rank
        shape = psit_nG.shape
        psit_nG = psit_nG.reshape(N, -1)
        G = psit_nG.shape[1]  # number of grid-points
        g = int(np.ceil(G / float(J)))

        # Buffers for send/receive of pre-multiplication versions of P_ani's.
        sbuf_nI = rbuf_nI = None
        if P_ani:
            sbuf_nI = np.hstack([P_ni for P_ni in P_ani.values()])
            sbuf_nI = np.ascontiguousarray(sbuf_nI)
            if B > 1:
                rbuf_nI = np.empty_like(sbuf_nI)

        # Because of the amount of communication involved, we need to
        # be syncronized up to this point but only on the 1D band_comm
        # communication ring
        band_comm.barrier()
        while g * J >= G + g:  # remove extra slice(s)
            J -= 1
        assert 0 < g * J < G + g

        work1_xG = reshape(self.work1_xG, (self.X,) + psit_nG.shape[1:])
        work2_xG = reshape(self.work2_xG, (self.X,) + psit_nG.shape[1:])

        for j in range(J):
            G1 = j * g
            G2 = G1 + g
            if G2 > G:
                G2 = G
                g = G2 - G1
            sbuf_ng = reshape(work1_xG, (N, g))
            rbuf_ng = reshape(work2_xG, (N, g))
            sbuf_ng[:] = psit_nG[:, G1:G2]
            beta = 0.0
            cycle_P_ani = (j == J - 1 and P_ani)
            for q in range(Q):
                # Start sending currently buffered kets to rank below
                # and receiving next set of kets from rank above us.
                # If we're at the last slice, start cycling P_ani too.
                if q < Q - 1:
                    self._initialize_cycle(sbuf_ng, rbuf_ng,
                                           sbuf_nI, rbuf_nI, cycle_P_ani)

                # Calculate wave-function contributions from the current slice
                # of grid data by the current mynbands x mynbands matrix block.
                C_nn = self.bmd.extract_block(C_NN, (rank + q) % B, rank)
                self.gd.gemm(1.0, sbuf_ng, C_nn, beta, psit_nG[:, G1:G2])

                # If we're at the last slice, add contributions to P_ani's.
                if cycle_P_ani:
                    I1 = 0
                    for P_ni in P_ani.values():
                        I2 = I1 + P_ni.shape[1]
                        gemm(1.0, sbuf_nI[:, I1:I2], C_nn, beta, P_ni)
                        I1 = I2

                # Wait for all send/receives to finish before next iteration.
                # Swap send and receive buffer such that next becomes current.
                # If we're at the last slice, also finishes the P_ani cycle.
                if q < Q - 1:
                    sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI = self._finish_cycle(
                        sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI, cycle_P_ani)

                # First iteration was special because we initialized the kets
                if q == 0:
                    beta = 1.0

        psit_nG.shape = shape
        return psit_nG
Example #3
0
from ase.data.extra_molecules import data
from ase.structure import molecule
from gpaw import GPAW, ConvergenceError
from gpaw.eigensolvers import RMM_DIIS
from gpaw.mpi import size
from _gpaw import offload_report
from gpaw import use_mic
from gpaw.mpi import rank

import time

offload_report(0)

if use_mic:
    txt = 'out_C60_mic_p%d.txt' % size
else:
    txt = 'out_C60_p%d.txt' % size

if rank == 0:
    print "Starting solver..."
tstart = time.time()
atoms = molecule('C60', data=data)
atoms.center(3.5)
calc = GPAW(h=0.18,
            nbands=400,
            eigensolver=RMM_DIIS(keep_htpsit=False),
            txt=txt,
            maxiter=10)
atoms.set_calculator(calc)
try:
    atoms.get_potential_energy()
Example #4
0
from ase.data.extra_molecules import data
from ase.structure import molecule
from gpaw import GPAW, ConvergenceError
from gpaw.eigensolvers import RMM_DIIS
from gpaw.mpi import size
from _gpaw import offload_report
from gpaw import use_mic
from gpaw.mpi import rank

import time

offload_report(0)

if use_mic:
    txt = 'out_C60_mic_p%d.txt' % size
else:
    txt = 'out_C60_p%d.txt' % size

if rank == 0:
    print "Starting solver..."
tstart = time.time()
atoms = molecule('C60', data=data)
atoms.center(3.5)
calc = GPAW(h=0.18, nbands=400, eigensolver=RMM_DIIS(keep_htpsit=False),
            txt=txt,
            maxiter=10)
atoms.set_calculator(calc)
try:
    atoms.get_potential_energy()
except ConvergenceError:
    pass