def initialize(self, wfs):
        self.timer = wfs.timer
        self.world = wfs.world
        self.kpt_comm = wfs.kd.comm
        self.band_comm = wfs.band_comm
        self.dtype = wfs.dtype
        self.bd = wfs.bd
        self.ksl = wfs.diagksl
        self.nbands = wfs.bd.nbands
        self.mynbands = wfs.bd.mynbands
        self.operator = wfs.matrixoperator

        if self.mynbands != self.nbands or self.operator.nblocks != 1:
            self.keep_htpsit = False

        if self.keep_htpsit:
            self.Htpsit_nG = wfs.empty(self.nbands)
            if use_mic:
                self.Htpsit_nG_mic = stream.bind(self.Htpsit_nG)
                stream.sync()

        # Preconditioner for the electronic gradients:
        self.preconditioner = wfs.make_preconditioner(self.blocksize)

        for kpt in wfs.kpt_u:
            if kpt.eps_n is None:
                kpt.eps_n = np.empty(self.mynbands)
        
        # Allocate arrays for matrix operator
        self.operator.allocate_arrays()

        self.initialized = True
Exemple #2
0
    def initialize(self, wfs):
        self.timer = wfs.timer
        self.world = wfs.world
        self.kpt_comm = wfs.kd.comm
        self.band_comm = wfs.band_comm
        self.dtype = wfs.dtype
        self.bd = wfs.bd
        self.ksl = wfs.diagksl
        self.nbands = wfs.bd.nbands
        self.mynbands = wfs.bd.mynbands
        self.operator = wfs.matrixoperator

        if self.mynbands != self.nbands or self.operator.nblocks != 1:
            self.keep_htpsit = False

        if self.keep_htpsit:
            self.Htpsit_nG = wfs.empty(self.nbands)
            if use_mic:
                self.Htpsit_nG_mic = stream.bind(self.Htpsit_nG)
                stream.sync()

        # Preconditioner for the electronic gradients:
        self.preconditioner = wfs.make_preconditioner(self.blocksize)

        for kpt in wfs.kpt_u:
            if kpt.eps_n is None:
                kpt.eps_n = np.empty(self.mynbands)

        # Allocate arrays for matrix operator
        self.operator.allocate_arrays()

        self.initialized = True
def rk(alpha, a, beta, c, trans='c'):
    """Rank-k update of a matrix."""

    assert isinstance(a, mic.OffloadArray)
    assert isinstance(c, mic.OffloadArray)

    dt = map_dtype(a.dtype)
    
    # determine sizes of the matrices
    am = a.shape[0]
    ak = np.prod(a.shape[1:])
    ck = c.shape[0]
    cn = np.prod(c.shape[1:])

    n, k = am, ak
    ldc = c.array.strides[0] / c.array.strides[1]

    
    if a.dtype in [np.complex]:
        alpha = complex(alpha)
        beta = complex(beta)
        
    # perform the offload
    stream.invoke(library.mic_syrk, dt, a, c, n, k, 
                  ldc, alpha, beta)
    stream.sync()
Exemple #4
0
 def initialize_from_lcao_coefficients(self, basis_functions, mynbands):
     for kpt in self.kpt_u:
         kpt.psit_nG = self.gd.zeros(self.bd.mynbands, self.dtype)
         basis_functions.lcao_to_grid(kpt.C_nM, kpt.psit_nG[:mynbands],
                                      kpt.q)
         kpt.C_nM = None
         if use_mic:
             kpt.psit_nG_mic = stream.bind(kpt.psit_nG)
             stream.sync()
 def initialize_from_lcao_coefficients(self, basis_functions, mynbands):
     for kpt in self.kpt_u:
         kpt.psit_nG = self.gd.zeros(self.bd.mynbands, self.dtype)
         basis_functions.lcao_to_grid(kpt.C_nM,
                                      kpt.psit_nG[:mynbands], kpt.q)
         kpt.C_nM = None
         if use_mic:
             kpt.psit_nG_mic = stream.bind(kpt.psit_nG)
             stream.sync()
    def empty(self, n=(), dtype=float, global_array=False, pad=False, usemic=False):
        """Return new uninitialized 3D array for this domain.

        The type can be set with the ``dtype`` keyword (default:
        ``float``).  Extra dimensions can be added with ``n=dim``.  A
        global array spanning all domains can be allocated with
        ``global_array=True``."""

        array = self._new_array(n, dtype, False, global_array, pad)
        if usemic:
            oa = stream.bind(array)
            stream.sync()
            return oa
        else:
            return array
def gemm(alpha, a, b, beta, c, transa='n'):
    # we want to make sure that we only use OffloadArrays here
    assert isinstance(a, mic.OffloadArray)
    assert isinstance(b, mic.OffloadArray)
    assert isinstance(c, mic.OffloadArray)
    
    # determine the datatype and map it to int
    dt = map_dtype(a.dtype)
    
    # determine sizes of the matrices
    am = a.shape[0]
    ak = np.prod(a.shape[1:])
    bk = b.shape[0]
    bn = np.prod(b.shape[1:])
    cm = c.shape[0]
    cn = np.prod(c.shape[1:])
    
    # just some safety checks
    if transa == 'n':
        assert am == bn
        assert ak == cn
        assert bk == cm
        trans = 0
        m, n, k = ak, bk, am
        lda = a.array.strides[0] / a.array.strides[-1]
        ldb = b.array.strides[0] / b.array.strides[1]
        ldc = c.array.strides[0] / c.array.strides[-1]
    else:
        assert am == cn
        assert ak == bn
        assert bk == cm
        trans = 1
        m, n, k = am, bk, ak
        lda = k
        ldb = b.array.strides[0] / b.array.strides[-1]
        ldc = c.array.strides[0] / c.array.strides[1]

    if a.dtype in [np.complex]:
        alpha = complex(alpha)
        beta = complex(beta)
            
    # perform the offload
    stream.invoke(library.mic_gemm, dt, 
                  a, b, c, 
                  m, n, k, 
                  lda, ldb, ldc, 
                  alpha, beta, trans)
    stream.sync()
def gemm(alpha, a, b, beta, c, transa='n'):
    # we want to make sure that we only use OffloadArrays here
    assert isinstance(a, mic.OffloadArray)
    assert isinstance(b, mic.OffloadArray)
    assert isinstance(c, mic.OffloadArray)

    # determine the datatype and map it to int
    dt = map_dtype(a.dtype)

    # determine sizes of the matrices
    am = a.shape[0]
    ak = np.prod(a.shape[1:])
    bk = b.shape[0]
    bn = np.prod(b.shape[1:])
    cm = c.shape[0]
    cn = np.prod(c.shape[1:])

    # just some safety checks
    if transa == 'n':
        assert am == bn
        assert ak == cn
        assert bk == cm
        trans = 0
        m, n, k = ak, bk, am
        lda = a.array.strides[0] / a.array.strides[-1]
        ldb = b.array.strides[0] / b.array.strides[1]
        ldc = c.array.strides[0] / c.array.strides[-1]
    else:
        assert am == cn
        assert ak == bn
        assert bk == cm
        trans = 1
        m, n, k = am, bk, ak
        lda = k
        ldb = b.array.strides[0] / b.array.strides[-1]
        ldc = c.array.strides[0] / c.array.strides[1]

    if a.dtype in [np.complex]:
        alpha = complex(alpha)
        beta = complex(beta)

    # perform the offload
    stream.invoke(library.mic_gemm, dt, a, b, c, m, n, k, lda, ldb, ldc, alpha,
                  beta, trans)
    stream.sync()
    def allocate_arrays(self):
        ngroups = self.bd.comm.size
        mynbands = self.bd.mynbands
        dtype = self.dtype
        if ngroups > 1:
            self.A_qnn = np.zeros((self.Q, mynbands, mynbands), dtype)
        self.A_nn = self.bmd.zeros(dtype=dtype)
        if use_mic:
            self.A_nn_mic = stream.bind(self.A_nn)
            stream.sync()

        if ngroups == 1 and self.nblocks == 1:
            self.work1_xG = self.gd.empty(self.bd.mynbands, self.dtype)
            if use_mic:
                self.work1_xG_mic = stream.bind(self.work1_xG)
                stream.sync()
        else:
            self.work1_xG = self.gd.empty(self.X, self.dtype)
            self.work2_xG = self.gd.empty(self.X, self.dtype)
    def allocate_arrays(self):
        ngroups = self.bd.comm.size
        mynbands = self.bd.mynbands
        dtype = self.dtype
        if ngroups > 1:
            self.A_qnn = np.zeros((self.Q, mynbands, mynbands), dtype)
        self.A_nn = self.bmd.zeros(dtype=dtype)
        if use_mic:
            self.A_nn_mic = stream.bind(self.A_nn)
            stream.sync()

        if ngroups == 1 and self.nblocks == 1:
            self.work1_xG = self.gd.empty(self.bd.mynbands, self.dtype) 
            if use_mic:
                self.work1_xG_mic = stream.bind(self.work1_xG)
                stream.sync()
        else:
            self.work1_xG = self.gd.empty(self.X, self.dtype)
            self.work2_xG = self.gd.empty(self.X, self.dtype)
Exemple #11
0
 def H(psit_xG):
     if self.keep_htpsit:
         result_xG = Htpsit_nG
     else:
         if use_mic:
             result_xG = self.operator.work1_xG_mic
         else:
             result_xG = reshape(self.operator.work1_xG, psit_xG.shape)
     if use_mic:
         psit_xG.update_device()
         wfs.apply_pseudo_hamiltonian(kpt, hamiltonian, psit_xG.array,
                                      result_xG.array)
         result_xG.update_device()
         stream.sync()
     else:
         wfs.apply_pseudo_hamiltonian(kpt, hamiltonian, psit_xG,
                                      result_xG)
     hamiltonian.xc.apply_orbital_dependent_hamiltonian(
         kpt, psit_xG, result_xG, hamiltonian.dH_asp)
     return result_xG
 def H(psit_xG):
     if self.keep_htpsit:
         result_xG = Htpsit_nG
     else:
         if use_mic:
             result_xG = self.operator.work1_xG_mic
         else:
             result_xG = reshape(self.operator.work1_xG, psit_xG.shape)
     if use_mic:
         psit_xG.update_device()
         wfs.apply_pseudo_hamiltonian(kpt, hamiltonian, psit_xG.array,
                                      result_xG.array)
         result_xG.update_device() 
         stream.sync()
     else:
         wfs.apply_pseudo_hamiltonian(kpt, hamiltonian, psit_xG,
                                      result_xG)
     hamiltonian.xc.apply_orbital_dependent_hamiltonian(
         kpt, psit_xG, result_xG, hamiltonian.dH_asp)
     return result_xG
def r2k(alpha, a, b, beta, c):
    """Rank-2k update of a matrix."""

    assert isinstance(a, mic.OffloadArray)
    assert isinstance(b, mic.OffloadArray)
    assert isinstance(c, mic.OffloadArray)

    assert (map_dtype(a.dtype) != 2)

    # determine sizes of the matrices
    am = a.shape[0]
    ak = np.prod(a.shape[1:])
    bm = b.shape[0]
    bk = np.prod(b.shape[1:])
    ck = c.shape[0]
    cn = np.prod(c.shape[1:])

    n, k = am, ak
    ldc = c.array.strides[0] / c.array.strides[1]

    stream.invoke(library.mic_dsyr2k, a, b, c, n, k, ldc, alpha, beta)
    stream.sync()
def r2k(alpha, a, b, beta, c):
    """Rank-2k update of a matrix."""

    assert isinstance(a, mic.OffloadArray)
    assert isinstance(b, mic.OffloadArray)
    assert isinstance(c, mic.OffloadArray)

    assert(map_dtype(a.dtype) != 2)
    
    # determine sizes of the matrices
    am = a.shape[0]
    ak = np.prod(a.shape[1:])
    bm = b.shape[0]
    bk = np.prod(b.shape[1:])
    ck = c.shape[0]
    cn = np.prod(c.shape[1:])

    n, k = am, ak
    ldc = c.array.strides[0] / c.array.strides[1]

    stream.invoke(library.mic_dsyr2k, a, b, c, n, k, 
                  ldc, alpha, beta)
    stream.sync()
def rk(alpha, a, beta, c, trans='c'):
    """Rank-k update of a matrix."""

    assert isinstance(a, mic.OffloadArray)
    assert isinstance(c, mic.OffloadArray)

    dt = map_dtype(a.dtype)

    # determine sizes of the matrices
    am = a.shape[0]
    ak = np.prod(a.shape[1:])
    ck = c.shape[0]
    cn = np.prod(c.shape[1:])

    n, k = am, ak
    ldc = c.array.strides[0] / c.array.strides[1]

    if a.dtype in [np.complex]:
        alpha = complex(alpha)
        beta = complex(beta)

    # perform the offload
    stream.invoke(library.mic_syrk, dt, a, c, n, k, ldc, alpha, beta)
    stream.sync()
    def subspace_diagonalize(self, hamiltonian, wfs, kpt):
        """Diagonalize the Hamiltonian in the subspace of kpt.psit_nG

        *Htpsit_nG* is a work array of same size as psit_nG which contains
        the local part of the Hamiltonian times psit on exit

        First, the Hamiltonian (defined by *kin*, *vt_sG*, and
        *dH_asp*) is applied to the wave functions, then the *H_nn*
        matrix is calculated and diagonalized, and finally, the wave
        functions (and also Htpsit_nG are rotated.  Also the
        projections *P_ani* are rotated.

        It is assumed that the wave functions *psit_nG* are orthonormal
        and that the integrals of projector functions and wave functions
        *P_ani* are already calculated.

        Return ratated wave functions and H applied to the rotated
        wave functions if self.keep_htpsit is True.
        """

        if self.band_comm.size > 1 and wfs.bd.strided:
            raise NotImplementedError

        self.timer.start('Subspace diag')

        if use_mic:
            psit_nG = kpt.psit_nG_mic
            # psit_nG.update_device()
            # stream.sync()
        else:
            psit_nG = kpt.psit_nG
        P_ani = kpt.P_ani

        if self.keep_htpsit:
            if use_mic:
                Htpsit_nG = self.Htpsit_nG_mic
            else:
                Htpsit_nG = reshape(self.Htpsit_nG, psit_nG.shape)
        else:
            Htpsit_nG = None

        def H(psit_xG):
            if self.keep_htpsit:
                result_xG = Htpsit_nG
            else:
                if use_mic:
                    result_xG = self.operator.work1_xG_mic
                else:
                    result_xG = reshape(self.operator.work1_xG, psit_xG.shape)
            if use_mic:
                psit_xG.update_device()
                wfs.apply_pseudo_hamiltonian(kpt, hamiltonian, psit_xG.array,
                                             result_xG.array)
                result_xG.update_device() 
                stream.sync()
            else:
                wfs.apply_pseudo_hamiltonian(kpt, hamiltonian, psit_xG,
                                             result_xG)
            hamiltonian.xc.apply_orbital_dependent_hamiltonian(
                kpt, psit_xG, result_xG, hamiltonian.dH_asp)
            return result_xG

        def dH(a, P_ni):
            return np.dot(P_ni, unpack(hamiltonian.dH_asp[a][kpt.s]))

        self.timer.start('calc_h_matrix')
        H_nn = self.operator.calculate_matrix_elements(psit_nG, P_ani,
                                                       H, dH)
        hamiltonian.xc.correct_hamiltonian_matrix(kpt, H_nn)
        self.timer.stop('calc_h_matrix')

        diagonalization_string = repr(self.ksl)
        wfs.timer.start(diagonalization_string)
        self.ksl.diagonalize(H_nn, kpt.eps_n)
        # H_nn now contains the result of the diagonalization.
        wfs.timer.stop(diagonalization_string)

        self.timer.start('rotate_psi')
        psit_nG = self.operator.matrix_multiply(H_nn, psit_nG, P_ani)
        if self.keep_htpsit:
            if use_mic:
                Htpsit_nG = self.operator.matrix_multiply(H_nn, Htpsit_nG,
                                                          out_nG=kpt.psit_nG_mic)
                 
            else:
                Htpsit_nG = self.operator.matrix_multiply(H_nn, Htpsit_nG,
                                                          out_nG=kpt.psit_nG)

        # Rotate orbital dependent XC stuff:
        hamiltonian.xc.rotate(kpt, H_nn)

        self.timer.stop('rotate_psi')
        self.timer.stop('Subspace diag')

        if use_mic:
            psit_nG.update_host()
            stream.sync()
            if self.keep_htpsit:
                Htpsit_nG.update_host()
                stream.sync()
                return psit_nG.array, Htpsit_nG.array
            else:
                return psit_nG.array, Htpsit_nG
        else:
            return psit_nG, Htpsit_nG
    def orthonormalize(self, wfs, kpt, psit_nG=None):
        """Orthonormalizes the vectors a_nG with respect to the overlap.

        First, a Cholesky factorization C is done for the overlap
        matrix S_nn = <a_nG | S | a_nG> = C*_nn C_nn Cholesky matrix C
        is inverted and orthonormal vectors a_nG' are obtained as::

          psit_nG' = inv(C_nn) psit_nG
                    __
           ~   _   \    -1   ~   _
          psi (r) = )  C    psi (r)
             n     /__  nm     m
                    m

        Parameters
        ----------

        psit_nG: ndarray, input/output
            On input the set of vectors to orthonormalize,
            on output the overlap-orthonormalized vectors.
        kpt: KPoint object:
            k-point object from kpoint.py.
        work_nG: ndarray
            Optional work array for overlap matrix times psit_nG.
        work_nn: ndarray
            Optional work array for overlap matrix.

        """
        self.timer.start('Orthonormalize')
        if psit_nG is None:
            psit_nG = kpt.psit_nG
            if use_mic:
                psit_nG_mic = kpt.psit_nG_mic
        else:
            if use_mic:
                psit_nG_mic = stream.bind(psit_nG, update_device=False)
                stream.sync()

        P_ani = kpt.P_ani
        self.timer.start('projections')
        wfs.pt.integrate(psit_nG, P_ani, kpt.q)
        self.timer.stop('projections')

        # Construct the overlap matrix:
        operator = wfs.matrixoperator

        def S(psit_G):
            return psit_G
        
        def dS(a, P_ni):
            return np.dot(P_ni, wfs.setups[a].dO_ii)

        if use_mic:
            self.timer.start('calc_s_matrix')
            psit_nG_mic.update_device()
            stream.sync()
            S_nn = operator.calculate_matrix_elements(psit_nG_mic, P_ani, S, dS)
            self.timer.stop('calc_s_matrix')
        else:
            self.timer.start('calc_s_matrix')
            S_nn = operator.calculate_matrix_elements(psit_nG, P_ani, S, dS)
            self.timer.stop('calc_s_matrix')


        orthonormalization_string = repr(self.ksl)
        self.timer.start(orthonormalization_string)
        #
        if extra_parameters.get('sic', False):
            #
            # symmetric Loewdin Orthonormalization
            tri2full(S_nn, UL='L', map=np.conj)
            nrm_n = np.empty(S_nn.shape[0])
            diagonalize(S_nn, nrm_n)
            nrm_nn = np.diag(1.0/np.sqrt(nrm_n))
            S_nn = np.dot(np.dot(S_nn.T.conj(), nrm_nn), S_nn)
        else:
            #
            self.ksl.inverse_cholesky(S_nn)
        # S_nn now contains the inverse of the Cholesky factorization.
        # Let's call it something different:
        C_nn = S_nn
        del S_nn
        self.timer.stop(orthonormalization_string)

        self.timer.start('rotate_psi')
        if use_mic:
            operator.matrix_multiply(C_nn, psit_nG_mic, P_ani, out_nG=kpt.psit_nG_mic)
            kpt.psit_nG_mic.update_host()
            stream.sync()
            # kpt.psit_nG[:] = self.psit_nG_mic.array[:]
        else:
            operator.matrix_multiply(C_nn, psit_nG, P_ani, out_nG=kpt.psit_nG)
        self.timer.stop('rotate_psi')
        self.timer.stop('Orthonormalize')
    def matrix_multiply(self, C_NN, psit_nG, P_ani=None, out_nG=None):
        """Calculate new linear combinations of wave functions.

        Results will be put in the *P_ani* dict and a new psit_nG returned::

                     __                                __
            ~       \       ~           ~a  ~         \       ~a  ~
           psi  <--  ) C   psi    and  <p |psi >  <--  ) C   <p |psi >
              n     /__ nn'   n'         i    n       /__ nn'  i    n'
                     n'                                n'


        Parameters:

        C_NN: ndarray
            Matrix representation of the requested linear combinations. Even
            with a hermitian operator, this matrix need not be self-adjoint.
            However, unlike the results from calculate_matrix_elements, it is
            assumed that all matrix elements are filled in (use e.g. tri2full).
        psit_nG: ndarray
            Set of vectors in which the matrix elements are evaluated.
        P_ani: dict
            Dictionary of projector overlap integrals P_ni = <p_i | psit_nG>.

        """

        if self.A_nn is None:
            self.allocate_arrays()

        band_comm = self.bd.comm
        B = band_comm.size
        J = self.nblocks
        N = self.bd.mynbands

        C_NN = self.bmd.redistribute_input(C_NN)

        if B == 1 and J == 1:
            # Simple case:
            if use_mic:
                work_nG = self.work1_xG_mic
            else:
                work_nG = reshape(self.work1_xG, psit_nG.shape)
            if out_nG is None:
                out_nG = work_nG
                # out_nG[:] = 117  # gemm may not like nan's
            elif out_nG is psit_nG:
                work_nG[:] = psit_nG
                psit_nG = work_nG

            if use_mic:
                if self.gd.comm.rank == 0:
                    offload_report(1)
                C_NN_mic = self.A_nn_mic
                C_NN_mic.array[:] = C_NN[:]
                C_NN_mic.update_device()
                stream.sync()
                mic_gemm(1.0, psit_nG, C_NN_mic, 0.0, out_nG)
                if self.gd.comm.rank == 0:
                    offload_report(0)
            else:
                self.gd.gemm(1.0, psit_nG, C_NN, 0.0, out_nG)
            if P_ani:
                for P_ni in P_ani.values():
                    gemm(1.0, P_ni.copy(), C_NN, 0.0, P_ni)
            return out_nG
        
        # Now it gets nasty! We parallelize over B groups of bands and
        # each grid chunk is divided in J smaller slices (less memory).

        Q = B  # always non-hermitian XXX
        rank = band_comm.rank
        shape = psit_nG.shape
        psit_nG = psit_nG.reshape(N, -1)
        G = psit_nG.shape[1]  # number of grid-points
        g = int(np.ceil(G / float(J)))

        # Buffers for send/receive of pre-multiplication versions of P_ani's.
        sbuf_nI = rbuf_nI = None
        if P_ani:
            sbuf_nI = np.hstack([P_ni for P_ni in P_ani.values()])
            sbuf_nI = np.ascontiguousarray(sbuf_nI)
            if B > 1:
                rbuf_nI = np.empty_like(sbuf_nI)

        # Because of the amount of communication involved, we need to
        # be syncronized up to this point but only on the 1D band_comm
        # communication ring
        band_comm.barrier()
        while g * J >= G + g:  # remove extra slice(s)
            J -= 1
        assert 0 < g * J < G + g

        work1_xG = reshape(self.work1_xG, (self.X,) + psit_nG.shape[1:])
        work2_xG = reshape(self.work2_xG, (self.X,) + psit_nG.shape[1:])

        for j in range(J):
            G1 = j * g
            G2 = G1 + g
            if G2 > G:
                G2 = G
                g = G2 - G1
            sbuf_ng = reshape(work1_xG, (N, g))
            rbuf_ng = reshape(work2_xG, (N, g))
            sbuf_ng[:] = psit_nG[:, G1:G2]
            beta = 0.0
            cycle_P_ani = (j == J - 1 and P_ani)
            for q in range(Q):
                # Start sending currently buffered kets to rank below
                # and receiving next set of kets from rank above us.
                # If we're at the last slice, start cycling P_ani too.
                if q < Q - 1:
                    self._initialize_cycle(sbuf_ng, rbuf_ng,
                                           sbuf_nI, rbuf_nI, cycle_P_ani)

                # Calculate wave-function contributions from the current slice
                # of grid data by the current mynbands x mynbands matrix block.
                C_nn = self.bmd.extract_block(C_NN, (rank + q) % B, rank)
                self.gd.gemm(1.0, sbuf_ng, C_nn, beta, psit_nG[:, G1:G2])

                # If we're at the last slice, add contributions to P_ani's.
                if cycle_P_ani:
                    I1 = 0
                    for P_ni in P_ani.values():
                        I2 = I1 + P_ni.shape[1]
                        gemm(1.0, sbuf_nI[:, I1:I2], C_nn, beta, P_ni)
                        I1 = I2

                # Wait for all send/receives to finish before next iteration.
                # Swap send and receive buffer such that next becomes current.
                # If we're at the last slice, also finishes the P_ani cycle.
                if q < Q - 1:
                    sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI = self._finish_cycle(
                        sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI, cycle_P_ani)

                # First iteration was special because we initialized the kets
                if q == 0:
                    beta = 1.0

        psit_nG.shape = shape
        return psit_nG
Exemple #19
0
    def subspace_diagonalize(self, hamiltonian, wfs, kpt):
        """Diagonalize the Hamiltonian in the subspace of kpt.psit_nG

        *Htpsit_nG* is a work array of same size as psit_nG which contains
        the local part of the Hamiltonian times psit on exit

        First, the Hamiltonian (defined by *kin*, *vt_sG*, and
        *dH_asp*) is applied to the wave functions, then the *H_nn*
        matrix is calculated and diagonalized, and finally, the wave
        functions (and also Htpsit_nG are rotated.  Also the
        projections *P_ani* are rotated.

        It is assumed that the wave functions *psit_nG* are orthonormal
        and that the integrals of projector functions and wave functions
        *P_ani* are already calculated.

        Return ratated wave functions and H applied to the rotated
        wave functions if self.keep_htpsit is True.
        """

        if self.band_comm.size > 1 and wfs.bd.strided:
            raise NotImplementedError

        self.timer.start('Subspace diag')

        if use_mic:
            psit_nG = kpt.psit_nG_mic
            # psit_nG.update_device()
            # stream.sync()
        else:
            psit_nG = kpt.psit_nG
        P_ani = kpt.P_ani

        if self.keep_htpsit:
            if use_mic:
                Htpsit_nG = self.Htpsit_nG_mic
            else:
                Htpsit_nG = reshape(self.Htpsit_nG, psit_nG.shape)
        else:
            Htpsit_nG = None

        def H(psit_xG):
            if self.keep_htpsit:
                result_xG = Htpsit_nG
            else:
                if use_mic:
                    result_xG = self.operator.work1_xG_mic
                else:
                    result_xG = reshape(self.operator.work1_xG, psit_xG.shape)
            if use_mic:
                psit_xG.update_device()
                wfs.apply_pseudo_hamiltonian(kpt, hamiltonian, psit_xG.array,
                                             result_xG.array)
                result_xG.update_device()
                stream.sync()
            else:
                wfs.apply_pseudo_hamiltonian(kpt, hamiltonian, psit_xG,
                                             result_xG)
            hamiltonian.xc.apply_orbital_dependent_hamiltonian(
                kpt, psit_xG, result_xG, hamiltonian.dH_asp)
            return result_xG

        def dH(a, P_ni):
            return np.dot(P_ni, unpack(hamiltonian.dH_asp[a][kpt.s]))

        self.timer.start('calc_h_matrix')
        H_nn = self.operator.calculate_matrix_elements(psit_nG, P_ani, H, dH)
        hamiltonian.xc.correct_hamiltonian_matrix(kpt, H_nn)
        self.timer.stop('calc_h_matrix')

        diagonalization_string = repr(self.ksl)
        wfs.timer.start(diagonalization_string)
        self.ksl.diagonalize(H_nn, kpt.eps_n)
        # H_nn now contains the result of the diagonalization.
        wfs.timer.stop(diagonalization_string)

        self.timer.start('rotate_psi')
        psit_nG = self.operator.matrix_multiply(H_nn, psit_nG, P_ani)
        if self.keep_htpsit:
            if use_mic:
                Htpsit_nG = self.operator.matrix_multiply(
                    H_nn, Htpsit_nG, out_nG=kpt.psit_nG_mic)

            else:
                Htpsit_nG = self.operator.matrix_multiply(H_nn,
                                                          Htpsit_nG,
                                                          out_nG=kpt.psit_nG)

        # Rotate orbital dependent XC stuff:
        hamiltonian.xc.rotate(kpt, H_nn)

        self.timer.stop('rotate_psi')
        self.timer.stop('Subspace diag')

        if use_mic:
            psit_nG.update_host()
            stream.sync()
            if self.keep_htpsit:
                Htpsit_nG.update_host()
                stream.sync()
                return psit_nG.array, Htpsit_nG.array
            else:
                return psit_nG.array, Htpsit_nG
        else:
            return psit_nG, Htpsit_nG
    def matrix_multiply(self, C_NN, psit_nG, P_ani=None, out_nG=None):
        """Calculate new linear combinations of wave functions.

        Results will be put in the *P_ani* dict and a new psit_nG returned::

                     __                                __
            ~       \       ~           ~a  ~         \       ~a  ~
           psi  <--  ) C   psi    and  <p |psi >  <--  ) C   <p |psi >
              n     /__ nn'   n'         i    n       /__ nn'  i    n'
                     n'                                n'


        Parameters:

        C_NN: ndarray
            Matrix representation of the requested linear combinations. Even
            with a hermitian operator, this matrix need not be self-adjoint.
            However, unlike the results from calculate_matrix_elements, it is
            assumed that all matrix elements are filled in (use e.g. tri2full).
        psit_nG: ndarray
            Set of vectors in which the matrix elements are evaluated.
        P_ani: dict
            Dictionary of projector overlap integrals P_ni = <p_i | psit_nG>.

        """

        if self.A_nn is None:
            self.allocate_arrays()

        band_comm = self.bd.comm
        B = band_comm.size
        J = self.nblocks
        N = self.bd.mynbands

        C_NN = self.bmd.redistribute_input(C_NN)

        if B == 1 and J == 1:
            # Simple case:
            if use_mic:
                work_nG = self.work1_xG_mic
            else:
                work_nG = reshape(self.work1_xG, psit_nG.shape)
            if out_nG is None:
                out_nG = work_nG
                # out_nG[:] = 117  # gemm may not like nan's
            elif out_nG is psit_nG:
                work_nG[:] = psit_nG
                psit_nG = work_nG

            if use_mic:
                if self.gd.comm.rank == 0:
                    offload_report(1)
                C_NN_mic = self.A_nn_mic
                C_NN_mic.array[:] = C_NN[:]
                C_NN_mic.update_device()
                stream.sync()
                mic_gemm(1.0, psit_nG, C_NN_mic, 0.0, out_nG)
                if self.gd.comm.rank == 0:
                    offload_report(0)
            else:
                self.gd.gemm(1.0, psit_nG, C_NN, 0.0, out_nG)
            if P_ani:
                for P_ni in P_ani.values():
                    gemm(1.0, P_ni.copy(), C_NN, 0.0, P_ni)
            return out_nG

        # Now it gets nasty! We parallelize over B groups of bands and
        # each grid chunk is divided in J smaller slices (less memory).

        Q = B  # always non-hermitian XXX
        rank = band_comm.rank
        shape = psit_nG.shape
        psit_nG = psit_nG.reshape(N, -1)
        G = psit_nG.shape[1]  # number of grid-points
        g = int(np.ceil(G / float(J)))

        # Buffers for send/receive of pre-multiplication versions of P_ani's.
        sbuf_nI = rbuf_nI = None
        if P_ani:
            sbuf_nI = np.hstack([P_ni for P_ni in P_ani.values()])
            sbuf_nI = np.ascontiguousarray(sbuf_nI)
            if B > 1:
                rbuf_nI = np.empty_like(sbuf_nI)

        # Because of the amount of communication involved, we need to
        # be syncronized up to this point but only on the 1D band_comm
        # communication ring
        band_comm.barrier()
        while g * J >= G + g:  # remove extra slice(s)
            J -= 1
        assert 0 < g * J < G + g

        work1_xG = reshape(self.work1_xG, (self.X, ) + psit_nG.shape[1:])
        work2_xG = reshape(self.work2_xG, (self.X, ) + psit_nG.shape[1:])

        for j in range(J):
            G1 = j * g
            G2 = G1 + g
            if G2 > G:
                G2 = G
                g = G2 - G1
            sbuf_ng = reshape(work1_xG, (N, g))
            rbuf_ng = reshape(work2_xG, (N, g))
            sbuf_ng[:] = psit_nG[:, G1:G2]
            beta = 0.0
            cycle_P_ani = (j == J - 1 and P_ani)
            for q in range(Q):
                # Start sending currently buffered kets to rank below
                # and receiving next set of kets from rank above us.
                # If we're at the last slice, start cycling P_ani too.
                if q < Q - 1:
                    self._initialize_cycle(sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI,
                                           cycle_P_ani)

                # Calculate wave-function contributions from the current slice
                # of grid data by the current mynbands x mynbands matrix block.
                C_nn = self.bmd.extract_block(C_NN, (rank + q) % B, rank)
                self.gd.gemm(1.0, sbuf_ng, C_nn, beta, psit_nG[:, G1:G2])

                # If we're at the last slice, add contributions to P_ani's.
                if cycle_P_ani:
                    I1 = 0
                    for P_ni in P_ani.values():
                        I2 = I1 + P_ni.shape[1]
                        gemm(1.0, sbuf_nI[:, I1:I2], C_nn, beta, P_ni)
                        I1 = I2

                # Wait for all send/receives to finish before next iteration.
                # Swap send and receive buffer such that next becomes current.
                # If we're at the last slice, also finishes the P_ani cycle.
                if q < Q - 1:
                    sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI = self._finish_cycle(
                        sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI, cycle_P_ani)

                # First iteration was special because we initialized the kets
                if q == 0:
                    beta = 1.0

        psit_nG.shape = shape
        return psit_nG
    def orthonormalize(self, wfs, kpt, psit_nG=None):
        """Orthonormalizes the vectors a_nG with respect to the overlap.

        First, a Cholesky factorization C is done for the overlap
        matrix S_nn = <a_nG | S | a_nG> = C*_nn C_nn Cholesky matrix C
        is inverted and orthonormal vectors a_nG' are obtained as::

          psit_nG' = inv(C_nn) psit_nG
                    __
           ~   _   \    -1   ~   _
          psi (r) = )  C    psi (r)
             n     /__  nm     m
                    m

        Parameters
        ----------

        psit_nG: ndarray, input/output
            On input the set of vectors to orthonormalize,
            on output the overlap-orthonormalized vectors.
        kpt: KPoint object:
            k-point object from kpoint.py.
        work_nG: ndarray
            Optional work array for overlap matrix times psit_nG.
        work_nn: ndarray
            Optional work array for overlap matrix.

        """
        self.timer.start('Orthonormalize')
        if psit_nG is None:
            psit_nG = kpt.psit_nG
            if use_mic:
                psit_nG_mic = kpt.psit_nG_mic
        else:
            if use_mic:
                psit_nG_mic = stream.bind(psit_nG, update_device=False)
                stream.sync()

        P_ani = kpt.P_ani
        self.timer.start('projections')
        wfs.pt.integrate(psit_nG, P_ani, kpt.q)
        self.timer.stop('projections')

        # Construct the overlap matrix:
        operator = wfs.matrixoperator

        def S(psit_G):
            return psit_G

        def dS(a, P_ni):
            return np.dot(P_ni, wfs.setups[a].dO_ii)

        if use_mic:
            self.timer.start('calc_s_matrix')
            psit_nG_mic.update_device()
            stream.sync()
            S_nn = operator.calculate_matrix_elements(psit_nG_mic, P_ani, S,
                                                      dS)
            self.timer.stop('calc_s_matrix')
        else:
            self.timer.start('calc_s_matrix')
            S_nn = operator.calculate_matrix_elements(psit_nG, P_ani, S, dS)
            self.timer.stop('calc_s_matrix')

        orthonormalization_string = repr(self.ksl)
        self.timer.start(orthonormalization_string)
        #
        if extra_parameters.get('sic', False):
            #
            # symmetric Loewdin Orthonormalization
            tri2full(S_nn, UL='L', map=np.conj)
            nrm_n = np.empty(S_nn.shape[0])
            diagonalize(S_nn, nrm_n)
            nrm_nn = np.diag(1.0 / np.sqrt(nrm_n))
            S_nn = np.dot(np.dot(S_nn.T.conj(), nrm_nn), S_nn)
        else:
            #
            self.ksl.inverse_cholesky(S_nn)
        # S_nn now contains the inverse of the Cholesky factorization.
        # Let's call it something different:
        C_nn = S_nn
        del S_nn
        self.timer.stop(orthonormalization_string)

        self.timer.start('rotate_psi')
        if use_mic:
            operator.matrix_multiply(C_nn,
                                     psit_nG_mic,
                                     P_ani,
                                     out_nG=kpt.psit_nG_mic)
            kpt.psit_nG_mic.update_host()
            stream.sync()
            # kpt.psit_nG[:] = self.psit_nG_mic.array[:]
        else:
            operator.matrix_multiply(C_nn, psit_nG, P_ani, out_nG=kpt.psit_nG)
        self.timer.stop('rotate_psi')
        self.timer.stop('Orthonormalize')
    def integrate(self, a_xg, b_yg=None,
                  global_integral=True, hermitian=False,
                  _transposed_result=None):
        """Integrate function(s) over domain.

        a_xg: ndarray
            Function(s) to be integrated.
        b_yg: ndarray
            If present, integrate a_xg.conj() * b_yg.
        global_integral: bool
            If the array(s) are distributed over several domains, then the
            total sum will be returned.  To get the local contribution
            only, use global_integral=False.
        hermitian: bool
            Result is hermitian.
        _transposed_result: ndarray
            Long story.  Don't use this unless you are a method of the
            MatrixOperator class ..."""
        
        xshape = a_xg.shape[:-3]
        
        if b_yg is None:
            # Only one array:
            result = a_xg.reshape(xshape + (-1,)).sum(axis=-1) * self.dv
            if global_integral:
                if result.ndim == 0:
                    result = self.comm.sum(result)
                else:
                    self.comm.sum(result)
            return result

        if isinstance(a_xg, mic.OffloadArray):
            # offload arrays have to be contiguous in any case
            A_xg = a_xg
            B_yg = b_yg
        else:
            A_xg = np.ascontiguousarray(a_xg.reshape((-1,) + a_xg.shape[-3:]))
            B_yg = np.ascontiguousarray(b_yg.reshape((-1,) + b_yg.shape[-3:]))

        if _transposed_result is None:
            result_yx = np.zeros((len(B_yg), len(A_xg)), A_xg.dtype)
        else:
            result_yx = _transposed_result
            global_integral = False

        if isinstance(a_xg, mic.OffloadArray):
            result_yx_mic = stream.bind(result_yx)
            stream.sync()
            # result_yx_mic.fillfrom(result_yx)
            # result_yx_mic.array[:] = result_yx[:]
            # result_yx_mic.update_device()

        if a_xg is b_yg:
            if isinstance(a_xg, mic.OffloadArray):
                # dsyrk performs badly in MIC so use dgemm here
                # mic_rk(self.dv, A_xg, 0.0, result_yx_mic)
                mic_gemm(self.dv, A_xg, A_xg, 0.0, result_yx_mic, 'c')
            else:
                rk(self.dv, A_xg, 0.0, result_yx)
        elif hermitian:
            if isinstance(a_xg, mic.OffloadArray):
                mic_r2k(self.dv, A_xg, B_yg, 0.0, result_yx_mic)
            else:
                r2k(0.5 * self.dv, A_xg, B_yg, 0.0, result_yx)
        else:
            if isinstance(a_xg, mic.OffloadArray):
                mic_gemm(self.dv, A_xg, B_yg, 0.0, result_yx_mic, 'c')
            else:
                gemm(self.dv, A_xg, B_yg, 0.0, result_yx, 'c')
        
        if isinstance(a_xg, mic.OffloadArray):
            result_yx_mic.update_host()
            stream.sync()

        if global_integral:
            self.comm.sum(result_yx)

        yshape = b_yg.shape[:-3]
        result = result_yx.T.reshape(xshape + yshape)
        
        if result.ndim == 0:
            return result.item()
        else:
            return result