Esempio n. 1
0
File: uccsd.py Progetto: tmash/pyscf
def grad_elec(cc_grad, t1=None, t2=None, l1=None, l2=None, eris=None, atmlst=None,
              d1=None, d2=None, verbose=logger.INFO):
    mycc = cc_grad.base
    if eris is not None:
        if (abs(eris.focka - numpy.diag(eris.focka.diagonal())).max() > 1e-3 or
            abs(eris.fockb - numpy.diag(eris.fockb.diagonal())).max() > 1e-3):
            raise RuntimeError('UCCSD gradients does not support NHF (non-canonical HF)')

    if t1 is None: t1 = mycc.t1
    if t2 is None: t2 = mycc.t2
    if l1 is None: l1 = mycc.l1
    if l2 is None: l2 = mycc.l2

    log = logger.new_logger(mycc, verbose)
    time0 = time.clock(), time.time()

    log.debug('Build uccsd rdm1 intermediates')
    if d1 is None:
        d1 = uccsd_rdm._gamma1_intermediates(mycc, t1, t2, l1, l2)
    time1 = log.timer_debug1('rdm1 intermediates', *time0)
    log.debug('Build uccsd rdm2 intermediates')
    fdm2 = lib.H5TmpFile()
    if d2 is None:
        d2 = uccsd_rdm._gamma2_outcore(mycc, t1, t2, l1, l2, fdm2, True)
    time1 = log.timer_debug1('rdm2 intermediates', *time1)

    mol = cc_grad.mol
    mo_a, mo_b = mycc.mo_coeff
    mo_ea, mo_eb = mycc._scf.mo_energy
    nao, nmoa = mo_a.shape
    nmob = mo_b.shape[1]
    nocca = numpy.count_nonzero(mycc.mo_occ[0] > 0)
    noccb = numpy.count_nonzero(mycc.mo_occ[1] > 0)
    nvira = nmoa - nocca
    nvirb = nmob - noccb
    with_frozen = not (mycc.frozen is None or mycc.frozen is 0)
    moidx = mycc.get_frozen_mask()
    OA_a, VA_a, OF_a, VF_a = ccsd_grad._index_frozen_active(moidx[0], mycc.mo_occ[0])
    OA_b, VA_b, OF_b, VF_b = ccsd_grad._index_frozen_active(moidx[1], mycc.mo_occ[1])

    log.debug('symmetrized rdm2 and MO->AO transformation')
# Roughly, dm2*2 is computed in _rdm2_mo2ao
    mo_active = (mo_a[:,numpy.hstack((OA_a,VA_a))],
                 mo_b[:,numpy.hstack((OA_b,VA_b))])
    _rdm2_mo2ao(mycc, d2, mo_active, fdm2)  # transform the active orbitals
    time1 = log.timer_debug1('MO->AO transformation', *time1)
    hf_dm1a, hf_dm1b = mycc._scf.make_rdm1(mycc.mo_coeff, mycc.mo_occ)
    hf_dm1 = hf_dm1a + hf_dm1b

    if atmlst is None:
        atmlst = range(mol.natm)
    offsetdic = mol.offset_nr_by_atom()
    diagidx = numpy.arange(nao)
    diagidx = diagidx*(diagidx+1)//2 + diagidx
    de = numpy.zeros((len(atmlst),3))
    Imata = numpy.zeros((nao,nao))
    Imatb = numpy.zeros((nao,nao))
    vhf1 = fdm2.create_dataset('vhf1', (len(atmlst),2,3,nao,nao), 'f8')

# 2e AO integrals dot 2pdm
    max_memory = max(0, mycc.max_memory - lib.current_memory()[0])
    blksize = max(1, int(max_memory*.9e6/8/(nao**3*2.5)))

    for k, ia in enumerate(atmlst):
        shl0, shl1, p0, p1 = offsetdic[ia]
        ip1 = p0
        vhf = numpy.zeros((2,3,nao,nao))
        for b0, b1, nf in ccsd_grad._shell_prange(mol, shl0, shl1, blksize):
            ip0, ip1 = ip1, ip1 + nf
            dm2bufa = ccsd_grad._load_block_tril(fdm2['dm2aa+ab'], ip0, ip1, nao)
            dm2bufb = ccsd_grad._load_block_tril(fdm2['dm2bb+ab'], ip0, ip1, nao)
            dm2bufa[:,:,diagidx] *= .5
            dm2bufb[:,:,diagidx] *= .5
            shls_slice = (b0,b1,0,mol.nbas,0,mol.nbas,0,mol.nbas)
            eri0 = mol.intor('int2e', aosym='s2kl', shls_slice=shls_slice)
            Imata += lib.einsum('ipx,iqx->pq', eri0.reshape(nf,nao,-1), dm2bufa)
            Imatb += lib.einsum('ipx,iqx->pq', eri0.reshape(nf,nao,-1), dm2bufb)
            eri0 = None

            eri1 = mol.intor('int2e_ip1', comp=3, aosym='s2kl',
                             shls_slice=shls_slice).reshape(3,nf,nao,-1)
            de[k] -= numpy.einsum('xijk,ijk->x', eri1, dm2bufa) * 2
            de[k] -= numpy.einsum('xijk,ijk->x', eri1, dm2bufb) * 2
            dm2bufa = dm2bufb = None
# HF part
            for i in range(3):
                eri1tmp = lib.unpack_tril(eri1[i].reshape(nf*nao,-1))
                eri1tmp = eri1tmp.reshape(nf,nao,nao,nao)
                vhf[:,i] += numpy.einsum('ijkl,ij->kl', eri1tmp, hf_dm1[ip0:ip1])
                vhf[0,i] -= numpy.einsum('ijkl,il->kj', eri1tmp, hf_dm1a[ip0:ip1])
                vhf[1,i] -= numpy.einsum('ijkl,il->kj', eri1tmp, hf_dm1b[ip0:ip1])
                vhf[:,i,ip0:ip1] += numpy.einsum('ijkl,kl->ij', eri1tmp, hf_dm1)
                vhf[0,i,ip0:ip1] -= numpy.einsum('ijkl,jk->il', eri1tmp, hf_dm1a)
                vhf[1,i,ip0:ip1] -= numpy.einsum('ijkl,jk->il', eri1tmp, hf_dm1b)
            eri1 = eri1tmp = None
        vhf1[k] = vhf
        log.debug('2e-part grad of atom %d %s = %s', ia, mol.atom_symbol(ia), de[k])
        time1 = log.timer_debug1('2e-part grad of atom %d'%ia, *time1)

    s0 = mycc._scf.get_ovlp()
    Imata = reduce(numpy.dot, (mo_a.T, Imata, s0, mo_a)) * -1
    Imatb = reduce(numpy.dot, (mo_b.T, Imatb, s0, mo_b)) * -1

    dm1a = numpy.zeros((nmoa,nmoa))
    dm1b = numpy.zeros((nmob,nmob))
    doo, dOO = d1[0]
    dov, dOV = d1[1]
    dvo, dVO = d1[2]
    dvv, dVV = d1[3]
    if with_frozen:
        dco = Imata[OF_a[:,None],OA_a] / (mo_ea[OF_a,None] - mo_ea[OA_a])
        dfv = Imata[VF_a[:,None],VA_a] / (mo_ea[VF_a,None] - mo_ea[VA_a])
        dm1a[OA_a[:,None],OA_a] = (doo + doo.T) * .5
        dm1a[OF_a[:,None],OA_a] = dco
        dm1a[OA_a[:,None],OF_a] = dco.T
        dm1a[VA_a[:,None],VA_a] = (dvv + dvv.T) * .5
        dm1a[VF_a[:,None],VA_a] = dfv
        dm1a[VA_a[:,None],VF_a] = dfv.T
        dco = Imatb[OF_b[:,None],OA_b] / (mo_eb[OF_b,None] - mo_eb[OA_b])
        dfv = Imatb[VF_b[:,None],VA_b] / (mo_eb[VF_b,None] - mo_eb[VA_b])
        dm1b[OA_b[:,None],OA_b] = (dOO + dOO.T) * .5
        dm1b[OF_b[:,None],OA_b] = dco
        dm1b[OA_b[:,None],OF_b] = dco.T
        dm1b[VA_b[:,None],VA_b] = (dVV + dVV.T) * .5
        dm1b[VF_b[:,None],VA_b] = dfv
        dm1b[VA_b[:,None],VF_b] = dfv.T
    else:
        dm1a[:nocca,:nocca] = (doo + doo.T) * .5
        dm1a[nocca:,nocca:] = (dvv + dvv.T) * .5
        dm1b[:noccb,:noccb] = (dOO + dOO.T) * .5
        dm1b[noccb:,noccb:] = (dVV + dVV.T) * .5

    dm1 = (reduce(numpy.dot, (mo_a, dm1a, mo_a.T)),
           reduce(numpy.dot, (mo_b, dm1b, mo_b.T)))
    vhf = mycc._scf.get_veff(mycc.mol, dm1)
    Xvo = reduce(numpy.dot, (mo_a[:,nocca:].T, vhf[0], mo_a[:,:nocca]))
    XVO = reduce(numpy.dot, (mo_b[:,noccb:].T, vhf[1], mo_b[:,:noccb]))
    Xvo+= Imata[:nocca,nocca:].T - Imata[nocca:,:nocca]
    XVO+= Imatb[:noccb,noccb:].T - Imatb[noccb:,:noccb]

    dm1_resp = _response_dm1(mycc, (Xvo,XVO), eris)
    dm1a += dm1_resp[0]
    dm1b += dm1_resp[1]
    time1 = log.timer_debug1('response_rdm1 intermediates', *time1)

    Imata[nocca:,:nocca] = Imata[:nocca,nocca:].T
    Imatb[noccb:,:noccb] = Imatb[:noccb,noccb:].T
    im1 = reduce(numpy.dot, (mo_a, Imata, mo_a.T))
    im1+= reduce(numpy.dot, (mo_b, Imatb, mo_b.T))
    time1 = log.timer_debug1('response_rdm1', *time1)

    log.debug('h1 and JK1')
    # Initialize hcore_deriv with the underlying SCF object because some
    # extensions (e.g. QM/MM, solvent) modifies the SCF object only.
    mf_grad = cc_grad.base._scf.nuc_grad_method()
    hcore_deriv = mf_grad.hcore_generator(mol)
    s1 = mf_grad.get_ovlp(mol)

    zeta = (mo_ea[:,None] + mo_ea) * .5
    zeta[nocca:,:nocca] = mo_ea[:nocca]
    zeta[:nocca,nocca:] = mo_ea[:nocca].reshape(-1,1)
    zeta_a = reduce(numpy.dot, (mo_a, zeta*dm1a, mo_a.T))
    zeta = (mo_eb[:,None] + mo_eb) * .5
    zeta[noccb:,:noccb] = mo_eb[:noccb]
    zeta[:noccb,noccb:] = mo_eb[:noccb].reshape(-1,1)
    zeta_b = reduce(numpy.dot, (mo_b, zeta*dm1b, mo_b.T))

    dm1 = (reduce(numpy.dot, (mo_a, dm1a, mo_a.T)),
           reduce(numpy.dot, (mo_b, dm1b, mo_b.T)))
    vhf_s1occ = mycc._scf.get_veff(mol, (dm1[0]+dm1[0].T, dm1[1]+dm1[1].T))
    p1a = numpy.dot(mo_a[:,:nocca], mo_a[:,:nocca].T)
    p1b = numpy.dot(mo_b[:,:noccb], mo_b[:,:noccb].T)
    vhf_s1occ = (reduce(numpy.dot, (p1a, vhf_s1occ[0], p1a)) +
                 reduce(numpy.dot, (p1b, vhf_s1occ[1], p1b))) * .5
    time1 = log.timer_debug1('h1 and JK1', *time1)

    # Hartree-Fock part contribution
    dm1pa = hf_dm1a + dm1[0]*2
    dm1pb = hf_dm1b + dm1[1]*2
    dm1 = dm1[0] + dm1[1] + hf_dm1
    zeta_a += rhf_grad.make_rdm1e(mo_ea, mo_a, mycc.mo_occ[0])
    zeta_b += rhf_grad.make_rdm1e(mo_eb, mo_b, mycc.mo_occ[1])
    zeta = zeta_a + zeta_b

    for k, ia in enumerate(atmlst):
        shl0, shl1, p0, p1 = offsetdic[ia]
# s[1] dot I, note matrix im1 is not hermitian
        de[k] += numpy.einsum('xij,ij->x', s1[:,p0:p1], im1[p0:p1])
        de[k] += numpy.einsum('xji,ij->x', s1[:,p0:p1], im1[:,p0:p1])
# h[1] \dot DM, contribute to f1
        h1ao = hcore_deriv(ia)
        de[k] += numpy.einsum('xij,ji->x', h1ao, dm1)
# -s[1]*e \dot DM,  contribute to f1
        de[k] -= numpy.einsum('xij,ij->x', s1[:,p0:p1], zeta[p0:p1]  )
        de[k] -= numpy.einsum('xji,ij->x', s1[:,p0:p1], zeta[:,p0:p1])
# -vhf[s_ij[1]],  contribute to f1, *2 for s1+s1.T
        de[k] -= numpy.einsum('xij,ij->x', s1[:,p0:p1], vhf_s1occ[p0:p1]) * 2
        de[k] -= numpy.einsum('xij,ij->x', vhf1[k,0], dm1pa)
        de[k] -= numpy.einsum('xij,ij->x', vhf1[k,1], dm1pb)

    log.timer('%s gradients' % mycc.__class__.__name__, *time0)
    return de
Esempio n. 2
0
def kernel(mycc, t1=None, t2=None, l1=None, l2=None, eris=None, atmlst=None,
           mf_grad=None, d1=None, d2=None, verbose=logger.INFO):
    if eris is not None:
        if (abs(eris.focka - numpy.diag(eris.focka.diagonal())).max() > 1e-3 or
            abs(eris.fockb - numpy.diag(eris.fockb.diagonal())).max() > 1e-3):
            raise RuntimeError('UCCSD gradients does not support NHF (non-canonical HF)')

    if t1 is None: t1 = mycc.t1
    if t2 is None: t2 = mycc.t2
    if l1 is None: l1 = mycc.l1
    if l2 is None: l2 = mycc.l2
    if mf_grad is None: mf_grad = mycc._scf.nuc_grad_method()

    log = logger.new_logger(mycc, verbose)
    time0 = time.clock(), time.time()

    log.debug('Build uccsd rdm1 intermediates')
    if d1 is None:
        d1 = uccsd_rdm._gamma1_intermediates(mycc, t1, t2, l1, l2)
    time1 = log.timer_debug1('rdm1 intermediates', *time0)
    log.debug('Build uccsd rdm2 intermediates')
    fdm2 = lib.H5TmpFile()
    if d2 is None:
        d2 = uccsd_rdm._gamma2_outcore(mycc, t1, t2, l1, l2, fdm2, True)
    time1 = log.timer_debug1('rdm2 intermediates', *time1)

    mol = mycc.mol
    mo_a, mo_b = mycc.mo_coeff
    mo_ea, mo_eb = mycc._scf.mo_energy
    nao, nmoa = mo_a.shape
    nmob = mo_b.shape[1]
    nocca = numpy.count_nonzero(mycc.mo_occ[0] > 0)
    noccb = numpy.count_nonzero(mycc.mo_occ[1] > 0)
    nvira = nmoa - nocca
    nvirb = nmob - noccb
    with_frozen = not (mycc.frozen is None or mycc.frozen is 0)
    moidx = mycc.get_frozen_mask()
    OA_a, VA_a, OF_a, VF_a = ccsd_grad._index_frozen_active(moidx[0], mycc.mo_occ[0])
    OA_b, VA_b, OF_b, VF_b = ccsd_grad._index_frozen_active(moidx[1], mycc.mo_occ[1])

    log.debug('symmetrized rdm2 and MO->AO transformation')
# Roughly, dm2*2 is computed in _rdm2_mo2ao
    mo_active = (mo_a[:,numpy.hstack((OA_a,VA_a))],
                 mo_b[:,numpy.hstack((OA_b,VA_b))])
    _rdm2_mo2ao(mycc, d2, mo_active, fdm2)  # transform the active orbitals
    time1 = log.timer_debug1('MO->AO transformation', *time1)
    hf_dm1a, hf_dm1b = mycc._scf.make_rdm1(mycc.mo_coeff, mycc.mo_occ)
    hf_dm1 = hf_dm1a + hf_dm1b

    if atmlst is None:
        atmlst = range(mol.natm)
    offsetdic = mol.offset_nr_by_atom()
    diagidx = numpy.arange(nao)
    diagidx = diagidx*(diagidx+1)//2 + diagidx
    de = numpy.zeros((len(atmlst),3))
    Imata = numpy.zeros((nao,nao))
    Imatb = numpy.zeros((nao,nao))
    vhf1 = fdm2.create_dataset('vhf1', (len(atmlst),2,3,nao,nao), 'f8')

# 2e AO integrals dot 2pdm
    max_memory = max(0, mycc.max_memory - lib.current_memory()[0])
    blksize = max(1, int(max_memory*.9e6/8/(nao**3*2.5)))

    for k, ia in enumerate(atmlst):
        shl0, shl1, p0, p1 = offsetdic[ia]
        ip1 = p0
        vhf = numpy.zeros((2,3,nao,nao))
        for b0, b1, nf in ccsd_grad._shell_prange(mol, shl0, shl1, blksize):
            ip0, ip1 = ip1, ip1 + nf
            dm2bufa = ccsd_grad._load_block_tril(fdm2['dm2aa+ab'], ip0, ip1, nao)
            dm2bufb = ccsd_grad._load_block_tril(fdm2['dm2bb+ab'], ip0, ip1, nao)
            dm2bufa[:,:,diagidx] *= .5
            dm2bufb[:,:,diagidx] *= .5
            shls_slice = (b0,b1,0,mol.nbas,0,mol.nbas,0,mol.nbas)
            eri0 = mol.intor('int2e', aosym='s2kl', shls_slice=shls_slice)
            Imata += lib.einsum('ipx,iqx->pq', eri0.reshape(nf,nao,-1), dm2bufa)
            Imatb += lib.einsum('ipx,iqx->pq', eri0.reshape(nf,nao,-1), dm2bufb)
            eri0 = None

            eri1 = mol.intor('int2e_ip1', comp=3, aosym='s2kl',
                             shls_slice=shls_slice).reshape(3,nf,nao,-1)
            de[k] -= numpy.einsum('xijk,ijk->x', eri1, dm2bufa) * 2
            de[k] -= numpy.einsum('xijk,ijk->x', eri1, dm2bufb) * 2
            dm2bufa = dm2bufb = None
# HF part
            for i in range(3):
                eri1tmp = lib.unpack_tril(eri1[i].reshape(nf*nao,-1))
                eri1tmp = eri1tmp.reshape(nf,nao,nao,nao)
                vhf[:,i] += numpy.einsum('ijkl,ij->kl', eri1tmp, hf_dm1[ip0:ip1])
                vhf[0,i] -= numpy.einsum('ijkl,il->kj', eri1tmp, hf_dm1a[ip0:ip1])
                vhf[1,i] -= numpy.einsum('ijkl,il->kj', eri1tmp, hf_dm1b[ip0:ip1])
                vhf[:,i,ip0:ip1] += numpy.einsum('ijkl,kl->ij', eri1tmp, hf_dm1)
                vhf[0,i,ip0:ip1] -= numpy.einsum('ijkl,jk->il', eri1tmp, hf_dm1a)
                vhf[1,i,ip0:ip1] -= numpy.einsum('ijkl,jk->il', eri1tmp, hf_dm1b)
            eri1 = eri1tmp = None
        vhf1[k] = vhf
        log.debug('2e-part grad of atom %d %s = %s', ia, mol.atom_symbol(ia), de[k])
        time1 = log.timer_debug1('2e-part grad of atom %d'%ia, *time1)

    s0 = mycc._scf.get_ovlp()
    Imata = reduce(numpy.dot, (mo_a.T, Imata, s0, mo_a)) * -1
    Imatb = reduce(numpy.dot, (mo_b.T, Imatb, s0, mo_b)) * -1

    dm1a = numpy.zeros((nmoa,nmoa))
    dm1b = numpy.zeros((nmob,nmob))
    doo, dOO = d1[0]
    dov, dOV = d1[1]
    dvo, dVO = d1[2]
    dvv, dVV = d1[3]
    if with_frozen:
        dco = Imata[OF_a[:,None],OA_a] / (mo_ea[OF_a,None] - mo_ea[OA_a])
        dfv = Imata[VF_a[:,None],VA_a] / (mo_ea[VF_a,None] - mo_ea[VA_a])
        dm1a[OA_a[:,None],OA_a] = (doo + doo.T) * .5
        dm1a[OF_a[:,None],OA_a] = dco
        dm1a[OA_a[:,None],OF_a] = dco.T
        dm1a[VA_a[:,None],VA_a] = (dvv + dvv.T) * .5
        dm1a[VF_a[:,None],VA_a] = dfv
        dm1a[VA_a[:,None],VF_a] = dfv.T
        dco = Imatb[OF_b[:,None],OA_b] / (mo_eb[OF_b,None] - mo_eb[OA_b])
        dfv = Imatb[VF_b[:,None],VA_b] / (mo_eb[VF_b,None] - mo_eb[VA_b])
        dm1b[OA_b[:,None],OA_b] = (dOO + dOO.T) * .5
        dm1b[OF_b[:,None],OA_b] = dco
        dm1b[OA_b[:,None],OF_b] = dco.T
        dm1b[VA_b[:,None],VA_b] = (dVV + dVV.T) * .5
        dm1b[VF_b[:,None],VA_b] = dfv
        dm1b[VA_b[:,None],VF_b] = dfv.T
    else:
        dm1a[:nocca,:nocca] = (doo + doo.T) * .5
        dm1a[nocca:,nocca:] = (dvv + dvv.T) * .5
        dm1b[:noccb,:noccb] = (dOO + dOO.T) * .5
        dm1b[noccb:,noccb:] = (dVV + dVV.T) * .5

    dm1 = (reduce(numpy.dot, (mo_a, dm1a, mo_a.T)),
           reduce(numpy.dot, (mo_b, dm1b, mo_b.T)))
    vhf = mycc._scf.get_veff(mycc.mol, dm1)
    Xvo = reduce(numpy.dot, (mo_a[:,nocca:].T, vhf[0], mo_a[:,:nocca]))
    XVO = reduce(numpy.dot, (mo_b[:,noccb:].T, vhf[1], mo_b[:,:noccb]))
    Xvo+= Imata[:nocca,nocca:].T - Imata[nocca:,:nocca]
    XVO+= Imatb[:noccb,noccb:].T - Imatb[noccb:,:noccb]

    dm1_resp = _response_dm1(mycc, (Xvo,XVO), eris)
    dm1a += dm1_resp[0]
    dm1b += dm1_resp[1]
    time1 = log.timer_debug1('response_rdm1 intermediates', *time1)

    Imata[nocca:,:nocca] = Imata[:nocca,nocca:].T
    Imatb[noccb:,:noccb] = Imatb[:noccb,noccb:].T
    im1 = reduce(numpy.dot, (mo_a, Imata, mo_a.T))
    im1+= reduce(numpy.dot, (mo_b, Imatb, mo_b.T))
    time1 = log.timer_debug1('response_rdm1', *time1)

    log.debug('h1 and JK1')
    hcore_deriv = mf_grad.hcore_generator(mol)
    s1 = mf_grad.get_ovlp(mol)
    zeta = (mo_ea[:,None] + mo_ea) * .5
    zeta[nocca:,:nocca] = mo_ea[:nocca]
    zeta[:nocca,nocca:] = mo_ea[:nocca].reshape(-1,1)
    zeta_a = reduce(numpy.dot, (mo_a, zeta*dm1a, mo_a.T))
    zeta = (mo_eb[:,None] + mo_eb) * .5
    zeta[noccb:,:noccb] = mo_eb[:noccb]
    zeta[:noccb,noccb:] = mo_eb[:noccb].reshape(-1,1)
    zeta_b = reduce(numpy.dot, (mo_b, zeta*dm1b, mo_b.T))

    dm1 = (reduce(numpy.dot, (mo_a, dm1a, mo_a.T)),
           reduce(numpy.dot, (mo_b, dm1b, mo_b.T)))
    vhf_s1occ = mycc._scf.get_veff(mol, (dm1[0]+dm1[0].T, dm1[1]+dm1[1].T))
    p1a = numpy.dot(mo_a[:,:nocca], mo_a[:,:nocca].T)
    p1b = numpy.dot(mo_b[:,:noccb], mo_b[:,:noccb].T)
    vhf_s1occ = (reduce(numpy.dot, (p1a, vhf_s1occ[0], p1a)) +
                 reduce(numpy.dot, (p1b, vhf_s1occ[1], p1b))) * .5
    time1 = log.timer_debug1('h1 and JK1', *time1)

    # Hartree-Fock part contribution
    dm1pa = hf_dm1a + dm1[0]*2
    dm1pb = hf_dm1b + dm1[1]*2
    dm1 = dm1[0] + dm1[1] + hf_dm1
    zeta_a += rhf_grad.make_rdm1e(mo_ea, mo_a, mycc.mo_occ[0])
    zeta_b += rhf_grad.make_rdm1e(mo_eb, mo_b, mycc.mo_occ[1])
    zeta = zeta_a + zeta_b

    for k, ia in enumerate(atmlst):
        shl0, shl1, p0, p1 = offsetdic[ia]
# s[1] dot I, note matrix im1 is not hermitian
        de[k] += numpy.einsum('xij,ij->x', s1[:,p0:p1], im1[p0:p1])
        de[k] += numpy.einsum('xji,ij->x', s1[:,p0:p1], im1[:,p0:p1])
# h[1] \dot DM, contribute to f1
        h1ao = hcore_deriv(ia)
        de[k] += numpy.einsum('xij,ji->x', h1ao, dm1)
# -s[1]*e \dot DM,  contribute to f1
        de[k] -= numpy.einsum('xij,ij->x', s1[:,p0:p1], zeta[p0:p1]  )
        de[k] -= numpy.einsum('xji,ij->x', s1[:,p0:p1], zeta[:,p0:p1])
# -vhf[s_ij[1]],  contribute to f1, *2 for s1+s1.T
        de[k] -= numpy.einsum('xij,ij->x', s1[:,p0:p1], vhf_s1occ[p0:p1]) * 2
        de[k] -= numpy.einsum('xij,ij->x', vhf1[k,0], dm1pa)
        de[k] -= numpy.einsum('xij,ij->x', vhf1[k,1], dm1pb)

    de += mf_grad.grad_nuc(mol)
    log.timer('%s gradients' % mycc.__class__.__name__, *time0)
    return de