def _guess_shell_ranges(mol, buflen, aosym): from pyscf.ao2mo.outcore import balance_partition ao_loc = mol.ao_loc_nr() if 's2' in aosym: return balance_partition(ao_loc * (ao_loc + 1) // 2, buflen) else: nao = ao_loc[-1] return balance_partition(ao_loc * nao, buflen)
def _guess_shell_ranges(mol, buflen, aosym): from pyscf.ao2mo.outcore import balance_partition ao_loc = mol.ao_loc_nr() if 's2' in aosym: return balance_partition(ao_loc*(ao_loc+1)//2, buflen) else: nao = ao_loc[-1] return balance_partition(ao_loc*nao, buflen)
def get_int3c_mo(mol, auxmol, mo_coeff, compact=getattr(__config__, 'df_df_DF_ao2mo_compact', True), max_memory=None): ''' Evaluate (P|uv) c_ui c_vj -> (P|ij) Args: mol: gto.Mole auxmol: gto.Mole, contains auxbasis mo_coeff: ndarray, list, or tuple containing MO coefficients if two ndarrays mo_coeff = (mo0, mo1) are provided, mo0 and mo1 are used for the two AO dimensions Kwargs: compact: bool If true, will return only unique ERIs along the two MO dimensions. Does nothing if mo_coeff contains two different sets of orbitals. max_memory: int Maximum memory consumption in MB Returns: int3c: ndarray of shape (naux, nmo0, nmo1) or (naux, nmo*(nmo+1)//2) ''' nao, naux, nbas, nauxbas = mol.nao, auxmol.nao, mol.nbas, auxmol.nbas npair = nao * (nao + 1) // 2 if max_memory is None: max_memory = mol.max_memory # Separate mo_coeff if isinstance(mo_coeff, np.ndarray) and mo_coeff.ndim == 2: mo0 = mo1 = mo_coeff else: mo0, mo1 = mo_coeff[0], mo_coeff[1] nmo0, nmo1 = mo0.shape[-1], mo1.shape[-1] mosym, nmo_pair, mo_conc, mo_slice = _conc_mos(mo0, mo1, compact=compact) # (P|uv) -> (P|ij) get_int3c = _int3c_wrapper(mol, auxmol, 'int3c2e', 's2ij') int3c = np.zeros((naux, nmo_pair), dtype=mo0.dtype) max_memory -= lib.current_memory()[0] blksize = int(min(max(max_memory * 1e6 / 8 / (npair * 2), 20), 240)) aux_loc = auxmol.ao_loc aux_ranges = balance_partition(aux_loc, blksize) for shl0, shl1, nL in aux_ranges: int3c_ao = get_int3c((0, nbas, 0, nbas, shl0, shl1)) # (uv|P) p0, p1 = aux_loc[shl0], aux_loc[shl1] int3c_ao = int3c_ao.T # is apparently stored f-contiguous but in the actual memory order I need, so just transpose int3c[p0:p1] = _ao2mo.nr_e2(int3c_ao, mo_conc, mo_slice, aosym='s2', mosym=mosym, out=int3c[p0:p1]) int3c_ao = None # Shape and return if 's1' in mosym: int3c = int3c.reshape(naux, nmo0, nmo1) return int3c
def get_jk(mf_grad, mol=None, dm=None, hermi=0, with_j=True, with_k=True): if mol is None: mol = mf_grad.mol #if dm is None: dm = mf_grad.base.make_rdm1() #TODO: dm has to be the SCF density matrix in this version. dm should be # extended to any 1-particle density matrix dm = mf_grad.base.make_rdm1() with_df = mf_grad.base.with_df auxmol = with_df.auxmol if auxmol is None: auxmol = df.addons.make_auxmol(with_df.mol, with_df.auxbasis) pmol = mol + auxmol ao_loc = mol.ao_loc nbas = mol.nbas nauxbas = auxmol.nbas get_int3c_s1 = _int3c_wrapper(mol, auxmol, 'int3c2e', 's1') get_int3c_s2 = _int3c_wrapper(mol, auxmol, 'int3c2e', 's2ij') get_int3c_ip1 = _int3c_wrapper(mol, auxmol, 'int3c2e_ip1', 's1') get_int3c_ip2 = _int3c_wrapper(mol, auxmol, 'int3c2e_ip2', 's2ij') nao = mol.nao naux = auxmol.nao dms = numpy.asarray(dm) out_shape = dms.shape[:-2] + (3, ) + dms.shape[-2:] dms = dms.reshape(-1, nao, nao) nset = dms.shape[0] auxslices = auxmol.aoslice_by_atom() aux_loc = auxmol.ao_loc max_memory = mf_grad.max_memory - lib.current_memory()[0] blksize = int(min(max(max_memory * .5e6 / 8 / (nao**2 * 3), 20), naux, 240)) ao_ranges = balance_partition(aux_loc, blksize) if not with_k: idx = numpy.arange(nao) dm_tril = dms + dms.transpose(0, 2, 1) dm_tril[:, idx, idx] *= .5 dm_tril = lib.pack_tril(dm_tril) # (i,j|P) rhoj = numpy.empty((nset, naux)) for shl0, shl1, nL in ao_ranges: int3c = get_int3c_s2((0, nbas, 0, nbas, shl0, shl1)) # (i,j|P) p0, p1 = aux_loc[shl0], aux_loc[shl1] rhoj[:, p0:p1] = numpy.einsum('wp,nw->np', int3c, dm_tril) int3c = None # (P|Q) int2c = auxmol.intor('int2c2e', aosym='s1') rhoj = scipy.linalg.solve(int2c, rhoj.T, sym_pos=True).T int2c = None # (d/dX i,j|P) vj = numpy.zeros((nset, 3, nao, nao)) for shl0, shl1, nL in ao_ranges: int3c = get_int3c_ip1((0, nbas, 0, nbas, shl0, shl1)) # (i,j|P) p0, p1 = aux_loc[shl0], aux_loc[shl1] vj += numpy.einsum('xijp,np->nxij', int3c, rhoj[:, p0:p1]) int3c = None if mf_grad.auxbasis_response: # (i,j|d/dX P) vjaux = numpy.empty((3, naux)) for shl0, shl1, nL in ao_ranges: int3c = get_int3c_ip2( (0, nbas, 0, nbas, shl0, shl1)) # (i,j|P) p0, p1 = aux_loc[shl0], aux_loc[shl1] vjaux[:, p0:p1] = numpy.einsum('xwp,mw,np->xp', int3c, dm_tril, rhoj[:, p0:p1]) int3c = None # (d/dX P|Q) int2c_e1 = auxmol.intor('int2c2e_ip1', aosym='s1') vjaux -= numpy.einsum('xpq,mp,nq->xp', int2c_e1, rhoj, rhoj) vjaux = [ -vjaux[:, p0:p1].sum(axis=1) for p0, p1 in auxslices[:, 2:] ] vj = lib.tag_array(-vj.reshape(out_shape), aux=numpy.array(vjaux)) else: vj = -vj.reshape(out_shape) return vj, None mo_coeff = mf_grad.base.mo_coeff mo_occ = mf_grad.base.mo_occ nmo = mo_occ.shape[-1] if isinstance(mf_grad.base, scf.rohf.ROHF): mo_coeff = numpy.vstack((mo_coeff, mo_coeff)) mo_occa = numpy.array(mo_occ > 0, dtype=numpy.double) mo_occb = numpy.array(mo_occ == 2, dtype=numpy.double) assert (mo_occa.sum() + mo_occb.sum() == mo_occ.sum()) mo_occ = numpy.vstack((mo_occa, mo_occb)) mo_coeff = numpy.asarray(mo_coeff).reshape(-1, nao, nmo) mo_occ = numpy.asarray(mo_occ).reshape(-1, nmo) rhoj = numpy.zeros((nset, naux)) f_rhok = lib.H5TmpFile() orbo = [] for i in range(nset): c = numpy.einsum('pi,i->pi', mo_coeff[i][:, mo_occ[i] > 0], numpy.sqrt(mo_occ[i][mo_occ[i] > 0])) nocc = c.shape[1] orbo.append(c) # (P|Q) int2c = scipy.linalg.cho_factor(auxmol.intor('int2c2e', aosym='s1')) max_memory = mf_grad.max_memory - lib.current_memory()[0] blksize = max_memory * .5e6 / 8 / (naux * nao) mol_ao_ranges = balance_partition(ao_loc, blksize) nsteps = len(mol_ao_ranges) for istep, (shl0, shl1, nd) in enumerate(mol_ao_ranges): int3c = get_int3c_s1((0, nbas, shl0, shl1, 0, nauxbas)) p0, p1 = ao_loc[shl0], ao_loc[shl1] rhoj += numpy.einsum('nlk,klp->np', dms[:, p0:p1], int3c) for i in range(nset): v = lib.einsum('ko,klp->plo', orbo[i], int3c) v = scipy.linalg.cho_solve(int2c, v.reshape(naux, -1)) f_rhok['%s/%s' % (i, istep)] = v.reshape(naux, p1 - p0, -1) int3c = v = None rhoj = scipy.linalg.cho_solve(int2c, rhoj.T).T int2c = None def load(set_id, p0, p1): nocc = orbo[set_id].shape[1] buf = numpy.empty((p1 - p0, nocc, nao)) col1 = 0 for istep in range(nsteps): dat = f_rhok['%s/%s' % (set_id, istep)][p0:p1] col0, col1 = col1, col1 + dat.shape[1] buf[:p1 - p0, :, col0:col1] = dat.transpose(0, 2, 1) return buf vj = numpy.zeros((nset, 3, nao, nao)) vk = numpy.zeros((nset, 3, nao, nao)) # (d/dX i,j|P) for shl0, shl1, nL in ao_ranges: int3c = get_int3c_ip1((0, nbas, 0, nbas, shl0, shl1)) # (i,j|P) p0, p1 = aux_loc[shl0], aux_loc[shl1] vj += numpy.einsum('xijp,np->nxij', int3c, rhoj[:, p0:p1]) for i in range(nset): tmp = lib.einsum('xijp,jo->xipo', int3c, orbo[i]) rhok = load(i, p0, p1) vk[i] += lib.einsum('xipo,pok->xik', tmp, rhok) tmp = rhok = None int3c = None max_memory = mf_grad.max_memory - lib.current_memory()[0] blksize = int(min(max(max_memory * .5e6 / 8 / (nao * nocc), 20), naux)) rhok_oo = [] for i in range(nset): nocc = orbo[i].shape[1] tmp = numpy.empty((naux, nocc, nocc)) for p0, p1 in lib.prange(0, naux, blksize): rhok = load(i, p0, p1) tmp[p0:p1] = lib.einsum('pok,kr->por', rhok, orbo[i]) rhok_oo.append(tmp) rhok = tmp = None if mf_grad.auxbasis_response: vjaux = numpy.zeros((3, naux)) vkaux = numpy.zeros((3, naux)) # (i,j|d/dX P) for shl0, shl1, nL in ao_ranges: int3c = get_int3c_ip2((0, nbas, 0, nbas, shl0, shl1)) # (i,j|P) p0, p1 = aux_loc[shl0], aux_loc[shl1] int3c = int3c.transpose(0, 2, 1).reshape(3 * (p1 - p0), -1) int3c = lib.unpack_tril(int3c) int3c = int3c.reshape(3, p1 - p0, nao, nao) vjaux[:, p0:p1] = numpy.einsum('xpij,mji,np->xp', int3c, dms, rhoj[:, p0:p1]) for i in range(nset): tmp = rhok_oo[i][p0:p1] tmp = lib.einsum('por,ir->pio', tmp, orbo[i]) tmp = lib.einsum('pio,jo->pij', tmp, orbo[i]) vkaux[:, p0:p1] += lib.einsum('xpij,pij->xp', int3c, tmp) int3c = tmp = None # (d/dX P|Q) int2c_e1 = auxmol.intor('int2c2e_ip1') vjaux -= numpy.einsum('xpq,mp,nq->xp', int2c_e1, rhoj, rhoj) for i in range(nset): tmp = lib.einsum('pij,qij->pq', rhok_oo[i], rhok_oo[i]) vkaux -= numpy.einsum('xpq,pq->xp', int2c_e1, tmp) vjaux = [-vjaux[:, p0:p1].sum(axis=1) for p0, p1 in auxslices[:, 2:]] vkaux = [-vkaux[:, p0:p1].sum(axis=1) for p0, p1 in auxslices[:, 2:]] vj = lib.tag_array(-vj.reshape(out_shape), aux=numpy.array(vjaux)) vk = lib.tag_array(-vk.reshape(out_shape), aux=numpy.array(vkaux)) else: vj = -vj.reshape(out_shape) vk = -vk.reshape(out_shape) return vj, vk
def tril_prange(start, stop, step): cum_costs = numpy.arange(stop+1)**2 tasks = balance_partition(cum_costs, step, start, stop) return tasks
def get_jk(mf_grad, mol=None, dm=None, hermi=0, with_j=True, with_k=True, ishf=True): t0 = (time.clock(), time.time()) if mol is None: mol = mf_grad.mol if dm is None: dm = mf_grad.base.make_rdm1() with_df = mf_grad.base.with_df auxmol = with_df.auxmol if auxmol is None: auxmol = df.addons.make_auxmol(with_df.mol, with_df.auxbasis) pmol = mol + auxmol ao_loc = mol.ao_loc nbas = mol.nbas nauxbas = auxmol.nbas get_int3c_s1 = _int3c_wrapper(mol, auxmol, 'int3c2e', 's1') get_int3c_s2 = _int3c_wrapper(mol, auxmol, 'int3c2e', 's2ij') get_int3c_ip1 = _int3c_wrapper(mol, auxmol, 'int3c2e_ip1', 's1') get_int3c_ip2 = _int3c_wrapper(mol, auxmol, 'int3c2e_ip2', 's2ij') nao = mol.nao naux = auxmol.nao dms = numpy.asarray(dm) out_shape = dms.shape[:-2] + (3, ) + dms.shape[-2:] dms = dms.reshape(-1, nao, nao) nset = dms.shape[0] idx = numpy.arange(nao) idx = idx * (idx + 1) // 2 + idx dm_tril = dms + dms.transpose(0, 2, 1) dm_tril = lib.pack_tril(dm_tril) dm_tril[:, idx] *= .5 auxslices = auxmol.aoslice_by_atom() aux_loc = auxmol.ao_loc max_memory = mf_grad.max_memory - lib.current_memory()[0] blksize = int(min(max(max_memory * .5e6 / 8 / (nao**2 * 3), 20), naux, 240)) ao_ranges = balance_partition(aux_loc, blksize) if not with_k: # (i,j|P) rhoj = numpy.empty((nset, naux)) for shl0, shl1, nL in ao_ranges: int3c = get_int3c_s2((0, nbas, 0, nbas, shl0, shl1)) # (i,j|P) p0, p1 = aux_loc[shl0], aux_loc[shl1] rhoj[:, p0:p1] = lib.einsum('wp,nw->np', int3c, dm_tril) int3c = None # (P|Q) int2c = auxmol.intor('int2c2e', aosym='s1') rhoj = scipy.linalg.solve(int2c, rhoj.T, sym_pos=True).T int2c = None # (d/dX i,j|P) vj = numpy.zeros((nset, 3, nao, nao)) for shl0, shl1, nL in ao_ranges: int3c = get_int3c_ip1((0, nbas, 0, nbas, shl0, shl1)) # (i,j|P) p0, p1 = aux_loc[shl0], aux_loc[shl1] vj += lib.einsum('xijp,np->nxij', int3c, rhoj[:, p0:p1]) int3c = None if mf_grad.auxbasis_response: # (i,j|d/dX P) vjaux = numpy.empty((nset, nset, 3, naux)) for shl0, shl1, nL in ao_ranges: int3c = get_int3c_ip2( (0, nbas, 0, nbas, shl0, shl1)) # (i,j|P) p0, p1 = aux_loc[shl0], aux_loc[shl1] vjaux[:, :, :, p0:p1] = lib.einsum('xwp,mw,np->mnxp', int3c, dm_tril, rhoj[:, p0:p1]) int3c = None # (d/dX P|Q) int2c_e1 = auxmol.intor('int2c2e_ip1', aosym='s1') vjaux -= lib.einsum('xpq,mp,nq->mnxp', int2c_e1, rhoj, rhoj) vjaux = numpy.array([ -vjaux[:, :, :, p0:p1].sum(axis=3) for p0, p1 in auxslices[:, 2:] ]) if ishf: vjaux = vjaux.sum((1, 2)) else: vjaux = numpy.ascontiguousarray(vjaux.transpose(1, 2, 0, 3)) vj = lib.tag_array(-vj.reshape(out_shape), aux=numpy.array(vjaux)) else: vj = -vj.reshape(out_shape) logger.timer(mf_grad, 'df vj', *t0) return vj, None if hasattr(dm, 'mo_coeff') and hasattr(dm, 'mo_occ'): mo_coeff = dm.mo_coeff mo_occ = dm.mo_occ elif ishf: mo_coeff = mf_grad.base.mo_coeff mo_occ = mf_grad.base.mo_occ if isinstance(mf_grad.base, scf.rohf.ROHF): mo_coeff = numpy.vstack((mo_coeff, mo_coeff)) mo_occa = numpy.array(mo_occ > 0, dtype=numpy.double) mo_occb = numpy.array(mo_occ == 2, dtype=numpy.double) assert (mo_occa.sum() + mo_occb.sum() == mo_occ.sum()) mo_occ = numpy.vstack((mo_occa, mo_occb)) else: s0 = mol.intor('int1e_ovlp') mo_occ = [] mo_coeff = [] for dm in dms: sdms = reduce(lib.dot, (s0, dm, s0)) n, c = scipy.linalg.eigh(sdms, b=s0) mo_occ.append(n) mo_coeff.append(c) mo_occ = numpy.stack(mo_occ, axis=0) nmo = mo_occ.shape[-1] mo_coeff = numpy.asarray(mo_coeff).reshape(-1, nao, nmo) mo_occ = numpy.asarray(mo_occ).reshape(-1, nmo) rhoj = numpy.zeros((nset, naux)) f_rhok = lib.H5TmpFile() orbor = [] orbol = [] nocc = [] orbor_stack = numpy.zeros((nao, 0), dtype=mo_coeff.dtype, order='F') orbol_stack = numpy.zeros((nao, 0), dtype=mo_coeff.dtype, order='F') offs = 0 for i in range(nset): idx = numpy.abs(mo_occ[i]) > 1e-8 nocc.append(numpy.count_nonzero(idx)) c = mo_coeff[i][:, idx] orbol_stack = numpy.append(orbol_stack, c, axis=1) orbol.append(orbol_stack[:, offs:offs + nocc[-1]]) cn = lib.einsum('pi,i->pi', c, mo_occ[i][idx]) orbor_stack = numpy.append(orbor_stack, cn, axis=1) orbor.append(orbor_stack[:, offs:offs + nocc[-1]]) offs += nocc[-1] # (P|Q) int2c = scipy.linalg.cho_factor(auxmol.intor('int2c2e', aosym='s1')) t1 = (time.clock(), time.time()) max_memory = mf_grad.max_memory - lib.current_memory()[0] blksize = max_memory * .5e6 / 8 / (naux * nao) mol_ao_ranges = balance_partition(ao_loc, blksize) nsteps = len(mol_ao_ranges) t2 = t1 for istep, (shl0, shl1, nd) in enumerate(mol_ao_ranges): int3c = get_int3c_s1((0, nbas, shl0, shl1, 0, nauxbas)) t2 = logger.timer_debug1(mf_grad, 'df grad intor (P|mn)', *t2) p0, p1 = ao_loc[shl0], ao_loc[shl1] for i in range(nset): # MRH 05/21/2020: De-vectorize this because array contiguity -> parallel scaling v = lib.dot(int3c.reshape(nao, -1, order='F').T, orbor[i]).reshape(naux, (p1 - p0) * nocc[i]) t2 = logger.timer_debug1(mf_grad, 'df grad einsum (P|mn) u_ni N_i = v_Pmi', *t2) rhoj[i] += numpy.dot(v, orbol[i][p0:p1].ravel()) t2 = logger.timer_debug1(mf_grad, 'df grad einsum v_Pmi u_mi = rho_P', *t2) v = scipy.linalg.cho_solve(int2c, v) t2 = logger.timer_debug1(mf_grad, 'df grad cho_solve (P|Q) D_Qmi = v_Pmi', *t2) f_rhok['%s/%s' % (i, istep)] = v.reshape(naux, p1 - p0, -1) t2 = logger.timer_debug1( mf_grad, 'df grad cache D_Pmi (m <-> i transpose upon retrieval)', *t2) int3c = v = None rhoj = scipy.linalg.cho_solve(int2c, rhoj.T).T int2c = None t1 = logger.timer_debug1( mf_grad, 'df grad vj and vk AO (P|Q) D_Q = (P|mn) D_mn solve', *t1) def load(set_id, p0, p1): buf = numpy.empty((p1 - p0, nocc[set_id], nao)) col1 = 0 for istep in range(nsteps): dat = f_rhok['%s/%s' % (set_id, istep)][p0:p1] col0, col1 = col1, col1 + dat.shape[1] buf[:p1 - p0, :, col0:col1] = dat.transpose(0, 2, 1) return buf vj = numpy.zeros((nset, 3, nao, nao)) vk = numpy.zeros((nset, 3, nao, nao)) # (d/dX i,j|P) fmmm = _ao2mo.libao2mo.AO2MOmmm_bra_nr_s1 # MO output index slower than AO output index; input AOs are asymmetric fdrv = _ao2mo.libao2mo.AO2MOnr_e2_drv # comp and aux indices are slower ftrans = _ao2mo.libao2mo.AO2MOtranse2_nr_s1 # input is not tril_packed null = lib.c_null_ptr() t2 = t1 for shl0, shl1, nL in ao_ranges: int3c = get_int3c_ip1((0, nbas, 0, nbas, shl0, shl1)).transpose(0, 3, 2, 1) # (P|mn'), row-major order t2 = logger.timer_debug1(mf_grad, "df grad intor (P|mn')", *t2) p0, p1 = aux_loc[shl0], aux_loc[shl1] for i in range(nset): # MRH 05/21/2020: De-vectorize this because array contiguity -> parallel scaling vj[i, 0] += numpy.dot(rhoj[i, p0:p1], int3c[0].reshape(p1 - p0, -1)).reshape(nao, nao).T vj[i, 1] += numpy.dot(rhoj[i, p0:p1], int3c[1].reshape(p1 - p0, -1)).reshape(nao, nao).T vj[i, 2] += numpy.dot(rhoj[i, p0:p1], int3c[2].reshape(p1 - p0, -1)).reshape(nao, nao).T t2 = logger.timer_debug1(mf_grad, "df grad einsum rho_P (P|mn') rho_P", *t2) tmp = numpy.empty((3, p1 - p0, nocc[i], nao), dtype=orbol_stack.dtype) fdrv( ftrans, fmmm, # xPmn u_mi -> xPin tmp.ctypes.data_as(ctypes.c_void_p), int3c.ctypes.data_as(ctypes.c_void_p), orbol[i].ctypes.data_as(ctypes.c_void_p), ctypes.c_int(3 * (p1 - p0)), ctypes.c_int(nao), (ctypes.c_int * 4)(0, nocc[i], 0, nao), null, ctypes.c_int(0)) t2 = logger.timer_debug1(mf_grad, "df grad einsum (P|mn') u_mi = dg_Pin", *t2) rhok = load(i, p0, p1) vk[i] += lib.einsum('xpoi,pok->xik', tmp, rhok) t2 = logger.timer_debug1(mf_grad, "df grad einsum D_Pim dg_Pin = v_ij", *t2) rhok = tmp = None int3c = None t1 = logger.timer_debug1(mf_grad, 'df grad vj and vk AO (P|mn) D_P eval', *t1) if mf_grad.auxbasis_response: # Cache (P|uv) D_ui c_vj. Must be include both upper and lower triangles # over nset. max_memory = mf_grad.max_memory - lib.current_memory()[0] blksize = int( min(max(max_memory * .5e6 / 8 / (nao * max(nocc)), 20), naux)) rhok_oo = [] for i, j in product(range(nset), repeat=2): tmp = numpy.empty((naux, nocc[i], nocc[j])) for p0, p1 in lib.prange(0, naux, blksize): rhok = load(i, p0, p1).reshape((p1 - p0) * nocc[i], nao) tmp[p0:p1] = lib.dot(rhok, orbol[j]).reshape(p1 - p0, nocc[i], nocc[j]) rhok_oo.append(tmp) rhok = tmp = None t1 = logger.timer_debug1( mf_grad, 'df grad vj and vk aux d_Pim u_mj = d_Pij eval', *t1) vjaux = numpy.zeros((nset, nset, 3, naux)) vkaux = numpy.zeros((nset, nset, 3, naux)) # (i,j|d/dX P) t2 = t1 fmmm = _ao2mo.libao2mo.AO2MOmmm_bra_nr_s2 # MO output index slower than AO output index; input AOs are symmetric fdrv = _ao2mo.libao2mo.AO2MOnr_e2_drv # comp and aux indices are slower ftrans = _ao2mo.libao2mo.AO2MOtranse2_nr_s2 # input is tril_packed null = lib.c_null_ptr() for shl0, shl1, nL in ao_ranges: int3c = get_int3c_ip2((0, nbas, 0, nbas, shl0, shl1)) # (i,j|P) t2 = logger.timer_debug1(mf_grad, "df grad intor (P'|mn)", *t2) p0, p1 = aux_loc[shl0], aux_loc[shl1] drhoj = lib.dot( int3c.transpose(0, 2, 1).reshape(3 * (p1 - p0), -1), dm_tril.T).reshape(3, p1 - p0, -1) # xpij,mij->xpm vjaux[:, :, :, p0:p1] = lib.einsum('xpm,np->mnxp', drhoj, rhoj[:, p0:p1]) t2 = logger.timer_debug1( mf_grad, "df grad einsum rho_P (P'|mn) D_mn = v_P", *t2) tmp = [ numpy.empty((3, p1 - p0, nocc_i, nao), dtype=orbor_stack.dtype) for nocc_i in nocc ] assert (orbor_stack.flags.f_contiguous), '{} {}'.format( orbor_stack.shape, orbor_stack.strides) for orb, buf, nocc_i in zip(orbol, tmp, nocc): fdrv( ftrans, fmmm, # gPmn u_ni -> gPim buf.ctypes.data_as(ctypes.c_void_p), int3c.ctypes.data_as(ctypes.c_void_p), orb.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(3 * (p1 - p0)), ctypes.c_int(nao), (ctypes.c_int * 4)(0, nocc_i, 0, nao), null, ctypes.c_int(0)) int3c = [[ lib.dot(buf.reshape(-1, nao), orb).reshape(3, p1 - p0, -1, norb) for orb, norb in zip(orbor, nocc) ] for buf in tmp] # pim,mj,j -> pij t2 = logger.timer_debug1( mf_grad, "df grad einsum (P'|mn) u_mi u_nj N_j = v_Pmn", *t2) for i, j in product(range(nset), repeat=2): k = (i * nset) + j tmp = rhok_oo[k][p0:p1] vkaux[i, j, :, p0:p1] += lib.einsum('xpij,pij->xp', int3c[i][j], tmp) t2 = logger.timer_debug1(mf_grad, "df grad einsum d_Pij v_Pij = v_P", *t2) int3c = tmp = None t1 = logger.timer_debug1(mf_grad, "df grad vj and vk aux (P'|mn) eval", *t1) # (d/dX P|Q) int2c_e1 = auxmol.intor('int2c2e_ip1') vjaux -= lib.einsum('xpq,mp,nq->mnxp', int2c_e1, rhoj, rhoj) for i, j in product(range(nset), repeat=2): k = (i * nset) + j l = (j * nset) + i tmp = lib.einsum('pij,qji->pq', rhok_oo[k], rhok_oo[l]) vkaux[i, j] -= lib.einsum('xpq,pq->xp', int2c_e1, tmp) t1 = logger.timer_debug1(mf_grad, "df grad vj and vk aux (P'|Q) eval", *t1) vjaux = numpy.array([ -vjaux[:, :, :, p0:p1].sum(axis=3) for p0, p1 in auxslices[:, 2:] ]) vkaux = numpy.array([ -vkaux[:, :, :, p0:p1].sum(axis=3) for p0, p1 in auxslices[:, 2:] ]) if ishf: vjaux = vjaux.sum((1, 2)) idx = numpy.array(list(range(nset))) * (nset + 1) vkaux = vkaux.reshape((nset**2, 3, mol.natm))[idx, :, :].sum(0) else: vjaux = numpy.ascontiguousarray(vjaux.transpose(1, 2, 0, 3)) vkaux = numpy.ascontiguousarray(vkaux.transpose(1, 2, 0, 3)) vj = lib.tag_array(-vj.reshape(out_shape), aux=numpy.array(vjaux)) vk = lib.tag_array(-vk.reshape(out_shape), aux=numpy.array(vkaux)) else: vj = -vj.reshape(out_shape) vk = -vk.reshape(out_shape) logger.timer(mf_grad, 'df grad vj and vk', *t0) return vj, vk
def grad_elec_dferi (mc_grad, mo_cas=None, ci=None, dfcasdm2=None, casdm2=None, atmlst=None, max_memory=None): ''' Evaluate the (P|i'j) d_Pij contribution to the electronic gradient, where d_Pij is the DF-2RDM obtained by solve_df_rdm2. The caller must symmetrize (i.e., [(P|i'j) + (P|ij')] d_Pij / 2) if necessary. Args: mc_grad: MC-SCF gradients method object Kwargs: mc_cas: ndarray, list, or tuple containing active-space MO coefficients If a tuple of length 2, the same pair of MO sets are assumed to apply to the internally-contracted and externally-contracted indices of the DF-2rdm: (P|Q)d_Qij = (P|kl)d_ijkl -> (P|Q)d_Qij = (P|ij)d_ijij If a tuple of length 4, the 4 MO sets are applied to ijkl above in that order (first two external, last two internal). ci: ndarray, tuple, or list containing CI coefficients in mo_cas basis. Not used if dfcasdm2 is provided. dfcasdm2: ndarray, tuple, or list containing DF-2rdm in mo_cas basis. Computed by solve_df_rdm2 if omitted. casdm2: ndarray, tuple, or list containing rdm2 in mo_cas basis. Computed by mc_grad.fcisolver.make_rdm12 (ci,...) if omitted. atmlst: list of integers List of nonfrozen atoms, as in grad_elec functions. Defaults to list (range (mol.natm)) max_memory: int Maximum memory usage in MB Returns: dE: ndarray of shape (len (dfcasdm2), len (atmlst), 3) ''' if isinstance (mc_grad, GradientsBasics): mc = mc_grad.base else: mc = mc_grad mol = mc_grad.mol auxmol = mc.with_df.auxmol ncore, ncas, nao, naux, nbas = mc.ncore, mc.ncas, mol.nao, auxmol.nao, mol.nbas nocc = ncore + ncas if mo_cas is None: mo_cas = mc.mo_coeff[:,ncore:nocc] if max_memory is None: max_memory = mc_grad.max_memory if isinstance (mo_cas, np.ndarray) and mo_cas.ndim == 2: mo_cas = (mo_cas,)*4 elif len (mo_cas) == 2: mo_cas = (mo_cas[0], mo_cas[1], mo_cas[0], mo_cas[1]) elif len (mo_cas) == 4: mo_cas = tuple (mo_cas) else: raise RuntimeError ('Invalid shape of np.asarray (mo_cas): {}'.format (mo_cas.shape)) nmo = [mo.shape[1] for mo in mo_cas] if atmlst is None: atmlst = list (range (mol.natm)) if ci is None: ci = mc.ci if dfcasdm2 is None: dfcasdm2 = solve_df_rdm2 (mc, mo_cas=mo_cas[2:], ci=ci, casdm2=casdm2) # d_Pij nset = len (dfcasdm2) dE = np.zeros ((nset, nao, 3)) dfcasdm2 = np.array (dfcasdm2) # Set up (P|u'v) calculation get_int3c = _int3c_wrapper(mol, auxmol, 'int3c2e_ip1', 's1') max_memory -= lib.current_memory()[0] blklen = nao*((3*nao) + (3*nmo[1]) + (nset*nmo[1])) blksize = int (min (max (max_memory * 1e6 / 8 / blklen, 20), 240)) aux_loc = auxmol.ao_loc aux_ranges = balance_partition(aux_loc, blksize) # Iterate over auxbasis range for shl0, shl1, nL in aux_ranges: p0, p1 = aux_loc[shl0], aux_loc[shl1] int3c = get_int3c ((0, nbas, 0, nbas, shl0, shl1)) # (u'v|P); shape = (3,nao,nao,p1-p0) intbuf = lib.einsum ('xuvp,vj->xupj', int3c, mo_cas[1]) dm2buf = lib.einsum ('ui,npij->nupj', mo_cas[0], dfcasdm2[:,p0:p1,:,:]) dE -= np.einsum ('nupj,xupj->nux', dm2buf, intbuf) intbuf = dm2buf = None intbuf = lib.einsum ('xuvp,vj->xupj', int3c, mo_cas[0]) dm2buf = lib.einsum ('uj,npij->nupi', mo_cas[1], dfcasdm2[:,p0:p1,:,:]) dE -= np.einsum ('nupj,xupj->nux', dm2buf, intbuf) intbuf = dm2buf = int3c = None aoslices = mol.aoslice_by_atom () dE = np.array ([dE[:,p0:p1].sum (axis=1) for p0, p1 in aoslices[:,2:]]).transpose (1,0,2) return np.ascontiguousarray (dE)
def grad_elec_auxresponse_dferi (mc_grad, mo_cas=None, ci=None, dfcasdm2=None, casdm2=None, atmlst=None, max_memory=None, dferi=None, incl_2c=True): ''' Evaluate the [(P'|ij) + (P'|Q) g_Qij] d_Pij contribution to the electronic gradient, where d_Pij is the DF-2RDM obtained by solve_df_rdm2 and g_Qij solves (P|Q) g_Qij = (P|ij). The caller must symmetrize if necessary (i.e., (P|Q) d_Qij = (P|kl) d_ijkl <-> (P|Q) d_Qkl = (P|ij) d_ijkl in order to get at Q'). Args: mc_grad: MC-SCF gradients method object Kwargs: mc_cas: ndarray, list, or tuple containing active-space MO coefficients If a tuple of length 2, the same pair of MO sets are assumed to apply to the internally-contracted and externally-contracted indices of the DF-2rdm: (P|Q)d_Qij = (P|kl)d_ijkl -> (P|Q)d_Qij = (P|ij)d_ijij If a tuple of length 4, the 4 MO sets are applied to ijkl above in that order (first two external, last two internal). ci: ndarray, tuple, or list containing CI coefficients in mo_cas basis. Not used if dfcasdm2 is provided. dfcasdm2: ndarray, tuple, or list containing DF-2rdm in mo_cas basis. Computed by solve_df_rdm2 if omitted. casdm2: ndarray, tuple, or list containing rdm2 in mo_cas basis. Computed by mc_grad.fcisolver.make_rdm12 (ci,...) if omitted. atmlst: list of integers List of nonfrozen atoms, as in grad_elec functions. Defaults to list (range (mol.natm)) max_memory: int Maximum memory usage in MB dferi: ndarray containing g_Pij for optional precalculation incl_2c: bool If False, omit the terms depending on (P'|Q) Returns: dE: list of ndarray of shape (len (atmlst), 3) ''' if isinstance (mc_grad, GradientsBasics): mc = mc_grad.base else: mc = mc_grad mol = mc_grad.mol auxmol = mc.with_df.auxmol ncore, ncas, nao, naux, nbas = mc.ncore, mc.ncas, mol.nao, auxmol.nao, mol.nbas nocc = ncore + ncas npair = nao * (nao + 1) // 2 if mo_cas is None: mo_cas = mc.mo_coeff[:,ncore:nocc] if max_memory is None: max_memory = mc.max_memory if isinstance (mo_cas, np.ndarray) and mo_cas.ndim == 2: mo_cas = (mo_cas,)*4 elif len (mo_cas) == 2: mo_cas = (mo_cas[0], mo_cas[1], mo_cas[0], mo_cas[1]) elif len (mo_cas) == 4: mo_cas = tuple (mo_cas) else: raise RuntimeError ('Invalid shape of np.asarray (mo_cas): {}'.format (mo_cas.shape)) nmo = [mo.shape[1] for mo in mo_cas] if atmlst is None: atmlst = list (range (mol.natm)) if ci is None: ci = mc.ci if dfcasdm2 is None: dfcasdm2 = solve_df_rdm2 (mc, mo_cas=mo_cas[2:], ci=ci, casdm2=casdm2) # d_Pij = (P|Q)^{-1} (Q|kl) d_ijkl nset = len (dfcasdm2) dE = np.zeros ((nset, naux, 3)) dfcasdm2 = np.array (dfcasdm2) # Shape dfcasdm2 mosym, nmo_pair, mo_conc, mo_slice = _conc_mos(mo_cas[0], mo_cas[1], compact=True) if 's2' in mosym: assert (nmo[0] == nmo[1]), 'How did I get {} with nmo[0] = {} and nmo[1] = {}'.format (mosym, nmo[0], nmo[1]) dfcasdm2 = dfcasdm2.reshape (nset*naux, nmo[0], nmo[1]) dfcasdm2 += dfcasdm2.transpose (0,2,1) diag_idx = np.arange(nmo[0]) diag_idx = diag_idx * (diag_idx+1) // 2 + diag_idx dfcasdm2 = lib.pack_tril (np.ascontiguousarray (dfcasdm2)) dfcasdm2[:,diag_idx] *= 0.5 dfcasdm2 = dfcasdm2.reshape (nset, naux, nmo_pair) # Do 2c part. Assume memory is no object if incl_2c: int2c = auxmol.intor('int2c2e_ip1') if (dferi is None): dferi = solve_df_eri (mc, mo_cas=mo_cas[:2]).reshape (naux, nmo_pair) # g_Pij = (P|Q)^{-1} (Q|ij) int3c = np.dot (int2c, dferi) # (P'|Q) g_Qij dE += lib.einsum ('npi,xpi->npx', dfcasdm2, int3c) # d_Pij (P'|Q) g_Qij int2c = int3c = dferi = None # Set up 3c part get_int3c = _int3c_wrapper(mol, auxmol, 'int3c2e_ip2', 's2ij') max_memory -= lib.current_memory()[0] blklen = 6*npair blksize = int (min (max (max_memory * 1e6 / 8 / blklen, 20), 240)) aux_loc = auxmol.ao_loc aux_ranges = balance_partition(aux_loc, blksize) # Iterate over auxbasis range and do 3c part for shl0, shl1, nL in aux_ranges: p0, p1 = aux_loc[shl0], aux_loc[shl1] int3c = get_int3c ((0, nbas, 0, nbas, shl0, shl1)) # (uv|P'); shape = (3,npair,p1-p0) int3c = np.ascontiguousarray (int3c.transpose (0,2,1).reshape (3*(p1-p0), npair)) int3c = _ao2mo.nr_e2(int3c, mo_conc, mo_slice, aosym='s2', mosym=mosym) int3c = int3c.reshape (3,p1-p0,nmo_pair) int3c = np.ascontiguousarray (int3c) dE[:,p0:p1,:] -= lib.einsum ('npi,xpi->npx', dfcasdm2[:,p0:p1,:], int3c) # Ravel to atoms auxslices = auxmol.aoslice_by_atom () dE = np.array ([dE[:,p0:p1].sum (axis=1) for p0, p1 in auxslices[:,2:]]).transpose (1,0,2) return np.ascontiguousarray (dE)