def gen_int3c(job_id, ish0, ish1): dataname = 'j3c-chunks/%d' % job_id i0 = ao_loc[ish0] i1 = ao_loc[ish1] dii = i1 * (i1 + 1) // 2 - i0 * (i0 + 1) // 2 if j_only: dij = dii buflen = max(8, int(max_memory * 1e6 / 16 / (nkptij * dii + dii))) else: dij = (i1 - i0) * nao buflen = max(8, int(max_memory * 1e6 / 16 / (nkptij * dij + dij))) auxranges = balance_segs(aux_loc[1:] - aux_loc[:-1], buflen) buflen = max([x[2] for x in auxranges]) buf = numpy.empty(nkptij * dij * buflen, dtype=dtype) buf1 = numpy.empty(dij * buflen, dtype=dtype) naux = aux_loc[-1] for kpt_id, kptij in enumerate(kptij_lst): key = '%s/%d' % (dataname, kpt_id) if aosym_s2[kpt_id]: shape = (naux, dii) else: shape = (naux, dij) if gamma_point(kptij): fswap.create_dataset(key, shape, 'f8') else: fswap.create_dataset(key, shape, 'c16') naux0 = 0 for istep, auxrange in enumerate(auxranges): log.alldebug2("aux_e1 job_id %d step %d", job_id, istep) sh0, sh1, nrow = auxrange sub_slice = (ish0, ish1, 0, cell.nbas, sh0, sh1) mat = numpy.ndarray((nkptij, dij, nrow), dtype=dtype, buffer=buf) mat = int3c(sub_slice, mat) for k, kptij in enumerate(kptij_lst): h5dat = fswap['%s/%d' % (dataname, k)] v = lib.transpose(mat[k], out=buf1) if not j_only and aosym_s2[k]: idy = idxb[i0 * (i0 + 1) // 2:i1 * (i1 + 1) // 2] - i0 * nao out = numpy.ndarray((nrow, dii), dtype=v.dtype, buffer=mat[k]) v = numpy.take(v, idy, axis=1, out=out) if gamma_point(kptij): h5dat[naux0:naux0 + nrow] = v.real else: h5dat[naux0:naux0 + nrow] = v naux0 += nrow
def gen_int3c(job_id, ish0, ish1): dataname = 'j3c-chunks/%d' % job_id i0 = ao_loc[ish0] i1 = ao_loc[ish1] dii = i1*(i1+1)//2 - i0*(i0+1)//2 dij = (i1 - i0) * nao if j_only: buflen = max(8, int(max_memory*1e6/16/(nkptij*dii+dii))) else: buflen = max(8, int(max_memory*1e6/16/(nkptij*dij+dij))) auxranges = balance_segs(aux_loc[1:]-aux_loc[:-1], buflen) buflen = max([x[2] for x in auxranges]) buf = numpy.empty(nkptij*dij*buflen, dtype=dtype) buf1 = numpy.empty(dij*buflen, dtype=dtype) naux = aux_loc[-1] for kpt_id, kptij in enumerate(kptij_lst): key = '%s/%d' % (dataname, kpt_id) if aosym_s2[kpt_id]: shape = (naux, dii) else: shape = (naux, dij) if gamma_point(kptij): fswap.create_dataset(key, shape, 'f8') else: fswap.create_dataset(key, shape, 'c16') naux0 = 0 for istep, auxrange in enumerate(auxranges): log.alldebug2("aux_e1 job_id %d step %d", job_id, istep) sh0, sh1, nrow = auxrange sub_slice = (ish0, ish1, 0, cell.nbas, sh0, sh1) if j_only: mat = numpy.ndarray((nkptij,dii,nrow), dtype=dtype, buffer=buf) else: mat = numpy.ndarray((nkptij,dij,nrow), dtype=dtype, buffer=buf) mat = int3c(sub_slice, mat) for k, kptij in enumerate(kptij_lst): h5dat = fswap['%s/%d'%(dataname,k)] v = lib.transpose(mat[k], out=buf1) if not j_only and aosym_s2[k]: idy = idxb[i0*(i0+1)//2:i1*(i1+1)//2] - i0 * nao out = numpy.ndarray((nrow,dii), dtype=v.dtype, buffer=mat[k]) v = numpy.take(v, idy, axis=1, out=out) if gamma_point(kptij): h5dat[naux0:naux0+nrow] = v.real else: h5dat[naux0:naux0+nrow] = v naux0 += nrow
def get_j_kpts(mydf, dm_kpts, hermi=1, kpts=numpy.zeros((1,3)), kpts_band=None): mydf = _sync_mydf(mydf) cell = mydf.cell mesh = mydf.mesh dm_kpts = lib.asarray(dm_kpts, order='C') dms = _format_dms(dm_kpts, kpts) nset, nkpts, nao = dms.shape[:3] coulG = tools.get_coulG(cell, mesh=mesh) ngrids = len(coulG) vR = rhoR = numpy.zeros((nset,ngrids)) for ao_ks_etc, p0, p1 in mydf.mpi_aoR_loop(mydf.grids, kpts): ao_ks = ao_ks_etc[0] for k, ao in enumerate(ao_ks): for i in range(nset): rhoR[i,p0:p1] += numint.eval_rho(cell, ao, dms[i,k]) ao = ao_ks = None rhoR = mpi.allreduce(rhoR) for i in range(nset): rhoR[i] *= 1./nkpts rhoG = tools.fft(rhoR[i], mesh) vG = coulG * rhoG vR[i] = tools.ifft(vG, mesh).real kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band nband = len(kpts_band) weight = cell.vol / ngrids vR *= weight if gamma_point(kpts_band): vj_kpts = numpy.zeros((nset,nband,nao,nao)) else: vj_kpts = numpy.zeros((nset,nband,nao,nao), dtype=numpy.complex128) for ao_ks_etc, p0, p1 in mydf.mpi_aoR_loop(mydf.grids, kpts_band): ao_ks = ao_ks_etc[0] for k, ao in enumerate(ao_ks): for i in range(nset): vj_kpts[i,k] += lib.dot(ao.T.conj()*vR[i,p0:p1], ao) vj_kpts = mpi.reduce(vj_kpts) if gamma_point(kpts_band): vj_kpts = vj_kpts.real return _format_jks(vj_kpts, dm_kpts, input_band, kpts)
def get_j_kpts(mydf, dm_kpts, hermi=1, kpts=np.zeros((1, 3)), kpts_band=None): '''Get the Coulomb (J) AO matrix at sampled k-points. Args: dm_kpts : (nkpts, nao, nao) ndarray or a list of (nkpts,nao,nao) ndarray Density matrix at each k-point. If a list of k-point DMs, eg, UHF alpha and beta DM, the alpha and beta DMs are contracted separately. kpts : (nkpts, 3) ndarray Kwargs: kpts_band : (3,) ndarray or (*,3) ndarray A list of arbitrary "band" k-points at which to evalute the matrix. Returns: vj : (nkpts, nao, nao) ndarray or list of vj if the input dm_kpts is a list of DMs ''' cell = mydf.cell gs = mydf.gs dm_kpts = lib.asarray(dm_kpts, order='C') dms = _format_dms(dm_kpts, kpts) nset, nkpts, nao = dms.shape[:3] coulG = tools.get_coulG(cell, gs=gs) ngs = len(coulG) vR = rhoR = np.zeros((nset, ngs)) for k, aoR in mydf.aoR_loop(gs, kpts): for i in range(nset): rhoR[i] += numint.eval_rho(cell, aoR, dms[i, k]) for i in range(nset): rhoR[i] *= 1. / nkpts rhoG = tools.fft(rhoR[i], gs) vG = coulG * rhoG vR[i] = tools.ifft(vG, gs).real kpts_band, single_kpt_band = _format_kpts_band(kpts_band, kpts) nband = len(kpts_band) vj_kpts = [] weight = cell.vol / ngs if gamma_point(kpts_band): vj_kpts = np.empty((nset, nband, nao, nao)) else: vj_kpts = np.empty((nset, nband, nao, nao), dtype=np.complex128) for k, aoR in mydf.aoR_loop(gs, kpts_band): for i in range(nset): vj_kpts[i, k] = weight * lib.dot(aoR.T.conj() * vR[i], aoR) return _format_jks(vj_kpts, dm_kpts, kpts_band, kpts, single_kpt_band)
def ft_fuse(job_id, uniq_kptji_id, sh0, sh1): kpt = uniq_kpts[uniq_kptji_id] # kpt = kptj - kpti adapted_ji_idx = numpy.where(uniq_inverse == uniq_kptji_id)[0] adapted_kptjs = kptjs[adapted_ji_idx] nkptj = len(adapted_kptjs) shls_slice = (auxcell.nbas, fused_cell.nbas) Gaux = ft_ao.ft_ao(fused_cell, Gv, shls_slice, b, gxyz, Gvbase, kpt) Gaux *= mydf.weighted_coulG(kpt, False, gs).reshape(-1, 1) kLR = Gaux.real.copy('C') kLI = Gaux.imag.copy('C') j2c = numpy.asarray(feri['j2c/%d' % uniq_kptji_id]) j2ctag = j2ctags[uniq_kptji_id] naux0 = j2c.shape[0] if is_zero(kpt): aosym = 's2' else: aosym = 's1' j3cR = [None] * nkptj j3cI = [None] * nkptj i0 = ao_loc[sh0] i1 = ao_loc[sh1] for k, idx in enumerate(adapted_ji_idx): key = 'j3c-chunks/%d/%d' % (job_id, idx) v = numpy.asarray(feri[key]) if is_zero(kpt): for i, c in enumerate(vbar): if c != 0: v[i] -= c * ovlp[k][i0 * (i0 + 1) // 2:i1 * (i1 + 1) // 2].ravel() j3cR[k] = numpy.asarray(v.real, order='C') if v.dtype == numpy.complex128: j3cI[k] = numpy.asarray(v.imag, order='C') v = None ncol = j3cR[0].shape[1] Gblksize = max(16, int(max_memory * 1e6 / 16 / ncol / (nkptj + 1))) # +1 for pqkRbuf/pqkIbuf Gblksize = min(Gblksize, ngs, 16384) pqkRbuf = numpy.empty(ncol * Gblksize) pqkIbuf = numpy.empty(ncol * Gblksize) buf = numpy.empty(nkptj * ncol * Gblksize, dtype=numpy.complex128) log.alldebug2(' blksize (%d,%d)', Gblksize, ncol) shls_slice = (sh0, sh1, 0, cell.nbas) for p0, p1 in lib.prange(0, ngs, Gblksize): dat = ft_ao._ft_aopair_kpts(cell, Gv[p0:p1], shls_slice, aosym, b, gxyz[p0:p1], Gvbase, kpt, adapted_kptjs, out=buf) nG = p1 - p0 for k, ji in enumerate(adapted_ji_idx): aoao = dat[k].reshape(nG, ncol) pqkR = numpy.ndarray((ncol, nG), buffer=pqkRbuf) pqkI = numpy.ndarray((ncol, nG), buffer=pqkIbuf) pqkR[:] = aoao.real.T pqkI[:] = aoao.imag.T lib.dot(kLR[p0:p1].T, pqkR.T, -1, j3cR[k][naux:], 1) lib.dot(kLI[p0:p1].T, pqkI.T, -1, j3cR[k][naux:], 1) if not (is_zero(kpt) and gamma_point(adapted_kptjs[k])): lib.dot(kLR[p0:p1].T, pqkI.T, -1, j3cI[k][naux:], 1) lib.dot(kLI[p0:p1].T, pqkR.T, 1, j3cI[k][naux:], 1) for k, idx in enumerate(adapted_ji_idx): if is_zero(kpt) and gamma_point(adapted_kptjs[k]): v = fuse(j3cR[k]) else: v = fuse(j3cR[k] + j3cI[k] * 1j) if j2ctag == 'CD': v = scipy.linalg.solve_triangular(j2c, v, lower=True, overwrite_b=True) else: v = lib.dot(j2c, v) feri['j3c-chunks/%d/%d' % (job_id, idx)][:naux0] = v
def get_jk(mydf, dm, hermi=1, kpt=numpy.zeros(3), kpt_band=None, with_j=True, with_k=True, exxdiv=None): '''JK for given k-point''' from pyscf.pbc.df.df_jk import _ewald_exxdiv_for_G0 vj = vk = None if kpt_band is not None and abs(kpt-kpt_band).sum() > 1e-9: kpt = numpy.reshape(kpt, (1,3)) if with_k: vk = get_k_kpts(mydf, dm, hermi, kpt, kpt_band, exxdiv) if with_j: vj = get_j_kpts(mydf, dm, hermi, kpt, kpt_band) return vj, vk cell = mydf.cell log = logger.Logger(mydf.stdout, mydf.verbose) t1 = (time.clock(), time.time()) dm = numpy.asarray(dm, order='C') dms = _format_dms(dm, [kpt]) nset, _, nao = dms.shape[:3] dms = dms.reshape(nset,nao,nao) j_real = gamma_point(kpt) k_real = gamma_point(kpt) and not numpy.iscomplexobj(dms) kptii = numpy.asarray((kpt,kpt)) kpt_allow = numpy.zeros(3) if with_j: vjcoulG = mydf.weighted_coulG(kpt_allow, False, mydf.gs) vjR = numpy.zeros((nset,nao,nao)) vjI = numpy.zeros((nset,nao,nao)) if with_k: mydf.exxdiv = exxdiv vkcoulG = mydf.weighted_coulG(kpt_allow, True, mydf.gs) vkR = numpy.zeros((nset,nao,nao)) vkI = numpy.zeros((nset,nao,nao)) dmsR = numpy.asarray(dms.real.reshape(nset,nao,nao), order='C') dmsI = numpy.asarray(dms.imag.reshape(nset,nao,nao), order='C') mem_now = lib.current_memory()[0] max_memory = max(2000, (mydf.max_memory - mem_now)) * .8 log.debug1('max_memory = %d MB (%d in use)', max_memory, mem_now) t2 = t1 # rho_rs(-G+k_rs) is computed as conj(rho_{rs^*}(G-k_rs)) # == conj(transpose(rho_sr(G+k_sr), (0,2,1))) blksize = max(int(max_memory*.25e6/16/nao**2), 16) bufR = numpy.empty(blksize*nao**2) bufI = numpy.empty(blksize*nao**2) for pqkR, pqkI, p0, p1 in mydf.pw_loop(mydf.gs, kptii, max_memory=max_memory): t2 = log.timer_debug1('%d:%d ft_aopair'%(p0,p1), *t2) pqkR = pqkR.reshape(nao,nao,-1) pqkI = pqkI.reshape(nao,nao,-1) if with_j: #:v4 = numpy.einsum('ijL,lkL->ijkl', pqk, pqk.conj()) #:vj += numpy.einsum('ijkl,lk->ij', v4, dm) for i in range(nset): rhoR = numpy.einsum('pq,pqk->k', dmsR[i], pqkR) rhoR+= numpy.einsum('pq,pqk->k', dmsI[i], pqkI) rhoI = numpy.einsum('pq,pqk->k', dmsI[i], pqkR) rhoI-= numpy.einsum('pq,pqk->k', dmsR[i], pqkI) rhoR *= vjcoulG[p0:p1] rhoI *= vjcoulG[p0:p1] vjR[i] += numpy.einsum('pqk,k->pq', pqkR, rhoR) vjR[i] -= numpy.einsum('pqk,k->pq', pqkI, rhoI) if not j_real: vjI[i] += numpy.einsum('pqk,k->pq', pqkR, rhoI) vjI[i] += numpy.einsum('pqk,k->pq', pqkI, rhoR) #t2 = log.timer_debug1(' with_j', *t2) if with_k: coulG = numpy.sqrt(vkcoulG[p0:p1]) pqkR *= coulG pqkI *= coulG #:v4 = numpy.einsum('ijL,lkL->ijkl', pqk, pqk.conj()) #:vk += numpy.einsum('ijkl,jk->il', v4, dm) pLqR = lib.transpose(pqkR, axes=(0,2,1), out=bufR).reshape(-1,nao) pLqI = lib.transpose(pqkI, axes=(0,2,1), out=bufI).reshape(-1,nao) iLkR = numpy.ndarray((nao*(p1-p0),nao), buffer=pqkR) iLkI = numpy.ndarray((nao*(p1-p0),nao), buffer=pqkI) for i in range(nset): if k_real: lib.dot(pLqR, dmsR[i], 1, iLkR) lib.dot(pLqI, dmsR[i], 1, iLkI) lib.dot(iLkR.reshape(nao,-1), pLqR.reshape(nao,-1).T, 1, vkR[i], 1) lib.dot(iLkI.reshape(nao,-1), pLqI.reshape(nao,-1).T, 1, vkR[i], 1) else: zdotNN(pLqR, pLqI, dmsR[i], dmsI[i], 1, iLkR, iLkI) zdotNC(iLkR.reshape(nao,-1), iLkI.reshape(nao,-1), pLqR.reshape(nao,-1).T, pLqI.reshape(nao,-1).T, 1, vkR[i], vkI[i]) #t2 = log.timer_debug1(' with_k', *t2) pqkR = pqkI = coulG = pLqR = pLqI = iLkR = iLkI = None #t2 = log.timer_debug1('%d:%d'%(p0,p1), *t2) bufR = bufI = None t1 = log.timer_debug1('aft_jk.get_jk', *t1) if with_j: if j_real: vj = vjR else: vj = vjR + vjI * 1j vj = vj.reshape(dm.shape) if with_k: if k_real: vk = vkR else: vk = vkR + vkI * 1j if cell.dimension != 3 and exxdiv: assert(exxdiv.lower() == 'ewald') _ewald_exxdiv_for_G0(cell, kpt, dms, vk) vk = vk.reshape(dm.shape) return vj, vk
def get_k_kpts(mydf, dm_kpts, hermi=1, kpts=numpy.zeros((1,3)), kpts_band=None, exxdiv=None): cell = mydf.cell log = logger.Logger(mydf.stdout, mydf.verbose) t1 = (time.clock(), time.time()) dm_kpts = lib.asarray(dm_kpts, order='C') dms = _format_dms(dm_kpts, kpts) nset, nkpts, nao = dms.shape[:3] swap_2e = (kpts_band is None) kpts_band, single_kpt_band = _format_kpts_band(kpts_band, kpts) nband = len(kpts_band) kk_table = kpts_band.reshape(-1,1,3) - kpts.reshape(1,-1,3) kk_todo = numpy.ones(kk_table.shape[:2], dtype=bool) vkR = numpy.zeros((nset,nband,nao,nao)) vkI = numpy.zeros((nset,nband,nao,nao)) dmsR = numpy.asarray(dms.real, order='C') dmsI = numpy.asarray(dms.imag, order='C') mem_now = lib.current_memory()[0] max_memory = max(2000, (mydf.max_memory - mem_now)) * .8 log.debug1('max_memory = %d MB (%d in use)', max_memory, mem_now) # K_pq = ( p{k1} i{k2} | i{k2} q{k1} ) def make_kpt(kpt): # kpt = kptj - kpti # search for all possible ki and kj that has ki-kj+kpt=0 kk_match = numpy.einsum('ijx->ij', abs(kk_table + kpt)) < 1e-9 kpti_idx, kptj_idx = numpy.where(kk_todo & kk_match) nkptj = len(kptj_idx) log.debug1('kpt = %s', kpt) log.debug2('kpti_idx = %s', kpti_idx) log.debug2('kptj_idx = %s', kptj_idx) kk_todo[kpti_idx,kptj_idx] = False if swap_2e and not is_zero(kpt): kk_todo[kptj_idx,kpti_idx] = False max_memory1 = max_memory * (nkptj+1)/(nkptj+5) blksize = max(int(max_memory1*4e6/(nkptj+5)/16/nao**2), 16) bufR = numpy.empty((blksize*nao**2)) bufI = numpy.empty((blksize*nao**2)) # Use DF object to mimic KRHF/KUHF object in function get_coulG mydf.exxdiv = exxdiv vkcoulG = mydf.weighted_coulG(kpt, True, mydf.gs) kptjs = kpts[kptj_idx] # <r|-G+k_rs|s> = conj(<s|G-k_rs|r>) = conj(<s|G+k_sr|r>) for k, pqkR, pqkI, p0, p1 \ in mydf.ft_loop(mydf.gs, kpt, kptjs, max_memory=max_memory1): ki = kpti_idx[k] kj = kptj_idx[k] coulG = numpy.sqrt(vkcoulG[p0:p1]) # case 1: k_pq = (pi|iq) #:v4 = numpy.einsum('ijL,lkL->ijkl', pqk, pqk.conj()) #:vk += numpy.einsum('ijkl,jk->il', v4, dm) pqkR *= coulG pqkI *= coulG pLqR = lib.transpose(pqkR.reshape(nao,nao,-1), axes=(0,2,1), out=bufR) pLqI = lib.transpose(pqkI.reshape(nao,nao,-1), axes=(0,2,1), out=bufI) iLkR = numpy.empty((nao*(p1-p0),nao)) iLkI = numpy.empty((nao*(p1-p0),nao)) for i in range(nset): iLkR, iLkI = zdotNN(pLqR.reshape(-1,nao), pLqI.reshape(-1,nao), dmsR[i,kj], dmsI[i,kj], 1, iLkR, iLkI) zdotNC(iLkR.reshape(nao,-1), iLkI.reshape(nao,-1), pLqR.reshape(nao,-1).T, pLqI.reshape(nao,-1).T, 1, vkR[i,ki], vkI[i,ki], 1) # case 2: k_pq = (iq|pi) #:v4 = numpy.einsum('iLj,lLk->ijkl', pqk, pqk.conj()) #:vk += numpy.einsum('ijkl,li->kj', v4, dm) if swap_2e and not is_zero(kpt): iLkR = iLkR.reshape(nao,-1) iLkI = iLkI.reshape(nao,-1) for i in range(nset): iLkR, iLkI = zdotNN(dmsR[i,ki], dmsI[i,ki], pLqR.reshape(nao,-1), pLqI.reshape(nao,-1), 1, iLkR, iLkI) zdotCN(pLqR.reshape(-1,nao).T, pLqI.reshape(-1,nao).T, iLkR.reshape(-1,nao), iLkI.reshape(-1,nao), 1, vkR[i,kj], vkI[i,kj], 1) pqkR = pqkI = coulG = pLqR = pLqI = iLkR = iLkI = None for ki, kpti in enumerate(kpts_band): for kj, kptj in enumerate(kpts): if kk_todo[ki,kj]: make_kpt(kptj-kpti) if (gamma_point(kpts) and gamma_point(kpts_band) and not numpy.iscomplexobj(dm_kpts)): vk_kpts = vkR else: vk_kpts = vkR + vkI * 1j vk_kpts *= 1./nkpts # G=0 was not included in the non-uniform grids if cell.dimension != 3 and exxdiv: assert(exxdiv.lower() == 'ewald') _ewald_exxdiv_for_G0(cell, kpts_band, dms, vk_kpts, kpts_band) return _format_jks(vk_kpts, dm_kpts, kpts_band, kpts, single_kpt_band)
def get_jk(mydf, dm, hermi=1, kpt=numpy.zeros(3), kpt_band=None, with_j=True, with_k=True, exxdiv=None): '''JK for given k-point''' from pyscf.pbc.df.df_jk import _ewald_exxdiv_for_G0 vj = vk = None if kpt_band is not None and abs(kpt-kpt_band).sum() > 1e-9: kpt = numpy.reshape(kpt, (1,3)) if with_k: vk = get_k_kpts(mydf, [dm], hermi, kpt, kpt_band, exxdiv) if with_j: vj = get_j_kpts(mydf, [dm], hermi, kpt, kpt_band) return vj, vk cell = mydf.cell log = logger.Logger(mydf.stdout, mydf.verbose) t1 = (time.clock(), time.time()) dm = numpy.asarray(dm, order='C') dms = _format_dms(dm, [kpt]) nset, _, nao = dms.shape[:3] dms = dms.reshape(nset,nao,nao) j_real = gamma_point(kpt) k_real = gamma_point(kpt) and not numpy.iscomplexobj(dms) kptii = numpy.asarray((kpt,kpt)) kpt_allow = numpy.zeros(3) if with_j: vjcoulG = mydf.weighted_coulG(kpt_allow, False, mydf.gs) vjR = numpy.zeros((nset,nao,nao)) vjI = numpy.zeros((nset,nao,nao)) if with_k: mydf.exxdiv = exxdiv vkcoulG = mydf.weighted_coulG(kpt_allow, True, mydf.gs) vkR = numpy.zeros((nset,nao,nao)) vkI = numpy.zeros((nset,nao,nao)) dmsR = numpy.asarray(dms.real.reshape(nset,nao,nao), order='C') dmsI = numpy.asarray(dms.imag.reshape(nset,nao,nao), order='C') mem_now = lib.current_memory()[0] max_memory = max(2000, (mydf.max_memory - mem_now)) * .8 log.debug1('max_memory = %d MB (%d in use)', max_memory, mem_now) t2 = t1 # rho_rs(-G+k_rs) is computed as conj(rho_{rs^*}(G-k_rs)) # == conj(transpose(rho_sr(G+k_sr), (0,2,1))) blksize = max(int(max_memory*.25e6/16/nao**2), 16) bufR = numpy.empty(blksize*nao**2) bufI = numpy.empty(blksize*nao**2) for pqkR, pqkI, p0, p1 in mydf.pw_loop(mydf.gs, kptii, max_memory=max_memory): t2 = log.timer_debug1('%d:%d ft_aopair'%(p0,p1), *t2) pqkR = pqkR.reshape(nao,nao,-1) pqkI = pqkI.reshape(nao,nao,-1) if with_j: for i in range(nset): rhoR = numpy.einsum('pq,pqk->k', dmsR[i], pqkR) rhoR+= numpy.einsum('pq,pqk->k', dmsI[i], pqkI) rhoI = numpy.einsum('pq,pqk->k', dmsI[i], pqkR) rhoI-= numpy.einsum('pq,pqk->k', dmsR[i], pqkI) rhoR *= vjcoulG[p0:p1] rhoI *= vjcoulG[p0:p1] vjR[i] += numpy.einsum('pqk,k->pq', pqkR, rhoR) vjR[i] -= numpy.einsum('pqk,k->pq', pqkI, rhoI) if not j_real: vjI[i] += numpy.einsum('pqk,k->pq', pqkR, rhoI) vjI[i] += numpy.einsum('pqk,k->pq', pqkI, rhoR) #t2 = log.timer_debug1(' with_j', *t2) if with_k: coulG = numpy.sqrt(vkcoulG[p0:p1]) pqkR *= coulG pqkI *= coulG #:v4 = numpy.einsum('ijL,lkL->ijkl', pqk, pqk.conj()) #:vk += numpy.einsum('ijkl,jk->il', v4, dm) pLqR = lib.transpose(pqkR, axes=(0,2,1), out=bufR).reshape(-1,nao) pLqI = lib.transpose(pqkI, axes=(0,2,1), out=bufI).reshape(-1,nao) iLkR = numpy.ndarray((nao*(p1-p0),nao), buffer=pqkR) iLkI = numpy.ndarray((nao*(p1-p0),nao), buffer=pqkI) for i in range(nset): if k_real: lib.dot(pLqR, dmsR[i], 1, iLkR) lib.dot(pLqI, dmsR[i], 1, iLkI) lib.dot(iLkR.reshape(nao,-1), pLqR.reshape(nao,-1).T, 1, vkR[i], 1) lib.dot(iLkI.reshape(nao,-1), pLqI.reshape(nao,-1).T, 1, vkR[i], 1) else: zdotNN(pLqR, pLqI, dmsR[i], dmsI[i], 1, iLkR, iLkI) zdotNC(iLkR.reshape(nao,-1), iLkI.reshape(nao,-1), pLqR.reshape(nao,-1).T, pLqI.reshape(nao,-1).T, 1, vkR[i], vkI[i]) #t2 = log.timer_debug1(' with_k', *t2) pqkR = pqkI = coulG = pLqR = pLqI = iLkR = iLkI = None #t2 = log.timer_debug1('%d:%d'%(p0,p1), *t2) bufR = bufI = None t1 = log.timer_debug1('pwdf_jk.get_jk', *t1) if with_j: if j_real: vj = vjR else: vj = vjR + vjI * 1j vj = vj.reshape(dm.shape) if with_k: if k_real: vk = vkR else: vk = vkR + vkI * 1j if cell.dimension != 3 and exxdiv is not None: assert(exxdiv.lower() == 'ewald') _ewald_exxdiv_for_G0(cell, kpt, dms, vk) vk = vk.reshape(dm.shape) return vj, vk
def get_k_kpts(mydf, dm_kpts, hermi=1, kpts=numpy.zeros((1,3)), kpts_band=None, exxdiv=None): mydf = _sync_mydf(mydf) cell = mydf.cell mesh = mydf.mesh coords = cell.gen_uniform_grids(mesh) ngrids = coords.shape[0] if hasattr(dm_kpts, 'mo_coeff'): if dm_kpts.ndim == 3: # KRHF mo_coeff = [dm_kpts.mo_coeff] mo_occ = [dm_kpts.mo_occ ] else: # KUHF mo_coeff = dm_kpts.mo_coeff mo_occ = dm_kpts.mo_occ elif hasattr(dm_kpts[0], 'mo_coeff'): mo_coeff = [dm.mo_coeff for dm in dm_kpts] mo_occ = [dm.mo_occ for dm in dm_kpts] else: mo_coeff = None kpts = numpy.asarray(kpts) dm_kpts = lib.asarray(dm_kpts, order='C') dms = _format_dms(dm_kpts, kpts) nset, nkpts, nao = dms.shape[:3] weight = 1./nkpts * (cell.vol/ngrids) kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band nband = len(kpts_band) if gamma_point(kpts_band) and gamma_point(kpts): vk_kpts = numpy.zeros((nset,nband,nao,nao), dtype=dms.dtype) else: vk_kpts = numpy.zeros((nset,nband,nao,nao), dtype=numpy.complex128) coords = mydf.grids.coords ao2_kpts = [numpy.asarray(ao.T, order='C') for ao in mydf._numint.eval_ao(cell, coords, kpts=kpts)] if input_band is None: ao1_kpts = ao2_kpts else: ao1_kpts = [numpy.asarray(ao.T, order='C') for ao in mydf._numint.eval_ao(cell, coords, kpts=kpts_band)] mem_now = lib.current_memory()[0] max_memory = mydf.max_memory - mem_now blksize = int(min(nao, max(1, (max_memory-mem_now)*1e6/16/4/ngrids/nao))) lib.logger.debug1(mydf, 'max_memory %s blksize %d', max_memory, blksize) ao1_dtype = numpy.result_type(*ao1_kpts) ao2_dtype = numpy.result_type(*ao2_kpts) vR_dm = numpy.empty((nset,nao,ngrids), dtype=vk_kpts.dtype) ao_dms_buf = [None] * nkpts tasks = [(k1,k2) for k2 in range(nkpts) for k1 in range(nband)] for k1, k2 in mpi.static_partition(tasks): ao1T = ao1_kpts[k1] ao2T = ao2_kpts[k2] kpt1 = kpts_band[k1] kpt2 = kpts[k2] if ao2T.size == 0 or ao1T.size == 0: continue # If we have an ewald exxdiv, we add the G=0 correction near the # end of the function to bypass any discretization errors # that arise from the FFT. mydf.exxdiv = exxdiv if exxdiv == 'ewald' or exxdiv is None: coulG = tools.get_coulG(cell, kpt2-kpt1, False, mydf, mesh) else: coulG = tools.get_coulG(cell, kpt2-kpt1, True, mydf, mesh) if is_zero(kpt1-kpt2): expmikr = numpy.array(1.) else: expmikr = numpy.exp(-1j * numpy.dot(coords, kpt2-kpt1)) if ao_dms_buf[k2] is None: if mo_coeff is None: ao_dms = [lib.dot(dm[k2], ao2T.conj()) for dm in dms] else: ao_dms = [] for i, dm in enumerate(dms): occ = mo_occ[i][k2] mo_scaled = mo_coeff[i][k2][:,occ>0] * numpy.sqrt(occ[occ>0]) ao_dms.append(lib.dot(mo_scaled.T, ao2T).conj()) ao_dms_buf[k2] = ao_dms else: ao_dms = ao_dms_buf[k2] if mo_coeff is None: for p0, p1 in lib.prange(0, nao, blksize): rho1 = numpy.einsum('ig,jg->ijg', ao1T[p0:p1].conj()*expmikr, ao2T) vG = tools.fft(rho1.reshape(-1,ngrids), mesh) rho1 = None vG *= coulG vR = tools.ifft(vG, mesh).reshape(p1-p0,nao,ngrids) vG = None if vR_dm.dtype == numpy.double: vR = vR.real for i in range(nset): numpy.einsum('ijg,jg->ig', vR, ao_dms[i], out=vR_dm[i,p0:p1]) vR = None else: for p0, p1 in lib.prange(0, nao, blksize): for i in range(nset): rho1 = numpy.einsum('ig,jg->ijg', ao1T[p0:p1].conj()*expmikr, ao_dms[i].conj()) vG = tools.fft(rho1.reshape(-1,ngrids), mesh) rho1 = None vG *= coulG vR = tools.ifft(vG, mesh).reshape(p1-p0,-1,ngrids) vG = None if vR_dm.dtype == numpy.double: vR = vR.real numpy.einsum('ijg,jg->ig', vR, ao_dms[i], out=vR_dm[i,p0:p1]) vR = None vR_dm *= expmikr.conj() for i in range(nset): vk_kpts[i,k1] += weight * lib.dot(vR_dm[i], ao1T.T) vk_kpts = mpi.reduce(lib.asarray(vk_kpts)) if gamma_point(kpts_band) and gamma_point(kpts): vk_kpts = vk_kpts.real if rank == 0: if exxdiv == 'ewald': _ewald_exxdiv_for_G0(cell, kpts, dms, vk_kpts, kpts_band=kpts_band) return _format_jks(vk_kpts, dm_kpts, input_band, kpts)
def get_k_kpts(mydf, dm_kpts, hermi=1, kpts=numpy.zeros((1,3)), kpt_band=None, exxdiv=None): cell = mydf.cell log = logger.Logger(mydf.stdout, mydf.verbose) t1 = (time.clock(), time.time()) dm_kpts = lib.asarray(dm_kpts, order='C') dms = _format_dms(dm_kpts, kpts) nset, nkpts, nao = dms.shape[:3] if kpt_band is None: kpts_band = kpts swap_2e = True else: kpts_band = numpy.reshape(kpt_band, (-1,3)) nband = len(kpts_band) kk_table = kpts_band.reshape(-1,1,3) - kpts.reshape(1,-1,3) kk_todo = numpy.ones(kk_table.shape[:2], dtype=bool) vkR = numpy.zeros((nset,nband,nao,nao)) vkI = numpy.zeros((nset,nband,nao,nao)) dmsR = numpy.asarray(dms.real, order='C') dmsI = numpy.asarray(dms.imag, order='C') mem_now = lib.current_memory()[0] max_memory = max(2000, (mydf.max_memory - mem_now)) * .8 log.debug1('max_memory = %d MB (%d in use)', max_memory, mem_now) # K_pq = ( p{k1} i{k2} | i{k2} q{k1} ) def make_kpt(kpt): # kpt = kptj - kpti # search for all possible ki and kj that has ki-kj+kpt=0 kk_match = numpy.einsum('ijx->ij', abs(kk_table + kpt)) < 1e-9 kpti_idx, kptj_idx = numpy.where(kk_todo & kk_match) nkptj = len(kptj_idx) log.debug1('kpt = %s', kpt) log.debug2('kpti_idx = %s', kpti_idx) log.debug2('kptj_idx = %s', kptj_idx) kk_todo[kpti_idx,kptj_idx] = False if swap_2e and not is_zero(kpt): kk_todo[kptj_idx,kpti_idx] = False max_memory1 = max_memory * (nkptj+1)/(nkptj+5) blksize = max(int(max_memory1*4e6/(nkptj+5)/16/nao**2), 16) bufR = numpy.empty((blksize*nao**2)) bufI = numpy.empty((blksize*nao**2)) # Use DF object to mimic KRHF/KUHF object in function get_coulG mydf.exxdiv = exxdiv vkcoulG = mydf.weighted_coulG(kpt, True, mydf.gs) kptjs = kpts[kptj_idx] # <r|-G+k_rs|s> = conj(<s|G-k_rs|r>) = conj(<s|G+k_sr|r>) for k, pqkR, pqkI, p0, p1 \ in mydf.ft_loop(mydf.gs, kpt, kptjs, max_memory=max_memory1): ki = kpti_idx[k] kj = kptj_idx[k] coulG = numpy.sqrt(vkcoulG[p0:p1]) # case 1: k_pq = (pi|iq) #:v4 = numpy.einsum('ijL,lkL->ijkl', pqk, pqk.conj()) #:vk += numpy.einsum('ijkl,jk->il', v4, dm) pqkR *= coulG pqkI *= coulG pLqR = lib.transpose(pqkR.reshape(nao,nao,-1), axes=(0,2,1), out=bufR) pLqI = lib.transpose(pqkI.reshape(nao,nao,-1), axes=(0,2,1), out=bufI) iLkR = numpy.empty((nao*(p1-p0),nao)) iLkI = numpy.empty((nao*(p1-p0),nao)) for i in range(nset): iLkR, iLkI = zdotNN(pLqR.reshape(-1,nao), pLqI.reshape(-1,nao), dmsR[i,kj], dmsI[i,kj], 1, iLkR, iLkI) zdotNC(iLkR.reshape(nao,-1), iLkI.reshape(nao,-1), pLqR.reshape(nao,-1).T, pLqI.reshape(nao,-1).T, 1, vkR[i,ki], vkI[i,ki], 1) # case 2: k_pq = (iq|pi) #:v4 = numpy.einsum('iLj,lLk->ijkl', pqk, pqk.conj()) #:vk += numpy.einsum('ijkl,li->kj', v4, dm) if swap_2e and not is_zero(kpt): iLkR = iLkR.reshape(nao,-1) iLkI = iLkI.reshape(nao,-1) for i in range(nset): iLkR, iLkI = zdotNN(dmsR[i,ki], dmsI[i,ki], pLqR.reshape(nao,-1), pLqI.reshape(nao,-1), 1, iLkR, iLkI) zdotCN(pLqR.reshape(-1,nao).T, pLqI.reshape(-1,nao).T, iLkR.reshape(-1,nao), iLkI.reshape(-1,nao), 1, vkR[i,kj], vkI[i,kj], 1) pqkR = pqkI = coulG = pLqR = pLqI = iLkR = iLkI = None for ki, kpti in enumerate(kpts_band): for kj, kptj in enumerate(kpts): if kk_todo[ki,kj]: make_kpt(kptj-kpti) if (gamma_point(kpts) and gamma_point(kpts_band) and not numpy.iscomplexobj(dm_kpts)): vk_kpts = vkR else: vk_kpts = vkR + vkI * 1j vk_kpts *= 1./nkpts # G=0 was not included in the non-uniform grids if cell.dimension != 3 and exxdiv is not None: assert(exxdiv.lower() == 'ewald') _ewald_exxdiv_for_G0(cell, kpts_band, dms, vk_kpts) if kpt_band is not None and numpy.shape(kpt_band) == (3,): if dm_kpts.ndim == 3: # One set of dm_kpts for KRHF return vk_kpts[0,0] else: return vk_kpts[:,0] else: return vk_kpts.reshape(dm_kpts.shape)
def make_kpt(uniq_kptji_id): # kpt = kptj - kpti kpt = uniq_kpts[uniq_kptji_id] log.debug1('kpt = %s', kpt) adapted_ji_idx = numpy.where(uniq_inverse == uniq_kptji_id)[0] adapted_kptjs = kptjs[adapted_ji_idx] nkptj = len(adapted_kptjs) log.debug1('adapted_ji_idx = %s', adapted_ji_idx) kLR = kLRs[uniq_kptji_id] kLI = kLIs[uniq_kptji_id] if is_zero(kpt): # kpti == kptj aosym = 's2' nao_pair = nao * (nao + 1) // 2 vbar = fuse(mydf.auxbar(fused_cell)) ovlp = cell.pbc_intor('cint1e_ovlp_sph', hermi=1, kpts=adapted_kptjs) for k, ji in enumerate(adapted_ji_idx): ovlp[k] = lib.pack_tril(ovlp[k]) else: aosym = 's1' nao_pair = nao**2 mem_now = lib.current_memory()[0] log.debug2('memory = %s', mem_now) max_memory = max(2000, mydf.max_memory - mem_now) # nkptj for 3c-coulomb arrays plus 1 Lpq array buflen = min( max(int(max_memory * .6 * 1e6 / 16 / naux / (nkptj + 1)), 1), nao_pair) shranges = _guess_shell_ranges(cell, buflen, aosym) buflen = max([x[2] for x in shranges]) # +1 for a pqkbuf if aosym == 's2': Gblksize = max( 16, int(max_memory * .2 * 1e6 / 16 / buflen / (nkptj + 1))) else: Gblksize = max( 16, int(max_memory * .4 * 1e6 / 16 / buflen / (nkptj + 1))) Gblksize = min(Gblksize, ngs, 16384) pqkRbuf = numpy.empty(buflen * Gblksize) pqkIbuf = numpy.empty(buflen * Gblksize) # buf for ft_aopair buf = numpy.zeros((nkptj, buflen * Gblksize), dtype=numpy.complex128) col1 = 0 for istep, sh_range in enumerate(shranges): log.debug1('int3c2e [%d/%d], AO [%d:%d], ncol = %d', \ istep+1, len(shranges), *sh_range) bstart, bend, ncol = sh_range col0, col1 = col1, col1 + ncol j3cR = [] j3cI = [] for k, idx in enumerate(adapted_ji_idx): v = numpy.asarray(feri['j3c/%d' % idx][:, col0:col1]) if is_zero(kpt): for i, c in enumerate(vbar): if c != 0: v[i] -= c * ovlp[k][col0:col1] j3cR.append(numpy.asarray(v.real, order='C')) if is_zero(kpt) and gamma_point(adapted_kptjs[k]): j3cI.append(None) else: j3cI.append(numpy.asarray(v.imag, order='C')) v = None if aosym == 's2': shls_slice = (bstart, bend, 0, bend) for p0, p1 in lib.prange(0, ngs, Gblksize): ft_ao._ft_aopair_kpts(cell, Gv[p0:p1], shls_slice, aosym, b, gxyz[p0:p1], Gvbase, kpt, adapted_kptjs, out=buf) nG = p1 - p0 for k, ji in enumerate(adapted_ji_idx): aoao = numpy.ndarray((nG, ncol), dtype=numpy.complex128, order='F', buffer=buf[k]) pqkR = numpy.ndarray((ncol, nG), buffer=pqkRbuf) pqkI = numpy.ndarray((ncol, nG), buffer=pqkIbuf) pqkR[:] = aoao.real.T pqkI[:] = aoao.imag.T aoao[:] = 0 lib.dot(kLR[p0:p1].T, pqkR.T, -1, j3cR[k][naux:], 1) lib.dot(kLI[p0:p1].T, pqkI.T, -1, j3cR[k][naux:], 1) if not (is_zero(kpt) and gamma_point(adapted_kptjs[k])): lib.dot(kLR[p0:p1].T, pqkI.T, -1, j3cI[k][naux:], 1) lib.dot(kLI[p0:p1].T, pqkR.T, 1, j3cI[k][naux:], 1) else: shls_slice = (bstart, bend, 0, cell.nbas) ni = ncol // nao for p0, p1 in lib.prange(0, ngs, Gblksize): ft_ao._ft_aopair_kpts(cell, Gv[p0:p1], shls_slice, aosym, b, gxyz[p0:p1], Gvbase, kpt, adapted_kptjs, out=buf) nG = p1 - p0 for k, ji in enumerate(adapted_ji_idx): aoao = numpy.ndarray((nG, ni, nao), dtype=numpy.complex128, order='F', buffer=buf[k]) pqkR = numpy.ndarray((ni, nao, nG), buffer=pqkRbuf) pqkI = numpy.ndarray((ni, nao, nG), buffer=pqkIbuf) pqkR[:] = aoao.real.transpose(1, 2, 0) pqkI[:] = aoao.imag.transpose(1, 2, 0) aoao[:] = 0 pqkR = pqkR.reshape(-1, nG) pqkI = pqkI.reshape(-1, nG) zdotCN(kLR[p0:p1].T, kLI[p0:p1].T, pqkR.T, pqkI.T, -1, j3cR[k][naux:], j3cI[k][naux:], 1) naux0 = nauxs[uniq_kptji_id] for k, ji in enumerate(adapted_ji_idx): if is_zero(kpt) and gamma_point(adapted_kptjs[k]): v = fuse(j3cR[k]) else: v = fuse(j3cR[k] + j3cI[k] * 1j) if j2c[uniq_kptji_id][0] == 'CD': v = scipy.linalg.solve_triangular(j2c[uniq_kptji_id][1], v, lower=True, overwrite_b=True) else: v = lib.dot(j2c[uniq_kptji_id][1], v) feri['j3c/%d' % ji][:naux0, col0:col1] = v naux0 = nauxs[uniq_kptji_id] for k, ji in enumerate(adapted_ji_idx): v = feri['j3c/%d' % ji][:naux0] del (feri['j3c/%d' % ji]) feri['j3c/%d' % ji] = v
def _make_j3c(mydf, cell, auxcell, kptij_lst, cderi_file): log = logger.Logger(mydf.stdout, mydf.verbose) t1 = t0 = (time.clock(), time.time()) fused_cell, fuse = fuse_auxcell(mydf, mydf.auxcell) ao_loc = cell.ao_loc_nr() nao = ao_loc[-1] naux = auxcell.nao_nr() nkptij = len(kptij_lst) gs = mydf.gs Gv, Gvbase, kws = cell.get_Gv_weights(gs) b = cell.reciprocal_vectors() gxyz = lib.cartesian_prod([numpy.arange(len(x)) for x in Gvbase]) ngs = gxyz.shape[0] kptis = kptij_lst[:, 0] kptjs = kptij_lst[:, 1] kpt_ji = kptjs - kptis uniq_kpts, uniq_index, uniq_inverse = unique(kpt_ji) log.debug('Num uniq kpts %d', len(uniq_kpts)) log.debug2('uniq_kpts %s', uniq_kpts) # j2c ~ (-kpt_ji | kpt_ji) j2c = fused_cell.pbc_intor('int2c2e_sph', hermi=1, kpts=uniq_kpts) j2ctags = [] nauxs = [] t1 = log.timer_debug1('2c2e', *t1) if h5py.is_hdf5(cderi_file): feri = h5py.File(cderi_file) else: feri = h5py.File(cderi_file, 'w') for k, kpt in enumerate(uniq_kpts): aoaux = ft_ao.ft_ao(fused_cell, Gv, None, b, gxyz, Gvbase, kpt).T coulG = numpy.sqrt(mydf.weighted_coulG(kpt, False, gs)) kLR = (aoaux.real * coulG).T kLI = (aoaux.imag * coulG).T if not kLR.flags.c_contiguous: kLR = lib.transpose(kLR.T) if not kLI.flags.c_contiguous: kLI = lib.transpose(kLI.T) aoaux = None kLR1 = numpy.asarray(kLR[:, naux:], order='C') kLI1 = numpy.asarray(kLI[:, naux:], order='C') if is_zero(kpt): # kpti == kptj for p0, p1 in mydf.mpi_prange(0, ngs): j2cR = lib.ddot(kLR1[p0:p1].T, kLR[p0:p1]) j2cR = lib.ddot(kLI1[p0:p1].T, kLI[p0:p1], 1, j2cR, 1) j2c[k][naux:] -= mpi.allreduce(j2cR) j2c[k][:naux, naux:] = j2c[k][naux:, :naux].T else: for p0, p1 in mydf.mpi_prange(0, ngs): j2cR, j2cI = zdotCN(kLR1[p0:p1].T, kLI1[p0:p1].T, kLR[p0:p1], kLI[p0:p1]) j2cR = mpi.allreduce(j2cR) j2cI = mpi.allreduce(j2cI) j2c[k][naux:] -= j2cR + j2cI * 1j j2c[k][:naux, naux:] = j2c[k][naux:, :naux].T.conj() j2c[k] = fuse(fuse(j2c[k]).T).T try: feri['j2c/%d' % k] = scipy.linalg.cholesky(j2c[k], lower=True) j2ctags.append('CD') nauxs.append(naux) except scipy.linalg.LinAlgError as e: #msg =('===================================\n' # 'J-metric not positive definite.\n' # 'It is likely that gs is not enough.\n' # '===================================') #log.error(msg) #raise scipy.linalg.LinAlgError('\n'.join([e.message, msg])) w, v = scipy.linalg.eigh(j2c) log.debug2('metric linear dependency for kpt %s', uniq_kptji_id) log.debug2('cond = %.4g, drop %d bfns', w[0] / w[-1], numpy.count_nonzero(w < LINEAR_DEP_THR)) v = v[:, w > LINEAR_DEP_THR].T.conj() v /= numpy.sqrt(w[w > LINEAR_DEP_THR]).reshape(-1, 1) feri['j2c/%d' % k] = v j2ctags.append('eig') nauxs.append(v.shape[0]) kLR = kLI = kLR1 = kLI1 = coulG = None j2c = None aosym_s2 = numpy.einsum('ix->i', abs(kptis - kptjs)) < 1e-9 j_only = numpy.all(aosym_s2) if gamma_point(kptij_lst): dtype = 'f8' else: dtype = 'c16' vbar = mydf.auxbar(fused_cell) vbar = fuse(vbar) ovlp = cell.pbc_intor('int1e_ovlp_sph', hermi=1, kpts=kptjs[aosym_s2]) ovlp = [lib.pack_tril(s) for s in ovlp] t1 = log.timer_debug1('aoaux and int2c', *t1) # Estimates the buffer size based on the last contraction in G-space. # This contraction requires to hold nkptj copies of (naux,?) array # simultaneously in memory. mem_now = max(comm.allgather(lib.current_memory()[0])) max_memory = max(2000, mydf.max_memory - mem_now) nkptj_max = max((uniq_inverse == x).sum() for x in set(uniq_inverse)) buflen = max( int( min(max_memory * .5e6 / 16 / naux / (nkptj_max + 2) / nao, nao / 3 / mpi.pool.size)), 1) chunks = (buflen, nao) j3c_jobs = grids2d_int3c_jobs(cell, auxcell, kptij_lst, chunks, j_only) log.debug1('max_memory = %d MB (%d in use) chunks %s', max_memory, mem_now, chunks) log.debug2('j3c_jobs %s', j3c_jobs) if j_only: int3c = wrap_int3c(cell, fused_cell, 'int3c2e_sph', 's2', 1, kptij_lst) else: int3c = wrap_int3c(cell, fused_cell, 'int3c2e_sph', 's1', 1, kptij_lst) idxb = numpy.tril_indices(nao) idxb = (idxb[0] * nao + idxb[1]).astype('i') aux_loc = fused_cell.ao_loc_nr('ssc' in 'int3c2e_sph') def gen_int3c(auxcell, job_id, ish0, ish1): dataname = 'j3c-chunks/%d' % job_id if dataname in feri: del (feri[dataname]) i0 = ao_loc[ish0] i1 = ao_loc[ish1] dii = i1 * (i1 + 1) // 2 - i0 * (i0 + 1) // 2 dij = (i1 - i0) * nao if j_only: buflen = max(8, int(max_memory * 1e6 / 16 / (nkptij * dii + dii))) else: buflen = max(8, int(max_memory * 1e6 / 16 / (nkptij * dij + dij))) auxranges = balance_segs(aux_loc[1:] - aux_loc[:-1], buflen) buflen = max([x[2] for x in auxranges]) buf = numpy.empty(nkptij * dij * buflen, dtype=dtype) buf1 = numpy.empty(dij * buflen, dtype=dtype) naux = aux_loc[-1] for kpt_id, kptij in enumerate(kptij_lst): key = '%s/%d' % (dataname, kpt_id) if aosym_s2[kpt_id]: shape = (naux, dii) else: shape = (naux, dij) if gamma_point(kptij): feri.create_dataset(key, shape, 'f8') else: feri.create_dataset(key, shape, 'c16') naux0 = 0 for istep, auxrange in enumerate(auxranges): log.alldebug2("aux_e2 job_id %d step %d", job_id, istep) sh0, sh1, nrow = auxrange sub_slice = (ish0, ish1, 0, cell.nbas, sh0, sh1) if j_only: mat = numpy.ndarray((nkptij, dii, nrow), dtype=dtype, buffer=buf) else: mat = numpy.ndarray((nkptij, dij, nrow), dtype=dtype, buffer=buf) mat = int3c(sub_slice, mat) for k, kptij in enumerate(kptij_lst): h5dat = feri['%s/%d' % (dataname, k)] v = lib.transpose(mat[k], out=buf1) if not j_only and aosym_s2[k]: idy = idxb[i0 * (i0 + 1) // 2:i1 * (i1 + 1) // 2] - i0 * nao out = numpy.ndarray((nrow, dii), dtype=v.dtype, buffer=mat[k]) v = numpy.take(v, idy, axis=1, out=out) if gamma_point(kptij): h5dat[naux0:naux0 + nrow] = v.real else: h5dat[naux0:naux0 + nrow] = v naux0 += nrow def ft_fuse(job_id, uniq_kptji_id, sh0, sh1): kpt = uniq_kpts[uniq_kptji_id] # kpt = kptj - kpti adapted_ji_idx = numpy.where(uniq_inverse == uniq_kptji_id)[0] adapted_kptjs = kptjs[adapted_ji_idx] nkptj = len(adapted_kptjs) shls_slice = (auxcell.nbas, fused_cell.nbas) Gaux = ft_ao.ft_ao(fused_cell, Gv, shls_slice, b, gxyz, Gvbase, kpt) Gaux *= mydf.weighted_coulG(kpt, False, gs).reshape(-1, 1) kLR = Gaux.real.copy('C') kLI = Gaux.imag.copy('C') j2c = numpy.asarray(feri['j2c/%d' % uniq_kptji_id]) j2ctag = j2ctags[uniq_kptji_id] naux0 = j2c.shape[0] if is_zero(kpt): aosym = 's2' else: aosym = 's1' j3cR = [None] * nkptj j3cI = [None] * nkptj i0 = ao_loc[sh0] i1 = ao_loc[sh1] for k, idx in enumerate(adapted_ji_idx): key = 'j3c-chunks/%d/%d' % (job_id, idx) v = numpy.asarray(feri[key]) if is_zero(kpt): for i, c in enumerate(vbar): if c != 0: v[i] -= c * ovlp[k][i0 * (i0 + 1) // 2:i1 * (i1 + 1) // 2].ravel() j3cR[k] = numpy.asarray(v.real, order='C') if v.dtype == numpy.complex128: j3cI[k] = numpy.asarray(v.imag, order='C') v = None ncol = j3cR[0].shape[1] Gblksize = max(16, int(max_memory * 1e6 / 16 / ncol / (nkptj + 1))) # +1 for pqkRbuf/pqkIbuf Gblksize = min(Gblksize, ngs, 16384) pqkRbuf = numpy.empty(ncol * Gblksize) pqkIbuf = numpy.empty(ncol * Gblksize) buf = numpy.empty(nkptj * ncol * Gblksize, dtype=numpy.complex128) log.alldebug2(' blksize (%d,%d)', Gblksize, ncol) shls_slice = (sh0, sh1, 0, cell.nbas) for p0, p1 in lib.prange(0, ngs, Gblksize): dat = ft_ao._ft_aopair_kpts(cell, Gv[p0:p1], shls_slice, aosym, b, gxyz[p0:p1], Gvbase, kpt, adapted_kptjs, out=buf) nG = p1 - p0 for k, ji in enumerate(adapted_ji_idx): aoao = dat[k].reshape(nG, ncol) pqkR = numpy.ndarray((ncol, nG), buffer=pqkRbuf) pqkI = numpy.ndarray((ncol, nG), buffer=pqkIbuf) pqkR[:] = aoao.real.T pqkI[:] = aoao.imag.T lib.dot(kLR[p0:p1].T, pqkR.T, -1, j3cR[k][naux:], 1) lib.dot(kLI[p0:p1].T, pqkI.T, -1, j3cR[k][naux:], 1) if not (is_zero(kpt) and gamma_point(adapted_kptjs[k])): lib.dot(kLR[p0:p1].T, pqkI.T, -1, j3cI[k][naux:], 1) lib.dot(kLI[p0:p1].T, pqkR.T, 1, j3cI[k][naux:], 1) for k, idx in enumerate(adapted_ji_idx): if is_zero(kpt) and gamma_point(adapted_kptjs[k]): v = fuse(j3cR[k]) else: v = fuse(j3cR[k] + j3cI[k] * 1j) if j2ctag == 'CD': v = scipy.linalg.solve_triangular(j2c, v, lower=True, overwrite_b=True) else: v = lib.dot(j2c, v) feri['j3c-chunks/%d/%d' % (job_id, idx)][:naux0] = v t2 = t1 j3c_workers = numpy.zeros(len(j3c_jobs), dtype=int) #for job_id, ish0, ish1 in mpi.work_share_partition(j3c_jobs): for job_id, ish0, ish1 in mpi.work_stealing_partition(j3c_jobs): gen_int3c(fused_cell, job_id, ish0, ish1) t2 = log.alltimer_debug2('int j3c %d' % job_id, *t2) for k, kpt in enumerate(uniq_kpts): ft_fuse(job_id, k, ish0, ish1) t2 = log.alltimer_debug2('ft-fuse %d k %d' % (job_id, k), *t2) j3c_workers[job_id] = rank j3c_workers = mpi.allreduce(j3c_workers) log.debug2('j3c_workers %s', j3c_workers) j2c = kLRs = kLIs = ovlp = vbar = fuse = gen_int3c = ft_fuse = None t1 = log.timer_debug1('int3c and fuse', *t1) def get_segs_loc(aosym): off0 = numpy.asarray([ao_loc[i0] for x, i0, i1 in j3c_jobs]) off1 = numpy.asarray([ao_loc[i1] for x, i0, i1 in j3c_jobs]) if aosym: # s2 dims = off1 * (off1 + 1) // 2 - off0 * (off0 + 1) // 2 else: dims = (off1 - off0) * nao #dims = numpy.asarray([ao_loc[i1]-ao_loc[i0] for x,i0,i1 in j3c_jobs]) dims = numpy.hstack( [dims[j3c_workers == w] for w in range(mpi.pool.size)]) job_idx = numpy.hstack( [numpy.where(j3c_workers == w)[0] for w in range(mpi.pool.size)]) segs_loc = numpy.append(0, numpy.cumsum(dims)) segs_loc = [(segs_loc[j], segs_loc[j + 1]) for j in numpy.argsort(job_idx)] return segs_loc segs_loc_s1 = get_segs_loc(False) segs_loc_s2 = get_segs_loc(True) if 'j3c' in feri: del (feri['j3c']) segsize = (max(nauxs) + mpi.pool.size - 1) // mpi.pool.size naux0 = rank * segsize for k, kptij in enumerate(kptij_lst): naux1 = min(nauxs[uniq_inverse[k]], naux0 + segsize) nrow = max(0, naux1 - naux0) if gamma_point(kptij): dtype = 'f8' else: dtype = 'c16' if aosym_s2[k]: nao_pair = nao * (nao + 1) // 2 else: nao_pair = nao * nao feri.create_dataset('j3c/%d' % k, (nrow, nao_pair), dtype, maxshape=(None, nao_pair)) def load(k, p0, p1): naux1 = nauxs[uniq_inverse[k]] slices = [(min(i * segsize + p0, naux1), min(i * segsize + p1, naux1)) for i in range(mpi.pool.size)] segs = [] for p0, p1 in slices: val = [] for job_id, worker in enumerate(j3c_workers): if rank == worker: key = 'j3c-chunks/%d/%d' % (job_id, k) val.append(feri[key][p0:p1].ravel()) if val: segs.append(numpy.hstack(val)) else: segs.append(numpy.zeros(0)) return segs def save(k, p0, p1, segs): segs = mpi.alltoall(segs) naux1 = nauxs[uniq_inverse[k]] loc0, loc1 = min(p0, naux1 - naux0), min(p1, naux1 - naux0) nL = loc1 - loc0 if nL > 0: if aosym_s2[k]: segs = numpy.hstack([ segs[i0 * nL:i1 * nL].reshape(nL, -1) for i0, i1 in segs_loc_s2 ]) else: segs = numpy.hstack([ segs[i0 * nL:i1 * nL].reshape(nL, -1) for i0, i1 in segs_loc_s1 ]) feri['j3c/%d' % k][loc0:loc1] = segs mem_now = max(comm.allgather(lib.current_memory()[0])) max_memory = max(2000, min(8000, mydf.max_memory - mem_now)) if numpy.all(aosym_s2): if gamma_point(kptij_lst): blksize = max(16, int(max_memory * .5e6 / 8 / nao**2)) else: blksize = max(16, int(max_memory * .5e6 / 16 / nao**2)) else: blksize = max(16, int(max_memory * .5e6 / 16 / nao**2 / 2)) log.debug1('max_momory %d MB (%d in use), blksize %d', max_memory, mem_now, blksize) t2 = t1 with lib.call_in_background(save) as async_write: for k, kptji in enumerate(kptij_lst): for p0, p1 in lib.prange(0, segsize, blksize): segs = load(k, p0, p1) async_write(k, p0, p1, segs) t2 = log.timer_debug1( 'assemble k=%d %d:%d (in %d)' % (k, p0, p1, segsize), *t2) if 'j3c-chunks' in feri: del (feri['j3c-chunks']) if 'j3c-kptij' in feri: del (feri['j3c-kptij']) feri['j3c-kptij'] = kptij_lst t1 = log.alltimer_debug1('assembling j3c', *t1) feri.close()
def _assemble(mydf, kptij_lst, j3c_jobs, gen_int3c, ft_fuse, cderi_file, fswap, log): t1 = (time.clock(), time.time()) cell = mydf.cell ao_loc = cell.ao_loc_nr() nao = ao_loc[-1] kptis = kptij_lst[:, 0] kptjs = kptij_lst[:, 1] kpt_ji = kptjs - kptis uniq_kpts, uniq_index, uniq_inverse = unique(kpt_ji) aosym_s2 = numpy.einsum('ix->i', abs(kptis - kptjs)) < 1e-9 t2 = t1 j3c_workers = numpy.zeros(len(j3c_jobs), dtype=int) #for job_id, ish0, ish1 in mpi.work_share_partition(j3c_jobs): for job_id, ish0, ish1 in mpi.work_stealing_partition(j3c_jobs): gen_int3c(job_id, ish0, ish1) t2 = log.alltimer_debug2('int j3c %d' % job_id, *t2) for k, kpt in enumerate(uniq_kpts): ft_fuse(job_id, k, ish0, ish1) t2 = log.alltimer_debug2('ft-fuse %d k %d' % (job_id, k), *t2) j3c_workers[job_id] = rank j3c_workers = mpi.allreduce(j3c_workers) log.debug2('j3c_workers %s', j3c_workers) t1 = log.timer_debug1('int3c and fuse', *t1) # Pass 2 # Transpose 3-index tensor and save data in cderi_file feri = h5py.File(cderi_file, 'w') nauxs = [fswap['j2c/%d' % k].shape[0] for k, kpt in enumerate(uniq_kpts)] segsize = (max(nauxs) + mpi.pool.size - 1) // mpi.pool.size naux0 = rank * segsize for k, kptij in enumerate(kptij_lst): naux1 = min(nauxs[uniq_inverse[k]], naux0 + segsize) nrow = max(0, naux1 - naux0) if gamma_point(kptij): dtype = 'f8' else: dtype = 'c16' if aosym_s2[k]: nao_pair = nao * (nao + 1) // 2 else: nao_pair = nao * nao feri.create_dataset('j3c/%d' % k, (nrow, nao_pair), dtype, maxshape=(None, nao_pair)) def get_segs_loc(aosym): off0 = numpy.asarray([ao_loc[i0] for x, i0, i1 in j3c_jobs]) off1 = numpy.asarray([ao_loc[i1] for x, i0, i1 in j3c_jobs]) if aosym: # s2 dims = off1 * (off1 + 1) // 2 - off0 * (off0 + 1) // 2 else: dims = (off1 - off0) * nao #dims = numpy.asarray([ao_loc[i1]-ao_loc[i0] for x,i0,i1 in j3c_jobs]) dims = numpy.hstack( [dims[j3c_workers == w] for w in range(mpi.pool.size)]) job_idx = numpy.hstack( [numpy.where(j3c_workers == w)[0] for w in range(mpi.pool.size)]) segs_loc = numpy.append(0, numpy.cumsum(dims)) segs_loc = [(segs_loc[j], segs_loc[j + 1]) for j in numpy.argsort(job_idx)] return segs_loc segs_loc_s1 = get_segs_loc(False) segs_loc_s2 = get_segs_loc(True) job_ids = numpy.where(rank == j3c_workers)[0] def load(k, p0, p1): naux1 = nauxs[uniq_inverse[k]] slices = [(min(i * segsize + p0, naux1), min(i * segsize + p1, naux1)) for i in range(mpi.pool.size)] segs = [] for p0, p1 in slices: val = [ fswap['j3c-chunks/%d/%d' % (job, k)][p0:p1].ravel() for job in job_ids ] if val: segs.append(numpy.hstack(val)) else: segs.append(numpy.zeros(0)) return segs def save(k, p0, p1, segs): segs = mpi.alltoall(segs) naux1 = nauxs[uniq_inverse[k]] loc0, loc1 = min(p0, naux1 - naux0), min(p1, naux1 - naux0) nL = loc1 - loc0 if nL > 0: if aosym_s2[k]: segs = numpy.hstack([ segs[i0 * nL:i1 * nL].reshape(nL, -1) for i0, i1 in segs_loc_s2 ]) else: segs = numpy.hstack([ segs[i0 * nL:i1 * nL].reshape(nL, -1) for i0, i1 in segs_loc_s1 ]) feri['j3c/%d' % k][loc0:loc1] = segs mem_now = max(comm.allgather(lib.current_memory()[0])) max_memory = max(2000, min(8000, mydf.max_memory - mem_now)) if numpy.all(aosym_s2): if gamma_point(kptij_lst): blksize = max(16, int(max_memory * .5e6 / 8 / nao**2)) else: blksize = max(16, int(max_memory * .5e6 / 16 / nao**2)) else: blksize = max(16, int(max_memory * .5e6 / 16 / nao**2 / 2)) log.debug1('max_momory %d MB (%d in use), blksize %d', max_memory, mem_now, blksize) t2 = t1 with lib.call_in_background(save) as async_write: for k, kptji in enumerate(kptij_lst): for p0, p1 in lib.prange(0, segsize, blksize): segs = load(k, p0, p1) async_write(k, p0, p1, segs) t2 = log.timer_debug1( 'assemble k=%d %d:%d (in %d)' % (k, p0, p1, segsize), *t2) if 'j2c-' in fswap: j2c_kpts_lists = [] for k, kpt in enumerate(uniq_kpts): if ('j2c-/%d' % k) in fswap: adapted_ji_idx = numpy.where(uniq_inverse == k)[0] j2c_kpts_lists.append(adapted_ji_idx) for k in numpy.hstack(j2c_kpts_lists): val = [ numpy.asarray(fswap['j3c-/%d/%d' % (job, k)]).ravel() for job in job_ids ] val = mpi.gather(numpy.hstack(val)) if rank == 0: naux1 = fswap['j3c-/0/%d' % k].shape[0] if aosym_s2[k]: v = [ val[i0 * naux1:i1 * naux1].reshape(naux1, -1) for i0, i1 in segs_loc_s2 ] else: v = [ val[i0 * naux1:i1 * naux1].reshape(naux1, -1) for i0, i1 in segs_loc_s1 ] feri['j3c-/%d' % k] = numpy.hstack(v) if 'j3c-kptij' in feri: del (feri['j3c-kptij']) feri['j3c-kptij'] = kptij_lst t1 = log.alltimer_debug1('assembling j3c', *t1) feri.close()
def get_nuc(mydf, kpts=None): cell = mydf.cell if kpts is None: kpts_lst = numpy.zeros((1,3)) else: kpts_lst = numpy.reshape(kpts, (-1,3)) log = logger.Logger(mydf.stdout, mydf.verbose) t1 = t0 = (time.clock(), time.time()) nkpts = len(kpts_lst) nao = cell.nao_nr() nao_pair = nao * (nao+1) // 2 Gv, Gvbase, kws = cell.get_Gv_weights(mydf.gs) kpt_allow = numpy.zeros(3) if mydf.eta == 0: vpplocG = pseudo.pp_int.get_gth_vlocG_part1(cell, Gv) vpplocG = -numpy.einsum('ij,ij->j', cell.get_SI(Gv), vpplocG) vpplocG *= kws vGR = vpplocG.real vGI = vpplocG.imag vjR = numpy.zeros((nkpts,nao_pair)) vjI = numpy.zeros((nkpts,nao_pair)) else: nuccell = copy.copy(cell) half_sph_norm = .5/numpy.sqrt(numpy.pi) norm = half_sph_norm/gto.mole._gaussian_int(2, mydf.eta) chg_env = [mydf.eta, norm] ptr_eta = cell._env.size ptr_norm = ptr_eta + 1 chg_bas = [[ia, 0, 1, 1, 0, ptr_eta, ptr_norm, 0] for ia in range(cell.natm)] nuccell._atm = cell._atm nuccell._bas = numpy.asarray(chg_bas, dtype=numpy.int32) nuccell._env = numpy.hstack((cell._env, chg_env)) # PP-loc part1 is handled by fakenuc in _int_nuc_vloc vj = lib.asarray(mydf._int_nuc_vloc(nuccell, kpts_lst)) vjR = vj.real vjI = vj.imag t1 = log.timer_debug1('vnuc pass1: analytic int', *t1) charge = -cell.atom_charges() coulG = tools.get_coulG(cell, kpt_allow, gs=mydf.gs, Gv=Gv) coulG *= kws aoaux = ft_ao.ft_ao(nuccell, Gv) vGR = numpy.einsum('i,xi->x', charge, aoaux.real) * coulG vGI = numpy.einsum('i,xi->x', charge, aoaux.imag) * coulG max_memory = max(2000, mydf.max_memory-lib.current_memory()[0]) for k, pqkR, pqkI, p0, p1 \ in mydf.ft_loop(mydf.gs, kpt_allow, kpts_lst, max_memory=max_memory, aosym='s2'): # rho_ij(G) nuc(-G) / G^2 # = [Re(rho_ij(G)) + Im(rho_ij(G))*1j] [Re(nuc(G)) - Im(nuc(G))*1j] / G^2 if not gamma_point(kpts_lst[k]): vjI[k] += numpy.einsum('k,xk->x', vGR[p0:p1], pqkI) vjI[k] -= numpy.einsum('k,xk->x', vGI[p0:p1], pqkR) vjR[k] += numpy.einsum('k,xk->x', vGR[p0:p1], pqkR) vjR[k] += numpy.einsum('k,xk->x', vGI[p0:p1], pqkI) t1 = log.timer_debug1('contracting Vnuc', *t1) if mydf.eta != 0 and cell.dimension == 3: nucbar = sum([z/nuccell.bas_exp(i)[0] for i,z in enumerate(charge)]) nucbar *= numpy.pi/cell.vol ovlp = cell.pbc_intor('cint1e_ovlp_sph', 1, lib.HERMITIAN, kpts_lst) for k in range(nkpts): s = lib.pack_tril(ovlp[k]) vjR[k] -= nucbar * s.real vjI[k] -= nucbar * s.imag vj = [] for k, kpt in enumerate(kpts_lst): if gamma_point(kpt): vj.append(lib.unpack_tril(vjR[k])) else: vj.append(lib.unpack_tril(vjR[k]+vjI[k]*1j)) if kpts is None or numpy.shape(kpts) == (3,): vj = vj[0] return vj
def ft_loop(self, gs=None, kpt=numpy.zeros(3), kpts=None, shls_slice=None, max_memory=4000, aosym='s1'): ''' Fourier transform iterator for all kpti which satisfy kpt = kpts - kpti ''' cell = self.cell if gs is None: gs = self.gs if kpts is None: assert(gamma_point(kpt)) kpts = self.kpts kpts = numpy.asarray(kpts) nkpts = len(kpts) ao_loc = cell.ao_loc_nr() b = cell.reciprocal_vectors() Gv, Gvbase, kws = cell.get_Gv_weights(gs) gxyz = lib.cartesian_prod([numpy.arange(len(x)) for x in Gvbase]) ngs = gxyz.shape[0] if shls_slice is None: shls_slice = (0, cell.nbas, 0, cell.nbas) if aosym == 's2': assert(shls_slice[2] == 0) i0 = ao_loc[shls_slice[0]] i1 = ao_loc[shls_slice[1]] nij = i1*(i1+1)//2 - i0*(i0+1)//2 else: ni = ao_loc[shls_slice[1]] - ao_loc[shls_slice[0]] nj = ao_loc[shls_slice[3]] - ao_loc[shls_slice[2]] nij = ni*nj blksize = max(16, int(max_memory*.9e6/(nij*(nkpts+1)*16))) blksize = min(blksize, ngs, 16384) buf = [numpy.zeros(nij*blksize, dtype='D') for k in range(nkpts)] pqkRbuf = numpy.empty(nij*blksize) pqkIbuf = numpy.empty(nij*blksize) if aosym == 's2': for p0, p1 in self.prange(0, ngs, blksize): ft_ao._ft_aopair_kpts(cell, Gv[p0:p1], shls_slice, aosym, b, gxyz[p0:p1], Gvbase, kpt, kpts, out=buf) nG = p1 - p0 for k in range(nkpts): aoao = numpy.ndarray((nG,nij), dtype=numpy.complex128, order='F', buffer=buf[k]) pqkR = numpy.ndarray((nij,nG), buffer=pqkRbuf) pqkI = numpy.ndarray((nij,nG), buffer=pqkIbuf) pqkR[:] = aoao.real.T pqkI[:] = aoao.imag.T yield (k, pqkR, pqkI, p0, p1) aoao[:] = 0 # == buf[k][:] = 0 else: for p0, p1 in self.prange(0, ngs, blksize): ft_ao._ft_aopair_kpts(cell, Gv[p0:p1], shls_slice, aosym, b, gxyz[p0:p1], Gvbase, kpt, kpts, out=buf) nG = p1 - p0 for k in range(nkpts): aoao = numpy.ndarray((nG,ni,nj), dtype=numpy.complex128, order='F', buffer=buf[k]) pqkR = numpy.ndarray((ni,nj,nG), buffer=pqkRbuf) pqkI = numpy.ndarray((ni,nj,nG), buffer=pqkIbuf) pqkR[:] = aoao.real.transpose(1,2,0) pqkI[:] = aoao.imag.transpose(1,2,0) yield (k, pqkR.reshape(-1,nG), pqkI.reshape(-1,nG), p0, p1) aoao[:] = 0 # == buf[k][:] = 0
def get_nuc(mydf, kpts=None): cell = mydf.cell if kpts is None: kpts_lst = numpy.zeros((1, 3)) else: kpts_lst = numpy.reshape(kpts, (-1, 3)) log = logger.Logger(mydf.stdout, mydf.verbose) t1 = t0 = (time.clock(), time.time()) nkpts = len(kpts_lst) nao = cell.nao_nr() nao_pair = nao * (nao + 1) // 2 Gv, Gvbase, kws = cell.get_Gv_weights(mydf.gs) kpt_allow = numpy.zeros(3) if mydf.eta == 0: vpplocG = pseudo.pp_int.get_gth_vlocG_part1(cell, Gv) vpplocG = -numpy.einsum('ij,ij->j', cell.get_SI(Gv), vpplocG) vpplocG *= kws vGR = vpplocG.real vGI = vpplocG.imag vjR = numpy.zeros((nkpts, nao_pair)) vjI = numpy.zeros((nkpts, nao_pair)) else: nuccell = copy.copy(cell) half_sph_norm = .5 / numpy.sqrt(numpy.pi) norm = half_sph_norm / gto.mole._gaussian_int(2, mydf.eta) chg_env = [mydf.eta, norm] ptr_eta = cell._env.size ptr_norm = ptr_eta + 1 chg_bas = [[ia, 0, 1, 1, 0, ptr_eta, ptr_norm, 0] for ia in range(cell.natm)] nuccell._atm = cell._atm nuccell._bas = numpy.asarray(chg_bas, dtype=numpy.int32) nuccell._env = numpy.hstack((cell._env, chg_env)) # PP-loc part1 is handled by fakenuc in _int_nuc_vloc vj = lib.asarray(mydf._int_nuc_vloc(nuccell, kpts_lst)) vjR = vj.real vjI = vj.imag t1 = log.timer_debug1('vnuc pass1: analytic int', *t1) charge = -cell.atom_charges() coulG = tools.get_coulG(cell, kpt_allow, gs=mydf.gs, Gv=Gv) coulG *= kws aoaux = ft_ao.ft_ao(nuccell, Gv) vGR = numpy.einsum('i,xi->x', charge, aoaux.real) * coulG vGI = numpy.einsum('i,xi->x', charge, aoaux.imag) * coulG max_memory = max(2000, mydf.max_memory - lib.current_memory()[0]) for k, pqkR, pqkI, p0, p1 \ in mydf.ft_loop(mydf.gs, kpt_allow, kpts_lst, max_memory=max_memory, aosym='s2'): # rho_ij(G) nuc(-G) / G^2 # = [Re(rho_ij(G)) + Im(rho_ij(G))*1j] [Re(nuc(G)) - Im(nuc(G))*1j] / G^2 if not gamma_point(kpts_lst[k]): vjI[k] += numpy.einsum('k,xk->x', vGR[p0:p1], pqkI) vjI[k] -= numpy.einsum('k,xk->x', vGI[p0:p1], pqkR) vjR[k] += numpy.einsum('k,xk->x', vGR[p0:p1], pqkR) vjR[k] += numpy.einsum('k,xk->x', vGI[p0:p1], pqkI) t1 = log.timer_debug1('contracting Vnuc', *t1) vj = [] for k, kpt in enumerate(kpts_lst): if gamma_point(kpt): vj.append(lib.unpack_tril(vjR[k])) else: vj.append(lib.unpack_tril(vjR[k] + vjI[k] * 1j)) if kpts is None or numpy.shape(kpts) == (3, ): vj = vj[0] return vj
def _assemble(mydf, kptij_lst, j3c_jobs, gen_int3c, ft_fuse, cderi_file, fswap, log): t1 = (time.clock(), time.time()) cell = mydf.cell ao_loc = cell.ao_loc_nr() nao = ao_loc[-1] kptis = kptij_lst[:,0] kptjs = kptij_lst[:,1] kpt_ji = kptjs - kptis uniq_kpts, uniq_index, uniq_inverse = unique(kpt_ji) aosym_s2 = numpy.einsum('ix->i', abs(kptis-kptjs)) < 1e-9 t2 = t1 j3c_workers = numpy.zeros(len(j3c_jobs), dtype=int) #for job_id, ish0, ish1 in mpi.work_share_partition(j3c_jobs): for job_id, ish0, ish1 in mpi.work_stealing_partition(j3c_jobs): gen_int3c(job_id, ish0, ish1) t2 = log.alltimer_debug2('int j3c %d' % job_id, *t2) for k, kpt in enumerate(uniq_kpts): ft_fuse(job_id, k, ish0, ish1) t2 = log.alltimer_debug2('ft-fuse %d k %d' % (job_id, k), *t2) j3c_workers[job_id] = rank j3c_workers = mpi.allreduce(j3c_workers) log.debug2('j3c_workers %s', j3c_workers) t1 = log.timer_debug1('int3c and fuse', *t1) # Pass 2 # Transpose 3-index tensor and save data in cderi_file feri = h5py.File(cderi_file, 'w') nauxs = [fswap['j2c/%d'%k].shape[0] for k, kpt in enumerate(uniq_kpts)] segsize = (max(nauxs)+mpi.pool.size-1) // mpi.pool.size naux0 = rank * segsize for k, kptij in enumerate(kptij_lst): naux1 = min(nauxs[uniq_inverse[k]], naux0+segsize) nrow = max(0, naux1-naux0) if gamma_point(kptij): dtype = 'f8' else: dtype = 'c16' if aosym_s2[k]: nao_pair = nao * (nao+1) // 2 else: nao_pair = nao * nao feri.create_dataset('j3c/%d'%k, (nrow,nao_pair), dtype, maxshape=(None,nao_pair)) def get_segs_loc(aosym): off0 = numpy.asarray([ao_loc[i0] for x,i0,i1 in j3c_jobs]) off1 = numpy.asarray([ao_loc[i1] for x,i0,i1 in j3c_jobs]) if aosym: # s2 dims = off1*(off1+1)//2 - off0*(off0+1)//2 else: dims = (off1-off0) * nao #dims = numpy.asarray([ao_loc[i1]-ao_loc[i0] for x,i0,i1 in j3c_jobs]) dims = numpy.hstack([dims[j3c_workers==w] for w in range(mpi.pool.size)]) job_idx = numpy.hstack([numpy.where(j3c_workers==w)[0] for w in range(mpi.pool.size)]) segs_loc = numpy.append(0, numpy.cumsum(dims)) segs_loc = [(segs_loc[j], segs_loc[j+1]) for j in numpy.argsort(job_idx)] return segs_loc segs_loc_s1 = get_segs_loc(False) segs_loc_s2 = get_segs_loc(True) job_ids = numpy.where(rank == j3c_workers)[0] def load(k, p0, p1): naux1 = nauxs[uniq_inverse[k]] slices = [(min(i*segsize+p0,naux1), min(i*segsize+p1,naux1)) for i in range(mpi.pool.size)] segs = [] for p0, p1 in slices: val = [fswap['j3c-chunks/%d/%d' % (job, k)][p0:p1].ravel() for job in job_ids] if val: segs.append(numpy.hstack(val)) else: segs.append(numpy.zeros(0)) return segs def save(k, p0, p1, segs): segs = mpi.alltoall(segs) naux1 = nauxs[uniq_inverse[k]] loc0, loc1 = min(p0, naux1-naux0), min(p1, naux1-naux0) nL = loc1 - loc0 if nL > 0: if aosym_s2[k]: segs = numpy.hstack([segs[i0*nL:i1*nL].reshape(nL,-1) for i0,i1 in segs_loc_s2]) else: segs = numpy.hstack([segs[i0*nL:i1*nL].reshape(nL,-1) for i0,i1 in segs_loc_s1]) feri['j3c/%d'%k][loc0:loc1] = segs mem_now = max(comm.allgather(lib.current_memory()[0])) max_memory = max(2000, min(8000, mydf.max_memory - mem_now)) if numpy.all(aosym_s2): if gamma_point(kptij_lst): blksize = max(16, int(max_memory*.5e6/8/nao**2)) else: blksize = max(16, int(max_memory*.5e6/16/nao**2)) else: blksize = max(16, int(max_memory*.5e6/16/nao**2/2)) log.debug1('max_momory %d MB (%d in use), blksize %d', max_memory, mem_now, blksize) t2 = t1 with lib.call_in_background(save) as async_write: for k, kptji in enumerate(kptij_lst): for p0, p1 in lib.prange(0, segsize, blksize): segs = load(k, p0, p1) async_write(k, p0, p1, segs) t2 = log.timer_debug1('assemble k=%d %d:%d (in %d)' % (k, p0, p1, segsize), *t2) if 'j2c-' in fswap: j2c_kpts_lists = [] for k, kpt in enumerate(uniq_kpts): if ('j2c-/%d' % k) in fswap: adapted_ji_idx = numpy.where(uniq_inverse == k)[0] j2c_kpts_lists.append(adapted_ji_idx) for k in numpy.hstack(j2c_kpts_lists): val = [numpy.asarray(fswap['j3c-/%d/%d' % (job, k)]).ravel() for job in job_ids] val = mpi.gather(numpy.hstack(val)) if rank == 0: naux1 = fswap['j3c-/0/%d'%k].shape[0] if aosym_s2[k]: v = [val[i0*naux1:i1*naux1].reshape(naux1,-1) for i0,i1 in segs_loc_s2] else: v = [val[i0*naux1:i1*naux1].reshape(naux1,-1) for i0,i1 in segs_loc_s1] feri['j3c-/%d'%k] = numpy.hstack(v) if 'j3c-kptij' in feri: del(feri['j3c-kptij']) feri['j3c-kptij'] = kptij_lst t1 = log.alltimer_debug1('assembling j3c', *t1) feri.close()
def make_kpt(uniq_kptji_id): # kpt = kptj - kpti kpt = uniq_kpts[uniq_kptji_id] log.debug1('kpt = %s', kpt) adapted_ji_idx = numpy.where(uniq_inverse == uniq_kptji_id)[0] adapted_kptjs = kptjs[adapted_ji_idx] nkptj = len(adapted_kptjs) log.debug1('adapted_ji_idx = %s', adapted_ji_idx) kLR = kLRs[uniq_kptji_id] kLI = kLIs[uniq_kptji_id] if is_zero(kpt): # kpti == kptj aosym = 's2' nao_pair = nao*(nao+1)//2 vbar = fuse(mydf.auxbar(fused_cell)) ovlp = cell.pbc_intor('cint1e_ovlp_sph', hermi=1, kpts=adapted_kptjs) for k, ji in enumerate(adapted_ji_idx): ovlp[k] = lib.pack_tril(ovlp[k]) else: aosym = 's1' nao_pair = nao**2 max_memory = max(2000, mydf.max_memory-lib.current_memory()[0]) # nkptj for 3c-coulomb arrays plus 1 Lpq array buflen = min(max(int(max_memory*.6*1e6/16/naux/(nkptj+1)), 1), nao_pair) shranges = _guess_shell_ranges(cell, buflen, aosym) buflen = max([x[2] for x in shranges]) # +1 for a pqkbuf if aosym == 's2': Gblksize = max(16, int(max_memory*.2*1e6/16/buflen/(nkptj+1))) else: Gblksize = max(16, int(max_memory*.4*1e6/16/buflen/(nkptj+1))) Gblksize = min(Gblksize, ngs, 16384) pqkRbuf = numpy.empty(buflen*Gblksize) pqkIbuf = numpy.empty(buflen*Gblksize) # buf for ft_aopair buf = numpy.zeros((nkptj,buflen*Gblksize), dtype=numpy.complex128) col1 = 0 for istep, sh_range in enumerate(shranges): log.debug1('int3c2e [%d/%d], AO [%d:%d], ncol = %d', \ istep+1, len(shranges), *sh_range) bstart, bend, ncol = sh_range col0, col1 = col1, col1+ncol j3cR = [] j3cI = [] for k, idx in enumerate(adapted_ji_idx): v = numpy.asarray(feri['j3c/%d'%idx][:,col0:col1]) if is_zero(kpt): for i, c in enumerate(vbar): if c != 0: v[i] -= c * ovlp[k][col0:col1] j3cR.append(numpy.asarray(v.real, order='C')) if is_zero(kpt) and gamma_point(adapted_kptjs[k]): j3cI.append(None) else: j3cI.append(numpy.asarray(v.imag, order='C')) if aosym == 's2': shls_slice = (bstart, bend, 0, bend) for p0, p1 in lib.prange(0, ngs, Gblksize): ft_ao._ft_aopair_kpts(cell, Gv[p0:p1], shls_slice, aosym, b, gxyz[p0:p1], Gvbase, kpt, adapted_kptjs, out=buf) nG = p1 - p0 for k, ji in enumerate(adapted_ji_idx): aoao = numpy.ndarray((nG,ncol), dtype=numpy.complex128, order='F', buffer=buf[k]) pqkR = numpy.ndarray((ncol,nG), buffer=pqkRbuf) pqkI = numpy.ndarray((ncol,nG), buffer=pqkIbuf) pqkR[:] = aoao.real.T pqkI[:] = aoao.imag.T aoao[:] = 0 lib.dot(kLR[p0:p1].T, pqkR.T, -1, j3cR[k][naux:], 1) lib.dot(kLI[p0:p1].T, pqkI.T, -1, j3cR[k][naux:], 1) if not (is_zero(kpt) and gamma_point(adapted_kptjs[k])): lib.dot(kLR[p0:p1].T, pqkI.T, -1, j3cI[k][naux:], 1) lib.dot(kLI[p0:p1].T, pqkR.T, 1, j3cI[k][naux:], 1) else: shls_slice = (bstart, bend, 0, cell.nbas) ni = ncol // nao for p0, p1 in lib.prange(0, ngs, Gblksize): ft_ao._ft_aopair_kpts(cell, Gv[p0:p1], shls_slice, aosym, b, gxyz[p0:p1], Gvbase, kpt, adapted_kptjs, out=buf) nG = p1 - p0 for k, ji in enumerate(adapted_ji_idx): aoao = numpy.ndarray((nG,ni,nao), dtype=numpy.complex128, order='F', buffer=buf[k]) pqkR = numpy.ndarray((ni,nao,nG), buffer=pqkRbuf) pqkI = numpy.ndarray((ni,nao,nG), buffer=pqkIbuf) pqkR[:] = aoao.real.transpose(1,2,0) pqkI[:] = aoao.imag.transpose(1,2,0) aoao[:] = 0 pqkR = pqkR.reshape(-1,nG) pqkI = pqkI.reshape(-1,nG) zdotCN(kLR[p0:p1].T, kLI[p0:p1].T, pqkR.T, pqkI.T, -1, j3cR[k][naux:], j3cI[k][naux:], 1) for k, ji in enumerate(adapted_ji_idx): if is_zero(kpt) and gamma_point(adapted_kptjs[k]): v = fuse(j3cR[k]) else: v = fuse(j3cR[k] + j3cI[k] * 1j) v = scipy.linalg.solve_triangular(j2c[uniq_kptji_id], v, lower=True, overwrite_b=True) feri['j3c/%d'%ji][:naux,col0:col1] = v
def ft_fuse(job_id, uniq_kptji_id, sh0, sh1): kpt = uniq_kpts[uniq_kptji_id] # kpt = kptj - kpti adapted_ji_idx = numpy.where(uniq_inverse == uniq_kptji_id)[0] adapted_kptjs = kptjs[adapted_ji_idx] nkptj = len(adapted_kptjs) j2c = numpy.asarray(fswap['j2c/%d' % uniq_kptji_id]) j2ctag = j2ctags[uniq_kptji_id] naux0 = j2c.shape[0] if ('j2c-/%d' % uniq_kptji_id) in fswap: j2c_negative = numpy.asarray(fswap['j2c-/%d' % uniq_kptji_id]) else: j2c_negative = None if is_zero(kpt): aosym = 's2' else: aosym = 's1' if aosym == 's2' and cell.dimension == 3: vbar = fuse(mydf.auxbar(fused_cell)) ovlp = cell.pbc_intor('int1e_ovlp', hermi=1, kpts=adapted_kptjs) ovlp = [lib.pack_tril(s) for s in ovlp] j3cR = [None] * nkptj j3cI = [None] * nkptj i0 = ao_loc[sh0] i1 = ao_loc[sh1] for k, idx in enumerate(adapted_ji_idx): key = 'j3c-chunks/%d/%d' % (job_id, idx) v = numpy.asarray(fswap[key]) if aosym == 's2' and cell.dimension == 3: for i in numpy.where(vbar != 0)[0]: v[i] -= vbar[i] * ovlp[k][i0 * (i0 + 1) // 2:i1 * (i1 + 1) // 2].ravel() j3cR[k] = numpy.asarray(v.real, order='C') if v.dtype == numpy.complex128: j3cI[k] = numpy.asarray(v.imag, order='C') v = None ncol = j3cR[0].shape[1] Gblksize = max(16, int(max_memory * 1e6 / 16 / ncol / (nkptj + 1))) # +1 for pqkRbuf/pqkIbuf Gblksize = min(Gblksize, ngrids, 16384) pqkRbuf = numpy.empty(ncol * Gblksize) pqkIbuf = numpy.empty(ncol * Gblksize) buf = numpy.empty(nkptj * ncol * Gblksize, dtype=numpy.complex128) log.alldebug2('job_id %d blksize (%d,%d)', job_id, Gblksize, ncol) wcoulG = mydf.weighted_coulG(kpt, False, mesh) fused_cell_slice = (auxcell.nbas, fused_cell.nbas) if aosym == 's2': shls_slice = (sh0, sh1, 0, sh1) else: shls_slice = (sh0, sh1, 0, cell.nbas) for p0, p1 in lib.prange(0, ngrids, Gblksize): Gaux = ft_ao.ft_ao(fused_cell, Gv[p0:p1], fused_cell_slice, b, gxyz[p0:p1], Gvbase, kpt) Gaux *= wcoulG[p0:p1, None] kLR = Gaux.real.copy('C') kLI = Gaux.imag.copy('C') Gaux = None dat = ft_ao._ft_aopair_kpts(cell, Gv[p0:p1], shls_slice, aosym, b, gxyz[p0:p1], Gvbase, kpt, adapted_kptjs, out=buf) nG = p1 - p0 for k, ji in enumerate(adapted_ji_idx): aoao = dat[k].reshape(nG, ncol) pqkR = numpy.ndarray((ncol, nG), buffer=pqkRbuf) pqkI = numpy.ndarray((ncol, nG), buffer=pqkIbuf) pqkR[:] = aoao.real.T pqkI[:] = aoao.imag.T lib.dot(kLR.T, pqkR.T, -1, j3cR[k][naux:], 1) lib.dot(kLI.T, pqkI.T, -1, j3cR[k][naux:], 1) if not (is_zero(kpt) and gamma_point(adapted_kptjs[k])): lib.dot(kLR.T, pqkI.T, -1, j3cI[k][naux:], 1) lib.dot(kLI.T, pqkR.T, 1, j3cI[k][naux:], 1) kLR = kLI = None for k, idx in enumerate(adapted_ji_idx): if is_zero(kpt) and gamma_point(adapted_kptjs[k]): v = fuse(j3cR[k]) else: v = fuse(j3cR[k] + j3cI[k] * 1j) if j2ctag == 'CD': v = scipy.linalg.solve_triangular(j2c, v, lower=True, overwrite_b=True) fswap['j3c-chunks/%d/%d' % (job_id, idx)][:naux0] = v else: fswap['j3c-chunks/%d/%d' % (job_id, idx)][:naux0] = lib.dot( j2c, v) # low-dimension systems if j2c_negative is not None: fswap['j3c-/%d/%d' % (job_id, idx)] = lib.dot(j2c_negative, v)
def ft_fuse(job_id, uniq_kptji_id, sh0, sh1): kpt = uniq_kpts[uniq_kptji_id] # kpt = kptj - kpti adapted_ji_idx = numpy.where(uniq_inverse == uniq_kptji_id)[0] adapted_kptjs = kptjs[adapted_ji_idx] nkptj = len(adapted_kptjs) Gaux = ft_ao.ft_ao(fused_cell, Gv, None, b, gxyz, Gvbase, kpt).T Gaux = fuse(Gaux) Gaux *= mydf.weighted_coulG(kpt, False, mesh) kLR = lib.transpose(numpy.asarray(Gaux.real, order='C')) kLI = lib.transpose(numpy.asarray(Gaux.imag, order='C')) j2c = numpy.asarray(fswap['j2c/%d'%uniq_kptji_id]) j2ctag = j2ctags[uniq_kptji_id] naux0 = j2c.shape[0] if ('j2c-/%d' % uniq_kptji_id) in fswap: j2c_negative = numpy.asarray(fswap['j2c-/%d'%uniq_kptji_id]) else: j2c_negative = None if is_zero(kpt): aosym = 's2' else: aosym = 's1' if aosym == 's2' and cell.dimension == 3: vbar = fuse(mydf.auxbar(fused_cell)) ovlp = cell.pbc_intor('int1e_ovlp', hermi=1, kpts=adapted_kptjs) ovlp = [lib.pack_tril(s) for s in ovlp] j3cR = [None] * nkptj j3cI = [None] * nkptj i0 = ao_loc[sh0] i1 = ao_loc[sh1] for k, idx in enumerate(adapted_ji_idx): key = 'j3c-chunks/%d/%d' % (job_id, idx) v = fuse(numpy.asarray(fswap[key])) if aosym == 's2' and cell.dimension == 3: for i in numpy.where(vbar != 0)[0]: v[i] -= vbar[i] * ovlp[k][i0*(i0+1)//2:i1*(i1+1)//2].ravel() j3cR[k] = numpy.asarray(v.real, order='C') if v.dtype == numpy.complex128: j3cI[k] = numpy.asarray(v.imag, order='C') v = None ncol = j3cR[0].shape[1] Gblksize = max(16, int(max_memory*1e6/16/ncol/(nkptj+1))) # +1 for pqkRbuf/pqkIbuf Gblksize = min(Gblksize, ngrids, 16384) pqkRbuf = numpy.empty(ncol*Gblksize) pqkIbuf = numpy.empty(ncol*Gblksize) buf = numpy.empty(nkptj*ncol*Gblksize, dtype=numpy.complex128) log.alldebug2(' blksize (%d,%d)', Gblksize, ncol) if aosym == 's2': shls_slice = (sh0, sh1, 0, sh1) else: shls_slice = (sh0, sh1, 0, cell.nbas) for p0, p1 in lib.prange(0, ngrids, Gblksize): dat = ft_ao._ft_aopair_kpts(cell, Gv[p0:p1], shls_slice, aosym, b, gxyz[p0:p1], Gvbase, kpt, adapted_kptjs, out=buf) nG = p1 - p0 for k, ji in enumerate(adapted_ji_idx): aoao = dat[k].reshape(nG,ncol) pqkR = numpy.ndarray((ncol,nG), buffer=pqkRbuf) pqkI = numpy.ndarray((ncol,nG), buffer=pqkIbuf) pqkR[:] = aoao.real.T pqkI[:] = aoao.imag.T lib.dot(kLR[p0:p1].T, pqkR.T, -1, j3cR[k], 1) lib.dot(kLI[p0:p1].T, pqkI.T, -1, j3cR[k], 1) if not (is_zero(kpt) and gamma_point(adapted_kptjs[k])): lib.dot(kLR[p0:p1].T, pqkI.T, -1, j3cI[k], 1) lib.dot(kLI[p0:p1].T, pqkR.T, 1, j3cI[k], 1) for k, idx in enumerate(adapted_ji_idx): if is_zero(kpt) and gamma_point(adapted_kptjs[k]): v = j3cR[k] else: v = j3cR[k] + j3cI[k] * 1j if j2ctag == 'CD': v = scipy.linalg.solve_triangular(j2c, v, lower=True, overwrite_b=True) fswap['j3c-chunks/%d/%d'%(job_id,idx)][:naux0] = v else: fswap['j3c-chunks/%d/%d'%(job_id,idx)][:naux0] = lib.dot(j2c, v) # low-dimension systems if j2c_negative is not None: fswap['j3c-/%d/%d'%(job_id,idx)] = lib.dot(j2c_negative, v)
def _make_j3c(mydf, cell, auxcell, kptij_lst, cderi_file): log = logger.Logger(mydf.stdout, mydf.verbose) t1 = t0 = (time.clock(), time.time()) fused_cell, fuse = fuse_auxcell(mydf, mydf.auxcell) ao_loc = cell.ao_loc_nr() nao = ao_loc[-1] naux = auxcell.nao_nr() nkptij = len(kptij_lst) mesh = mydf.mesh Gv, Gvbase, kws = cell.get_Gv_weights(mesh) b = cell.reciprocal_vectors() gxyz = lib.cartesian_prod([numpy.arange(len(x)) for x in Gvbase]) ngrids = gxyz.shape[0] kptis = kptij_lst[:, 0] kptjs = kptij_lst[:, 1] kpt_ji = kptjs - kptis uniq_kpts, uniq_index, uniq_inverse = unique(kpt_ji) log.debug('Num uniq kpts %d', len(uniq_kpts)) log.debug2('uniq_kpts %s', uniq_kpts) # j2c ~ (-kpt_ji | kpt_ji) j2c = fused_cell.pbc_intor('int2c2e', hermi=1, kpts=uniq_kpts) j2ctags = [] t1 = log.timer_debug1('2c2e', *t1) swapfile = tempfile.NamedTemporaryFile(dir=os.path.dirname(cderi_file)) fswap = lib.H5TmpFile(swapfile.name) # Unlink swapfile to avoid trash swapfile = None mem_now = max(comm.allgather(lib.current_memory()[0])) max_memory = max(2000, mydf.max_memory - mem_now) blksize = max(2048, int(max_memory * .5e6 / 16 / fused_cell.nao_nr())) log.debug2('max_memory %s (MB) blocksize %s', max_memory, blksize) for k, kpt in enumerate(uniq_kpts): coulG = mydf.weighted_coulG(kpt, False, mesh) j2c_k = numpy.zeros_like(j2c[k]) for p0, p1 in mydf.prange(0, ngrids, blksize): aoaux = ft_ao.ft_ao(fused_cell, Gv[p0:p1], None, b, gxyz[p0:p1], Gvbase, kpt).T LkR = numpy.asarray(aoaux.real, order='C') LkI = numpy.asarray(aoaux.imag, order='C') aoaux = None if is_zero(kpt): # kpti == kptj j2c_k[naux:] += lib.ddot(LkR[naux:] * coulG[p0:p1], LkR.T) j2c_k[naux:] += lib.ddot(LkI[naux:] * coulG[p0:p1], LkI.T) else: j2cR, j2cI = zdotCN(LkR[naux:] * coulG[p0:p1], LkI[naux:] * coulG[p0:p1], LkR.T, LkI.T) j2c_k[naux:] += j2cR + j2cI * 1j kLR = kLI = None j2c_k[:naux, naux:] = j2c_k[naux:, :naux].conj().T j2c[k] -= mpi.allreduce(j2c_k) j2c[k] = fuse(fuse(j2c[k]).T).T try: fswap['j2c/%d' % k] = scipy.linalg.cholesky(j2c[k], lower=True) j2ctags.append('CD') except scipy.linalg.LinAlgError as e: #msg =('===================================\n' # 'J-metric not positive definite.\n' # 'It is likely that mesh is not enough.\n' # '===================================') #log.error(msg) #raise scipy.linalg.LinAlgError('\n'.join([str(e), msg])) w, v = scipy.linalg.eigh(j2c[k]) log.debug2('metric linear dependency for kpt %s', k) log.debug2('cond = %.4g, drop %d bfns', w[0] / w[-1], numpy.count_nonzero(w < mydf.linear_dep_threshold)) v1 = v[:, w > mydf.linear_dep_threshold].T.conj() v1 /= numpy.sqrt(w[w > mydf.linear_dep_threshold]).reshape(-1, 1) fswap['j2c/%d' % k] = v1 if cell.dimension == 2 and cell.low_dim_ft_type != 'inf_vacuum': idx = numpy.where(w < -mydf.linear_dep_threshold)[0] if len(idx) > 0: fswap['j2c-/%d' % k] = (v[:, idx] / numpy.sqrt(-w[idx])).conj().T w = v = v1 = None j2ctags.append('eig') j2c = coulG = None aosym_s2 = numpy.einsum('ix->i', abs(kptis - kptjs)) < 1e-9 j_only = numpy.all(aosym_s2) if gamma_point(kptij_lst): dtype = 'f8' else: dtype = 'c16' t1 = log.timer_debug1('aoaux and int2c', *t1) # Estimates the buffer size based on the last contraction in G-space. # This contraction requires to hold nkptj copies of (naux,?) array # simultaneously in memory. mem_now = max(comm.allgather(lib.current_memory()[0])) max_memory = max(2000, mydf.max_memory - mem_now) nkptj_max = max((uniq_inverse == x).sum() for x in set(uniq_inverse)) buflen = max( int( min(max_memory * .5e6 / 16 / naux / (nkptj_max + 2) / nao, nao / 3 / mpi.pool.size)), 1) chunks = (buflen, nao) j3c_jobs = grids2d_int3c_jobs(cell, auxcell, kptij_lst, chunks, j_only) log.debug1('max_memory = %d MB (%d in use) chunks %s', max_memory, mem_now, chunks) log.debug2('j3c_jobs %s', j3c_jobs) if j_only: int3c = wrap_int3c(cell, fused_cell, 'int3c2e', 's2', 1, kptij_lst) else: int3c = wrap_int3c(cell, fused_cell, 'int3c2e', 's1', 1, kptij_lst) idxb = numpy.tril_indices(nao) idxb = (idxb[0] * nao + idxb[1]).astype('i') aux_loc = fused_cell.ao_loc_nr('ssc' in 'int3c2e') def gen_int3c(job_id, ish0, ish1): dataname = 'j3c-chunks/%d' % job_id i0 = ao_loc[ish0] i1 = ao_loc[ish1] dii = i1 * (i1 + 1) // 2 - i0 * (i0 + 1) // 2 if j_only: dij = dii buflen = max(8, int(max_memory * 1e6 / 16 / (nkptij * dii + dii))) else: dij = (i1 - i0) * nao buflen = max(8, int(max_memory * 1e6 / 16 / (nkptij * dij + dij))) auxranges = balance_segs(aux_loc[1:] - aux_loc[:-1], buflen) buflen = max([x[2] for x in auxranges]) buf = numpy.empty(nkptij * dij * buflen, dtype=dtype) buf1 = numpy.empty(dij * buflen, dtype=dtype) naux = aux_loc[-1] for kpt_id, kptij in enumerate(kptij_lst): key = '%s/%d' % (dataname, kpt_id) if aosym_s2[kpt_id]: shape = (naux, dii) else: shape = (naux, dij) if gamma_point(kptij): fswap.create_dataset(key, shape, 'f8') else: fswap.create_dataset(key, shape, 'c16') naux0 = 0 for istep, auxrange in enumerate(auxranges): log.alldebug2("aux_e1 job_id %d step %d", job_id, istep) sh0, sh1, nrow = auxrange sub_slice = (ish0, ish1, 0, cell.nbas, sh0, sh1) mat = numpy.ndarray((nkptij, dij, nrow), dtype=dtype, buffer=buf) mat = int3c(sub_slice, mat) for k, kptij in enumerate(kptij_lst): h5dat = fswap['%s/%d' % (dataname, k)] v = lib.transpose(mat[k], out=buf1) if not j_only and aosym_s2[k]: idy = idxb[i0 * (i0 + 1) // 2:i1 * (i1 + 1) // 2] - i0 * nao out = numpy.ndarray((nrow, dii), dtype=v.dtype, buffer=mat[k]) v = numpy.take(v, idy, axis=1, out=out) if gamma_point(kptij): h5dat[naux0:naux0 + nrow] = v.real else: h5dat[naux0:naux0 + nrow] = v naux0 += nrow def ft_fuse(job_id, uniq_kptji_id, sh0, sh1): kpt = uniq_kpts[uniq_kptji_id] # kpt = kptj - kpti adapted_ji_idx = numpy.where(uniq_inverse == uniq_kptji_id)[0] adapted_kptjs = kptjs[adapted_ji_idx] nkptj = len(adapted_kptjs) j2c = numpy.asarray(fswap['j2c/%d' % uniq_kptji_id]) j2ctag = j2ctags[uniq_kptji_id] naux0 = j2c.shape[0] if ('j2c-/%d' % uniq_kptji_id) in fswap: j2c_negative = numpy.asarray(fswap['j2c-/%d' % uniq_kptji_id]) else: j2c_negative = None if is_zero(kpt): aosym = 's2' else: aosym = 's1' if aosym == 's2' and cell.dimension == 3: vbar = fuse(mydf.auxbar(fused_cell)) ovlp = cell.pbc_intor('int1e_ovlp', hermi=1, kpts=adapted_kptjs) ovlp = [lib.pack_tril(s) for s in ovlp] j3cR = [None] * nkptj j3cI = [None] * nkptj i0 = ao_loc[sh0] i1 = ao_loc[sh1] for k, idx in enumerate(adapted_ji_idx): key = 'j3c-chunks/%d/%d' % (job_id, idx) v = numpy.asarray(fswap[key]) if aosym == 's2' and cell.dimension == 3: for i in numpy.where(vbar != 0)[0]: v[i] -= vbar[i] * ovlp[k][i0 * (i0 + 1) // 2:i1 * (i1 + 1) // 2].ravel() j3cR[k] = numpy.asarray(v.real, order='C') if v.dtype == numpy.complex128: j3cI[k] = numpy.asarray(v.imag, order='C') v = None ncol = j3cR[0].shape[1] Gblksize = max(16, int(max_memory * 1e6 / 16 / ncol / (nkptj + 1))) # +1 for pqkRbuf/pqkIbuf Gblksize = min(Gblksize, ngrids, 16384) pqkRbuf = numpy.empty(ncol * Gblksize) pqkIbuf = numpy.empty(ncol * Gblksize) buf = numpy.empty(nkptj * ncol * Gblksize, dtype=numpy.complex128) log.alldebug2('job_id %d blksize (%d,%d)', job_id, Gblksize, ncol) wcoulG = mydf.weighted_coulG(kpt, False, mesh) fused_cell_slice = (auxcell.nbas, fused_cell.nbas) if aosym == 's2': shls_slice = (sh0, sh1, 0, sh1) else: shls_slice = (sh0, sh1, 0, cell.nbas) for p0, p1 in lib.prange(0, ngrids, Gblksize): Gaux = ft_ao.ft_ao(fused_cell, Gv[p0:p1], fused_cell_slice, b, gxyz[p0:p1], Gvbase, kpt) Gaux *= wcoulG[p0:p1, None] kLR = Gaux.real.copy('C') kLI = Gaux.imag.copy('C') Gaux = None dat = ft_ao._ft_aopair_kpts(cell, Gv[p0:p1], shls_slice, aosym, b, gxyz[p0:p1], Gvbase, kpt, adapted_kptjs, out=buf) nG = p1 - p0 for k, ji in enumerate(adapted_ji_idx): aoao = dat[k].reshape(nG, ncol) pqkR = numpy.ndarray((ncol, nG), buffer=pqkRbuf) pqkI = numpy.ndarray((ncol, nG), buffer=pqkIbuf) pqkR[:] = aoao.real.T pqkI[:] = aoao.imag.T lib.dot(kLR.T, pqkR.T, -1, j3cR[k][naux:], 1) lib.dot(kLI.T, pqkI.T, -1, j3cR[k][naux:], 1) if not (is_zero(kpt) and gamma_point(adapted_kptjs[k])): lib.dot(kLR.T, pqkI.T, -1, j3cI[k][naux:], 1) lib.dot(kLI.T, pqkR.T, 1, j3cI[k][naux:], 1) kLR = kLI = None for k, idx in enumerate(adapted_ji_idx): if is_zero(kpt) and gamma_point(adapted_kptjs[k]): v = fuse(j3cR[k]) else: v = fuse(j3cR[k] + j3cI[k] * 1j) if j2ctag == 'CD': v = scipy.linalg.solve_triangular(j2c, v, lower=True, overwrite_b=True) fswap['j3c-chunks/%d/%d' % (job_id, idx)][:naux0] = v else: fswap['j3c-chunks/%d/%d' % (job_id, idx)][:naux0] = lib.dot( j2c, v) # low-dimension systems if j2c_negative is not None: fswap['j3c-/%d/%d' % (job_id, idx)] = lib.dot(j2c_negative, v) _assemble(mydf, kptij_lst, j3c_jobs, gen_int3c, ft_fuse, cderi_file, fswap, log)
def _make_j3c(mydf, cell, auxcell, kptij_lst, cderi_file): log = logger.Logger(mydf.stdout, mydf.verbose) t1 = t0 = (time.clock(), time.time()) fused_cell, fuse = fuse_auxcell(mydf, mydf.auxcell) ao_loc = cell.ao_loc_nr() nao = ao_loc[-1] naux = auxcell.nao_nr() nkptij = len(kptij_lst) mesh = mydf.mesh Gv, Gvbase, kws = cell.get_Gv_weights(mesh) b = cell.reciprocal_vectors() gxyz = lib.cartesian_prod([numpy.arange(len(x)) for x in Gvbase]) ngrids = gxyz.shape[0] kptis = kptij_lst[:,0] kptjs = kptij_lst[:,1] kpt_ji = kptjs - kptis uniq_kpts, uniq_index, uniq_inverse = unique(kpt_ji) log.debug('Num uniq kpts %d', len(uniq_kpts)) log.debug2('uniq_kpts %s', uniq_kpts) # j2c ~ (-kpt_ji | kpt_ji) j2c = fused_cell.pbc_intor('int2c2e', hermi=1, kpts=uniq_kpts) j2ctags = [] t1 = log.timer_debug1('2c2e', *t1) swapfile = tempfile.NamedTemporaryFile(dir=os.path.dirname(cderi_file)) fswap = lib.H5TmpFile(swapfile.name) # Unlink swapfile to avoid trash swapfile = None for k, kpt in enumerate(uniq_kpts): coulG = mydf.weighted_coulG(kpt, False, mesh) j2c[k] = fuse(fuse(j2c[k]).T).T.copy() j2c_k = numpy.zeros_like(j2c[k]) for p0, p1 in mydf.mpi_prange(0, ngrids): aoaux = ft_ao.ft_ao(fused_cell, Gv[p0:p1], None, b, gxyz[p0:p1], Gvbase, kpt).T aoaux = fuse(aoaux) LkR = numpy.asarray(aoaux.real, order='C') LkI = numpy.asarray(aoaux.imag, order='C') aoaux = None if is_zero(kpt): # kpti == kptj j2cR = lib.dot(LkR*coulG[p0:p1], LkR.T) j2c_k += lib.dot(LkI*coulG[p0:p1], LkI.T, 1, j2cR, 1) else: # aoaux ~ kpt_ij, aoaux.conj() ~ kpt_kl j2cR, j2cI = zdotCN(LkR*coulG[p0:p1], LkI*coulG[p0:p1], LkR.T, LkI.T) j2c_k += j2cR + j2cI * 1j LkR = LkI = None j2c[k] -= mpi.allreduce(j2c_k) try: fswap['j2c/%d'%k] = scipy.linalg.cholesky(j2c[k], lower=True) j2ctags.append('CD') except scipy.linalg.LinAlgError: w, v = scipy.linalg.eigh(j2c[k]) log.debug2('metric linear dependency for kpt %s', k) log.debug2('cond = %.4g, drop %d bfns', w[0]/w[-1], numpy.count_nonzero(w<mydf.linear_dep_threshold)) v1 = v[:,w>mydf.linear_dep_threshold].T.conj() v1 /= numpy.sqrt(w[w>mydf.linear_dep_threshold]).reshape(-1,1) fswap['j2c/%d'%k] = v1 if cell.dimension == 2 and cell.low_dim_ft_type != 'inf_vacuum': idx = numpy.where(w < -mydf.linear_dep_threshold)[0] if len(idx) > 0: fswap['j2c-/%d'%k] = (v[:,idx]/numpy.sqrt(-w[idx])).conj().T w = v = v1 = v2 = None j2ctags.append('eig') aoaux = kLR = kLI = j2cR = j2cI = coulG = None j2c = None aosym_s2 = numpy.einsum('ix->i', abs(kptis-kptjs)) < 1e-9 j_only = numpy.all(aosym_s2) if gamma_point(kptij_lst): dtype = 'f8' else: dtype = 'c16' t1 = log.timer_debug1('aoaux and int2c', *t1) # Estimates the buffer size based on the last contraction in G-space. # This contraction requires to hold nkptj copies of (naux,?) array # simultaneously in memory. mem_now = max(comm.allgather(lib.current_memory()[0])) max_memory = max(2000, mydf.max_memory - mem_now) nkptj_max = max((uniq_inverse==x).sum() for x in set(uniq_inverse)) buflen = max(int(min(max_memory*.5e6/16/naux/(nkptj_max+2)/nao, nao/3/mpi.pool.size)), 1) chunks = (buflen, nao) j3c_jobs = mpi_df.grids2d_int3c_jobs(cell, auxcell, kptij_lst, chunks, j_only) log.debug1('max_memory = %d MB (%d in use) chunks %s', max_memory, mem_now, chunks) log.debug2('j3c_jobs %s', j3c_jobs) if j_only: int3c = wrap_int3c(cell, fused_cell, 'int3c2e', 's2', 1, kptij_lst) else: int3c = wrap_int3c(cell, fused_cell, 'int3c2e', 's1', 1, kptij_lst) idxb = numpy.tril_indices(nao) idxb = (idxb[0] * nao + idxb[1]).astype('i') aux_loc = fused_cell.ao_loc_nr(fused_cell.cart) def gen_int3c(job_id, ish0, ish1): dataname = 'j3c-chunks/%d' % job_id i0 = ao_loc[ish0] i1 = ao_loc[ish1] dii = i1*(i1+1)//2 - i0*(i0+1)//2 dij = (i1 - i0) * nao if j_only: buflen = max(8, int(max_memory*1e6/16/(nkptij*dii+dii))) else: buflen = max(8, int(max_memory*1e6/16/(nkptij*dij+dij))) auxranges = balance_segs(aux_loc[1:]-aux_loc[:-1], buflen) buflen = max([x[2] for x in auxranges]) buf = numpy.empty(nkptij*dij*buflen, dtype=dtype) buf1 = numpy.empty(dij*buflen, dtype=dtype) naux = aux_loc[-1] for kpt_id, kptij in enumerate(kptij_lst): key = '%s/%d' % (dataname, kpt_id) if aosym_s2[kpt_id]: shape = (naux, dii) else: shape = (naux, dij) if gamma_point(kptij): fswap.create_dataset(key, shape, 'f8') else: fswap.create_dataset(key, shape, 'c16') naux0 = 0 for istep, auxrange in enumerate(auxranges): log.alldebug2("aux_e1 job_id %d step %d", job_id, istep) sh0, sh1, nrow = auxrange sub_slice = (ish0, ish1, 0, cell.nbas, sh0, sh1) if j_only: mat = numpy.ndarray((nkptij,dii,nrow), dtype=dtype, buffer=buf) else: mat = numpy.ndarray((nkptij,dij,nrow), dtype=dtype, buffer=buf) mat = int3c(sub_slice, mat) for k, kptij in enumerate(kptij_lst): h5dat = fswap['%s/%d'%(dataname,k)] v = lib.transpose(mat[k], out=buf1) if not j_only and aosym_s2[k]: idy = idxb[i0*(i0+1)//2:i1*(i1+1)//2] - i0 * nao out = numpy.ndarray((nrow,dii), dtype=v.dtype, buffer=mat[k]) v = numpy.take(v, idy, axis=1, out=out) if gamma_point(kptij): h5dat[naux0:naux0+nrow] = v.real else: h5dat[naux0:naux0+nrow] = v naux0 += nrow def ft_fuse(job_id, uniq_kptji_id, sh0, sh1): kpt = uniq_kpts[uniq_kptji_id] # kpt = kptj - kpti adapted_ji_idx = numpy.where(uniq_inverse == uniq_kptji_id)[0] adapted_kptjs = kptjs[adapted_ji_idx] nkptj = len(adapted_kptjs) Gaux = ft_ao.ft_ao(fused_cell, Gv, None, b, gxyz, Gvbase, kpt).T Gaux = fuse(Gaux) Gaux *= mydf.weighted_coulG(kpt, False, mesh) kLR = lib.transpose(numpy.asarray(Gaux.real, order='C')) kLI = lib.transpose(numpy.asarray(Gaux.imag, order='C')) j2c = numpy.asarray(fswap['j2c/%d'%uniq_kptji_id]) j2ctag = j2ctags[uniq_kptji_id] naux0 = j2c.shape[0] if ('j2c-/%d' % uniq_kptji_id) in fswap: j2c_negative = numpy.asarray(fswap['j2c-/%d'%uniq_kptji_id]) else: j2c_negative = None if is_zero(kpt): aosym = 's2' else: aosym = 's1' if aosym == 's2' and cell.dimension == 3: vbar = fuse(mydf.auxbar(fused_cell)) ovlp = cell.pbc_intor('int1e_ovlp', hermi=1, kpts=adapted_kptjs) ovlp = [lib.pack_tril(s) for s in ovlp] j3cR = [None] * nkptj j3cI = [None] * nkptj i0 = ao_loc[sh0] i1 = ao_loc[sh1] for k, idx in enumerate(adapted_ji_idx): key = 'j3c-chunks/%d/%d' % (job_id, idx) v = fuse(numpy.asarray(fswap[key])) if aosym == 's2' and cell.dimension == 3: for i in numpy.where(vbar != 0)[0]: v[i] -= vbar[i] * ovlp[k][i0*(i0+1)//2:i1*(i1+1)//2].ravel() j3cR[k] = numpy.asarray(v.real, order='C') if v.dtype == numpy.complex128: j3cI[k] = numpy.asarray(v.imag, order='C') v = None ncol = j3cR[0].shape[1] Gblksize = max(16, int(max_memory*1e6/16/ncol/(nkptj+1))) # +1 for pqkRbuf/pqkIbuf Gblksize = min(Gblksize, ngrids, 16384) pqkRbuf = numpy.empty(ncol*Gblksize) pqkIbuf = numpy.empty(ncol*Gblksize) buf = numpy.empty(nkptj*ncol*Gblksize, dtype=numpy.complex128) log.alldebug2(' blksize (%d,%d)', Gblksize, ncol) if aosym == 's2': shls_slice = (sh0, sh1, 0, sh1) else: shls_slice = (sh0, sh1, 0, cell.nbas) for p0, p1 in lib.prange(0, ngrids, Gblksize): dat = ft_ao._ft_aopair_kpts(cell, Gv[p0:p1], shls_slice, aosym, b, gxyz[p0:p1], Gvbase, kpt, adapted_kptjs, out=buf) nG = p1 - p0 for k, ji in enumerate(adapted_ji_idx): aoao = dat[k].reshape(nG,ncol) pqkR = numpy.ndarray((ncol,nG), buffer=pqkRbuf) pqkI = numpy.ndarray((ncol,nG), buffer=pqkIbuf) pqkR[:] = aoao.real.T pqkI[:] = aoao.imag.T lib.dot(kLR[p0:p1].T, pqkR.T, -1, j3cR[k], 1) lib.dot(kLI[p0:p1].T, pqkI.T, -1, j3cR[k], 1) if not (is_zero(kpt) and gamma_point(adapted_kptjs[k])): lib.dot(kLR[p0:p1].T, pqkI.T, -1, j3cI[k], 1) lib.dot(kLI[p0:p1].T, pqkR.T, 1, j3cI[k], 1) for k, idx in enumerate(adapted_ji_idx): if is_zero(kpt) and gamma_point(adapted_kptjs[k]): v = j3cR[k] else: v = j3cR[k] + j3cI[k] * 1j if j2ctag == 'CD': v = scipy.linalg.solve_triangular(j2c, v, lower=True, overwrite_b=True) fswap['j3c-chunks/%d/%d'%(job_id,idx)][:naux0] = v else: fswap['j3c-chunks/%d/%d'%(job_id,idx)][:naux0] = lib.dot(j2c, v) # low-dimension systems if j2c_negative is not None: fswap['j3c-/%d/%d'%(job_id,idx)] = lib.dot(j2c_negative, v) mpi_df._assemble(mydf, kptij_lst, j3c_jobs, gen_int3c, ft_fuse, cderi_file, fswap, log)