def loop(self, blksize=None): if self._cderi is None: self.build() if blksize is None: blksize = self.blockdim with addons.load(self._cderi, 'j3c') as feri: if isinstance(feri, numpy.ndarray): naoaux = feri.shape[0] for b0, b1 in self.prange(0, naoaux, blksize): yield numpy.asarray(feri[b0:b1], order='C') else: if isinstance(feri, h5py.Group): # starting from pyscf-1.7, DF tensor may be stored in # block format naoaux = feri['0'].shape[0] def load(b0, b1, prefetch): prefetch[0] = _load_from_h5g(feri, b0, b1) else: naoaux = feri.shape[0] def load(b0, b1, prefetch): prefetch[0] = numpy.asarray(feri[b0:b1]) dat = [None] prefetch = [None] with lib.call_in_background(load) as bload: bload(0, min(blksize, naoaux), prefetch) for b0, b1 in self.prange(blksize, naoaux, blksize): dat, prefetch = prefetch, dat bload(b0, b1, prefetch) yield dat[0] yield prefetch[0]
def bg_raise(): def raise1(): raise ValueError with lib.call_in_background(raise1) as f: f() raise IndexError
def kernel(mycc, eris=None): cpu0 = (time.clock(), time.time()) ccsd._sync_(mycc) log = logger.new_logger(mycc) eris = getattr(mycc, '_eris', None) if eris is None: mycc.ao2mo(mycc.mo_coeff) eris = mycc._eris t1T = numpy.asarray(mycc.t1.T, order='C') nvir, nocc = t1T.shape fvo = eris.fock[nocc:,:nocc].copy() mo_energy = eris.mo_energy.copy() et_sum = numpy.zeros(1, dtype=t1T.dtype) drv = _ccsd.libcc.MPICCsd_t_contract cpu2 = [time.clock(), time.time()] def contract(slices, data): #vvop_ab, vvop_ac, vvop_ba, vvop_bc, vvop_ca, vvop_cb, \ # vooo_a, vooo_b, vooo_c, t2T_a, t2T_b, t2T_c = data data_ptrs = [x.ctypes.data_as(ctypes.c_void_p) for x in data] data_ptrs = (ctypes.c_void_p*12)(*data_ptrs) drv(et_sum.ctypes.data_as(ctypes.c_void_p), mo_energy.ctypes.data_as(ctypes.c_void_p), t1T.ctypes.data_as(ctypes.c_void_p), fvo.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nocc), ctypes.c_int(nvir), (ctypes.c_int*6)(*slices), data_ptrs) cpu2[:] = log.alltimer_debug1('contract'+str(slices), *cpu2) with GlobalDataHandler(mycc) as daemon: v_seg_ranges = daemon.data_partition tasks = [] for ka, (a0, a1) in enumerate(v_seg_ranges): for kb, (b0, b1) in enumerate(v_seg_ranges[:ka+1]): for c0, c1 in v_seg_ranges[:kb+1]: tasks.append((a0, a1, b0, b1, c0, c1)) log.debug('ntasks = %d', len(tasks)) task_count = 0 with lib.call_in_background(contract) as async_contract: #for task in mpi.static_partition(tasks): #for task in mpi.work_stealing_partition(tasks): for task in mpi.work_share_partition(tasks, loadmin=2): log.alldebug2('request for segment %s', task) data = [None] * 12 daemon.request_(task, data) async_contract(task, data) task_count += 1 log.alldebug1('task_count = %d', task_count) et = comm.allreduce(et_sum[0] * 2).real log.timer('CCSD(T)', *cpu0) log.note('CCSD(T) correction = %.15g', et) return et
def start(self, interval=0.02): mycc = self._cc log = logger.new_logger(mycc) cpu1 = (logger.process_clock(), logger.perf_counter()) eris = mycc._eris t2T = mycc.t2.transpose(2, 3, 0, 1) nocc, nvir = mycc.t1.shape nmo = nocc + nvir vloc0, vloc1 = self.vranges[rank] nvir_seg = vloc1 - vloc0 max_memory = min(24000, mycc.max_memory - lib.current_memory()[0]) blksize = min( nvir_seg // 4 + 1, max(16, int(max_memory * .3e6 / 8 / (nvir * nocc * nmo)))) self.eri_tmp = lib.H5TmpFile() vvop = self.eri_tmp.create_dataset('vvop', (nvir_seg, nvir, nocc, nmo), 'f8') def save_vvop(j0, j1, vvvo): buf = numpy.empty((j1 - j0, nvir, nocc, nmo), dtype=t2T.dtype) buf[:, :, :, :nocc] = eris.ovov[:, j0:j1].conj().transpose( 1, 3, 0, 2) for k, (q0, q1) in enumerate(self.vranges): blk = vvvo[k].reshape(q1 - q0, nvir, j1 - j0, nocc) buf[:, q0:q1, :, nocc:] = blk.transpose(2, 0, 3, 1) vvop[j0:j1] = buf with lib.call_in_background(save_vvop) as save_vvop: for p0, p1 in mpi.prange(vloc0, vloc1, blksize): j0, j1 = p0 - vloc0, p1 - vloc0 sub_locs = comm.allgather((p0, p1)) vvvo = mpi.alltoall_new( [eris.vvvo[:, :, q0:q1] for q0, q1 in sub_locs], split_recvbuf=True) save_vvop(j0, j1, vvvo) cpu1 = log.timer_debug1('transpose %d:%d' % (p0, p1), *cpu1) def send_data(): while True: while comm.Iprobe(source=MPI.ANY_SOURCE, tag=INQUIRY): tensors, dest = comm.recv(source=MPI.ANY_SOURCE, tag=INQUIRY) for task, slices in tensors: if task == 'Done': return else: mpi.send(self._get_tensor(task, slices), dest, tag=TRANSFER_DATA) time.sleep(interval) daemon = threading.Thread(target=send_data) daemon.start() return daemon
def start(self, interval=0.02): mycc = self._cc log = logger.new_logger(mycc) cpu1 = (time.clock(), time.time()) eris = mycc._eris t2T = mycc.t2.transpose(2,3,0,1) nocc, nvir = mycc.t1.shape nmo = nocc + nvir vloc0, vloc1 = self.vranges[rank] nvir_seg = vloc1 - vloc0 max_memory = min(24000, mycc.max_memory - lib.current_memory()[0]) blksize = min(nvir_seg//4+1, max(16, int(max_memory*.3e6/8/(nvir*nocc*nmo)))) self.eri_tmp = lib.H5TmpFile() vvop = self.eri_tmp.create_dataset('vvop', (nvir_seg,nvir,nocc,nmo), 'f8') def save_vvop(j0, j1, vvvo): buf = numpy.empty((j1-j0,nvir,nocc,nmo), dtype=t2T.dtype) buf[:,:,:,:nocc] = eris.ovov[:,j0:j1].conj().transpose(1,3,0,2) for k, (q0, q1) in enumerate(self.vranges): blk = vvvo[k].reshape(q1-q0,nvir,j1-j0,nocc) buf[:,q0:q1,:,nocc:] = blk.transpose(2,0,3,1) vvop[j0:j1] = buf with lib.call_in_background(save_vvop) as save_vvop: for p0, p1 in mpi.prange(vloc0, vloc1, blksize): j0, j1 = p0 - vloc0, p1 - vloc0 sub_locs = comm.allgather((p0,p1)) vvvo = mpi.alltoall([eris.vvvo[:,:,q0:q1] for q0, q1 in sub_locs], split_recvbuf=True) save_vvop(j0, j1, vvvo) cpu1 = log.timer_debug1('transpose %d:%d'%(p0,p1), *cpu1) def send_data(): while True: while comm.Iprobe(source=MPI.ANY_SOURCE, tag=INQUIRY): tensors, dest = comm.recv(source=MPI.ANY_SOURCE, tag=INQUIRY) for task, slices in tensors: if task == 'Done': return else: mpi.send(self._get_tensor(task, slices), dest, tag=TRANSFER_DATA) time.sleep(interval) daemon = threading.Thread(target=send_data) daemon.start() return daemon
def _sort_eri(mycc, eris, nocc, nvir, vvop, log): cpu1 = (time.clock(), time.time()) mol = mycc.mol nmo = nocc + nvir if mol.symmetry: orbsym = symm.addons.label_orb_symm(mol, mol.irrep_id, mol.symm_orb, eris.mo_coeff, check=False) orbsym = numpy.asarray(orbsym, dtype=numpy.int32) % 10 else: orbsym = numpy.zeros(nmo, dtype=numpy.int32) o_sorted = _irrep_argsort(orbsym[:nocc]) v_sorted = _irrep_argsort(orbsym[nocc:]) vrank = numpy.argsort(v_sorted) max_memory = max(0, mycc.max_memory - lib.current_memory()[0]) max_memory = min(8000, max_memory * .9) blksize = min(nvir, max(16, int(max_memory * 1e6 / 8 / (nvir * nocc * nmo)))) log.debug1('_sort_eri max_memory %g blksize %d', max_memory, blksize) dtype = vvop.dtype with lib.call_in_background(vvop.__setitem__, sync=not mycc.async_io) as save: bufopv = numpy.empty((nocc, nmo, nvir), dtype=dtype) buf1 = numpy.empty_like(bufopv) buf = numpy.empty((nocc, nvir, nvir), dtype=dtype) for j0, j1 in lib.prange(0, nvir, blksize): ovov = numpy.asarray(eris.ovov[:, j0:j1]) #ovvv = numpy.asarray(eris.ovvv[:,j0:j1]) ovvv = eris.get_ovvv(slice(None), slice(j0, j1)) for j in range(j0, j1): oov = ovov[o_sorted, j - j0] ovv = ovvv[o_sorted, j - j0] #if ovv.ndim == 2: # ovv = lib.unpack_tril(ovv, out=buf) bufopv[:, :nocc, :] = oov[:, o_sorted][:, :, v_sorted].conj() bufopv[:, nocc:, :] = ovv[:, v_sorted][:, :, v_sorted].conj() save(vrank[j], bufopv.transpose(2, 0, 1)) bufopv, buf1 = buf1, bufopv cpu1 = log.timer_debug1('transpose %d:%d' % (j0, j1), *cpu1) return orbsym
def _sort_eri(mycc, eris, nocc, nvir, vvop, log): cpu1 = (time.clock(), time.time()) mol = mycc.mol nmo = nocc + nvir if mol.symmetry: orbsym = symm.addons.label_orb_symm(mol, mol.irrep_id, mol.symm_orb, eris.mo_coeff, check=False) orbsym = numpy.asarray(orbsym, dtype=numpy.int32) % 10 else: orbsym = numpy.zeros(nmo, dtype=numpy.int32) o_sorted = _irrep_argsort(orbsym[:nocc]) v_sorted = _irrep_argsort(orbsym[nocc:]) vrank = numpy.argsort(v_sorted) max_memory = max(0, mycc.max_memory - lib.current_memory()[0]) max_memory = min(8000, max_memory*.9) blksize = min(nvir, max(16, int(max_memory*1e6/8/(nvir*nocc*nmo)))) log.debug1('_sort_eri max_memory %g blksize %d', max_memory, blksize) dtype = vvop.dtype with lib.call_in_background(vvop.__setitem__, sync=not mycc.async_io) as save: bufopv = numpy.empty((nocc,nmo,nvir), dtype=dtype) buf1 = numpy.empty_like(bufopv) buf = numpy.empty((nocc,nvir,nvir), dtype=dtype) for j0, j1 in lib.prange(0, nvir, blksize): ovov = numpy.asarray(eris.ovov[:,j0:j1]) #ovvv = numpy.asarray(eris.ovvv[:,j0:j1]) ovvv = eris.get_ovvv(slice(None), slice(j0,j1)) for j in range(j0,j1): oov = ovov[o_sorted,j-j0] ovv = ovvv[o_sorted,j-j0] #if ovv.ndim == 2: # ovv = lib.unpack_tril(ovv, out=buf) bufopv[:,:nocc,:] = oov[:,o_sorted][:,:,v_sorted].conj() bufopv[:,nocc:,:] = ovv[:,v_sorted][:,:,v_sorted].conj() save(vrank[j], bufopv.transpose(2,0,1)) bufopv, buf1 = buf1, bufopv cpu1 = log.timer_debug1('transpose %d:%d'%(j0,j1), *cpu1) return orbsym
def general(eri, mo_coeffs, erifile, dataname='eri_mo', ioblk_size=IOBLK_SIZE, compact=True, verbose=logger.NOTE): '''For the given four sets of orbitals, transfer arbitrary spherical AO integrals to MO integrals on disk. Args: eri : 8-fold reduced eri vector mo_coeffs : 4-item list of ndarray Four sets of orbital coefficients, corresponding to the four indices of (ij|kl) erifile : str or h5py File or h5py Group object To store the transformed integrals, in HDF5 format. Kwargs dataname : str The dataset name in the erifile (ref the hierarchy of HDF5 format http://www.hdfgroup.org/HDF5/doc1.6/UG/09_Groups.html). By assigning different dataname, the existed integral file can be reused. If the erifile contains the dataname, the new integrals data will overwrite the old one. ioblk_size : float or int The block size for IO, large block size may **not** improve performance compact : bool When compact is True, depending on the four oribital sets, the returned MO integrals has (up to 4-fold) permutation symmetry. If it's False, the function will abandon any permutation symmetry, and return the "plain" MO integrals Pseudocode / algorithm: u = mu v = nu l = lambda o = sigma Assume eri's are 8-fold reduced. nij/nkl_pair = npair or i*j/k*l if only transforming a subset First half transform: Initialize half_eri of size (nij_pair,npair) For lo = 1 -> npair Unpack row lo Unpack row lo to matrix E_{uv}^{lo} Transform C_ui^+*E*C_nj -> E_{ij}^{lo} Ravel or pack E_{ij}^{lo} Save E_{ij}^{lo} -> half_eri[:,lo] Second half transform: Initialize h5d_eri of size (nij_pair,nkl_pair) For ij = 1 -> nij_pair Load and unpack half_eri[ij,:] -> E_{lo}^{ij} Transform C_{lk}E_{lo}^{ij}C_{ol} -> E_{kl}^{ij} Repack E_{kl}^{ij} Save E_{kl}^{ij} -> h5d_eri[ij,:] Each matrix is indexed by the composite index ij x kl, where ij/kl is either npair or ixj/kxl, if only a subset of MOs are being transformed. Since entire rows or columns need to be read in, the arrays are chunked such that IOBLK_SIZE = row/col x chunking col/row. For example, for the first half transform, we would save in nij_pair x IOBLK_SIZE/nij_pair, then load in IOBLK_SIZE/nkl_pair x npair for the second half transform. ------ kl -----> |jxl | ij | | v As a first guess, the chunking size is jxl. If the super-rows/cols are larger than IOBLK_SIZE, then the chunk rectangle jxl is trimmed accordingly. The pathological limiting case is where the dimensions nao_pair, nij_pair, or nkl_pair are so large that the arrays are chunked 1x1, in which case IOBLK_SIZE needs to be increased. ''' log = logger.new_logger(None, verbose) log.info('******** ao2mo disk, custom eri ********') eri_ao = numpy.asarray(eri, order='C') nao, nmoi = mo_coeffs[0].shape nmoj = mo_coeffs[1].shape[1] nao_pair = nao*(nao+1)//2 ijmosym, nij_pair, moij, ijshape = _conc_mos(mo_coeffs[0], mo_coeffs[1], compact) klmosym, nkl_pair, mokl, klshape = _conc_mos(mo_coeffs[2], mo_coeffs[3], compact) ijshape = (ijshape[0], ijshape[1]-ijshape[0], ijshape[2], ijshape[3]-ijshape[2]) dtype = numpy.result_type(eri, *mo_coeffs) typesize = dtype.itemsize/1e6 # in MB if nij_pair == 0: return numpy.empty((nij_pair,nkl_pair)) ij_red = ijmosym == 's1' kl_red = klmosym == 's1' if isinstance(erifile, str): if h5py.is_hdf5(erifile): feri = h5py.File(erifile, 'a') if dataname in feri: del(feri[dataname]) else: feri = h5py.File(erifile,'w',libver='latest') else: assert(isinstance(erifile, h5py.Group)) feri = erifile h5d_eri = feri.create_dataset(dataname,(nij_pair,nkl_pair), dtype.char) feri_swap = lib.H5TmpFile(libver='latest') chunk_size = min(nao_pair, max(4, int(ioblk_size*1e6/8/nao_pair))) log.debug('Memory information:') log.debug(' IOBLK_SIZE (MB): {} chunk_size: {}' .format(ioblk_size, chunk_size)) log.debug(' Final disk eri size (MB): {:.3g}' .format(nij_pair*nkl_pair*typesize)) log.debug(' Half transformed eri size (MB): {:.3g}' .format(nij_pair*nao_pair*typesize)) log.debug(' RAM buffer (MB): {:.3g}' .format(nij_pair*IOBLK_SIZE*typesize*2)) if eri_ao.size == nao_pair**2: # 4-fold symmetry # half_e1 first transforms the indices which are contiguous in memory # transpose the 4-fold integrals to make ij the contiguous indices eri_ao = lib.transpose(eri_ao) ftrans = _ao2mo.libao2mo.AO2MOtranse1_incore_s4 elif eri_ao.size == nao_pair*(nao_pair+1)//2: ftrans = _ao2mo.libao2mo.AO2MOtranse1_incore_s8 else: raise NotImplementedError if ijmosym == 's2': fmmm = _ao2mo.libao2mo.AO2MOmmm_nr_s2_s2 elif nmoi <= nmoj: fmmm = _ao2mo.libao2mo.AO2MOmmm_nr_s2_iltj else: fmmm = _ao2mo.libao2mo.AO2MOmmm_nr_s2_igtj fdrv = getattr(_ao2mo.libao2mo, 'AO2MOnr_e1incore_drv') def save(piece, buf): feri_swap[str(piece)] = buf.T # transform \mu\nu -> ij cput0 = time.clock(), time.time() with lib.call_in_background(save) as async_write: for istep, (p0, p1) in enumerate(lib.prange(0, nao_pair, chunk_size)): if dtype == numpy.double: buf = numpy.empty((p1-p0, nij_pair)) fdrv(ftrans, fmmm, buf.ctypes.data_as(ctypes.c_void_p), eri_ao.ctypes.data_as(ctypes.c_void_p), moij.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(p0), ctypes.c_int(p1-p0), ctypes.c_int(nao), ctypes.c_int(ijshape[0]), ctypes.c_int(ijshape[1]), ctypes.c_int(ijshape[2]), ctypes.c_int(ijshape[3])) else: # complex tmp = numpy.empty((p1-p0, nao_pair)) if eri_ao.size == nao_pair**2: # 4-fold symmetry tmp = eri_ao[p0:p1] else: # 8-fold symmetry for i in range(p0, p1): tmp[i-p0] = lib.unpack_row(eri_ao, i) tmp = lib.unpack_tril(tmp, filltriu=lib.SYMMETRIC) buf = lib.einsum('xpq,pi,qj->xij', tmp, mo_coeffs[0].conj(), mo_coeffs[1]) if ij_red: buf = buf.reshape(p1-p0,-1) # grabs by row else: buf = lib.pack_tril(buf) async_write(istep, buf) log.timer('(uv|lo) -> (ij|lo)', *cput0) # transform \lambda\sigma -> kl cput1 = time.clock(), time.time() Cklam = mo_coeffs[2].conj() buf_read = numpy.empty((chunk_size,nao_pair), dtype=dtype) buf_prefetch = numpy.empty_like(buf_read) def load(start, stop, buf): if start < stop: _load_from_h5g(feri_swap, start, stop, buf) def save(start, stop, buf): if start < stop: h5d_eri[start:stop] = buf[:stop-start] with lib.call_in_background(save,load) as (async_write, prefetch): for p0, p1 in lib.prange(0, nij_pair, chunk_size): if p0 == 0: load(p0, p1, buf_prefetch) buf_read, buf_prefetch = buf_prefetch, buf_read prefetch(p1, min(p1+chunk_size, nij_pair), buf_prefetch) lo = lib.unpack_tril(buf_read[:p1-p0], filltriu=lib.SYMMETRIC) lo = lib.einsum('xpq,pi,qj->xij', lo, Cklam, mo_coeffs[3]) if kl_red: kl = lo.reshape(p1-p0,-1) else: kl = lib.pack_tril(lo) async_write(p0, p1, kl) log.timer('(ij|lo) -> (ij|kl)', *cput1) if isinstance(erifile, str): feri.close() return erifile
def general(mol, mo_coeffs, erifile, auxbasis='weigend+etb', dataname='eri_mo', tmpdir=None, int3c='int3c2e', aosym='s2ij', int2c='int2c2e', comp=1, max_memory=MAX_MEMORY, verbose=0, compact=True): ''' Transform ij of (ij|L) to MOs. ''' assert (aosym in ('s1', 's2ij')) time0 = (time.clock(), time.time()) log = logger.new_logger(mol, verbose) if tmpdir is None: tmpdir = lib.param.TMPDIR swapfile = tempfile.NamedTemporaryFile(dir=tmpdir) cholesky_eri_b(mol, swapfile.name, auxbasis, dataname, int3c, aosym, int2c, comp, max_memory, verbose=log) fswap = h5py.File(swapfile.name, 'r') time1 = log.timer('AO->MO eri transformation 1 pass', *time0) nao = mo_coeffs[0].shape[0] if aosym == 's1': nao_pair = nao * nao aosym_as_nr_e2 = 's1' else: nao_pair = nao * (nao + 1) // 2 aosym_as_nr_e2 = 's2kl' ijmosym, nij_pair, moij, ijshape = \ ao2mo.incore._conc_mos(mo_coeffs[0], mo_coeffs[1], compact and aosym != 's1') naoaux = fswap['%s/0' % dataname].shape[-2] feri = _create_h5file(erifile, dataname) if comp == 1: h5d_eri = feri.create_dataset(dataname, (naoaux, nij_pair), 'f8') else: h5d_eri = feri.create_dataset(dataname, (comp, naoaux, nij_pair), 'f8') def save(row0, row1, buf): if comp == 1: h5d_eri[row0:row1] = buf else: h5d_eri[:, row0:row1] = buf iolen = min(max(int(max_memory * .45e6 / 8 / (nao_pair + nij_pair)), 28), naoaux) totstep = (naoaux + iolen - 1) // iolen ti0 = time1 with lib.call_in_background(save) as bsave: for istep, (row0, row1) in enumerate(lib.prange(0, naoaux, iolen)): nrow = row1 - row0 log.debug('step 2 [%d/%d], [%d:%d], row = %d', istep + 1, totstep, row0, row1, nrow) buf = _load_from_h5g(fswap[dataname], row0, row1) if comp == 1: buf = _ao2mo.nr_e2(buf, moij, ijshape, aosym_as_nr_e2, ijmosym) bsave(row0, row1, buf) else: buf = _ao2mo.nr_e2(buf.reshape(comp * nrow, nao_pair), moij, ijshape, aosym_as_nr_e2, ijmosym) bsave(row0, row1, buf.reshape(comp, nrow, nij_pair)) buf = None ti0 = log.timer( 'step 2 [%d/%d], [%d:%d], row = %d' % (istep + 1, totstep, row0, row1, nrow), *ti0) fswap.close() feri.close() log.timer('AO->MO CD eri transformation 2 pass', *time1) log.timer('AO->MO CD eri transformation', *time0) return erifile
def _ao2mo_ovov(mp, orbs, feri, max_memory=2000, verbose=None): time0 = (time.clock(), time.time()) log = logger.new_logger(mp, verbose) orboa = numpy.asarray(orbs[0], order='F') orbva = numpy.asarray(orbs[1], order='F') orbob = numpy.asarray(orbs[2], order='F') orbvb = numpy.asarray(orbs[3], order='F') nao, nocca = orboa.shape noccb = orbob.shape[1] nvira = orbva.shape[1] nvirb = orbvb.shape[1] mol = mp.mol int2e = mol._add_suffix('int2e') ao2mopt = _ao2mo.AO2MOpt(mol, int2e, 'CVHFnr_schwarz_cond', 'CVHFsetnr_direct_scf') nbas = mol.nbas assert (nvira <= nao) assert (nvirb <= nao) ao_loc = mol.ao_loc_nr() dmax = max( 4, min(nao / 3, numpy.sqrt(max_memory * .95e6 / 8 / (nao + nocca)**2))) sh_ranges = ao2mo.outcore.balance_partition(ao_loc, dmax) dmax = max(x[2] for x in sh_ranges) eribuf = numpy.empty((nao, dmax, dmax, nao)) ftmp = lib.H5TmpFile() disk = (nocca**2 * (nao * (nao + dmax) / 2 + nvira**2) + noccb**2 * (nao * (nao + dmax) / 2 + nvirb**2) + nocca * noccb * (nao**2 + nvira * nvirb)) log.debug('max_memory %s MB (dmax = %s) required disk space %g MB', max_memory, dmax, disk * 8 / 1e6) fint = gto.moleintor.getints4c aa_blk_slices = [] ab_blk_slices = [] count_ab = 0 count_aa = 0 time1 = time0 with lib.call_in_background(ftmp.__setitem__) as save: for ish0, ish1, ni in sh_ranges: for jsh0, jsh1, nj in sh_ranges: i0, i1 = ao_loc[ish0], ao_loc[ish1] j0, j1 = ao_loc[jsh0], ao_loc[jsh1] eri = fint(int2e, mol._atm, mol._bas, mol._env, shls_slice=(0, nbas, ish0, ish1, jsh0, jsh1, 0, nbas), aosym='s1', ao_loc=ao_loc, cintopt=ao2mopt._cintopt, out=eribuf) tmp_i = lib.ddot(orboa.T, eri.reshape(nao, (i1 - i0) * (j1 - j0) * nao)) tmp_li = lib.ddot( orbob.T, tmp_i.reshape(nocca * (i1 - i0) * (j1 - j0), nao).T) tmp_li = tmp_li.reshape(noccb, nocca, (i1 - i0), (j1 - j0)) save('ab/%d' % count_ab, tmp_li.transpose(1, 0, 2, 3)) ab_blk_slices.append((i0, i1, j0, j1)) count_ab += 1 if ish0 >= jsh0: tmp_li = lib.ddot( orboa.T, tmp_i.reshape(nocca * (i1 - i0) * (j1 - j0), nao).T) tmp_li = tmp_li.reshape(nocca, nocca, (i1 - i0), (j1 - j0)) save('aa/%d' % count_aa, tmp_li.transpose(1, 0, 2, 3)) tmp_i = lib.ddot( orbob.T, eri.reshape(nao, (i1 - i0) * (j1 - j0) * nao)) tmp_li = lib.ddot( orbob.T, tmp_i.reshape(noccb * (i1 - i0) * (j1 - j0), nao).T) tmp_li = tmp_li.reshape(noccb, noccb, (i1 - i0), (j1 - j0)) save('bb/%d' % count_aa, tmp_li.transpose(1, 0, 2, 3)) aa_blk_slices.append((i0, i1, j0, j1)) count_aa += 1 time1 = log.timer_debug1( 'partial ao2mo [%d:%d,%d:%d]' % (ish0, ish1, jsh0, jsh1), *time1) time1 = time0 = log.timer('mp2 ao2mo_ovov pass1', *time0) eri = eribuf = tmp_i = tmp_li = None fovov = feri.create_dataset('ovov', (nocca * nvira, nocca * nvira), 'f8', chunks=(nvira, nvira)) fovOV = feri.create_dataset('ovOV', (nocca * nvira, noccb * nvirb), 'f8', chunks=(nvira, nvirb)) fOVOV = feri.create_dataset('OVOV', (noccb * nvirb, noccb * nvirb), 'f8', chunks=(nvirb, nvirb)) occblk = int( min(max(nocca, noccb), max(4, 250 / nocca, max_memory * .9e6 / 8 / (nao**2 * nocca) / 5))) def load_aa(h5g, nocc, i0, eri): if i0 < nocc: i1 = min(i0 + occblk, nocc) for k, (p0, p1, q0, q1) in enumerate(aa_blk_slices): eri[:i1 - i0, :, p0:p1, q0:q1] = h5g[str(k)][i0:i1] if p0 != q0: dat = numpy.asarray(h5g[str(k)][:, i0:i1]) eri[:i1 - i0, :, q0:q1, p0:p1] = dat.transpose(1, 0, 3, 2) def load_ab(h5g, nocca, i0, eri): if i0 < nocca: i1 = min(i0 + occblk, nocca) for k, (p0, p1, q0, q1) in enumerate(ab_blk_slices): eri[:i1 - i0, :, p0:p1, q0:q1] = h5g[str(k)][i0:i1] def save(h5dat, nvir, i0, i1, dat): for i in range(i0, i1): h5dat[i * nvir:(i + 1) * nvir] = dat[i - i0].reshape(nvir, -1) with lib.call_in_background(save) as bsave: with lib.call_in_background(load_aa) as prefetch: buf_prefecth = numpy.empty((occblk, nocca, nao, nao)) buf = numpy.empty_like(buf_prefecth) load_aa(ftmp['aa'], nocca, 0, buf_prefecth) for i0, i1 in lib.prange(0, nocca, occblk): buf, buf_prefecth = buf_prefecth, buf prefetch(ftmp['aa'], nocca, i1, buf_prefecth) eri = buf[:i1 - i0].reshape((i1 - i0) * nocca, nao, nao) dat = _ao2mo.nr_e2(eri, orbva, (0, nvira, 0, nvira), 's1', 's1') bsave( fovov, nvira, i0, i1, dat.reshape(i1 - i0, nocca, nvira, nvira).transpose(0, 2, 1, 3)) time1 = log.timer_debug1( 'pass2 ao2mo for aa [%d:%d]' % (i0, i1), *time1) buf_prefecth = numpy.empty((occblk, noccb, nao, nao)) buf = numpy.empty_like(buf_prefecth) load_aa(ftmp['bb'], noccb, 0, buf_prefecth) for i0, i1 in lib.prange(0, noccb, occblk): buf, buf_prefecth = buf_prefecth, buf prefetch(ftmp['bb'], noccb, i1, buf_prefecth) eri = buf[:i1 - i0].reshape((i1 - i0) * noccb, nao, nao) dat = _ao2mo.nr_e2(eri, orbvb, (0, nvirb, 0, nvirb), 's1', 's1') bsave( fOVOV, nvirb, i0, i1, dat.reshape(i1 - i0, noccb, nvirb, nvirb).transpose(0, 2, 1, 3)) time1 = log.timer_debug1( 'pass2 ao2mo for bb [%d:%d]' % (i0, i1), *time1) orbvab = numpy.asarray(numpy.hstack((orbva, orbvb)), order='F') with lib.call_in_background(load_ab) as prefetch: load_ab(ftmp['ab'], nocca, 0, buf_prefecth) for i0, i1 in lib.prange(0, nocca, occblk): buf, buf_prefecth = buf_prefecth, buf prefetch(ftmp['ab'], nocca, i1, buf_prefecth) eri = buf[:i1 - i0].reshape((i1 - i0) * noccb, nao, nao) dat = _ao2mo.nr_e2(eri, orbvab, (0, nvira, nvira, nvira + nvirb), 's1', 's1') bsave( fovOV, nvira, i0, i1, dat.reshape(i1 - i0, noccb, nvira, nvirb).transpose(0, 2, 1, 3)) time1 = log.timer_debug1( 'pass2 ao2mo for ab [%d:%d]' % (i0, i1), *time1) time0 = log.timer('mp2 ao2mo_ovov pass2', *time0)
def kernel(mycc, eris, t1=None, t2=None, verbose=logger.NOTE): cpu1 = cpu0 = (logger.process_clock(), logger.perf_counter()) log = logger.new_logger(mycc, verbose) if t1 is None: t1 = mycc.t1 if t2 is None: t2 = mycc.t2 nocc, nvir = t1.shape nmo = nocc + nvir dtype = numpy.result_type(t1, t2, eris.ovoo.dtype) if mycc.incore_complete: ftmp = None eris_vvop = numpy.zeros((nvir, nvir, nocc, nmo), dtype) else: ftmp = lib.H5TmpFile() eris_vvop = ftmp.create_dataset('vvop', (nvir, nvir, nocc, nmo), dtype) orbsym = _sort_eri(mycc, eris, nocc, nvir, eris_vvop, log) mo_energy, t1T, t2T, vooo, fvo, restore_t2_inplace = \ _sort_t2_vooo_(mycc, orbsym, t1, t2, eris) cpu1 = log.timer_debug1('CCSD(T) sort_eri', *cpu1) cpu2 = list(cpu1) orbsym = numpy.hstack( (numpy.sort(orbsym[:nocc]), numpy.sort(orbsym[nocc:]))) o_ir_loc = numpy.append( 0, numpy.cumsum(numpy.bincount(orbsym[:nocc], minlength=8))) v_ir_loc = numpy.append( 0, numpy.cumsum(numpy.bincount(orbsym[nocc:], minlength=8))) o_sym = orbsym[:nocc] oo_sym = (o_sym[:, None] ^ o_sym).ravel() oo_ir_loc = numpy.append(0, numpy.cumsum(numpy.bincount(oo_sym, minlength=8))) nirrep = max(oo_sym) + 1 orbsym = orbsym.astype(numpy.int32) o_ir_loc = o_ir_loc.astype(numpy.int32) v_ir_loc = v_ir_loc.astype(numpy.int32) oo_ir_loc = oo_ir_loc.astype(numpy.int32) if dtype == numpy.complex: drv = _ccsd.libcc.CCsd_t_zcontract else: drv = _ccsd.libcc.CCsd_t_contract et_sum = numpy.zeros(1, dtype=dtype) def contract(a0, a1, b0, b1, cache): cache_row_a, cache_col_a, cache_row_b, cache_col_b = cache drv(et_sum.ctypes.data_as(ctypes.c_void_p), mo_energy.ctypes.data_as(ctypes.c_void_p), t1T.ctypes.data_as(ctypes.c_void_p), t2T.ctypes.data_as(ctypes.c_void_p), vooo.ctypes.data_as(ctypes.c_void_p), fvo.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nocc), ctypes.c_int(nvir), ctypes.c_int(a0), ctypes.c_int(a1), ctypes.c_int(b0), ctypes.c_int(b1), ctypes.c_int(nirrep), o_ir_loc.ctypes.data_as(ctypes.c_void_p), v_ir_loc.ctypes.data_as(ctypes.c_void_p), oo_ir_loc.ctypes.data_as(ctypes.c_void_p), orbsym.ctypes.data_as(ctypes.c_void_p), cache_row_a.ctypes.data_as(ctypes.c_void_p), cache_col_a.ctypes.data_as(ctypes.c_void_p), cache_row_b.ctypes.data_as(ctypes.c_void_p), cache_col_b.ctypes.data_as(ctypes.c_void_p)) cpu2[:] = log.timer_debug1('contract %d:%d,%d:%d' % (a0, a1, b0, b1), *cpu2) # The rest 20% memory for cache b mem_now = lib.current_memory()[0] max_memory = max(0, mycc.max_memory - mem_now) bufsize = (max_memory * .5e6 / 8 - nocc**3 * 3 * lib.num_threads()) / ( nocc * nmo) #*.5 for async_io bufsize *= .5 #*.5 upper triangular part is loaded bufsize *= .8 #*.8 for [a0:a1]/[b0:b1] partition bufsize = max(8, bufsize) log.debug('max_memory %d MB (%d MB in use)', max_memory, mem_now) with lib.call_in_background(contract, sync=not mycc.async_io) as async_contract: for a0, a1 in reversed(list(lib.prange_tril(0, nvir, bufsize))): cache_row_a = numpy.asarray(eris_vvop[a0:a1, :a1], order='C') if a0 == 0: cache_col_a = cache_row_a else: cache_col_a = numpy.asarray(eris_vvop[:a0, a0:a1], order='C') async_contract( a0, a1, a0, a1, (cache_row_a, cache_col_a, cache_row_a, cache_col_a)) for b0, b1 in lib.prange_tril(0, a0, bufsize / 8): cache_row_b = numpy.asarray(eris_vvop[b0:b1, :b1], order='C') if b0 == 0: cache_col_b = cache_row_b else: cache_col_b = numpy.asarray(eris_vvop[:b0, b0:b1], order='C') async_contract( a0, a1, b0, b1, (cache_row_a, cache_col_a, cache_row_b, cache_col_b)) t2 = restore_t2_inplace(t2T) et_sum *= 2 if abs(et_sum[0].imag) > 1e-4: logger.warn(mycc, 'Non-zero imaginary part of CCSD(T) energy was found %s', et_sum[0]) et = et_sum[0].real log.timer('CCSD(T)', *cpu0) log.note('CCSD(T) correction = %.15g', et) return et
def half_e1(mol, mo_coeffs, swapfile, intor='int2e', aosym='s4', comp=1, max_memory=MAX_MEMORY, ioblk_size=IOBLK_SIZE, verbose=logger.WARN, compact=True, ao2mopt=None): r'''Half transform arbitrary spherical AO integrals to MO integrals for the given two sets of orbitals Args: mol : :class:`Mole` object AO integrals will be generated in terms of mol._atm, mol._bas, mol._env mo_coeff : ndarray Transform (ij|kl) with the same set of orbitals. swapfile : str or h5py File or h5py Group object To store the transformed integrals, in HDF5 format. The transformed integrals are saved in blocks. Kwargs intor : str Name of the 2-electron integral. Ref to :func:`getints_by_shell` for the complete list of available 2-electron integral names aosym : int or str Permutation symmetry for the AO integrals | 4 or '4' or 's4': 4-fold symmetry (default) | '2ij' or 's2ij' : symmetry between i, j in (ij|kl) | '2kl' or 's2kl' : symmetry between k, l in (ij|kl) | 1 or '1' or 's1': no symmetry | 'a4ij' : 4-fold symmetry with anti-symmetry between i, j in (ij|kl) (TODO) | 'a4kl' : 4-fold symmetry with anti-symmetry between k, l in (ij|kl) (TODO) | 'a2ij' : anti-symmetry between i, j in (ij|kl) (TODO) | 'a2kl' : anti-symmetry between k, l in (ij|kl) (TODO) comp : int Components of the integrals, e.g. int2e_ip_sph has 3 components. verbose : int Print level max_memory : float or int The maximum size of cache to use (in MB), large cache may **not** improve performance. ioblk_size : float or int The block size for IO, large block size may **not** improve performance verbose : int Print level compact : bool When compact is True, depending on the four oribital sets, the returned MO integrals has (up to 4-fold) permutation symmetry. If it's False, the function will abandon any permutation symmetry, and return the "plain" MO integrals ao2mopt : :class:`AO2MOpt` object Precomputed data to improve perfomance Returns: None ''' if any(c.dtype == numpy.complex128 for c in mo_coeffs): raise NotImplementedError('Integral transformation for complex orbitals') intor = mol._add_suffix(intor) time0 = (logger.process_clock(), logger.perf_counter()) log = logger.new_logger(mol, verbose) nao = mo_coeffs[0].shape[0] aosym = _stand_sym_code(aosym) if aosym in ('s4', 's2ij'): nao_pair = nao * (nao+1) // 2 else: nao_pair = nao * nao ijmosym, nij_pair, moij, ijshape = \ incore._conc_mos(mo_coeffs[0], mo_coeffs[1], compact and aosym in ('s4', 's2ij')) e1buflen, mem_words, iobuf_words, ioblk_words = \ guess_e1bufsize(max_memory, ioblk_size, nij_pair, nao_pair, comp) ioblk_size = ioblk_words * 8/1e6 # The buffer to hold AO integrals in C code, see line (@) aobuflen = max(int((mem_words - 2*comp*e1buflen*nij_pair) // (nao_pair*comp)), IOBUF_ROW_MIN) ao_loc = mol.ao_loc_nr('_cart' in intor) shranges = guess_shell_ranges(mol, (aosym in ('s4', 's2kl')), e1buflen, aobuflen, ao_loc) if ao2mopt is None: if intor == 'int2e_cart' or intor == 'int2e_sph': ao2mopt = _ao2mo.AO2MOpt(mol, intor, 'CVHFnr_schwarz_cond', 'CVHFsetnr_direct_scf') else: ao2mopt = _ao2mo.AO2MOpt(mol, intor) if isinstance(swapfile, h5py.Group): fswap = swapfile else: fswap = lib.H5TmpFile(swapfile) for icomp in range(comp): fswap.create_group(str(icomp)) # for h5py old version log.debug('step1: tmpfile %s %.8g MB', fswap.filename, nij_pair*nao_pair*8/1e6) log.debug('step1: (ij,kl) = (%d,%d), mem cache %.8g MB, iobuf %.8g MB', nij_pair, nao_pair, mem_words*8/1e6, iobuf_words*8/1e6) nstep = len(shranges) e1buflen = max([x[2] for x in shranges]) e2buflen, chunks = guess_e2bufsize(ioblk_size, nij_pair, e1buflen) def save(istep, iobuf): for icomp in range(comp): _transpose_to_h5g(fswap, '%d/%d'%(icomp,istep), iobuf[icomp], e2buflen, None) # transform e1 ti0 = log.timer('Initializing ao2mo.outcore.half_e1', *time0) with lib.call_in_background(save) as async_write: buf1 = numpy.empty((comp*e1buflen,nao_pair)) buf2 = numpy.empty((comp*e1buflen,nij_pair)) buf_write = numpy.empty_like(buf2) fill = _ao2mo.nr_e1fill f_e1 = _ao2mo.nr_e1 for istep,sh_range in enumerate(shranges): log.debug1('step 1 [%d/%d], AO [%d:%d], len(buf) = %d', istep+1, nstep, *(sh_range[:3])) buflen = sh_range[2] iobuf = numpy.ndarray((comp,buflen,nij_pair), buffer=buf2) nmic = len(sh_range[3]) p1 = 0 for imic, aoshs in enumerate(sh_range[3]): log.debug2(' fill iobuf micro [%d/%d], AO [%d:%d], len(aobuf) = %d', imic+1, nmic, *aoshs) buf = fill(intor, aoshs, mol._atm, mol._bas, mol._env, aosym, comp, ao2mopt, out=buf1).reshape(-1,nao_pair) buf = f_e1(buf, moij, ijshape, aosym, ijmosym) p0, p1 = p1, p1 + aoshs[2] iobuf[:,p0:p1] = buf.reshape(comp,aoshs[2],nij_pair) ti0 = log.timer_debug1('gen AO/transform MO [%d/%d]'%(istep+1,nstep), *ti0) async_write(istep, iobuf) buf2, buf_write = buf_write, buf2 fswap = None return swapfile
def make_kpt(uniq_kptji_id): # kpt = kptj - kpti kpt = uniq_kpts[uniq_kptji_id] log.debug1('kpt = %s', kpt) adapted_ji_idx = numpy.where(uniq_inverse == uniq_kptji_id)[0] adapted_kptjs = kptjs[adapted_ji_idx] nkptj = len(adapted_kptjs) log.debug1('adapted_ji_idx = %s', adapted_ji_idx) shls_slice = (auxcell.nbas, fused_cell.nbas) Gaux = ft_ao.ft_ao(fused_cell, Gv, shls_slice, b, gxyz, Gvbase, kpt) if (cell.dimension == 1 or cell.dimension == 2) and is_zero(kpt): G0idx, SI_on_z = pbcgto.cell._SI_for_uniform_model_charge(cell, Gv) s = plain_ints[-Gaux.shape[1]:] # Only compensated Gaussians Gaux[G0idx] -= numpy.einsum('g,i->gi', SI_on_z, s) wcoulG = mydf.weighted_coulG(kpt, False, mesh) Gaux *= wcoulG.reshape(-1,1) kLR = Gaux.real.copy('C') kLI = Gaux.imag.copy('C') Gaux = None j2c = numpy.asarray(fswap['j2c/%d'%uniq_kptji_id]) try: j2c = scipy.linalg.cholesky(j2c, lower=True) j2ctag = 'CD' except scipy.linalg.LinAlgError as e: #msg =('===================================\n' # 'J-metric not positive definite.\n' # 'It is likely that mesh is not enough.\n' # '===================================') #log.error(msg) #raise scipy.linalg.LinAlgError('\n'.join([e.message, msg])) w, v = scipy.linalg.eigh(j2c) log.debug('DF metric linear dependency for kpt %s', uniq_kptji_id) log.debug('cond = %.4g, drop %d bfns', w[-1]/w[0], numpy.count_nonzero(w<mydf.linear_dep_threshold)) v = v[:,w>mydf.linear_dep_threshold].T.conj() v /= numpy.sqrt(w[w>mydf.linear_dep_threshold]).reshape(-1,1) j2c = v j2ctag = 'eig' naux0 = j2c.shape[0] if is_zero(kpt): # kpti == kptj aosym = 's2' nao_pair = nao*(nao+1)//2 vbar = mydf.auxbar(fused_cell) ovlp = cell.pbc_intor('int1e_ovlp', hermi=1, kpts=adapted_kptjs) ovlp = [lib.pack_tril(s) for s in ovlp] else: aosym = 's1' nao_pair = nao**2 mem_now = lib.current_memory()[0] log.debug2('memory = %s', mem_now) max_memory = max(2000, mydf.max_memory-mem_now) # nkptj for 3c-coulomb arrays plus 1 Lpq array buflen = min(max(int(max_memory*.38e6/16/naux/(nkptj+1)), 1), nao_pair) shranges = _guess_shell_ranges(cell, buflen, aosym) buflen = max([x[2] for x in shranges]) # +1 for a pqkbuf if aosym == 's2': Gblksize = max(16, int(max_memory*.1e6/16/buflen/(nkptj+1))) else: Gblksize = max(16, int(max_memory*.2e6/16/buflen/(nkptj+1))) Gblksize = min(Gblksize, ngrids, 16384) pqkRbuf = numpy.empty(buflen*Gblksize) pqkIbuf = numpy.empty(buflen*Gblksize) # buf for ft_aopair buf = numpy.empty(nkptj*buflen*Gblksize, dtype=numpy.complex128) def pw_contract(istep, sh_range, j3cR, j3cI): bstart, bend, ncol = sh_range if aosym == 's2': shls_slice = (bstart, bend, 0, bend) else: shls_slice = (bstart, bend, 0, cell.nbas) for p0, p1 in lib.prange(0, ngrids, Gblksize): dat = ft_ao._ft_aopair_kpts(cell, Gv[p0:p1], shls_slice, aosym, b, gxyz[p0:p1], Gvbase, kpt, adapted_kptjs, out=buf) if (cell.dimension == 1 or cell.dimension == 2) and is_zero(kpt): G0idx, SI_on_z = pbcgto.cell._SI_for_uniform_model_charge(cell, Gv[p0:p1]) if SI_on_z.size > 0: for k, aoao in enumerate(dat): aoao[G0idx] -= numpy.einsum('g,i->gi', SI_on_z, ovlp[k]) aux = fuse(ft_ao.ft_ao(fused_cell, Gv[p0:p1][G0idx]).T) vG_mod = numpy.einsum('ig,g,g->i', aux.conj(), wcoulG[p0:p1][G0idx], SI_on_z) if gamma_point(adapted_kptjs[k]): j3cR[k][:naux] -= vG_mod[:,None].real * ovlp[k] else: tmp = vG_mod[:,None] * ovlp[k] j3cR[k][:naux] -= tmp.real j3cI[k][:naux] -= tmp.imag tmp = aux = vG_mod nG = p1 - p0 for k, ji in enumerate(adapted_ji_idx): aoao = dat[k].reshape(nG,ncol) pqkR = numpy.ndarray((ncol,nG), buffer=pqkRbuf) pqkI = numpy.ndarray((ncol,nG), buffer=pqkIbuf) pqkR[:] = aoao.real.T pqkI[:] = aoao.imag.T lib.dot(kLR[p0:p1].T, pqkR.T, -1, j3cR[k][naux:], 1) lib.dot(kLI[p0:p1].T, pqkI.T, -1, j3cR[k][naux:], 1) if not (is_zero(kpt) and gamma_point(adapted_kptjs[k])): lib.dot(kLR[p0:p1].T, pqkI.T, -1, j3cI[k][naux:], 1) lib.dot(kLI[p0:p1].T, pqkR.T, 1, j3cI[k][naux:], 1) for k, ji in enumerate(adapted_ji_idx): if is_zero(kpt) and gamma_point(adapted_kptjs[k]): v = fuse(j3cR[k]) else: v = fuse(j3cR[k] + j3cI[k] * 1j) if j2ctag == 'CD': v = scipy.linalg.solve_triangular(j2c, v, lower=True, overwrite_b=True) else: v = lib.dot(j2c, v) feri['j3c/%d/%d'%(ji,istep)] = v with lib.call_in_background(pw_contract) as compute: col1 = 0 for istep, sh_range in enumerate(shranges): log.debug1('int3c2e [%d/%d], AO [%d:%d], ncol = %d', \ istep+1, len(shranges), *sh_range) bstart, bend, ncol = sh_range col0, col1 = col1, col1+ncol j3cR = [] j3cI = [] for k, idx in enumerate(adapted_ji_idx): v = numpy.vstack([fswap['j3c-junk/%d/%d'%(idx,i)][0,col0:col1].T for i in range(nsegs)]) if is_zero(kpt) and cell.dimension == 3: for i in numpy.where(vbar != 0)[0]: v[i] -= vbar[i] * ovlp[k][col0:col1] j3cR.append(numpy.asarray(v.real, order='C')) if is_zero(kpt) and gamma_point(adapted_kptjs[k]): j3cI.append(None) else: j3cI.append(numpy.asarray(v.imag, order='C')) v = None compute(istep, sh_range, j3cR, j3cI) for ji in adapted_ji_idx: del(fswap['j3c-junk/%d'%ji])
def _make_eris(mp, mo_coeff=None, verbose=None): log = logger.new_logger(mp, verbose) time0 = (time.clock(), time.time()) log.debug('transform (ia|jb) outcore') mol = mp.mol nocc = mp.nocc nmo = mp.nmo nvir = nmo - nocc eris = mp2._ChemistsERIs(mp, mo_coeff) nao = eris.mo_coeff.shape[0] assert (nvir <= nao) orbo = eris.mo_coeff[:, :nocc] orbv = numpy.asarray(eris.mo_coeff[:, nocc:], order='F') eris.feri = lib.H5TmpFile() int2e = mol._add_suffix('int2e') ao2mopt = _ao2mo.AO2MOpt(mol, int2e, 'CVHFnr_schwarz_cond', 'CVHFsetnr_direct_scf') fint = gto.moleintor.getints4c ntasks = mpi.pool.size olocs = [_task_location(nocc, task_id) for task_id in range(ntasks)] oloc0, oloc1 = olocs[rank] nocc_seg = oloc1 - oloc0 log.debug2('olocs %s', olocs) ao_loc = mol.ao_loc_nr() task_sh_locs = lib.misc._balanced_partition(ao_loc, ntasks) log.debug2('task_sh_locs %s', task_sh_locs) ao_sh0 = task_sh_locs[rank] ao_sh1 = task_sh_locs[rank + 1] ao_loc0 = ao_loc[ao_sh0] ao_loc1 = ao_loc[ao_sh1] nao_seg = ao_loc1 - ao_loc0 orbo_seg = orbo[ao_loc0:ao_loc1] mem_now = lib.current_memory()[0] max_memory = max(0, mp.max_memory - mem_now) dmax = numpy.sqrt(max_memory * .9e6 / 8 / ((nao + nocc) * (nao_seg + nocc))) dmax = min(nao // 4 + 2, max(BLKMIN, min(comm.allgather(dmax)))) sh_ranges = ao2mo.outcore.balance_partition(ao_loc, dmax) sh_ranges = comm.bcast(sh_ranges) dmax = max(x[2] for x in sh_ranges) eribuf = numpy.empty((nao, dmax, dmax, nao_seg)) ftmp = lib.H5TmpFile() log.debug('max_memory %s MB (dmax = %s) required disk space %g MB', max_memory, dmax, nocc * nocc_seg * (nao * (nao + dmax) / 2 + nvir**2) * 8 / 1e6) def save(count, tmp_xo): di, dj = tmp_xo.shape[2:4] tmp_xo = [tmp_xo[p0:p1] for p0, p1 in olocs] tmp_xo = mpi.alltoall(tmp_xo, split_recvbuf=True) tmp_xo = sum(tmp_xo).reshape(nocc_seg, nocc, di, dj) ftmp[str(count) + 'b'] = tmp_xo tmp_ox = mpi.alltoall([tmp_xo[:, p0:p1] for p0, p1 in olocs], split_recvbuf=True) tmp_ox = [ tmp_ox[i].reshape(p1 - p0, nocc_seg, di, dj) for i, (p0, p1) in enumerate(olocs) ] ftmp[str(count) + 'a'] = numpy.vstack(tmp_ox) jk_blk_slices = [] count = 0 time1 = time0 with lib.call_in_background(save) as bg_save: for ip, (ish0, ish1, ni) in enumerate(sh_ranges): for jsh0, jsh1, nj in sh_ranges[:ip + 1]: i0, i1 = ao_loc[ish0], ao_loc[ish1] j0, j1 = ao_loc[jsh0], ao_loc[jsh1] jk_blk_slices.append((i0, i1, j0, j1)) shls_slice = (0, mol.nbas, ish0, ish1, jsh0, jsh1, ao_sh0, ao_sh1) eri = fint(int2e, mol._atm, mol._bas, mol._env, shls_slice=shls_slice, aosym='s1', ao_loc=ao_loc, cintopt=ao2mopt._cintopt, out=eribuf) tmp_xo = lib.einsum('pi,pqrs->iqrs', orbo, eri) tmp_xo = lib.einsum('iqrs,sl->ilqr', tmp_xo, orbo_seg) bg_save(count, tmp_xo) tmp_xo = None count += 1 time1 = log.timer_debug1( 'partial ao2mo [%d:%d,%d:%d]' % (ish0, ish1, jsh0, jsh1), *time1) eri = eribuf = None time1 = time0 = log.timer('mp2 ao2mo_ovov pass1', *time0) eris.ovov = eris.feri.create_dataset('ovov', (nocc, nvir, nocc_seg, nvir), 'f8') occblk = int( min(nocc, max(BLKMIN, max_memory * .9e6 / 8 / (nao**2 * nocc_seg + 1) / 5))) def load(i0, eri): if i0 < nocc: i1 = min(i0 + occblk, nocc) for k, (p0, p1, q0, q1) in enumerate(jk_blk_slices): eri[:i1 - i0, :, p0:p1, q0:q1] = ftmp[str(k) + 'a'][i0:i1] if p0 != q0: dat = numpy.asarray(ftmp[str(k) + 'b'][:, i0:i1]) eri[:i1 - i0, :, q0:q1, p0:p1] = dat.transpose(1, 0, 3, 2) def save(i0, i1, dat): eris.ovov[i0:i1] = dat buf_prefecth = numpy.empty((occblk, nocc_seg, nao, nao)) buf = numpy.empty_like(buf_prefecth) bufw = numpy.empty((occblk * nocc_seg, nvir**2)) bufw1 = numpy.empty_like(bufw) with lib.call_in_background(load) as prefetch: with lib.call_in_background(save) as bsave: load(0, buf_prefecth) for i0, i1 in lib.prange(0, nocc, occblk): buf, buf_prefecth = buf_prefecth, buf prefetch(i1, buf_prefecth) eri = buf[:i1 - i0].reshape((i1 - i0) * nocc_seg, nao, nao) dat = _ao2mo.nr_e2(eri, orbv, (0, nvir, 0, nvir), 's1', 's1', out=bufw) bsave( i0, i1, dat.reshape(i1 - i0, nocc_seg, nvir, nvir).transpose(0, 2, 1, 3)) bufw, bufw1 = bufw1, bufw time1 = log.timer_debug1('pass2 ao2mo [%d:%d]' % (i0, i1), *time1) time0 = log.timer('mp2 ao2mo_ovov pass2', *time0) mp._eris = eris return eris
def make_kpt(uniq_kptji_id): # kpt = kptj - kpti kpt = uniq_kpts[uniq_kptji_id] log.debug1('kpt = %s', kpt) adapted_ji_idx = numpy.where(uniq_inverse == uniq_kptji_id)[0] adapted_kptjs = kptjs[adapted_ji_idx] nkptj = len(adapted_kptjs) log.debug1('adapted_ji_idx = %s', adapted_ji_idx) Gaux = ft_ao.ft_ao(fused_cell, Gv, None, b, gxyz, Gvbase, kpt).T Gaux = fuse(Gaux) Gaux *= mydf.weighted_coulG(kpt, False, mesh) kLR = Gaux.T.real.copy('C') kLI = Gaux.T.imag.copy('C') j2c = numpy.asarray(fswap['j2c/%d' % uniq_kptji_id]) # Note large difference may be found in results between the CD/eig treatments. # In some systems, small integral errors can lead to different treatments of # linear dependency which can be observed in the total energy/orbital energy # around 4th decimal place. # try: # j2c = scipy.linalg.cholesky(j2c, lower=True) # j2ctag = 'CD' # except scipy.linalg.LinAlgError as e: # # Abandon CD treatment for better numerical stablity w, v = scipy.linalg.eigh(j2c) log.debug('MDF metric for kpt %s cond = %.4g, drop %d bfns', uniq_kptji_id, w[-1] / w[0], numpy.count_nonzero(w < mydf.linear_dep_threshold)) v = v[:, w > mydf.linear_dep_threshold].T.conj() v /= numpy.sqrt(w[w > mydf.linear_dep_threshold]).reshape(-1, 1) j2c = v j2ctag = 'eig' naux0 = j2c.shape[0] if is_zero(kpt): # kpti == kptj aosym = 's2' nao_pair = nao * (nao + 1) // 2 vbar = fuse(mydf.auxbar(fused_cell)) ovlp = cell.pbc_intor('int1e_ovlp', hermi=1, kpts=adapted_kptjs) for k, ji in enumerate(adapted_ji_idx): ovlp[k] = lib.pack_tril(ovlp[k]) else: aosym = 's1' nao_pair = nao**2 mem_now = lib.current_memory()[0] log.debug2('memory = %s', mem_now) max_memory = max(2000, mydf.max_memory - mem_now) # nkptj for 3c-coulomb arrays plus 1 Lpq array buflen = min(max(int(max_memory * .38e6 / 16 / naux / (nkptj + 1)), 1), nao_pair) shranges = _guess_shell_ranges(cell, buflen, aosym) buflen = max([x[2] for x in shranges]) # +1 for a pqkbuf if aosym == 's2': Gblksize = max(16, int(max_memory * .1e6 / 16 / buflen / (nkptj + 1))) else: Gblksize = max(16, int(max_memory * .2e6 / 16 / buflen / (nkptj + 1))) Gblksize = min(Gblksize, ngrids, 16384) pqkRbuf = numpy.empty(buflen * Gblksize) pqkIbuf = numpy.empty(buflen * Gblksize) # buf for ft_aopair buf = numpy.empty((nkptj, buflen * Gblksize), dtype=numpy.complex128) def pw_contract(istep, sh_range, j3cR, j3cI): bstart, bend, ncol = sh_range if aosym == 's2': shls_slice = (bstart, bend, 0, bend) else: shls_slice = (bstart, bend, 0, cell.nbas) for p0, p1 in lib.prange(0, ngrids, Gblksize): dat = ft_ao._ft_aopair_kpts(cell, Gv[p0:p1], shls_slice, aosym, b, gxyz[p0:p1], Gvbase, kpt, adapted_kptjs, out=buf) nG = p1 - p0 for k, ji in enumerate(adapted_ji_idx): aoao = dat[k].reshape(nG, ncol) pqkR = numpy.ndarray((ncol, nG), buffer=pqkRbuf) pqkI = numpy.ndarray((ncol, nG), buffer=pqkIbuf) pqkR[:] = aoao.real.T pqkI[:] = aoao.imag.T lib.dot(kLR[p0:p1].T, pqkR.T, -1, j3cR[k], 1) lib.dot(kLI[p0:p1].T, pqkI.T, -1, j3cR[k], 1) if not (is_zero(kpt) and gamma_point(adapted_kptjs[k])): lib.dot(kLR[p0:p1].T, pqkI.T, -1, j3cI[k], 1) lib.dot(kLI[p0:p1].T, pqkR.T, 1, j3cI[k], 1) for k, ji in enumerate(adapted_ji_idx): if is_zero(kpt) and gamma_point(adapted_kptjs[k]): v = j3cR[k] else: v = j3cR[k] + j3cI[k] * 1j if j2ctag == 'CD': v = scipy.linalg.solve_triangular(j2c, v, lower=True, overwrite_b=True) else: v = lib.dot(j2c, v) feri['j3c/%d/%d' % (ji, istep)] = v with lib.call_in_background(pw_contract) as compute: col1 = 0 for istep, sh_range in enumerate(shranges): log.debug1('int3c2e [%d/%d], AO [%d:%d], ncol = %d', \ istep+1, len(shranges), *sh_range) bstart, bend, ncol = sh_range col0, col1 = col1, col1 + ncol j3cR = [] j3cI = [] for k, idx in enumerate(adapted_ji_idx): v = [ feri['j3c-junk/%d/%d' % (idx, i)][0, col0:col1].T for i in range(nsegs) ] v = fuse(numpy.vstack(v)) if is_zero(kpt) and cell.dimension == 3: for i, c in enumerate(vbar): if c != 0: v[i] -= c * ovlp[k][col0:col1] j3cR.append(numpy.asarray(v.real, order='C')) if is_zero(kpt) and gamma_point(adapted_kptjs[k]): j3cI.append(None) else: j3cI.append(numpy.asarray(v.imag, order='C')) v = None compute(istep, sh_range, j3cR, j3cI) for ji in adapted_ji_idx: del (feri['j3c-junk/%d' % ji])
def _aux_e2(cell, auxcell, erifile, intor='int3c2e', aosym='s2ij', comp=None, kptij_lst=None, dataname='eri_mo', shls_slice=None, max_memory=2000, verbose=0): r'''3-center AO integrals (ij|L) with double lattice sum: \sum_{lm} (i[l]j[m]|L[0]), where L is the auxiliary basis. Three-index integral tensor (kptij_idx, nao_pair, naux) or four-index integral tensor (kptij_idx, comp, nao_pair, naux) are stored on disk. **This function should be only used by df and mdf initialization function _make_j3c** Args: kptij_lst : (*,2,3) array A list of (kpti, kptj) ''' intor, comp = gto.moleintor._get_intor_and_comp(cell._add_suffix(intor), comp) if isinstance(erifile, h5py.Group): feri = erifile elif h5py.is_hdf5(erifile): feri = h5py.File(erifile) else: feri = h5py.File(erifile, 'w') if dataname in feri: del(feri[dataname]) if dataname+'-kptij' in feri: del(feri[dataname+'-kptij']) if kptij_lst is None: kptij_lst = numpy.zeros((1,2,3)) feri[dataname+'-kptij'] = kptij_lst if shls_slice is None: shls_slice = (0, cell.nbas, 0, cell.nbas, 0, auxcell.nbas) ao_loc = cell.ao_loc_nr() aux_loc = auxcell.ao_loc_nr(auxcell.cart or 'ssc' in intor)[:shls_slice[5]+1] ni = ao_loc[shls_slice[1]] - ao_loc[shls_slice[0]] nj = ao_loc[shls_slice[3]] - ao_loc[shls_slice[2]] naux = aux_loc[shls_slice[5]] - aux_loc[shls_slice[4]] nkptij = len(kptij_lst) nii = (ao_loc[shls_slice[1]]*(ao_loc[shls_slice[1]]+1)//2 - ao_loc[shls_slice[0]]*(ao_loc[shls_slice[0]]+1)//2) nij = ni * nj kpti = kptij_lst[:,0] kptj = kptij_lst[:,1] aosym_ks2 = abs(kpti-kptj).sum(axis=1) < KPT_DIFF_TOL j_only = numpy.all(aosym_ks2) #aosym_ks2 &= (aosym[:2] == 's2' and shls_slice[:2] == shls_slice[2:4]) aosym_ks2 &= aosym[:2] == 's2' if j_only and aosym[:2] == 's2': assert(shls_slice[2] == 0) nao_pair = nii else: nao_pair = nij if gamma_point(kptij_lst): dtype = numpy.double else: dtype = numpy.complex128 buflen = max(8, int(max_memory*.47e6/16/(nkptij*ni*nj*comp))) auxdims = aux_loc[shls_slice[4]+1:shls_slice[5]+1] - aux_loc[shls_slice[4]:shls_slice[5]] auxranges = balance_segs(auxdims, buflen) buflen = max([x[2] for x in auxranges]) buf = numpy.empty(nkptij*comp*ni*nj*buflen, dtype=dtype) buf1 = numpy.empty_like(buf) int3c = wrap_int3c(cell, auxcell, intor, aosym, comp, kptij_lst) kptis = kptij_lst[:,0] kptjs = kptij_lst[:,1] kpt_ji = kptjs - kptis uniq_kpts, uniq_index, uniq_inverse = unique(kpt_ji) # sorted_ij_idx: Sort and group the kptij_lst according to the ordering in # df._make_j3c to reduce the data fragment in the hdf5 file. When datasets # are written to hdf5, they are saved sequentially. If the integral data are # saved as the order of kptij_lst, removing the datasets in df._make_j3c will # lead to holes that can not be reused. sorted_ij_idx = numpy.hstack([numpy.where(uniq_inverse == k)[0] for k, kpt in enumerate(uniq_kpts)]) tril_idx = numpy.tril_indices(ni) tril_idx = tril_idx[0] * ni + tril_idx[1] def save(istep, mat): for k in sorted_ij_idx: v = mat[k] if gamma_point(kptij_lst[k]): v = v.real if aosym_ks2[k] and nao_pair == ni**2: v = v[:,tril_idx] feri['%s/%d/%d' % (dataname,k,istep)] = v with lib.call_in_background(save) as bsave: for istep, auxrange in enumerate(auxranges): sh0, sh1, nrow = auxrange sub_slice = (shls_slice[0], shls_slice[1], shls_slice[2], shls_slice[3], shls_slice[4]+sh0, shls_slice[4]+sh1) mat = numpy.ndarray((nkptij,comp,nao_pair,nrow), dtype=dtype, buffer=buf) bsave(istep, int3c(sub_slice, mat)) buf, buf1 = buf1, buf if not isinstance(erifile, h5py.Group): feri.close() return erifile
def update_amps(mycc, t1, t2, eris): time1 = time0 = time.clock(), time.time() log = logger.Logger(mycc.stdout, mycc.verbose) cpu1 = time0 t1T = t1.T t2T = numpy.asarray(t2.transpose(2,3,0,1), order='C') nvir_seg, nvir, nocc = t2T.shape[:3] t1 = t2 = None ntasks = mpi.pool.size vlocs = [_task_location(nvir, task_id) for task_id in range(ntasks)] vloc0, vloc1 = vlocs[rank] log.debug2('vlocs %s', vlocs) assert(vloc1-vloc0 == nvir_seg) fock = eris.fock mo_e_o = eris.mo_energy[:nocc] mo_e_v = eris.mo_energy[nocc:] + mycc.level_shift def _rotate_vir_block(buf): for task_id, buf in _rotate_tensor_block(buf): loc0, loc1 = vlocs[task_id] yield task_id, buf, loc0, loc1 fswap = lib.H5TmpFile() wVooV = numpy.zeros((nvir_seg,nocc,nocc,nvir)) eris_voov = _cp(eris.ovvo).transpose(1,0,3,2) tau = t2T * .5 tau += numpy.einsum('ai,bj->abij', t1T[vloc0:vloc1], t1T) for task_id, tau, p0, p1 in _rotate_vir_block(tau): wVooV += lib.einsum('bkic,cajk->bija', eris_voov[:,:,:,p0:p1], tau) fswap['wVooV1'] = wVooV wVooV = tau = None time1 = log.timer_debug1('wVooV', *time1) wVOov = eris_voov eris_VOov = eris_voov - eris_voov.transpose(0,2,1,3)*.5 tau = t2T.transpose(2,0,3,1) - t2T.transpose(3,0,2,1)*.5 tau -= numpy.einsum('ai,bj->jaib', t1T[vloc0:vloc1], t1T) for task_id, tau, p0, p1 in _rotate_vir_block(tau): wVOov += lib.einsum('dlkc,kcjb->dljb', eris_VOov[:,:,:,p0:p1], tau) fswap['wVOov1'] = wVOov wVOov = tau = eris_VOov = eris_voov = None time1 = log.timer_debug1('wVOov', *time1) t1Tnew = numpy.zeros_like(t1T) t2Tnew = mycc._add_vvvv(t1T, t2T, eris, t2sym='jiba') time1 = log.timer_debug1('vvvv', *time1) #** make_inter_F fov = fock[:nocc,nocc:].copy() t1Tnew += fock[nocc:,:nocc] foo = fock[:nocc,:nocc] - numpy.diag(mo_e_o) foo += .5 * numpy.einsum('ia,aj->ij', fock[:nocc,nocc:], t1T) fvv = fock[nocc:,nocc:] - numpy.diag(mo_e_v) fvv -= .5 * numpy.einsum('ai,ib->ab', t1T, fock[:nocc,nocc:]) foo_priv = numpy.zeros_like(foo) fov_priv = numpy.zeros_like(fov) fvv_priv = numpy.zeros_like(fvv) t1T_priv = numpy.zeros_like(t1T) max_memory = mycc.max_memory - lib.current_memory()[0] unit = nocc*nvir**2*3 + nocc**2*nvir + 1 blksize = min(nvir, max(BLKMIN, int((max_memory*.9e6/8-t2T.size)/unit))) log.debug1('pass 1, max_memory %d MB, nocc,nvir = %d,%d blksize = %d', max_memory, nocc, nvir, blksize) buf = numpy.empty((blksize,nvir,nvir,nocc)) def load_vvvo(p0): p1 = min(nvir_seg, p0+blksize) if p0 < p1: buf[:p1-p0] = eris.vvvo[p0:p1] fswap.create_dataset('wVooV', (nvir_seg,nocc,nocc,nvir), 'f8') wVOov = [] with lib.call_in_background(load_vvvo) as prefetch: load_vvvo(0) for p0, p1 in lib.prange(vloc0, vloc1, blksize): i0, i1 = p0 - vloc0, p1 - vloc0 eris_vvvo, buf = buf[:p1-p0], numpy.empty_like(buf) prefetch(i1) fvv_priv[p0:p1] += 2*numpy.einsum('ck,abck->ab', t1T, eris_vvvo) fvv_priv -= numpy.einsum('ck,cabk->ab', t1T[p0:p1], eris_vvvo) if not mycc.direct: raise NotImplementedError tau = t2T[i0:i1] + numpy.einsum('ai,bj->abij', t1T[p0:p1], t1T) for task_id, tau, q0, q1 in _rotate_vir_block(tau): tmp = lib.einsum('bdck,cdij->bkij', eris_vvvo[:,:,q0:q1], tau) t2Tnew -= lib.einsum('ak,bkij->baji', t1T, tmp) tau = tmp = None fswap['wVooV'][i0:i1] = lib.einsum('cj,baci->bija', -t1T, eris_vvvo) theta = t2T[i0:i1].transpose(0,2,1,3) * 2 theta -= t2T[i0:i1].transpose(0,3,1,2) t1T_priv += lib.einsum('bicj,bacj->ai', theta, eris_vvvo) wVOov.append(lib.einsum('acbi,cj->abij', eris_vvvo, t1T)) theta = eris_vvvo = None time1 = log.timer_debug1('vvvo [%d:%d]'%(p0, p1), *time1) wVOov = numpy.vstack(wVOov) wVOov = mpi.alltoall([wVOov[:,q0:q1] for q0,q1 in vlocs], split_recvbuf=True) wVOov = numpy.vstack([x.reshape(-1,nvir_seg,nocc,nocc) for x in wVOov]) fswap['wVOov'] = wVOov.transpose(1,2,3,0) wVooV = None unit = nocc**2*nvir*7 + nocc**3 + nocc*nvir**2 max_memory = max(0, mycc.max_memory - lib.current_memory()[0]) blksize = min(nvir, max(BLKMIN, int((max_memory*.9e6/8-nocc**4)/unit))) log.debug1('pass 2, max_memory %d MB, nocc,nvir = %d,%d blksize = %d', max_memory, nocc, nvir, blksize) woooo = numpy.zeros((nocc,nocc,nocc,nocc)) for p0, p1 in lib.prange(vloc0, vloc1, blksize): i0, i1 = p0 - vloc0, p1 - vloc0 wVOov = fswap['wVOov'][i0:i1] wVooV = fswap['wVooV'][i0:i1] eris_ovoo = eris.ovoo[:,i0:i1] eris_oovv = numpy.empty((nocc,nocc,i1-i0,nvir)) def load_oovv(p0, p1): eris_oovv[:] = eris.oovv[:,:,p0:p1] with lib.call_in_background(load_oovv) as prefetch_oovv: #:eris_oovv = eris.oovv[:,:,i0:i1] prefetch_oovv(i0, i1) foo_priv += numpy.einsum('ck,kcji->ij', 2*t1T[p0:p1], eris_ovoo) foo_priv += numpy.einsum('ck,icjk->ij', -t1T[p0:p1], eris_ovoo) tmp = lib.einsum('al,jaik->lkji', t1T[p0:p1], eris_ovoo) woooo += tmp + tmp.transpose(1,0,3,2) tmp = None wVOov -= lib.einsum('jbik,ak->bjia', eris_ovoo, t1T) t2Tnew[i0:i1] += wVOov.transpose(0,3,1,2) wVooV += lib.einsum('kbij,ak->bija', eris_ovoo, t1T) eris_ovoo = None load_oovv = prefetch_oovv = None eris_ovvo = numpy.empty((nocc,i1-i0,nvir,nocc)) def load_ovvo(p0, p1): eris_ovvo[:] = eris.ovvo[:,p0:p1] with lib.call_in_background(load_ovvo) as prefetch_ovvo: #:eris_ovvo = eris.ovvo[:,i0:i1] prefetch_ovvo(i0, i1) t1T_priv[p0:p1] -= numpy.einsum('bj,jiab->ai', t1T, eris_oovv) wVooV -= eris_oovv.transpose(2,0,1,3) wVOov += wVooV*.5 #: bjia + bija*.5 eris_voov = eris_ovvo.transpose(1,0,3,2) eris_ovvo = None load_ovvo = prefetch_ovvo = None def update_wVooV(i0, i1): wVooV[:] += fswap['wVooV1'][i0:i1] fswap['wVooV1'][i0:i1] = wVooV wVOov[:] += fswap['wVOov1'][i0:i1] fswap['wVOov1'][i0:i1] = wVOov with lib.call_in_background(update_wVooV) as update_wVooV: update_wVooV(i0, i1) t2Tnew[i0:i1] += eris_voov.transpose(0,3,1,2) * .5 t1T_priv[p0:p1] += 2*numpy.einsum('bj,aijb->ai', t1T, eris_voov) tmp = lib.einsum('ci,kjbc->bijk', t1T, eris_oovv) tmp += lib.einsum('bjkc,ci->bjik', eris_voov, t1T) t2Tnew[i0:i1] -= lib.einsum('bjik,ak->baji', tmp, t1T) eris_oovv = tmp = None fov_priv[:,p0:p1] += numpy.einsum('ck,aikc->ia', t1T, eris_voov) * 2 fov_priv[:,p0:p1] -= numpy.einsum('ck,akic->ia', t1T, eris_voov) tau = numpy.einsum('ai,bj->abij', t1T[p0:p1]*.5, t1T) tau += t2T[i0:i1] theta = tau.transpose(0,1,3,2) * 2 theta -= tau fvv_priv -= lib.einsum('caij,cjib->ab', theta, eris_voov) foo_priv += lib.einsum('aikb,abkj->ij', eris_voov, theta) tau = theta = None tau = t2T[i0:i1] + numpy.einsum('ai,bj->abij', t1T[p0:p1], t1T) woooo += lib.einsum('abij,aklb->ijkl', tau, eris_voov) tau = None eris_VOov = wVOov = wVooV = update_wVooV = None time1 = log.timer_debug1('voov [%d:%d]'%(p0, p1), *time1) wVooV = _cp(fswap['wVooV1']) for task_id, wVooV, p0, p1 in _rotate_vir_block(wVooV): tmp = lib.einsum('ackj,ckib->ajbi', t2T[:,p0:p1], wVooV) t2Tnew += tmp.transpose(0,2,3,1) t2Tnew += tmp.transpose(0,2,1,3) * .5 wVooV = tmp = None time1 = log.timer_debug1('contracting wVooV', *time1) wVOov = _cp(fswap['wVOov1']) theta = t2T * 2 theta -= t2T.transpose(0,1,3,2) for task_id, wVOov, p0, p1 in _rotate_vir_block(wVOov): t2Tnew += lib.einsum('acik,ckjb->abij', theta[:,p0:p1], wVOov) wVOov = theta = None fswap = None time1 = log.timer_debug1('contracting wVOov', *time1) foo += mpi.allreduce(foo_priv) fov += mpi.allreduce(fov_priv) fvv += mpi.allreduce(fvv_priv) theta = t2T.transpose(0,1,3,2) * 2 - t2T t1T_priv[vloc0:vloc1] += numpy.einsum('jb,abji->ai', fov, theta) ovoo = _cp(eris.ovoo) for task_id, ovoo, p0, p1 in _rotate_vir_block(ovoo): t1T_priv[vloc0:vloc1] -= lib.einsum('jbki,abjk->ai', ovoo, theta[:,p0:p1]) theta = ovoo = None woooo = mpi.allreduce(woooo) woooo += _cp(eris.oooo).transpose(0,2,1,3) tau = t2T + numpy.einsum('ai,bj->abij', t1T[vloc0:vloc1], t1T) t2Tnew += .5 * lib.einsum('abkl,ijkl->abij', tau, woooo) tau = woooo = None t1Tnew += mpi.allreduce(t1T_priv) ft_ij = foo + numpy.einsum('aj,ia->ij', .5*t1T, fov) ft_ab = fvv - numpy.einsum('ai,ib->ab', .5*t1T, fov) t2Tnew += lib.einsum('acij,bc->abij', t2T, ft_ab) t2Tnew -= lib.einsum('ki,abkj->abij', ft_ij, t2T) eia = mo_e_o[:,None] - mo_e_v t1Tnew += numpy.einsum('bi,ab->ai', t1T, fvv) t1Tnew -= numpy.einsum('aj,ji->ai', t1T, foo) t1Tnew /= eia.T t2tmp = mpi.alltoall([t2Tnew[:,p0:p1] for p0,p1 in vlocs], split_recvbuf=True) for task_id, (p0, p1) in enumerate(vlocs): tmp = t2tmp[task_id].reshape(p1-p0,nvir_seg,nocc,nocc) t2Tnew[:,p0:p1] += tmp.transpose(1,0,3,2) for i in range(vloc0, vloc1): t2Tnew[i-vloc0] /= lib.direct_sum('i+jb->bij', eia[:,i], eia) time0 = log.timer_debug1('update t1 t2', *time0) return t1Tnew.T, t2Tnew.transpose(2,3,0,1)
def _make_eris_outcore(mycc, mo_coeff=None): cput0 = (time.clock(), time.time()) log = logger.Logger(mycc.stdout, mycc.verbose) _sync_(mycc) eris = ccsd._ChemistsERIs() if rank == 0: eris._common_init_(mycc, mo_coeff) comm.bcast((eris.mo_coeff, eris.fock, eris.nocc, eris.mo_energy)) else: eris.mol = mycc.mol eris.mo_coeff, eris.fock, eris.nocc, eris.mo_energy = comm.bcast(None) mol = mycc.mol mo_coeff = numpy.asarray(eris.mo_coeff, order='F') nocc = eris.nocc nao, nmo = mo_coeff.shape nvir = nmo - nocc orbo = mo_coeff[:,:nocc] orbv = mo_coeff[:,nocc:] nvpair = nvir * (nvir+1) // 2 vlocs = [_task_location(nvir, task_id) for task_id in range(mpi.pool.size)] vloc0, vloc1 = vlocs[rank] vseg = vloc1 - vloc0 eris.feri1 = lib.H5TmpFile() eris.oooo = eris.feri1.create_dataset('oooo', (nocc,nocc,nocc,nocc), 'f8') eris.oovv = eris.feri1.create_dataset('oovv', (nocc,nocc,vseg,nvir), 'f8', chunks=(nocc,nocc,1,nvir)) eris.ovoo = eris.feri1.create_dataset('ovoo', (nocc,vseg,nocc,nocc), 'f8', chunks=(nocc,1,nocc,nocc)) eris.ovvo = eris.feri1.create_dataset('ovvo', (nocc,vseg,nvir,nocc), 'f8', chunks=(nocc,1,nvir,nocc)) eris.ovov = eris.feri1.create_dataset('ovov', (nocc,vseg,nocc,nvir), 'f8', chunks=(nocc,1,nocc,nvir)) # eris.ovvv = eris.feri1.create_dataset('ovvv', (nocc,vseg,nvpair), 'f8', chunks=(nocc,1,nvpair)) eris.vvvo = eris.feri1.create_dataset('vvvo', (vseg,nvir,nvir,nocc), 'f8', chunks=(1,nvir,1,nocc)) assert(mycc.direct) def save_occ_frac(p0, p1, eri): eri = eri.reshape(p1-p0,nocc,nmo,nmo) eris.oooo[p0:p1] = eri[:,:,:nocc,:nocc] eris.oovv[p0:p1] = eri[:,:,nocc+vloc0:nocc+vloc1,nocc:] def save_vir_frac(p0, p1, eri): log.alldebug1('save_vir_frac %d %d %s', p0, p1, eri.shape) eri = eri.reshape(p1-p0,nocc,nmo,nmo) eris.ovoo[:,p0:p1] = eri[:,:,:nocc,:nocc].transpose(1,0,2,3) eris.ovvo[:,p0:p1] = eri[:,:,nocc:,:nocc].transpose(1,0,2,3) eris.ovov[:,p0:p1] = eri[:,:,:nocc,nocc:].transpose(1,0,2,3) # vvv = lib.pack_tril(eri[:,:,nocc:,nocc:].reshape((p1-p0)*nocc,nvir,nvir)) # eris.ovvv[:,p0:p1] = vvv.reshape(p1-p0,nocc,nvpair).transpose(1,0,2) cput2 = time.clock(), time.time() ovvv_segs = [eri[:,:,nocc+q0:nocc+q1,nocc:].transpose(2,3,0,1) for q0,q1 in vlocs] ovvv_segs = mpi.alltoall(ovvv_segs, split_recvbuf=True) cput2 = log.timer_debug1('vvvo alltoall', *cput2) for task_id, (q0, q1) in enumerate(comm.allgather((p0,p1))): ip0 = q0 + vlocs[task_id][0] ip1 = q1 + vlocs[task_id][0] eris.vvvo[:,:,ip0:ip1] = ovvv_segs[task_id].reshape(vseg,nvir,q1-q0,nocc) fswap = lib.H5TmpFile() max_memory = max(MEMORYMIN, mycc.max_memory-lib.current_memory()[0]) int2e = mol._add_suffix('int2e') orbov = numpy.hstack((orbo, orbv[:,vloc0:vloc1])) ao2mo.outcore.half_e1(mol, (orbov,orbo), fswap, int2e, 's4', 1, max_memory, verbose=log) ao_loc = mol.ao_loc_nr() nao_pair = nao * (nao+1) // 2 blksize = int(min(8e9,max_memory*.5e6)/8/(nao_pair+nmo**2)/nocc) blksize = min(nvir, max(BLKMIN, blksize)) fload = ao2mo.outcore._load_from_h5g buf = numpy.empty((blksize*nocc,nao_pair)) buf_prefetch = numpy.empty_like(buf) def prefetch(p0, p1, rowmax): p0, p1 = p1, min(rowmax, p1+blksize) if p0 < p1: fload(fswap['0'], p0*nocc, p1*nocc, buf_prefetch) cput1 = time.clock(), time.time() outbuf = numpy.empty((blksize*nocc,nmo**2)) with lib.call_in_background(prefetch) as bprefetch: fload(fswap['0'], 0, min(nocc,blksize)*nocc, buf_prefetch) for p0, p1 in lib.prange(0, nocc, blksize): nrow = (p1 - p0) * nocc buf, buf_prefetch = buf_prefetch, buf bprefetch(p0, p1, nocc) dat = ao2mo._ao2mo.nr_e2(buf[:nrow], mo_coeff, (0,nmo,0,nmo), 's4', 's1', out=outbuf, ao_loc=ao_loc) save_occ_frac(p0, p1, dat) blksize = min(comm.allgather(blksize)) norb_max = nocc + vseg fload(fswap['0'], nocc**2, min(nocc+blksize,norb_max)*nocc, buf_prefetch) for p0, p1 in mpi.prange(vloc0, vloc1, blksize): i0, i1 = p0 - vloc0, p1 - vloc0 nrow = (p1 - p0) * nocc buf, buf_prefetch = buf_prefetch, buf bprefetch(nocc+i0, nocc+i1, norb_max) dat = ao2mo._ao2mo.nr_e2(buf[:nrow], mo_coeff, (0,nmo,0,nmo), 's4', 's1', out=outbuf, ao_loc=ao_loc) save_vir_frac(i0, i1, dat) buf = buf_prefecth = outbuf = None cput1 = log.timer_debug1('transforming oppp', *cput1) log.timer('CCSD integral transformation', *cput0) mycc._eris = eris return eris
def transform_integrals_outcore(myadc): cput0 = (time.clock(), time.time()) log = logger.Logger(myadc.stdout, myadc.verbose) mol = myadc.mol mo_coeff = myadc.mo_coeff nao = mo_coeff.shape[0] nmo = myadc._nmo occ = myadc.mo_coeff[:, :myadc._nocc] vir = myadc.mo_coeff[:, myadc._nocc:] nocc = occ.shape[1] nvir = vir.shape[1] nvpair = nvir * (nvir + 1) // 2 eris = lambda: None eris.feri1 = lib.H5TmpFile() eris.oooo = eris.feri1.create_dataset('oooo', (nocc, nocc, nocc, nocc), 'f8') eris.oovv = eris.feri1.create_dataset('oovv', (nocc, nocc, nvir, nvir), 'f8', chunks=(nocc, nocc, 1, nvir)) eris.ovoo = eris.feri1.create_dataset('ovoo', (nocc, nvir, nocc, nocc), 'f8', chunks=(nocc, 1, nocc, nocc)) eris.ovvo = eris.feri1.create_dataset('ovvo', (nocc, nvir, nvir, nocc), 'f8', chunks=(nocc, 1, nvir, nocc)) eris.ovov = eris.feri1.create_dataset('ovov', (nocc, nvir, nocc, nvir), 'f8', chunks=(nocc, 1, nocc, nvir)) eris.ovvv = eris.feri1.create_dataset('ovvv', (nocc, nvir, nvpair), 'f8') def save_occ_frac(p0, p1, eri): eri = eri.reshape(p1 - p0, nocc, nmo, nmo) eris.oooo[p0:p1] = eri[:, :, :nocc, :nocc] eris.oovv[p0:p1] = eri[:, :, nocc:, nocc:] def save_vir_frac(p0, p1, eri): eri = eri.reshape(p1 - p0, nocc, nmo, nmo) eris.ovoo[:, p0:p1] = eri[:, :, :nocc, :nocc].transpose(1, 0, 2, 3) eris.ovvo[:, p0:p1] = eri[:, :, nocc:, :nocc].transpose(1, 0, 2, 3) eris.ovov[:, p0:p1] = eri[:, :, :nocc, nocc:].transpose(1, 0, 2, 3) vvv = lib.pack_tril(eri[:, :, nocc:, nocc:].reshape((p1 - p0) * nocc, nvir, nvir)) eris.ovvv[:, p0:p1] = vvv.reshape(p1 - p0, nocc, nvpair).transpose(1, 0, 2) cput1 = time.clock(), time.time() fswap = lib.H5TmpFile() max_memory = myadc.max_memory - lib.current_memory()[0] if max_memory <= 0: max_memory = myadc.memorymin int2e = mol._add_suffix('int2e') ao2mo.outcore.half_e1(mol, (mo_coeff, occ), fswap, int2e, 's4', 1, max_memory=max_memory, verbose=log) ao_loc = mol.ao_loc_nr() nao_pair = nao * (nao + 1) // 2 blksize = int(min(8e9, max_memory * .5e6) / 8 / (nao_pair + nmo**2) / nocc) blksize = min(nmo, max(myadc.blkmin, blksize)) log.debug1('blksize %d', blksize) cput2 = cput1 fload = ao2mo.outcore._load_from_h5g buf = np.empty((blksize * nocc, nao_pair)) buf_prefetch = np.empty_like(buf) def load(buf_prefetch, p0, rowmax): if p0 < rowmax: p1 = min(rowmax, p0 + blksize) fload(fswap['0'], p0 * nocc, p1 * nocc, buf_prefetch) outbuf = np.empty((blksize * nocc, nmo**2)) with lib.call_in_background(load, sync=not myadc.async_io) as prefetch: prefetch(buf_prefetch, 0, nocc) for p0, p1 in lib.prange(0, nocc, blksize): buf, buf_prefetch = buf_prefetch, buf prefetch(buf_prefetch, p1, nocc) nrow = (p1 - p0) * nocc dat = ao2mo._ao2mo.nr_e2(buf[:nrow], mo_coeff, (0, nmo, 0, nmo), 's4', 's1', out=outbuf, ao_loc=ao_loc) save_occ_frac(p0, p1, dat) cput2 = log.timer_debug1('transforming oopp', *cput2) prefetch(buf_prefetch, nocc, nmo) for p0, p1 in lib.prange(0, nvir, blksize): buf, buf_prefetch = buf_prefetch, buf prefetch(buf_prefetch, nocc + p1, nmo) nrow = (p1 - p0) * nocc dat = ao2mo._ao2mo.nr_e2(buf[:nrow], mo_coeff, (0, nmo, 0, nmo), 's4', 's1', out=outbuf, ao_loc=ao_loc) save_vir_frac(p0, p1, dat) cput2 = log.timer_debug1('transforming ovpp [%d:%d]' % (p0, p1), *cput2) cput1 = log.timer_debug1('transforming oppp', *cput1) ############### forming eris_vvvv ######################################## if (myadc.method == "adc(2)-x" or myadc.method == "adc(3)"): eris.vvvv = [] cput3 = time.clock(), time.time() avail_mem = (myadc.max_memory - lib.current_memory()[0]) * 0.5 chnk_size = calculate_chunk_size(myadc) for p in range(0, vir.shape[1], chnk_size): if chnk_size < vir.shape[1]: orb_slice = vir[:, p:p + chnk_size] else: orb_slice = vir[:, p:] _, tmp = tempfile.mkstemp() ao2mo.outcore.general(mol, (orb_slice, vir, vir, vir), tmp, max_memory=avail_mem, ioblk_size=100, compact=False) vvvv = read_dataset(tmp, 'eri_mo') vvvv = vvvv.reshape(orb_slice.shape[1], vir.shape[1], vir.shape[1], vir.shape[1]) vvvv = np.ascontiguousarray(vvvv.transpose(0, 2, 1, 3)).reshape( -1, nvir, nvir * nvir) vvvv_p = write_dataset(vvvv) del vvvv eris.vvvv.append(vvvv_p) cput3 = log.timer_debug1('transforming vvvv', *cput3) log.timer('ADC integral transformation', *cput0) return eris
def _sort_eri(mycc, eris, h5tmp, log): cpu1 = (logger.process_clock(), logger.perf_counter()) nocca, noccb = mycc.nocc nmoa = eris.focka.shape[0] nmob = eris.fockb.shape[0] nvira = nmoa - nocca nvirb = nmob - noccb if mycc.t2 is None: dtype = eris.ovov.dtype else: dtype = numpy.result_type(mycc.t2[0], eris.ovov.dtype) if mycc.incore_complete or h5tmp is None: eris_vvop = numpy.empty((nvira, nvira, nocca, nmoa), dtype) eris_VVOP = numpy.empty((nvirb, nvirb, noccb, nmob), dtype) eris_vVoP = numpy.empty((nvira, nvirb, nocca, nmob), dtype) eris_VvOp = numpy.empty((nvirb, nvira, noccb, nmoa), dtype) else: eris_vvop = h5tmp.create_dataset('vvop', (nvira, nvira, nocca, nmoa), dtype) eris_VVOP = h5tmp.create_dataset('VVOP', (nvirb, nvirb, noccb, nmob), dtype) eris_vVoP = h5tmp.create_dataset('vVoP', (nvira, nvirb, nocca, nmob), dtype) eris_VvOp = h5tmp.create_dataset('VvOp', (nvirb, nvira, noccb, nmoa), dtype) max_memory = max(2000, mycc.max_memory - lib.current_memory()[0]) max_memory = min(8000, max_memory * .9) blksize = min(nvira, max(16, int(max_memory * 1e6 / 8 / (nvira * nocca * nmoa)))) with lib.call_in_background(eris_vvop.__setitem__, sync=not mycc.async_io) as save: bufopv = numpy.empty((nocca, nmoa, nvira), dtype=dtype) buf1 = numpy.empty_like(bufopv) for j0, j1 in lib.prange(0, nvira, blksize): ovov = numpy.asarray(eris.ovov[:, j0:j1]) ovvv = eris.get_ovvv(slice(None), slice(j0, j1)) for j in range(j0, j1): bufopv[:, :nocca, :] = ovov[:, j - j0].conj() bufopv[:, nocca:, :] = ovvv[:, j - j0].conj() save(j, bufopv.transpose(2, 0, 1)) bufopv, buf1 = buf1, bufopv ovov = ovvv = None cpu1 = log.timer_debug1('transpose %d:%d' % (j0, j1), *cpu1) blksize = min(nvirb, max(16, int(max_memory * 1e6 / 8 / (nvirb * noccb * nmob)))) with lib.call_in_background(eris_VVOP.__setitem__, sync=not mycc.async_io) as save: bufopv = numpy.empty((noccb, nmob, nvirb), dtype=dtype) buf1 = numpy.empty_like(bufopv) for j0, j1 in lib.prange(0, nvirb, blksize): ovov = numpy.asarray(eris.OVOV[:, j0:j1]) ovvv = eris.get_OVVV(slice(None), slice(j0, j1)) for j in range(j0, j1): bufopv[:, :noccb, :] = ovov[:, j - j0].conj() bufopv[:, noccb:, :] = ovvv[:, j - j0].conj() save(j, bufopv.transpose(2, 0, 1)) bufopv, buf1 = buf1, bufopv ovov = ovvv = None cpu1 = log.timer_debug1('transpose %d:%d' % (j0, j1), *cpu1) blksize = min(nvira, max(16, int(max_memory * 1e6 / 8 / (nvirb * nocca * nmob)))) with lib.call_in_background(eris_vVoP.__setitem__, sync=not mycc.async_io) as save: bufopv = numpy.empty((nocca, nmob, nvirb), dtype=dtype) buf1 = numpy.empty_like(bufopv) for j0, j1 in lib.prange(0, nvira, blksize): ovov = numpy.asarray(eris.ovOV[:, j0:j1]) ovvv = eris.get_ovVV(slice(None), slice(j0, j1)) for j in range(j0, j1): bufopv[:, :noccb, :] = ovov[:, j - j0].conj() bufopv[:, noccb:, :] = ovvv[:, j - j0].conj() save(j, bufopv.transpose(2, 0, 1)) bufopv, buf1 = buf1, bufopv ovov = ovvv = None cpu1 = log.timer_debug1('transpose %d:%d' % (j0, j1), *cpu1) blksize = min(nvirb, max(16, int(max_memory * 1e6 / 8 / (nvira * noccb * nmoa)))) OVov = numpy.asarray(eris.ovOV).transpose(2, 3, 0, 1) with lib.call_in_background(eris_VvOp.__setitem__, sync=not mycc.async_io) as save: bufopv = numpy.empty((noccb, nmoa, nvira), dtype=dtype) buf1 = numpy.empty_like(bufopv) for j0, j1 in lib.prange(0, nvirb, blksize): ovov = OVov[:, j0:j1] ovvv = eris.get_OVvv(slice(None), slice(j0, j1)) for j in range(j0, j1): bufopv[:, :nocca, :] = ovov[:, j - j0].conj() bufopv[:, nocca:, :] = ovvv[:, j - j0].conj() save(j, bufopv.transpose(2, 0, 1)) bufopv, buf1 = buf1, bufopv ovov = ovvv = None cpu1 = log.timer_debug1('transpose %d:%d' % (j0, j1), *cpu1) return eris_vvop, eris_VVOP, eris_vVoP, eris_VvOp
def kernel(mycc, eris, t1=None, t2=None, verbose=logger.NOTE): cpu1 = cpu0 = (logger.process_clock(), logger.perf_counter()) log = logger.new_logger(mycc, verbose) if t1 is None: t1 = mycc.t1 if t2 is None: t2 = mycc.t2 t1a, t1b = t1 t2aa, t2ab, t2bb = t2 nocca, noccb = mycc.nocc nmoa = eris.focka.shape[0] nmob = eris.fockb.shape[0] nvira = nmoa - nocca nvirb = nmob - noccb if mycc.incore_complete: ftmp = None else: ftmp = lib.H5TmpFile() t1aT = t1a.T.copy() t1bT = t1b.T.copy() t2aaT = t2aa.transpose(2, 3, 0, 1).copy() t2bbT = t2bb.transpose(2, 3, 0, 1).copy() eris_vooo = numpy.asarray(eris.ovoo).transpose(1, 3, 0, 2).conj().copy() eris_VOOO = numpy.asarray(eris.OVOO).transpose(1, 3, 0, 2).conj().copy() eris_vOoO = numpy.asarray(eris.ovOO).transpose(1, 3, 0, 2).conj().copy() eris_VoOo = numpy.asarray(eris.OVoo).transpose(1, 3, 0, 2).conj().copy() eris_vvop, eris_VVOP, eris_vVoP, eris_VvOp = _sort_eri( mycc, eris, ftmp, log) cpu1 = log.timer_debug1('UCCSD(T) sort_eri', *cpu1) dtype = numpy.result_type(t1a.dtype, t2aa.dtype, eris_vooo.dtype) et_sum = numpy.zeros(1, dtype=dtype) mem_now = lib.current_memory()[0] max_memory = max(0, mycc.max_memory - mem_now) # aaa bufsize = max( 8, int((max_memory * .5e6 / 8 - nocca**3 * 3 * lib.num_threads()) * .4 / (nocca * nmoa))) log.debug('max_memory %d MB (%d MB in use)', max_memory, mem_now) orbsym = numpy.zeros(nocca, dtype=int) contract = _gen_contract_aaa(t1aT, t2aaT, eris_vooo, eris.focka, eris.mo_energy[0], orbsym, log) with lib.call_in_background(contract, sync=not mycc.async_io) as ctr: for a0, a1 in reversed(list(lib.prange_tril(0, nvira, bufsize))): cache_row_a = numpy.asarray(eris_vvop[a0:a1, :a1], order='C') if a0 == 0: cache_col_a = cache_row_a else: cache_col_a = numpy.asarray(eris_vvop[:a0, a0:a1], order='C') ctr(et_sum, a0, a1, a0, a1, (cache_row_a, cache_col_a, cache_row_a, cache_col_a)) for b0, b1 in lib.prange_tril(0, a0, bufsize / 8): cache_row_b = numpy.asarray(eris_vvop[b0:b1, :b1], order='C') if b0 == 0: cache_col_b = cache_row_b else: cache_col_b = numpy.asarray(eris_vvop[:b0, b0:b1], order='C') ctr(et_sum, a0, a1, b0, b1, (cache_row_a, cache_col_a, cache_row_b, cache_col_b)) cpu1 = log.timer_debug1('contract_aaa', *cpu1) # bbb bufsize = max( 8, int((max_memory * .5e6 / 8 - noccb**3 * 3 * lib.num_threads()) * .4 / (noccb * nmob))) log.debug('max_memory %d MB (%d MB in use)', max_memory, mem_now) orbsym = numpy.zeros(noccb, dtype=int) contract = _gen_contract_aaa(t1bT, t2bbT, eris_VOOO, eris.fockb, eris.mo_energy[1], orbsym, log) with lib.call_in_background(contract, sync=not mycc.async_io) as ctr: for a0, a1 in reversed(list(lib.prange_tril(0, nvirb, bufsize))): cache_row_a = numpy.asarray(eris_VVOP[a0:a1, :a1], order='C') if a0 == 0: cache_col_a = cache_row_a else: cache_col_a = numpy.asarray(eris_VVOP[:a0, a0:a1], order='C') ctr(et_sum, a0, a1, a0, a1, (cache_row_a, cache_col_a, cache_row_a, cache_col_a)) for b0, b1 in lib.prange_tril(0, a0, bufsize / 8): cache_row_b = numpy.asarray(eris_VVOP[b0:b1, :b1], order='C') if b0 == 0: cache_col_b = cache_row_b else: cache_col_b = numpy.asarray(eris_VVOP[:b0, b0:b1], order='C') ctr(et_sum, a0, a1, b0, b1, (cache_row_a, cache_col_a, cache_row_b, cache_col_b)) cpu1 = log.timer_debug1('contract_bbb', *cpu1) # Cache t2abT in t2ab to reduce memory footprint assert (t2ab.flags.c_contiguous) t2abT = lib.transpose(t2ab.copy().reshape(nocca * noccb, nvira * nvirb), out=t2ab) t2abT = t2abT.reshape(nvira, nvirb, nocca, noccb) # baa bufsize = int( max(12, (max_memory * .5e6 / 8 - noccb * nocca**2 * 5) * .7 / (nocca * nmob))) ts = t1aT, t1bT, t2aaT, t2abT fock = (eris.focka, eris.fockb) vooo = (eris_vooo, eris_vOoO, eris_VoOo) contract = _gen_contract_baa(ts, vooo, fock, eris.mo_energy, orbsym, log) with lib.call_in_background(contract, sync=not mycc.async_io) as ctr: for a0, a1 in lib.prange(0, nvirb, int(bufsize / nvira + 1)): cache_row_a = numpy.asarray(eris_VvOp[a0:a1, :], order='C') cache_col_a = numpy.asarray(eris_vVoP[:, a0:a1], order='C') for b0, b1 in lib.prange_tril(0, nvira, bufsize / 6 / 2): cache_row_b = numpy.asarray(eris_vvop[b0:b1, :b1], order='C') cache_col_b = numpy.asarray(eris_vvop[:b0, b0:b1], order='C') ctr(et_sum, a0, a1, b0, b1, (cache_row_a, cache_col_a, cache_row_b, cache_col_b)) cpu1 = log.timer_debug1('contract_baa', *cpu1) t2baT = numpy.ndarray((nvirb, nvira, noccb, nocca), buffer=t2abT, dtype=t2abT.dtype) t2baT[:] = t2abT.copy().transpose(1, 0, 3, 2) # abb ts = t1bT, t1aT, t2bbT, t2baT fock = (eris.fockb, eris.focka) mo_energy = (eris.mo_energy[1], eris.mo_energy[0]) vooo = (eris_VOOO, eris_VoOo, eris_vOoO) contract = _gen_contract_baa(ts, vooo, fock, mo_energy, orbsym, log) for a0, a1 in lib.prange(0, nvira, int(bufsize / nvirb + 1)): with lib.call_in_background(contract, sync=not mycc.async_io) as ctr: cache_row_a = numpy.asarray(eris_vVoP[a0:a1, :], order='C') cache_col_a = numpy.asarray(eris_VvOp[:, a0:a1], order='C') for b0, b1 in lib.prange_tril(0, nvirb, bufsize / 6 / 2): cache_row_b = numpy.asarray(eris_VVOP[b0:b1, :b1], order='C') cache_col_b = numpy.asarray(eris_VVOP[:b0, b0:b1], order='C') ctr(et_sum, a0, a1, b0, b1, (cache_row_a, cache_col_a, cache_row_b, cache_col_b)) cpu1 = log.timer_debug1('contract_abb', *cpu1) # Restore t2ab lib.transpose(t2baT.transpose(1, 0, 3, 2).copy().reshape(nvira * nvirb, nocca * noccb), out=t2ab) et_sum *= .25 if abs(et_sum[0].imag) > 1e-4: logger.warn(mycc, 'Non-zero imaginary part of UCCSD(T) energy was found %s', et_sum[0]) et = et_sum[0].real log.timer('UCCSD(T)', *cpu0) log.note('UCCSD(T) correction = %.15g', et) return et
def _assemble(mydf, kptij_lst, j3c_jobs, gen_int3c, ft_fuse, cderi_file, fswap, log): t1 = (time.clock(), time.time()) cell = mydf.cell ao_loc = cell.ao_loc_nr() nao = ao_loc[-1] kptis = kptij_lst[:,0] kptjs = kptij_lst[:,1] kpt_ji = kptjs - kptis uniq_kpts, uniq_index, uniq_inverse = unique(kpt_ji) aosym_s2 = numpy.einsum('ix->i', abs(kptis-kptjs)) < 1e-9 t2 = t1 j3c_workers = numpy.zeros(len(j3c_jobs), dtype=int) #for job_id, ish0, ish1 in mpi.work_share_partition(j3c_jobs): for job_id, ish0, ish1 in mpi.work_stealing_partition(j3c_jobs): gen_int3c(job_id, ish0, ish1) t2 = log.alltimer_debug2('int j3c %d' % job_id, *t2) for k, kpt in enumerate(uniq_kpts): ft_fuse(job_id, k, ish0, ish1) t2 = log.alltimer_debug2('ft-fuse %d k %d' % (job_id, k), *t2) j3c_workers[job_id] = rank j3c_workers = mpi.allreduce(j3c_workers) log.debug2('j3c_workers %s', j3c_workers) t1 = log.timer_debug1('int3c and fuse', *t1) # Pass 2 # Transpose 3-index tensor and save data in cderi_file feri = h5py.File(cderi_file, 'w') nauxs = [fswap['j2c/%d'%k].shape[0] for k, kpt in enumerate(uniq_kpts)] segsize = (max(nauxs)+mpi.pool.size-1) // mpi.pool.size naux0 = rank * segsize for k, kptij in enumerate(kptij_lst): naux1 = min(nauxs[uniq_inverse[k]], naux0+segsize) nrow = max(0, naux1-naux0) if gamma_point(kptij): dtype = 'f8' else: dtype = 'c16' if aosym_s2[k]: nao_pair = nao * (nao+1) // 2 else: nao_pair = nao * nao feri.create_dataset('j3c/%d'%k, (nrow,nao_pair), dtype, maxshape=(None,nao_pair)) def get_segs_loc(aosym): off0 = numpy.asarray([ao_loc[i0] for x,i0,i1 in j3c_jobs]) off1 = numpy.asarray([ao_loc[i1] for x,i0,i1 in j3c_jobs]) if aosym: # s2 dims = off1*(off1+1)//2 - off0*(off0+1)//2 else: dims = (off1-off0) * nao #dims = numpy.asarray([ao_loc[i1]-ao_loc[i0] for x,i0,i1 in j3c_jobs]) dims = numpy.hstack([dims[j3c_workers==w] for w in range(mpi.pool.size)]) job_idx = numpy.hstack([numpy.where(j3c_workers==w)[0] for w in range(mpi.pool.size)]) segs_loc = numpy.append(0, numpy.cumsum(dims)) segs_loc = [(segs_loc[j], segs_loc[j+1]) for j in numpy.argsort(job_idx)] return segs_loc segs_loc_s1 = get_segs_loc(False) segs_loc_s2 = get_segs_loc(True) job_ids = numpy.where(rank == j3c_workers)[0] def load(k, p0, p1): naux1 = nauxs[uniq_inverse[k]] slices = [(min(i*segsize+p0,naux1), min(i*segsize+p1,naux1)) for i in range(mpi.pool.size)] segs = [] for p0, p1 in slices: val = [fswap['j3c-chunks/%d/%d' % (job, k)][p0:p1].ravel() for job in job_ids] if val: segs.append(numpy.hstack(val)) else: segs.append(numpy.zeros(0)) return segs def save(k, p0, p1, segs): segs = mpi.alltoall(segs) naux1 = nauxs[uniq_inverse[k]] loc0, loc1 = min(p0, naux1-naux0), min(p1, naux1-naux0) nL = loc1 - loc0 if nL > 0: if aosym_s2[k]: segs = numpy.hstack([segs[i0*nL:i1*nL].reshape(nL,-1) for i0,i1 in segs_loc_s2]) else: segs = numpy.hstack([segs[i0*nL:i1*nL].reshape(nL,-1) for i0,i1 in segs_loc_s1]) feri['j3c/%d'%k][loc0:loc1] = segs mem_now = max(comm.allgather(lib.current_memory()[0])) max_memory = max(2000, min(8000, mydf.max_memory - mem_now)) if numpy.all(aosym_s2): if gamma_point(kptij_lst): blksize = max(16, int(max_memory*.5e6/8/nao**2)) else: blksize = max(16, int(max_memory*.5e6/16/nao**2)) else: blksize = max(16, int(max_memory*.5e6/16/nao**2/2)) log.debug1('max_momory %d MB (%d in use), blksize %d', max_memory, mem_now, blksize) t2 = t1 with lib.call_in_background(save) as async_write: for k, kptji in enumerate(kptij_lst): for p0, p1 in lib.prange(0, segsize, blksize): segs = load(k, p0, p1) async_write(k, p0, p1, segs) t2 = log.timer_debug1('assemble k=%d %d:%d (in %d)' % (k, p0, p1, segsize), *t2) if 'j2c-' in fswap: j2c_kpts_lists = [] for k, kpt in enumerate(uniq_kpts): if ('j2c-/%d' % k) in fswap: adapted_ji_idx = numpy.where(uniq_inverse == k)[0] j2c_kpts_lists.append(adapted_ji_idx) for k in numpy.hstack(j2c_kpts_lists): val = [numpy.asarray(fswap['j3c-/%d/%d' % (job, k)]).ravel() for job in job_ids] val = mpi.gather(numpy.hstack(val)) if rank == 0: naux1 = fswap['j3c-/0/%d'%k].shape[0] if aosym_s2[k]: v = [val[i0*naux1:i1*naux1].reshape(naux1,-1) for i0,i1 in segs_loc_s2] else: v = [val[i0*naux1:i1*naux1].reshape(naux1,-1) for i0,i1 in segs_loc_s1] feri['j3c-/%d'%k] = numpy.hstack(v) if 'j3c-kptij' in feri: del(feri['j3c-kptij']) feri['j3c-kptij'] = kptij_lst t1 = log.alltimer_debug1('assembling j3c', *t1) feri.close()
def _add_ovvv_(mycc, t1, t2, eris, fvv, t1new, t2new, fswap): time1 = logger.process_clock(), logger.perf_counter() log = logger.Logger(mycc.stdout, mycc.verbose) nocc, nvir = t1.shape nvir_pair = nvir * (nvir + 1) // 2 if fswap is None: wVOov = numpy.zeros((nvir, nocc, nocc, nvir)) else: wVOov = fswap.create_dataset('wVOov', (nvir, nocc, nocc, nvir), 'f8') wooVV = numpy.zeros((nocc, nocc * nvir_pair)) max_memory = mycc.max_memory - lib.current_memory()[0] unit = nocc * nvir**2 * 3 + nocc**2 * nvir + 2 blksize = min( nvir, max(BLKMIN, int((max_memory * .95e6 / 8 - wooVV.size) / unit))) if not mycc.direct: unit = nocc * nvir**2 * 3 + nocc**2 * nvir + 2 + nocc * nvir**2 + nocc * nvir blksize = min( nvir, max( BLKMIN, int((max_memory * .95e6 / 8 - wooVV.size - nocc**2 * nvir) / unit))) log.debug1('max_memory %d MB, nocc,nvir = %d,%d blksize = %d', max_memory, nocc, nvir, blksize) def load_ovvv(buf, p0): if p0 < nvir: p1 = min(nvir, p0 + blksize) buf[:p1 - p0] = eris.ovvv[:, p0:p1].transpose(1, 0, 2) with lib.call_in_background(load_ovvv, sync=not mycc.async_io) as prefetch: buf = numpy.empty((blksize, nocc, nvir_pair)) buf_prefetch = numpy.empty((blksize, nocc, nvir_pair)) load_ovvv(buf_prefetch, 0) for p0, p1 in lib.prange(0, nvir, blksize): buf, buf_prefetch = buf_prefetch, buf prefetch(buf_prefetch, p1) eris_vovv = buf[:p1 - p0] eris_vovv = lib.unpack_tril( eris_vovv.reshape((p1 - p0) * nocc, nvir_pair)) eris_vovv = eris_vovv.reshape(p1 - p0, nocc, nvir, nvir) wVOov[p0:p1] = lib.einsum('biac,jc->bija', eris_vovv, t1) theta = t2[:, :, p0:p1].transpose(1, 2, 0, 3) * 2 theta -= t2[:, :, p0:p1].transpose(0, 2, 1, 3) t1new += lib.einsum('icjb,cjba->ia', theta, eris_vovv) theta = None time1 = log.timer_debug1('vovv [%d:%d]' % (p0, p1), *time1) if fswap is None: wooVV = lib.unpack_tril(wooVV.reshape(nocc**2, nvir_pair)) return wVOov, wooVV.reshape(nocc, nocc, nvir, nvir).transpose(2, 1, 0, 3) else: fswap.create_dataset('wVooV', (nvir, nocc, nocc, nvir), 'f8') wooVV = wooVV.reshape(nocc, nocc, nvir_pair) tril2sq = lib.square_mat_in_trilu_indices(nvir) for p0, p1 in lib.prange(0, nvir, blksize): fswap['wVooV'][p0:p1] = wooVV[:, :, tril2sq[p0:p1]].transpose( 2, 1, 0, 3) return fswap['wVOov'], fswap['wVooV']
def _ao2mo_ovov(mp, orbo, orbv, feri, max_memory=2000, verbose=None): time0 = (time.clock(), time.time()) log = logger.new_logger(mp, verbose) mol = mp.mol int2e = mol._add_suffix('int2e') ao2mopt = _ao2mo.AO2MOpt(mol, int2e, 'CVHFnr_schwarz_cond', 'CVHFsetnr_direct_scf') nao, nocc = orbo.shape nvir = orbv.shape[1] nbas = mol.nbas assert(nvir <= nao) ao_loc = mol.ao_loc_nr() dmax = max(4, min(nao/3, numpy.sqrt(max_memory*.95e6/8/(nao+nocc)**2))) sh_ranges = ao2mo.outcore.balance_partition(ao_loc, dmax) dmax = max(x[2] for x in sh_ranges) eribuf = numpy.empty((nao,dmax,dmax,nao)) ftmp = lib.H5TmpFile() log.debug('max_memory %s MB (dmax = %s) required disk space %g MB', max_memory, dmax, nocc**2*(nao*(nao+dmax)/2+nvir**2)*8/1e6) buf_i = numpy.empty((nocc*dmax**2*nao)) buf_li = numpy.empty((nocc**2*dmax**2)) buf1 = numpy.empty_like(buf_li) fint = gto.moleintor.getints4c jk_blk_slices = [] count = 0 time1 = time0 with lib.call_in_background(ftmp.__setitem__) as save: for ip, (ish0, ish1, ni) in enumerate(sh_ranges): for jsh0, jsh1, nj in sh_ranges[:ip+1]: i0, i1 = ao_loc[ish0], ao_loc[ish1] j0, j1 = ao_loc[jsh0], ao_loc[jsh1] jk_blk_slices.append((i0,i1,j0,j1)) eri = fint(int2e, mol._atm, mol._bas, mol._env, shls_slice=(0,nbas,ish0,ish1, jsh0,jsh1,0,nbas), aosym='s1', ao_loc=ao_loc, cintopt=ao2mopt._cintopt, out=eribuf) tmp_i = numpy.ndarray((nocc,(i1-i0)*(j1-j0)*nao), buffer=buf_i) tmp_li = numpy.ndarray((nocc,nocc*(i1-i0)*(j1-j0)), buffer=buf_li) lib.ddot(orbo.T, eri.reshape(nao,(i1-i0)*(j1-j0)*nao), c=tmp_i) lib.ddot(orbo.T, tmp_i.reshape(nocc*(i1-i0)*(j1-j0),nao).T, c=tmp_li) tmp_li = tmp_li.reshape(nocc,nocc,(i1-i0),(j1-j0)) save(str(count), tmp_li.transpose(1,0,2,3)) buf_li, buf1 = buf1, buf_li count += 1 time1 = log.timer_debug1('partial ao2mo [%d:%d,%d:%d]' % (ish0,ish1,jsh0,jsh1), *time1) time1 = time0 = log.timer('mp2 ao2mo_ovov pass1', *time0) eri = eribuf = tmp_i = tmp_li = buf_i = buf_li = buf1 = None h5dat = feri.create_dataset('ovov', (nocc*nvir,nocc*nvir), 'f8', chunks=(nvir,nvir)) occblk = int(min(nocc, max(4, 250/nocc, max_memory*.9e6/8/(nao**2*nocc)/5))) def load(i0, eri): if i0 < nocc: i1 = min(i0+occblk, nocc) for k, (p0,p1,q0,q1) in enumerate(jk_blk_slices): eri[:i1-i0,:,p0:p1,q0:q1] = ftmp[str(k)][i0:i1] if p0 != q0: dat = numpy.asarray(ftmp[str(k)][:,i0:i1]) eri[:i1-i0,:,q0:q1,p0:p1] = dat.transpose(1,0,3,2) def save(i0, i1, dat): for i in range(i0, i1): h5dat[i*nvir:(i+1)*nvir] = dat[i-i0].reshape(nvir,nocc*nvir) orbv = numpy.asarray(orbv, order='F') buf_prefecth = numpy.empty((occblk,nocc,nao,nao)) buf = numpy.empty_like(buf_prefecth) bufw = numpy.empty((occblk*nocc,nvir**2)) bufw1 = numpy.empty_like(bufw) with lib.call_in_background(load) as prefetch: with lib.call_in_background(save) as bsave: load(0, buf_prefecth) for i0, i1 in lib.prange(0, nocc, occblk): buf, buf_prefecth = buf_prefecth, buf prefetch(i1, buf_prefecth) eri = buf[:i1-i0].reshape((i1-i0)*nocc,nao,nao) dat = _ao2mo.nr_e2(eri, orbv, (0,nvir,0,nvir), 's1', 's1', out=bufw) bsave(i0, i1, dat.reshape(i1-i0,nocc,nvir,nvir).transpose(0,2,1,3)) bufw, bufw1 = bufw1, bufw time1 = log.timer_debug1('pass2 ao2mo [%d:%d]' % (i0,i1), *time1) time0 = log.timer('mp2 ao2mo_ovov pass2', *time0) return h5dat
def general(eri, mo_coeffs, erifile, dataname='eri_mo', ioblk_size=IOBLK_SIZE, compact=True, verbose=logger.NOTE): '''For the given four sets of orbitals, transfer arbitrary spherical AO integrals to MO integrals on disk. Args: eri : 8-fold reduced eri vector mo_coeffs : 4-item list of ndarray Four sets of orbital coefficients, corresponding to the four indices of (ij|kl) erifile : str or h5py File or h5py Group object To store the transformed integrals, in HDF5 format. Kwargs dataname : str The dataset name in the erifile (ref the hierarchy of HDF5 format http://www.hdfgroup.org/HDF5/doc1.6/UG/09_Groups.html). By assigning different dataname, the existed integral file can be reused. If the erifile contains the dataname, the new integrals data will overwrite the old one. ioblk_size : float or int The block size for IO, large block size may **not** improve performance compact : bool When compact is True, depending on the four oribital sets, the returned MO integrals has (up to 4-fold) permutation symmetry. If it's False, the function will abandon any permutation symmetry, and return the "plain" MO integrals Pseudocode / algorithm: u = mu v = nu l = lambda o = sigma Assume eri's are 8-fold reduced. nij/nkl_pair = npair or i*j/k*l if only transforming a subset First half transform: Initialize half_eri of size (nij_pair,npair) For lo = 1 -> npair Unpack row lo Unpack row lo to matrix E_{uv}^{lo} Transform C_ui^+*E*C_nj -> E_{ij}^{lo} Ravel or pack E_{ij}^{lo} Save E_{ij}^{lo} -> half_eri[:,lo] Second half transform: Initialize h5d_eri of size (nij_pair,nkl_pair) For ij = 1 -> nij_pair Load and unpack half_eri[ij,:] -> E_{lo}^{ij} Transform C_{lk}E_{lo}^{ij}C_{ol} -> E_{kl}^{ij} Repack E_{kl}^{ij} Save E_{kl}^{ij} -> h5d_eri[ij,:] Each matrix is indexed by the composite index ij x kl, where ij/kl is either npair or ixj/kxl, if only a subset of MOs are being transformed. Since entire rows or columns need to be read in, the arrays are chunked such that IOBLK_SIZE = row/col x chunking col/row. For example, for the first half transform, we would save in nij_pair x IOBLK_SIZE/nij_pair, then load in IOBLK_SIZE/nkl_pair x npair for the second half transform. ------ kl -----> |jxl | ij | | v As a first guess, the chunking size is jxl. If the super-rows/cols are larger than IOBLK_SIZE, then the chunk rectangle jxl is trimmed accordingly. The pathological limiting case is where the dimensions nao_pair, nij_pair, or nkl_pair are so large that the arrays are chunked 1x1, in which case IOBLK_SIZE needs to be increased. ''' log = logger.new_logger(None, verbose) log.info('******** ao2mo disk, custom eri ********') nmoi = mo_coeffs[0].shape[1] nmoj = mo_coeffs[1].shape[1] nmok = mo_coeffs[2].shape[1] nmol = mo_coeffs[3].shape[1] nao = mo_coeffs[0].shape[0] nao_pair = nao * (nao + 1) // 2 if compact and iden_coeffs(mo_coeffs[0], mo_coeffs[1]): ij_red = False nij_pair = nmoi * (nmoi + 1) // 2 else: ij_red = True nij_pair = nmoi * nmoj if compact and iden_coeffs(mo_coeffs[2], mo_coeffs[3]): kl_red = False nkl_pair = nmok * (nmok + 1) // 2 else: kl_red = True nkl_pair = nmok * nmol chunks_half = (max( 1, numpy.minimum(int(ioblk_size // (nao_pair * f8_size)), nmoj)), max( 1, numpy.minimum(int(ioblk_size // (nij_pair * f8_size)), nmol))) ''' ideally, the final transformed eris should have a chunk of nmoj x nmol to optimize read operations. However, I'm chunking the row size so that the write operations during the transform can be done as fast as possible. ''' chunks_full = (numpy.minimum(int(ioblk_size // (nkl_pair * f8_size)), nmoj), nmol) if isinstance(erifile, str): if h5py.is_hdf5(erifile): feri = h5py.File(erifile) if dataname in feri: del (feri[dataname]) else: feri = h5py.File(erifile, 'w', libver='latest') else: assert (isinstance(erifile, h5py.Group)) feri = erifile h5d_eri = feri.create_dataset(dataname, (nij_pair, nkl_pair), 'f8', chunks=chunks_full) feri_swap = lib.H5TmpFile(libver='latest') half_eri = feri_swap.create_dataset(dataname, (nij_pair, nao_pair), 'f8', chunks=chunks_half) log.debug('Memory information:') log.debug(' IOBLK_SIZE (MB): {}'.format(ioblk_size)) log.debug(' jxl {}x{}, half eri chunk dim {}x{}'.format( nmoj, nmol, chunks_half[0], chunks_half[1])) log.debug(' jxl {}x{}, full eri chunk dim {}x{}'.format( nmoj, nmol, chunks_full[0], chunks_full[1])) log.debug(' Final disk eri size (MB): {:.3g}, chunked {:.3g}'.format( nij_pair * nkl_pair * f8_size, numpy.prod(chunks_full) * f8_size)) log.debug( ' Half transformed eri size (MB): {:.3g}, chunked {:.3g}'.format( nij_pair * nao_pair * f8_size, numpy.prod(chunks_half) * f8_size)) log.debug(' RAM buffer for half transform (MB): {:.3g}'.format( nij_pair * chunks_half[1] * f8_size * 2)) log.debug(' RAM buffer for full transform (MB): {:.3g}'.format( f8_size * chunks_full[0] * nkl_pair * 2 + chunks_half[0] * nao_pair * f8_size * 2)) def save1(piece, buf): start = piece * chunks_half[1] stop = (piece + 1) * chunks_half[1] if stop > nao_pair: stop = nao_pair half_eri[:, start:stop] = buf[:, :stop - start] return def load2(piece): start = piece * chunks_half[0] stop = (piece + 1) * chunks_half[0] if stop > nij_pair: stop = nij_pair if start >= nij_pair: start = stop - 1 return half_eri[start:stop, :] def prefetch2(piece): start = piece * chunks_half[0] stop = (piece + 1) * chunks_half[0] if stop > nij_pair: stop = nij_pair if start >= nij_pair: start = stop - 1 buf_prefetch[:stop - start, :] = half_eri[start:stop, :] return def save2(piece, buf): start = piece * chunks_full[0] stop = (piece + 1) * chunks_full[0] if stop > nij_pair: stop = nij_pair h5d_eri[start:stop, :] = buf[:stop - start, :] return # transform \mu\nu -> ij cput0 = time.clock(), time.time() Cimu = mo_coeffs[0].conj().transpose() buf_write = numpy.empty((nij_pair, chunks_half[1])) buf_out = numpy.empty_like(buf_write) wpiece = 0 with lib.call_in_background(save1) as async_write: for lo in range(nao_pair): if lo % chunks_half[1] == 0 and lo > 0: #save1(wpiece,buf_write) buf_out, buf_write = buf_write, buf_out async_write(wpiece, buf_out) wpiece += 1 buf = lib.unpack_row(eri, lo) uv = lib.unpack_tril(buf) uv = Cimu.dot(uv).dot(mo_coeffs[1]) if ij_red: ij = numpy.ravel(uv) # grabs by row else: ij = lib.pack_tril(uv) buf_write[:, lo % chunks_half[1]] = ij # final write operation & cleanup save1(wpiece, buf_write) log.timer('(uv|lo) -> (ij|lo)', *cput0) uv = None ij = None buf = None # transform \lambda\sigma -> kl cput1 = time.clock(), time.time() Cklam = mo_coeffs[2].conj().transpose() buf_write = numpy.empty((chunks_full[0], nkl_pair)) buf_out = numpy.empty_like(buf_write) buf_read = numpy.empty((chunks_half[0], nao_pair)) buf_prefetch = numpy.empty_like(buf_read) rpiece = 0 wpiece = 0 with lib.call_in_background(save2, prefetch2) as (async_write, prefetch): buf_read = load2(rpiece) prefetch(rpiece + 1) for ij in range(nij_pair): if ij % chunks_full[0] == 0 and ij > 0: #save2(wpiece,buf_write) buf_out, buf_write = buf_write, buf_out async_write(wpiece, buf_out) wpiece += 1 if ij % chunks_half[0] == 0 and ij > 0: #buf_read = load2(rpiece) buf_read, buf_prefetch = buf_prefetch, buf_read rpiece += 1 prefetch(rpiece + 1) lo = lib.unpack_tril(buf_read[ij % chunks_half[0], :]) lo = Cklam.dot(lo).dot(mo_coeffs[3]) if kl_red: kl = numpy.ravel(lo) else: kl = lib.pack_tril(lo) buf_write[ij % chunks_full[0], :] = kl save2(wpiece, buf_write) log.timer('(ij|lo) -> (ij|kl)', *cput1) if isinstance(erifile, str): feri.close() return erifile
def general(eri, mo_coeffs, erifile, dataname='eri_mo', ioblk_size=IOBLK_SIZE, compact=True, verbose=logger.NOTE): '''For the given four sets of orbitals, transfer arbitrary spherical AO integrals to MO integrals on disk. Args: eri : 8-fold reduced eri vector mo_coeffs : 4-item list of ndarray Four sets of orbital coefficients, corresponding to the four indices of (ij|kl) erifile : str or h5py File or h5py Group object To store the transformed integrals, in HDF5 format. Kwargs dataname : str The dataset name in the erifile (ref the hierarchy of HDF5 format http://www.hdfgroup.org/HDF5/doc1.6/UG/09_Groups.html). By assigning different dataname, the existed integral file can be reused. If the erifile contains the dataname, the new integrals data will overwrite the old one. ioblk_size : float or int The block size for IO, large block size may **not** improve performance compact : bool When compact is True, depending on the four oribital sets, the returned MO integrals has (up to 4-fold) permutation symmetry. If it's False, the function will abandon any permutation symmetry, and return the "plain" MO integrals Pseudocode / algorithm: u = mu v = nu l = lambda o = sigma Assume eri's are 8-fold reduced. nij/nkl_pair = npair or i*j/k*l if only transforming a subset First half transform: Initialize half_eri of size (nij_pair,npair) For lo = 1 -> npair Unpack row lo Unpack row lo to matrix E_{uv}^{lo} Transform C_ui^+*E*C_nj -> E_{ij}^{lo} Ravel or pack E_{ij}^{lo} Save E_{ij}^{lo} -> half_eri[:,lo] Second half transform: Initialize h5d_eri of size (nij_pair,nkl_pair) For ij = 1 -> nij_pair Load and unpack half_eri[ij,:] -> E_{lo}^{ij} Transform C_{lk}E_{lo}^{ij}C_{ol} -> E_{kl}^{ij} Repack E_{kl}^{ij} Save E_{kl}^{ij} -> h5d_eri[ij,:] Each matrix is indexed by the composite index ij x kl, where ij/kl is either npair or ixj/kxl, if only a subset of MOs are being transformed. Since entire rows or columns need to be read in, the arrays are chunked such that IOBLK_SIZE = row/col x chunking col/row. For example, for the first half transform, we would save in nij_pair x IOBLK_SIZE/nij_pair, then load in IOBLK_SIZE/nkl_pair x npair for the second half transform. ------ kl -----> |jxl | ij | | v As a first guess, the chunking size is jxl. If the super-rows/cols are larger than IOBLK_SIZE, then the chunk rectangle jxl is trimmed accordingly. The pathological limiting case is where the dimensions nao_pair, nij_pair, or nkl_pair are so large that the arrays are chunked 1x1, in which case IOBLK_SIZE needs to be increased. ''' log = logger.new_logger(None, verbose) log.info('******** ao2mo disk, custom eri ********') nmoi = mo_coeffs[0].shape[1] nmoj = mo_coeffs[1].shape[1] nmok = mo_coeffs[2].shape[1] nmol = mo_coeffs[3].shape[1] nao = mo_coeffs[0].shape[0] nao_pair = nao*(nao+1) // 2 if compact and iden_coeffs(mo_coeffs[0], mo_coeffs[1]): ij_red = False nij_pair = nmoi*(nmoi+1) // 2 else: ij_red = True nij_pair = nmoi*nmoj if compact and iden_coeffs(mo_coeffs[2], mo_coeffs[3]): kl_red = False nkl_pair = nmok*(nmok+1) // 2 else: kl_red = True nkl_pair = nmok*nmol chunks_half = (max(1, numpy.minimum(int(ioblk_size//(nao_pair*f8_size)),nmoj)), max(1, numpy.minimum(int(ioblk_size//(nij_pair*f8_size)),nmol))) ''' ideally, the final transformed eris should have a chunk of nmoj x nmol to optimize read operations. However, I'm chunking the row size so that the write operations during the transform can be done as fast as possible. ''' chunks_full = (numpy.minimum(int(ioblk_size//(nkl_pair*f8_size)),nmoj),nmol) if isinstance(erifile, str): if h5py.is_hdf5(erifile): feri = h5py.File(erifile) if dataname in feri: del(feri[dataname]) else: feri = h5py.File(erifile,'w',libver='latest') else: assert(isinstance(erifile, h5py.Group)) feri = erifile h5d_eri = feri.create_dataset(dataname,(nij_pair,nkl_pair),'f8',chunks=chunks_full) feri_swap = lib.H5TmpFile(libver='latest') half_eri = feri_swap.create_dataset(dataname,(nij_pair,nao_pair),'f8',chunks=chunks_half) log.debug('Memory information:') log.debug(' IOBLK_SIZE (MB): {}'.format(ioblk_size)) log.debug(' jxl {}x{}, half eri chunk dim {}x{}'.format(nmoj,nmol,chunks_half[0],chunks_half[1])) log.debug(' jxl {}x{}, full eri chunk dim {}x{}'.format(nmoj,nmol,chunks_full[0],chunks_full[1])) log.debug(' Final disk eri size (MB): {:.3g}, chunked {:.3g}' .format(nij_pair*nkl_pair*f8_size,numpy.prod(chunks_full)*f8_size)) log.debug(' Half transformed eri size (MB): {:.3g}, chunked {:.3g}' .format(nij_pair*nao_pair*f8_size,numpy.prod(chunks_half)*f8_size)) log.debug(' RAM buffer for half transform (MB): {:.3g}' .format(nij_pair*chunks_half[1]*f8_size*2)) log.debug(' RAM buffer for full transform (MB): {:.3g}' .format(f8_size*chunks_full[0]*nkl_pair*2 + chunks_half[0]*nao_pair*f8_size*2)) def save1(piece,buf): start = piece*chunks_half[1] stop = (piece+1)*chunks_half[1] if stop > nao_pair: stop = nao_pair half_eri[:,start:stop] = buf[:,:stop-start] return def load2(piece): start = piece*chunks_half[0] stop = (piece+1)*chunks_half[0] if stop > nij_pair: stop = nij_pair if start >= nij_pair: start = stop - 1 return half_eri[start:stop,:] def prefetch2(piece): start = piece*chunks_half[0] stop = (piece+1)*chunks_half[0] if stop > nij_pair: stop = nij_pair if start >= nij_pair: start = stop - 1 buf_prefetch[:stop-start,:] = half_eri[start:stop,:] return def save2(piece,buf): start = piece*chunks_full[0] stop = (piece+1)*chunks_full[0] if stop > nij_pair: stop = nij_pair h5d_eri[start:stop,:] = buf[:stop-start,:] return # transform \mu\nu -> ij cput0 = time.clock(), time.time() Cimu = mo_coeffs[0].conj().transpose() buf_write = numpy.empty((nij_pair,chunks_half[1])) buf_out = numpy.empty_like(buf_write) wpiece = 0 with lib.call_in_background(save1) as async_write: for lo in range(nao_pair): if lo % chunks_half[1] == 0 and lo > 0: #save1(wpiece,buf_write) buf_out, buf_write = buf_write, buf_out async_write(wpiece,buf_out) wpiece += 1 buf = lib.unpack_row(eri,lo) uv = lib.unpack_tril(buf) uv = Cimu.dot(uv).dot(mo_coeffs[1]) if ij_red: ij = numpy.ravel(uv) # grabs by row else: ij = lib.pack_tril(uv) buf_write[:,lo % chunks_half[1]] = ij # final write operation & cleanup save1(wpiece,buf_write) log.timer('(uv|lo) -> (ij|lo)', *cput0) uv = None ij = None buf = None # transform \lambda\sigma -> kl cput1 = time.clock(), time.time() Cklam = mo_coeffs[2].conj().transpose() buf_write = numpy.empty((chunks_full[0],nkl_pair)) buf_out = numpy.empty_like(buf_write) buf_read = numpy.empty((chunks_half[0],nao_pair)) buf_prefetch = numpy.empty_like(buf_read) rpiece = 0 wpiece = 0 with lib.call_in_background(save2,prefetch2) as (async_write,prefetch): buf_read = load2(rpiece) prefetch(rpiece+1) for ij in range(nij_pair): if ij % chunks_full[0] == 0 and ij > 0: #save2(wpiece,buf_write) buf_out, buf_write = buf_write, buf_out async_write(wpiece,buf_out) wpiece += 1 if ij % chunks_half[0] == 0 and ij > 0: #buf_read = load2(rpiece) buf_read, buf_prefetch = buf_prefetch, buf_read rpiece += 1 prefetch(rpiece+1) lo = lib.unpack_tril(buf_read[ij % chunks_half[0],:]) lo = Cklam.dot(lo).dot(mo_coeffs[3]) if kl_red: kl = numpy.ravel(lo) else: kl = lib.pack_tril(lo) buf_write[ij % chunks_full[0],:] = kl save2(wpiece,buf_write) log.timer('(ij|lo) -> (ij|kl)', *cput1) if isinstance(erifile, str): feri.close() return erifile
def kernel(mycc, eris, t1=None, t2=None, verbose=logger.NOTE): cpu1 = cpu0 = (time.clock(), time.time()) log = logger.new_logger(mycc, verbose) if t1 is None: t1 = mycc.t1 if t2 is None: t2 = mycc.t2 nocc, nvir = t1.shape nmo = nocc + nvir dtype = numpy.result_type(t1, t2, eris.ovoo.dtype) if mycc.incore_complete: ftmp = None eris_vvop = numpy.zeros((nvir,nvir,nocc,nmo), dtype) else: ftmp = lib.H5TmpFile() eris_vvop = ftmp.create_dataset('vvop', (nvir,nvir,nocc,nmo), dtype) orbsym = _sort_eri(mycc, eris, nocc, nvir, eris_vvop, log) mo_energy, t1T, t2T, vooo, fvo, restore_t2_inplace = \ _sort_t2_vooo_(mycc, orbsym, t1, t2, eris) cpu1 = log.timer_debug1('CCSD(T) sort_eri', *cpu1) cpu2 = list(cpu1) orbsym = numpy.hstack((numpy.sort(orbsym[:nocc]),numpy.sort(orbsym[nocc:]))) o_ir_loc = numpy.append(0, numpy.cumsum(numpy.bincount(orbsym[:nocc], minlength=8))) v_ir_loc = numpy.append(0, numpy.cumsum(numpy.bincount(orbsym[nocc:], minlength=8))) o_sym = orbsym[:nocc] oo_sym = (o_sym[:,None] ^ o_sym).ravel() oo_ir_loc = numpy.append(0, numpy.cumsum(numpy.bincount(oo_sym, minlength=8))) nirrep = max(oo_sym) + 1 orbsym = orbsym.astype(numpy.int32) o_ir_loc = o_ir_loc.astype(numpy.int32) v_ir_loc = v_ir_loc.astype(numpy.int32) oo_ir_loc = oo_ir_loc.astype(numpy.int32) if dtype == numpy.complex: drv = _ccsd.libcc.CCsd_t_zcontract else: drv = _ccsd.libcc.CCsd_t_contract et_sum = numpy.zeros(1, dtype=dtype) def contract(a0, a1, b0, b1, cache): cache_row_a, cache_col_a, cache_row_b, cache_col_b = cache drv(et_sum.ctypes.data_as(ctypes.c_void_p), mo_energy.ctypes.data_as(ctypes.c_void_p), t1T.ctypes.data_as(ctypes.c_void_p), t2T.ctypes.data_as(ctypes.c_void_p), vooo.ctypes.data_as(ctypes.c_void_p), fvo.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nocc), ctypes.c_int(nvir), ctypes.c_int(a0), ctypes.c_int(a1), ctypes.c_int(b0), ctypes.c_int(b1), ctypes.c_int(nirrep), o_ir_loc.ctypes.data_as(ctypes.c_void_p), v_ir_loc.ctypes.data_as(ctypes.c_void_p), oo_ir_loc.ctypes.data_as(ctypes.c_void_p), orbsym.ctypes.data_as(ctypes.c_void_p), cache_row_a.ctypes.data_as(ctypes.c_void_p), cache_col_a.ctypes.data_as(ctypes.c_void_p), cache_row_b.ctypes.data_as(ctypes.c_void_p), cache_col_b.ctypes.data_as(ctypes.c_void_p)) cpu2[:] = log.timer_debug1('contract %d:%d,%d:%d'%(a0,a1,b0,b1), *cpu2) # The rest 20% memory for cache b mem_now = lib.current_memory()[0] max_memory = max(0, mycc.max_memory - mem_now) bufsize = (max_memory*.5e6/8-nocc**3*3*lib.num_threads())/(nocc*nmo) #*.5 for async_io bufsize *= .5 #*.5 upper triangular part is loaded bufsize *= .8 #*.8 for [a0:a1]/[b0:b1] partition bufsize = max(8, bufsize) log.debug('max_memory %d MB (%d MB in use)', max_memory, mem_now) with lib.call_in_background(contract, sync=not mycc.async_io) as async_contract: for a0, a1 in reversed(list(lib.prange_tril(0, nvir, bufsize))): cache_row_a = numpy.asarray(eris_vvop[a0:a1,:a1], order='C') if a0 == 0: cache_col_a = cache_row_a else: cache_col_a = numpy.asarray(eris_vvop[:a0,a0:a1], order='C') async_contract(a0, a1, a0, a1, (cache_row_a,cache_col_a, cache_row_a,cache_col_a)) for b0, b1 in lib.prange_tril(0, a0, bufsize/8): cache_row_b = numpy.asarray(eris_vvop[b0:b1,:b1], order='C') if b0 == 0: cache_col_b = cache_row_b else: cache_col_b = numpy.asarray(eris_vvop[:b0,b0:b1], order='C') async_contract(a0, a1, b0, b1, (cache_row_a,cache_col_a, cache_row_b,cache_col_b)) t2 = restore_t2_inplace(t2T) et_sum *= 2 if abs(et_sum[0].imag) > 1e-4: logger.warn(mycc, 'Non-zero imaginary part of CCSD(T) energy was found %s', et_sum[0]) et = et_sum[0].real log.timer('CCSD(T)', *cpu0) log.note('CCSD(T) correction = %.15g', et) return et
''' This example shows how to use the call_in_background macro ''' from pyscf import lib import time def fa(): print('a') time.sleep(0.5) def fb(): print('b') time.sleep(0.8) print('type 1') w0 = time.time() with lib.call_in_background(fa) as afa, lib.call_in_background(fb) as afb: for i in range(3): afa() afb() print('total time = %.1f s = [fb]0.8 * 3 seconds' % (time.time() - w0)) print('type 2') w0 = time.time() with lib.call_in_background(fa, fb) as (afa, afb): for i in range(3): afa() afb() print('total time = %.1f s = ([fa]0.5 + [fb]0.8) * 3 seconds' % (time.time() - w0))
def _make_j3c(mydf, cell, auxcell, kptij_lst, cderi_file): log = logger.Logger(mydf.stdout, mydf.verbose) t1 = t0 = (time.clock(), time.time()) fused_cell, fuse = fuse_auxcell(mydf, mydf.auxcell) ao_loc = cell.ao_loc_nr() nao = ao_loc[-1] naux = auxcell.nao_nr() nkptij = len(kptij_lst) gs = mydf.gs Gv, Gvbase, kws = cell.get_Gv_weights(gs) b = cell.reciprocal_vectors() gxyz = lib.cartesian_prod([numpy.arange(len(x)) for x in Gvbase]) ngs = gxyz.shape[0] kptis = kptij_lst[:, 0] kptjs = kptij_lst[:, 1] kpt_ji = kptjs - kptis uniq_kpts, uniq_index, uniq_inverse = unique(kpt_ji) log.debug('Num uniq kpts %d', len(uniq_kpts)) log.debug2('uniq_kpts %s', uniq_kpts) # j2c ~ (-kpt_ji | kpt_ji) j2c = fused_cell.pbc_intor('int2c2e_sph', hermi=1, kpts=uniq_kpts) j2ctags = [] nauxs = [] t1 = log.timer_debug1('2c2e', *t1) if h5py.is_hdf5(cderi_file): feri = h5py.File(cderi_file) else: feri = h5py.File(cderi_file, 'w') for k, kpt in enumerate(uniq_kpts): aoaux = ft_ao.ft_ao(fused_cell, Gv, None, b, gxyz, Gvbase, kpt).T coulG = numpy.sqrt(mydf.weighted_coulG(kpt, False, gs)) kLR = (aoaux.real * coulG).T kLI = (aoaux.imag * coulG).T if not kLR.flags.c_contiguous: kLR = lib.transpose(kLR.T) if not kLI.flags.c_contiguous: kLI = lib.transpose(kLI.T) aoaux = None kLR1 = numpy.asarray(kLR[:, naux:], order='C') kLI1 = numpy.asarray(kLI[:, naux:], order='C') if is_zero(kpt): # kpti == kptj for p0, p1 in mydf.mpi_prange(0, ngs): j2cR = lib.ddot(kLR1[p0:p1].T, kLR[p0:p1]) j2cR = lib.ddot(kLI1[p0:p1].T, kLI[p0:p1], 1, j2cR, 1) j2c[k][naux:] -= mpi.allreduce(j2cR) j2c[k][:naux, naux:] = j2c[k][naux:, :naux].T else: for p0, p1 in mydf.mpi_prange(0, ngs): j2cR, j2cI = zdotCN(kLR1[p0:p1].T, kLI1[p0:p1].T, kLR[p0:p1], kLI[p0:p1]) j2cR = mpi.allreduce(j2cR) j2cI = mpi.allreduce(j2cI) j2c[k][naux:] -= j2cR + j2cI * 1j j2c[k][:naux, naux:] = j2c[k][naux:, :naux].T.conj() j2c[k] = fuse(fuse(j2c[k]).T).T try: feri['j2c/%d' % k] = scipy.linalg.cholesky(j2c[k], lower=True) j2ctags.append('CD') nauxs.append(naux) except scipy.linalg.LinAlgError as e: #msg =('===================================\n' # 'J-metric not positive definite.\n' # 'It is likely that gs is not enough.\n' # '===================================') #log.error(msg) #raise scipy.linalg.LinAlgError('\n'.join([e.message, msg])) w, v = scipy.linalg.eigh(j2c) log.debug2('metric linear dependency for kpt %s', uniq_kptji_id) log.debug2('cond = %.4g, drop %d bfns', w[0] / w[-1], numpy.count_nonzero(w < LINEAR_DEP_THR)) v = v[:, w > LINEAR_DEP_THR].T.conj() v /= numpy.sqrt(w[w > LINEAR_DEP_THR]).reshape(-1, 1) feri['j2c/%d' % k] = v j2ctags.append('eig') nauxs.append(v.shape[0]) kLR = kLI = kLR1 = kLI1 = coulG = None j2c = None aosym_s2 = numpy.einsum('ix->i', abs(kptis - kptjs)) < 1e-9 j_only = numpy.all(aosym_s2) if gamma_point(kptij_lst): dtype = 'f8' else: dtype = 'c16' vbar = mydf.auxbar(fused_cell) vbar = fuse(vbar) ovlp = cell.pbc_intor('int1e_ovlp_sph', hermi=1, kpts=kptjs[aosym_s2]) ovlp = [lib.pack_tril(s) for s in ovlp] t1 = log.timer_debug1('aoaux and int2c', *t1) # Estimates the buffer size based on the last contraction in G-space. # This contraction requires to hold nkptj copies of (naux,?) array # simultaneously in memory. mem_now = max(comm.allgather(lib.current_memory()[0])) max_memory = max(2000, mydf.max_memory - mem_now) nkptj_max = max((uniq_inverse == x).sum() for x in set(uniq_inverse)) buflen = max( int( min(max_memory * .5e6 / 16 / naux / (nkptj_max + 2) / nao, nao / 3 / mpi.pool.size)), 1) chunks = (buflen, nao) j3c_jobs = grids2d_int3c_jobs(cell, auxcell, kptij_lst, chunks, j_only) log.debug1('max_memory = %d MB (%d in use) chunks %s', max_memory, mem_now, chunks) log.debug2('j3c_jobs %s', j3c_jobs) if j_only: int3c = wrap_int3c(cell, fused_cell, 'int3c2e_sph', 's2', 1, kptij_lst) else: int3c = wrap_int3c(cell, fused_cell, 'int3c2e_sph', 's1', 1, kptij_lst) idxb = numpy.tril_indices(nao) idxb = (idxb[0] * nao + idxb[1]).astype('i') aux_loc = fused_cell.ao_loc_nr('ssc' in 'int3c2e_sph') def gen_int3c(auxcell, job_id, ish0, ish1): dataname = 'j3c-chunks/%d' % job_id if dataname in feri: del (feri[dataname]) i0 = ao_loc[ish0] i1 = ao_loc[ish1] dii = i1 * (i1 + 1) // 2 - i0 * (i0 + 1) // 2 dij = (i1 - i0) * nao if j_only: buflen = max(8, int(max_memory * 1e6 / 16 / (nkptij * dii + dii))) else: buflen = max(8, int(max_memory * 1e6 / 16 / (nkptij * dij + dij))) auxranges = balance_segs(aux_loc[1:] - aux_loc[:-1], buflen) buflen = max([x[2] for x in auxranges]) buf = numpy.empty(nkptij * dij * buflen, dtype=dtype) buf1 = numpy.empty(dij * buflen, dtype=dtype) naux = aux_loc[-1] for kpt_id, kptij in enumerate(kptij_lst): key = '%s/%d' % (dataname, kpt_id) if aosym_s2[kpt_id]: shape = (naux, dii) else: shape = (naux, dij) if gamma_point(kptij): feri.create_dataset(key, shape, 'f8') else: feri.create_dataset(key, shape, 'c16') naux0 = 0 for istep, auxrange in enumerate(auxranges): log.alldebug2("aux_e2 job_id %d step %d", job_id, istep) sh0, sh1, nrow = auxrange sub_slice = (ish0, ish1, 0, cell.nbas, sh0, sh1) if j_only: mat = numpy.ndarray((nkptij, dii, nrow), dtype=dtype, buffer=buf) else: mat = numpy.ndarray((nkptij, dij, nrow), dtype=dtype, buffer=buf) mat = int3c(sub_slice, mat) for k, kptij in enumerate(kptij_lst): h5dat = feri['%s/%d' % (dataname, k)] v = lib.transpose(mat[k], out=buf1) if not j_only and aosym_s2[k]: idy = idxb[i0 * (i0 + 1) // 2:i1 * (i1 + 1) // 2] - i0 * nao out = numpy.ndarray((nrow, dii), dtype=v.dtype, buffer=mat[k]) v = numpy.take(v, idy, axis=1, out=out) if gamma_point(kptij): h5dat[naux0:naux0 + nrow] = v.real else: h5dat[naux0:naux0 + nrow] = v naux0 += nrow def ft_fuse(job_id, uniq_kptji_id, sh0, sh1): kpt = uniq_kpts[uniq_kptji_id] # kpt = kptj - kpti adapted_ji_idx = numpy.where(uniq_inverse == uniq_kptji_id)[0] adapted_kptjs = kptjs[adapted_ji_idx] nkptj = len(adapted_kptjs) shls_slice = (auxcell.nbas, fused_cell.nbas) Gaux = ft_ao.ft_ao(fused_cell, Gv, shls_slice, b, gxyz, Gvbase, kpt) Gaux *= mydf.weighted_coulG(kpt, False, gs).reshape(-1, 1) kLR = Gaux.real.copy('C') kLI = Gaux.imag.copy('C') j2c = numpy.asarray(feri['j2c/%d' % uniq_kptji_id]) j2ctag = j2ctags[uniq_kptji_id] naux0 = j2c.shape[0] if is_zero(kpt): aosym = 's2' else: aosym = 's1' j3cR = [None] * nkptj j3cI = [None] * nkptj i0 = ao_loc[sh0] i1 = ao_loc[sh1] for k, idx in enumerate(adapted_ji_idx): key = 'j3c-chunks/%d/%d' % (job_id, idx) v = numpy.asarray(feri[key]) if is_zero(kpt): for i, c in enumerate(vbar): if c != 0: v[i] -= c * ovlp[k][i0 * (i0 + 1) // 2:i1 * (i1 + 1) // 2].ravel() j3cR[k] = numpy.asarray(v.real, order='C') if v.dtype == numpy.complex128: j3cI[k] = numpy.asarray(v.imag, order='C') v = None ncol = j3cR[0].shape[1] Gblksize = max(16, int(max_memory * 1e6 / 16 / ncol / (nkptj + 1))) # +1 for pqkRbuf/pqkIbuf Gblksize = min(Gblksize, ngs, 16384) pqkRbuf = numpy.empty(ncol * Gblksize) pqkIbuf = numpy.empty(ncol * Gblksize) buf = numpy.empty(nkptj * ncol * Gblksize, dtype=numpy.complex128) log.alldebug2(' blksize (%d,%d)', Gblksize, ncol) shls_slice = (sh0, sh1, 0, cell.nbas) for p0, p1 in lib.prange(0, ngs, Gblksize): dat = ft_ao._ft_aopair_kpts(cell, Gv[p0:p1], shls_slice, aosym, b, gxyz[p0:p1], Gvbase, kpt, adapted_kptjs, out=buf) nG = p1 - p0 for k, ji in enumerate(adapted_ji_idx): aoao = dat[k].reshape(nG, ncol) pqkR = numpy.ndarray((ncol, nG), buffer=pqkRbuf) pqkI = numpy.ndarray((ncol, nG), buffer=pqkIbuf) pqkR[:] = aoao.real.T pqkI[:] = aoao.imag.T lib.dot(kLR[p0:p1].T, pqkR.T, -1, j3cR[k][naux:], 1) lib.dot(kLI[p0:p1].T, pqkI.T, -1, j3cR[k][naux:], 1) if not (is_zero(kpt) and gamma_point(adapted_kptjs[k])): lib.dot(kLR[p0:p1].T, pqkI.T, -1, j3cI[k][naux:], 1) lib.dot(kLI[p0:p1].T, pqkR.T, 1, j3cI[k][naux:], 1) for k, idx in enumerate(adapted_ji_idx): if is_zero(kpt) and gamma_point(adapted_kptjs[k]): v = fuse(j3cR[k]) else: v = fuse(j3cR[k] + j3cI[k] * 1j) if j2ctag == 'CD': v = scipy.linalg.solve_triangular(j2c, v, lower=True, overwrite_b=True) else: v = lib.dot(j2c, v) feri['j3c-chunks/%d/%d' % (job_id, idx)][:naux0] = v t2 = t1 j3c_workers = numpy.zeros(len(j3c_jobs), dtype=int) #for job_id, ish0, ish1 in mpi.work_share_partition(j3c_jobs): for job_id, ish0, ish1 in mpi.work_stealing_partition(j3c_jobs): gen_int3c(fused_cell, job_id, ish0, ish1) t2 = log.alltimer_debug2('int j3c %d' % job_id, *t2) for k, kpt in enumerate(uniq_kpts): ft_fuse(job_id, k, ish0, ish1) t2 = log.alltimer_debug2('ft-fuse %d k %d' % (job_id, k), *t2) j3c_workers[job_id] = rank j3c_workers = mpi.allreduce(j3c_workers) log.debug2('j3c_workers %s', j3c_workers) j2c = kLRs = kLIs = ovlp = vbar = fuse = gen_int3c = ft_fuse = None t1 = log.timer_debug1('int3c and fuse', *t1) def get_segs_loc(aosym): off0 = numpy.asarray([ao_loc[i0] for x, i0, i1 in j3c_jobs]) off1 = numpy.asarray([ao_loc[i1] for x, i0, i1 in j3c_jobs]) if aosym: # s2 dims = off1 * (off1 + 1) // 2 - off0 * (off0 + 1) // 2 else: dims = (off1 - off0) * nao #dims = numpy.asarray([ao_loc[i1]-ao_loc[i0] for x,i0,i1 in j3c_jobs]) dims = numpy.hstack( [dims[j3c_workers == w] for w in range(mpi.pool.size)]) job_idx = numpy.hstack( [numpy.where(j3c_workers == w)[0] for w in range(mpi.pool.size)]) segs_loc = numpy.append(0, numpy.cumsum(dims)) segs_loc = [(segs_loc[j], segs_loc[j + 1]) for j in numpy.argsort(job_idx)] return segs_loc segs_loc_s1 = get_segs_loc(False) segs_loc_s2 = get_segs_loc(True) if 'j3c' in feri: del (feri['j3c']) segsize = (max(nauxs) + mpi.pool.size - 1) // mpi.pool.size naux0 = rank * segsize for k, kptij in enumerate(kptij_lst): naux1 = min(nauxs[uniq_inverse[k]], naux0 + segsize) nrow = max(0, naux1 - naux0) if gamma_point(kptij): dtype = 'f8' else: dtype = 'c16' if aosym_s2[k]: nao_pair = nao * (nao + 1) // 2 else: nao_pair = nao * nao feri.create_dataset('j3c/%d' % k, (nrow, nao_pair), dtype, maxshape=(None, nao_pair)) def load(k, p0, p1): naux1 = nauxs[uniq_inverse[k]] slices = [(min(i * segsize + p0, naux1), min(i * segsize + p1, naux1)) for i in range(mpi.pool.size)] segs = [] for p0, p1 in slices: val = [] for job_id, worker in enumerate(j3c_workers): if rank == worker: key = 'j3c-chunks/%d/%d' % (job_id, k) val.append(feri[key][p0:p1].ravel()) if val: segs.append(numpy.hstack(val)) else: segs.append(numpy.zeros(0)) return segs def save(k, p0, p1, segs): segs = mpi.alltoall(segs) naux1 = nauxs[uniq_inverse[k]] loc0, loc1 = min(p0, naux1 - naux0), min(p1, naux1 - naux0) nL = loc1 - loc0 if nL > 0: if aosym_s2[k]: segs = numpy.hstack([ segs[i0 * nL:i1 * nL].reshape(nL, -1) for i0, i1 in segs_loc_s2 ]) else: segs = numpy.hstack([ segs[i0 * nL:i1 * nL].reshape(nL, -1) for i0, i1 in segs_loc_s1 ]) feri['j3c/%d' % k][loc0:loc1] = segs mem_now = max(comm.allgather(lib.current_memory()[0])) max_memory = max(2000, min(8000, mydf.max_memory - mem_now)) if numpy.all(aosym_s2): if gamma_point(kptij_lst): blksize = max(16, int(max_memory * .5e6 / 8 / nao**2)) else: blksize = max(16, int(max_memory * .5e6 / 16 / nao**2)) else: blksize = max(16, int(max_memory * .5e6 / 16 / nao**2 / 2)) log.debug1('max_momory %d MB (%d in use), blksize %d', max_memory, mem_now, blksize) t2 = t1 with lib.call_in_background(save) as async_write: for k, kptji in enumerate(kptij_lst): for p0, p1 in lib.prange(0, segsize, blksize): segs = load(k, p0, p1) async_write(k, p0, p1, segs) t2 = log.timer_debug1( 'assemble k=%d %d:%d (in %d)' % (k, p0, p1, segsize), *t2) if 'j3c-chunks' in feri: del (feri['j3c-chunks']) if 'j3c-kptij' in feri: del (feri['j3c-kptij']) feri['j3c-kptij'] = kptij_lst t1 = log.alltimer_debug1('assembling j3c', *t1) feri.close()
def general(mol, mo_coeffs, erifile, dataname='eri_mo', intor='int2e', aosym='s4', comp=None, max_memory=MAX_MEMORY, ioblk_size=IOBLK_SIZE, verbose=logger.WARN, compact=True): r'''For the given four sets of orbitals, transfer arbitrary spherical AO integrals to MO integrals on the fly. Args: mol : :class:`Mole` object AO integrals will be generated in terms of mol._atm, mol._bas, mol._env mo_coeffs : 4-item list of ndarray Four sets of orbital coefficients, corresponding to the four indices of (ij|kl) erifile : str or h5py File or h5py Group object To store the transformed integrals, in HDF5 format. Kwargs dataname : str The dataset name in the erifile (ref the hierarchy of HDF5 format http://www.hdfgroup.org/HDF5/doc1.6/UG/09_Groups.html). By assigning different dataname, the existed integral file can be reused. If the erifile contains the dataname, the new integrals data will overwrite the old one. intor : str Name of the 2-electron integral. Ref to :func:`getints_by_shell` for the complete list of available 2-electron integral names aosym : int or str Permutation symmetry for the AO integrals | 4 or '4' or 's4': 4-fold symmetry (default) | '2ij' or 's2ij' : symmetry between i, j in (ij|kl) | '2kl' or 's2kl' : symmetry between k, l in (ij|kl) | 1 or '1' or 's1': no symmetry | 'a4ij' : 4-fold symmetry with anti-symmetry between i, j in (ij|kl) (TODO) | 'a4kl' : 4-fold symmetry with anti-symmetry between k, l in (ij|kl) (TODO) | 'a2ij' : anti-symmetry between i, j in (ij|kl) (TODO) | 'a2kl' : anti-symmetry between k, l in (ij|kl) (TODO) comp : int Components of the integrals, e.g. int2e_ip_sph has 3 components. max_memory : float or int The maximum size of cache to use (in MB), large cache may **not** improve performance. ioblk_size : float or int The block size for IO, large block size may **not** improve performance verbose : int Print level compact : bool When compact is True, depending on the four oribital sets, the returned MO integrals has (up to 4-fold) permutation symmetry. If it's False, the function will abandon any permutation symmetry, and return the "plain" MO integrals Returns: None Examples: >>> from pyscf import gto >>> from pyscf import ao2mo >>> import h5py >>> def view(h5file, dataname='eri_mo'): ... f5 = h5py.File(h5file, 'r') ... print('dataset %s, shape %s' % (str(f5.keys()), str(f5[dataname].shape))) ... f5.close() >>> mol = gto.M(atom='O 0 0 0; H 0 1 0; H 0 0 1', basis='sto3g') >>> mo1 = numpy.random.random((mol.nao_nr(), 10)) >>> mo2 = numpy.random.random((mol.nao_nr(), 8)) >>> mo3 = numpy.random.random((mol.nao_nr(), 6)) >>> mo4 = numpy.random.random((mol.nao_nr(), 4)) >>> ao2mo.outcore.general(mol, (mo1,mo2,mo3,mo4), 'oh2.h5') >>> view('oh2.h5') dataset ['eri_mo'], shape (80, 24) >>> ao2mo.outcore.general(mol, (mo1,mo2,mo3,mo3), 'oh2.h5') >>> view('oh2.h5') dataset ['eri_mo'], shape (80, 21) >>> ao2mo.outcore.general(mol, (mo1,mo2,mo3,mo3), 'oh2.h5', compact=False) >>> view('oh2.h5') dataset ['eri_mo'], shape (80, 36) >>> ao2mo.outcore.general(mol, (mo1,mo1,mo2,mo2), 'oh2.h5') >>> view('oh2.h5') dataset ['eri_mo'], shape (55, 36) >>> ao2mo.outcore.general(mol, (mo1,mo1,mo1,mo1), 'oh2.h5', dataname='new') >>> view('oh2.h5', 'new') dataset ['eri_mo', 'new'], shape (55, 55) >>> ao2mo.outcore.general(mol, (mo1,mo1,mo1,mo1), 'oh2.h5', intor='int2e_ip1_sph', aosym='s1', comp=3) >>> view('oh2.h5') dataset ['eri_mo', 'new'], shape (3, 100, 100) >>> ao2mo.outcore.general(mol, (mo1,mo1,mo1,mo1), 'oh2.h5', intor='int2e_ip1_sph', aosym='s2kl', comp=3) >>> view('oh2.h5') dataset ['eri_mo', 'new'], shape (3, 100, 55) ''' if any(c.dtype == numpy.complex128 for c in mo_coeffs): raise NotImplementedError('Integral transformation for complex orbitals') time_0pass = (logger.process_clock(), logger.perf_counter()) log = logger.new_logger(mol, verbose) nmoi = mo_coeffs[0].shape[1] nmoj = mo_coeffs[1].shape[1] nmol = mo_coeffs[3].shape[1] nao = mo_coeffs[0].shape[0] intor, comp = gto.moleintor._get_intor_and_comp(mol._add_suffix(intor), comp) assert(nao == mol.nao_nr('_cart' in intor)) aosym = _stand_sym_code(aosym) if aosym in ('s4', 's2kl'): nao_pair = nao * (nao+1) // 2 else: nao_pair = nao * nao if (compact and iden_coeffs(mo_coeffs[0], mo_coeffs[1]) and aosym in ('s4', 's2ij')): nij_pair = nmoi*(nmoi+1) // 2 else: nij_pair = nmoi*nmoj klmosym, nkl_pair, mokl, klshape = \ incore._conc_mos(mo_coeffs[2], mo_coeffs[3], compact and aosym in ('s4', 's2kl')) # if nij_pair > nkl_pair: # log.warn('low efficiency for AO to MO trans!') if isinstance(erifile, str): if h5py.is_hdf5(erifile): feri = h5py.File(erifile, 'a') if dataname in feri: del(feri[dataname]) else: feri = h5py.File(erifile, 'w') else: assert(isinstance(erifile, h5py.Group)) feri = erifile if comp == 1: chunks = (nmoj, nmol) shape = (nij_pair, nkl_pair) else: chunks = (1, nmoj, nmol) shape = (comp, nij_pair, nkl_pair) if nij_pair == 0 or nkl_pair == 0: feri.create_dataset(dataname, shape, 'f8') if isinstance(erifile, str): feri.close() return erifile else: h5d_eri = feri.create_dataset(dataname, shape, 'f8', chunks=chunks) log.debug('MO integrals %s are saved in %s/%s', intor, erifile, dataname) log.debug('num. MO ints = %.8g, required disk %.8g MB', float(nij_pair)*nkl_pair*comp, nij_pair*nkl_pair*comp*8/1e6) # transform e1 fswap = lib.H5TmpFile() half_e1(mol, mo_coeffs, fswap, intor, aosym, comp, max_memory, ioblk_size, log, compact) time_1pass = log.timer('AO->MO transformation for %s 1 pass'%intor, *time_0pass) def load(icomp, row0, row1, buf): if icomp+1 < comp: icomp += 1 else: # move to next row-block row0, row1 = row1, min(nij_pair, row1+iobuflen) icomp = 0 if row0 < row1: _load_from_h5g(fswap['%d'%icomp], row0, row1, buf) def save(icomp, row0, row1, buf): if comp == 1: h5d_eri[row0:row1] = buf[:row1-row0] else: h5d_eri[icomp,row0:row1] = buf[:row1-row0] ioblk_size = max(max_memory*.1, ioblk_size) iobuflen = guess_e2bufsize(ioblk_size, nij_pair, max(nao_pair,nkl_pair))[0] buf = numpy.empty((iobuflen,nao_pair)) buf_prefetch = numpy.empty_like(buf) outbuf = numpy.empty((iobuflen,nkl_pair)) buf_write = numpy.empty_like(outbuf) log.debug('step2: kl-pair (ao %d, mo %d), mem %.8g MB, ioblock %.8g MB', nao_pair, nkl_pair, iobuflen*nao_pair*8/1e6, iobuflen*nkl_pair*8/1e6) #klaoblks = len(fswap['0']) ijmoblks = int(numpy.ceil(float(nij_pair)/iobuflen)) * comp ao_loc = mol.ao_loc_nr('_cart' in intor) ti0 = time_1pass istep = 0 with lib.call_in_background(load) as prefetch: with lib.call_in_background(save) as async_write: _load_from_h5g(fswap['0'], 0, min(nij_pair, iobuflen), buf_prefetch) for row0, row1 in prange(0, nij_pair, iobuflen): nrow = row1 - row0 for icomp in range(comp): istep += 1 log.debug1('step 2 [%d/%d], [%d,%d:%d], row = %d', istep, ijmoblks, icomp, row0, row1, nrow) buf, buf_prefetch = buf_prefetch, buf prefetch(icomp, row0, row1, buf_prefetch) _ao2mo.nr_e2(buf[:nrow], mokl, klshape, aosym, klmosym, ao_loc=ao_loc, out=outbuf) async_write(icomp, row0, row1, outbuf) outbuf, buf_write = buf_write, outbuf # avoid flushing writing buffer ti1 = (logger.process_clock(), logger.perf_counter()) log.debug1('step 2 [%d/%d] CPU time: %9.2f, Wall time: %9.2f', istep, ijmoblks, ti1[0]-ti0[0], ti1[1]-ti0[1]) ti0 = ti1 fswap = None if isinstance(erifile, str): feri.close() log.timer('AO->MO transformation for %s 2 pass'%intor, *time_1pass) log.timer('AO->MO transformation for %s '%intor, *time_0pass) return erifile
def _ao2mo_ovov(mp, orbo, orbv, feri, max_memory=2000, verbose=None): time0 = (time.clock(), time.time()) log = logger.new_logger(mp, verbose) mol = mp.mol int2e = mol._add_suffix('int2e') ao2mopt = _ao2mo.AO2MOpt(mol, int2e, 'CVHFnr_schwarz_cond', 'CVHFsetnr_direct_scf') nao, nocc = orbo.shape nvir = orbv.shape[1] nbas = mol.nbas assert (nvir <= nao) ao_loc = mol.ao_loc_nr() dmax = max( 4, min(nao / 3, numpy.sqrt(max_memory * .95e6 / 8 / (nao + nocc)**2))) sh_ranges = ao2mo.outcore.balance_partition(ao_loc, dmax) dmax = max(x[2] for x in sh_ranges) eribuf = numpy.empty((nao, dmax, dmax, nao)) ftmp = lib.H5TmpFile() log.debug('max_memory %s MB (dmax = %s) required disk space %g MB', max_memory, dmax, nocc**2 * (nao * (nao + dmax) / 2 + nvir**2) * 8 / 1e6) buf_i = numpy.empty((nocc * dmax**2 * nao)) buf_li = numpy.empty((nocc**2 * dmax**2)) buf1 = numpy.empty_like(buf_li) fint = gto.moleintor.getints4c jk_blk_slices = [] count = 0 time1 = time0 with lib.call_in_background(ftmp.__setitem__) as save: for ip, (ish0, ish1, ni) in enumerate(sh_ranges): for jsh0, jsh1, nj in sh_ranges[:ip + 1]: i0, i1 = ao_loc[ish0], ao_loc[ish1] j0, j1 = ao_loc[jsh0], ao_loc[jsh1] jk_blk_slices.append((i0, i1, j0, j1)) eri = fint(int2e, mol._atm, mol._bas, mol._env, shls_slice=(0, nbas, ish0, ish1, jsh0, jsh1, 0, nbas), aosym='s1', ao_loc=ao_loc, cintopt=ao2mopt._cintopt, out=eribuf) tmp_i = numpy.ndarray((nocc, (i1 - i0) * (j1 - j0) * nao), buffer=buf_i) tmp_li = numpy.ndarray((nocc, nocc * (i1 - i0) * (j1 - j0)), buffer=buf_li) lib.ddot(orbo.T, eri.reshape(nao, (i1 - i0) * (j1 - j0) * nao), c=tmp_i) lib.ddot(orbo.T, tmp_i.reshape(nocc * (i1 - i0) * (j1 - j0), nao).T, c=tmp_li) tmp_li = tmp_li.reshape(nocc, nocc, (i1 - i0), (j1 - j0)) save(str(count), tmp_li.transpose(1, 0, 2, 3)) buf_li, buf1 = buf1, buf_li count += 1 time1 = log.timer_debug1( 'partial ao2mo [%d:%d,%d:%d]' % (ish0, ish1, jsh0, jsh1), *time1) time1 = time0 = log.timer('mp2 ao2mo_ovov pass1', *time0) eri = eribuf = tmp_i = tmp_li = buf_i = buf_li = buf1 = None h5dat = feri.create_dataset('ovov', (nocc * nvir, nocc * nvir), 'f8', chunks=(nvir, nvir)) occblk = int( min(nocc, max(4, 250 / nocc, max_memory * .9e6 / 8 / (nao**2 * nocc) / 5))) def load(i0, eri): if i0 < nocc: i1 = min(i0 + occblk, nocc) for k, (p0, p1, q0, q1) in enumerate(jk_blk_slices): eri[:i1 - i0, :, p0:p1, q0:q1] = ftmp[str(k)][i0:i1] if p0 != q0: dat = numpy.asarray(ftmp[str(k)][:, i0:i1]) eri[:i1 - i0, :, q0:q1, p0:p1] = dat.transpose(1, 0, 3, 2) def save(i0, i1, dat): for i in range(i0, i1): h5dat[i * nvir:(i + 1) * nvir] = dat[i - i0].reshape( nvir, nocc * nvir) orbv = numpy.asarray(orbv, order='F') buf_prefecth = numpy.empty((occblk, nocc, nao, nao)) buf = numpy.empty_like(buf_prefecth) bufw = numpy.empty((occblk * nocc, nvir**2)) bufw1 = numpy.empty_like(bufw) with lib.call_in_background(load) as prefetch: with lib.call_in_background(save) as bsave: load(0, buf_prefecth) for i0, i1 in lib.prange(0, nocc, occblk): buf, buf_prefecth = buf_prefecth, buf prefetch(i1, buf_prefecth) eri = buf[:i1 - i0].reshape((i1 - i0) * nocc, nao, nao) dat = _ao2mo.nr_e2(eri, orbv, (0, nvir, 0, nvir), 's1', 's1', out=bufw) bsave( i0, i1, dat.reshape(i1 - i0, nocc, nvir, nvir).transpose(0, 2, 1, 3)) bufw, bufw1 = bufw1, bufw time1 = log.timer_debug1('pass2 ao2mo [%d:%d]' % (i0, i1), *time1) time0 = log.timer('mp2 ao2mo_ovov pass2', *time0) return h5dat
def cholesky_eri_b(mol, erifile, auxbasis='weigend+etb', dataname='j3c', int3c='int3c2e', aosym='s2ij', int2c='int2c2e', comp=1, max_memory=MAX_MEMORY, auxmol=None, verbose=logger.NOTE): '''3-center 2-electron DF tensor. Similar to cholesky_eri while this function stores DF tensor in blocks. ''' assert (aosym in ('s1', 's2ij')) log = logger.new_logger(mol, verbose) time0 = (time.clock(), time.time()) if auxmol is None: auxmol = make_auxmol(mol, auxbasis) j2c = auxmol.intor(int2c, hermi=1) log.debug('size of aux basis %d', j2c.shape[0]) time1 = log.timer('2c2e', *time0) try: low = scipy.linalg.cholesky(j2c, lower=True) tag = 'cd' except scipy.linalg.LinAlgError: w, v = scipy.linalg.eigh(j2c) idx = w > LINEAR_DEP_THR low = (v[:, idx] / numpy.sqrt(w[idx])) v = None tag = 'eig' j2c = None naoaux, naux = low.shape time1 = log.timer('Cholesky 2c2e', *time1) int3c = gto.moleintor.ascint3(mol._add_suffix(int3c)) atm, bas, env = gto.mole.conc_env(mol._atm, mol._bas, mol._env, auxmol._atm, auxmol._bas, auxmol._env) ao_loc = gto.moleintor.make_loc(bas, int3c) nao = ao_loc[mol.nbas] naoaux = ao_loc[-1] - nao if aosym == 's1': nao_pair = nao * nao buflen = min(max(int(max_memory * .24e6 / 8 / naoaux / comp), 1), nao_pair) shranges = _guess_shell_ranges(mol, buflen, 's1') else: nao_pair = nao * (nao + 1) // 2 buflen = min(max(int(max_memory * .24e6 / 8 / naoaux / comp), 1), nao_pair) shranges = _guess_shell_ranges(mol, buflen, 's2ij') log.debug('erifile %.8g MB, IO buf size %.8g MB', naoaux * nao_pair * 8 / 1e6, comp * buflen * naoaux * 8 / 1e6) log.debug1('shranges = %s', shranges) # TODO: Libcint-3.14 and newer version support to compute int3c2e without # the opt for the 3rd index. #if '3c2e' in int3c: # cintopt = gto.moleintor.make_cintopt(atm, mol._bas, env, int3c) #else: # cintopt = gto.moleintor.make_cintopt(atm, bas, env, int3c) cintopt = gto.moleintor.make_cintopt(atm, bas, env, int3c) bufs1 = numpy.empty((comp * max([x[2] for x in shranges]), naoaux)) feri = _create_h5file(erifile, dataname) def store(buf, label): if comp == 1: feri[label] = buf else: shape = (len(buf), ) + buf[0].shape fdat = feri.create_dataset(label, shape, buf[0].dtype.char) for i, b in enumerate(buf): fdat[i] = b def transform(b): if b.ndim == 3 and b.flags.f_contiguous: b = lib.transpose(b.T, axes=(0, 2, 1)).reshape(naoaux, -1) else: b = b.reshape((-1, naoaux)).T if tag == 'cd': if b.flags.c_contiguous: b = lib.transpose(b).T return scipy.linalg.solve_triangular(low, b, lower=True, overwrite_b=True, check_finite=False) else: return lib.dot(low.T, b) with lib.call_in_background(store) as bstore: for istep, sh_range in enumerate(shranges): log.debug('int3c2e [%d/%d], AO [%d:%d], nrow = %d', \ istep+1, len(shranges), *sh_range) bstart, bend, nrow = sh_range shls_slice = (bstart, bend, 0, mol.nbas, mol.nbas, mol.nbas + auxmol.nbas) ints = gto.moleintor.getints3c(int3c, atm, bas, env, shls_slice, comp, aosym, ao_loc, cintopt, out=bufs1) if comp == 1: buf = transform(ints) else: buf = [transform(x) for x in ints] bstore(buf, '%s/%d' % (dataname, istep)) buf = ints = None time1 = log.timer( 'gen CD eri [%d/%d]' % (istep + 1, len(shranges)), *time1) bufs1 = None feri.close() return erifile
def _make_eris(mp, mo_coeff=None, verbose=None): log = logger.new_logger(mp, verbose) time0 = (time.clock(), time.time()) log.debug('transform (ia|jb) outcore') mol = mp.mol nocc = mp.nocc nmo = mp.nmo nvir = nmo - nocc eris = mp2._ChemistsERIs(mp, mo_coeff) nao = eris.mo_coeff.shape[0] assert(nvir <= nao) orbo = eris.mo_coeff[:,:nocc] orbv = numpy.asarray(eris.mo_coeff[:,nocc:], order='F') eris.feri = lib.H5TmpFile() int2e = mol._add_suffix('int2e') ao2mopt = _ao2mo.AO2MOpt(mol, int2e, 'CVHFnr_schwarz_cond', 'CVHFsetnr_direct_scf') fint = gto.moleintor.getints4c ntasks = mpi.pool.size olocs = [_task_location(nocc, task_id) for task_id in range(ntasks)] oloc0, oloc1 = olocs[rank] nocc_seg = oloc1 - oloc0 log.debug2('olocs %s', olocs) ao_loc = mol.ao_loc_nr() task_sh_locs = lib.misc._balanced_partition(ao_loc, ntasks) log.debug2('task_sh_locs %s', task_sh_locs) ao_sh0 = task_sh_locs[rank] ao_sh1 = task_sh_locs[rank+1] ao_loc0 = ao_loc[ao_sh0] ao_loc1 = ao_loc[ao_sh1] nao_seg = ao_loc1 - ao_loc0 orbo_seg = orbo[ao_loc0:ao_loc1] mem_now = lib.current_memory()[0] max_memory = max(0, mp.max_memory - mem_now) dmax = numpy.sqrt(max_memory*.9e6/8/((nao+nocc)*(nao_seg+nocc))) dmax = min(nao//4+2, max(BLKMIN, min(comm.allgather(dmax)))) sh_ranges = ao2mo.outcore.balance_partition(ao_loc, dmax) sh_ranges = comm.bcast(sh_ranges) dmax = max(x[2] for x in sh_ranges) eribuf = numpy.empty((nao,dmax,dmax,nao_seg)) ftmp = lib.H5TmpFile() log.debug('max_memory %s MB (dmax = %s) required disk space %g MB', max_memory, dmax, nocc*nocc_seg*(nao*(nao+dmax)/2+nvir**2)*8/1e6) def save(count, tmp_xo): di, dj = tmp_xo.shape[2:4] tmp_xo = [tmp_xo[p0:p1] for p0, p1 in olocs] tmp_xo = mpi.alltoall(tmp_xo, split_recvbuf=True) tmp_xo = sum(tmp_xo).reshape(nocc_seg,nocc,di,dj) ftmp[str(count)+'b'] = tmp_xo tmp_ox = mpi.alltoall([tmp_xo[:,p0:p1] for p0, p1 in olocs], split_recvbuf=True) tmp_ox = [tmp_ox[i].reshape(p1-p0,nocc_seg,di,dj) for i, (p0,p1) in enumerate(olocs)] ftmp[str(count)+'a'] = numpy.vstack(tmp_ox) jk_blk_slices = [] count = 0 time1 = time0 with lib.call_in_background(save) as bg_save: for ip, (ish0, ish1, ni) in enumerate(sh_ranges): for jsh0, jsh1, nj in sh_ranges[:ip+1]: i0, i1 = ao_loc[ish0], ao_loc[ish1] j0, j1 = ao_loc[jsh0], ao_loc[jsh1] jk_blk_slices.append((i0,i1,j0,j1)) shls_slice = (0,mol.nbas,ish0,ish1, jsh0,jsh1,ao_sh0,ao_sh1) eri = fint(int2e, mol._atm, mol._bas, mol._env, shls_slice=shls_slice, aosym='s1', ao_loc=ao_loc, cintopt=ao2mopt._cintopt, out=eribuf) tmp_xo = lib.einsum('pi,pqrs->iqrs', orbo, eri) tmp_xo = lib.einsum('iqrs,sl->ilqr', tmp_xo, orbo_seg) bg_save(count, tmp_xo) tmp_xo = None count += 1 time1 = log.timer_debug1('partial ao2mo [%d:%d,%d:%d]' % (ish0,ish1,jsh0,jsh1), *time1) eri = eribuf = None time1 = time0 = log.timer('mp2 ao2mo_ovov pass1', *time0) eris.ovov = eris.feri.create_dataset('ovov', (nocc,nvir,nocc_seg,nvir), 'f8') occblk = int(min(nocc, max(BLKMIN, max_memory*.9e6/8/(nao**2*nocc_seg+1)/5))) def load(i0, eri): if i0 < nocc: i1 = min(i0+occblk, nocc) for k, (p0,p1,q0,q1) in enumerate(jk_blk_slices): eri[:i1-i0,:,p0:p1,q0:q1] = ftmp[str(k)+'a'][i0:i1] if p0 != q0: dat = numpy.asarray(ftmp[str(k)+'b'][:,i0:i1]) eri[:i1-i0,:,q0:q1,p0:p1] = dat.transpose(1,0,3,2) def save(i0, i1, dat): eris.ovov[i0:i1] = dat buf_prefecth = numpy.empty((occblk,nocc_seg,nao,nao)) buf = numpy.empty_like(buf_prefecth) bufw = numpy.empty((occblk*nocc_seg,nvir**2)) bufw1 = numpy.empty_like(bufw) with lib.call_in_background(load) as prefetch: with lib.call_in_background(save) as bsave: load(0, buf_prefecth) for i0, i1 in lib.prange(0, nocc, occblk): buf, buf_prefecth = buf_prefecth, buf prefetch(i1, buf_prefecth) eri = buf[:i1-i0].reshape((i1-i0)*nocc_seg,nao,nao) dat = _ao2mo.nr_e2(eri, orbv, (0,nvir,0,nvir), 's1', 's1', out=bufw) bsave(i0, i1, dat.reshape(i1-i0,nocc_seg,nvir,nvir).transpose(0,2,1,3)) bufw, bufw1 = bufw1, bufw time1 = log.timer_debug1('pass2 ao2mo [%d:%d]' % (i0,i1), *time1) time0 = log.timer('mp2 ao2mo_ovov pass2', *time0) mp._eris = eris return eris
def cholesky_eri(mol, erifile, auxbasis='weigend+etb', dataname='j3c', tmpdir=None, int3c='int3c2e', aosym='s2ij', int2c='int2c2e', comp=1, max_memory=MAX_MEMORY, auxmol=None, verbose=logger.NOTE): '''3-index density-fitting tensor. ''' assert (aosym in ('s1', 's2ij')) assert (comp == 1) log = logger.new_logger(mol, verbose) time0 = (time.clock(), time.time()) if auxmol is None: auxmol = make_auxmol(mol, auxbasis) if tmpdir is None: tmpdir = lib.param.TMPDIR swapfile = tempfile.NamedTemporaryFile(dir=tmpdir) cholesky_eri_b(mol, swapfile.name, auxbasis, dataname, int3c, aosym, int2c, comp, max_memory, auxmol, verbose=log) fswap = h5py.File(swapfile.name, 'r') time1 = log.timer('generate (ij|L) 1 pass', *time0) # Cannot let naoaux = auxmol.nao_nr() if auxbasis has linear dependence nao = mol.nao_nr() if aosym == 's1': nao_pair = nao * nao else: nao_pair = nao * (nao + 1) // 2 feri = _create_h5file(erifile, dataname) if comp == 1: naoaux = fswap['%s/0' % dataname].shape[0] h5d_eri = feri.create_dataset(dataname, (naoaux, nao_pair), 'f8') else: naoaux = fswap['%s/0' % dataname].shape[1] h5d_eri = feri.create_dataset(dataname, (comp, naoaux, nao_pair), 'f8') def save(row0, row1, buf): if comp == 1: h5d_eri[row0:row1] = buf else: h5d_eri[:, row0:row1] = buf iolen = min(max(int(max_memory * .45e6 / 8 / nao_pair), 28), naoaux) totstep = (naoaux + iolen - 1) // iolen bufs1 = numpy.empty((iolen, nao_pair)) bufs2 = numpy.empty_like(bufs1) ti0 = time1 with lib.call_in_background(save) as bsave: for istep, (row0, row1) in enumerate(lib.prange(0, naoaux, iolen)): nrow = row1 - row0 buf = _load_from_h5g(fswap[dataname], row0, row1, bufs1) bufs1, bufs2 = bufs2, bufs1 bsave(row0, row1, buf) ti0 = log.timer( 'step 2 [%d/%d], [%d:%d], row = %d' % (istep + 1, totstep, row0, row1, nrow), *ti0) fswap.close() feri.close() log.timer('cholesky_eri', *time0) return erifile
def make_kpt(uniq_kptji_id, cholesky_j2c): kpt = uniq_kpts[uniq_kptji_id] # kpt = kptj - kpti log.debug1('kpt = %s', kpt) adapted_ji_idx = numpy.where(uniq_inverse == uniq_kptji_id)[0] adapted_kptjs = kptjs[adapted_ji_idx] nkptj = len(adapted_kptjs) log.debug1('adapted_ji_idx = %s', adapted_ji_idx) j2c, j2c_negative, j2ctag = cholesky_j2c shls_slice = (auxcell.nbas, fused_cell.nbas) Gaux = ft_ao.ft_ao(fused_cell, Gv, shls_slice, b, gxyz, Gvbase, kpt) wcoulG = mydf.weighted_coulG(kpt, False, mesh) Gaux *= wcoulG.reshape(-1,1) kLR = Gaux.real.copy('C') kLI = Gaux.imag.copy('C') Gaux = None if is_zero(kpt): # kpti == kptj aosym = 's2' nao_pair = nao*(nao+1)//2 if cell.dimension == 3: vbar = fuse(mydf.auxbar(fused_cell)) ovlp = cell.pbc_intor('int1e_ovlp', hermi=1, kpts=adapted_kptjs) ovlp = [lib.pack_tril(s) for s in ovlp] else: aosym = 's1' nao_pair = nao**2 mem_now = lib.current_memory()[0] log.debug2('memory = %s', mem_now) max_memory = max(2000, mydf.max_memory-mem_now) # nkptj for 3c-coulomb arrays plus 1 Lpq array buflen = min(max(int(max_memory*.38e6/16/naux/(nkptj+1)), 1), nao_pair) shranges = _guess_shell_ranges(cell, buflen, aosym) buflen = max([x[2] for x in shranges]) # +1 for a pqkbuf if aosym == 's2': Gblksize = max(16, int(max_memory*.1e6/16/buflen/(nkptj+1))) else: Gblksize = max(16, int(max_memory*.2e6/16/buflen/(nkptj+1))) Gblksize = min(Gblksize, ngrids, 16384) pqkRbuf = numpy.empty(buflen*Gblksize) pqkIbuf = numpy.empty(buflen*Gblksize) # buf for ft_aopair buf = numpy.empty(nkptj*buflen*Gblksize, dtype=numpy.complex128) def pw_contract(istep, sh_range, j3cR, j3cI): bstart, bend, ncol = sh_range if aosym == 's2': shls_slice = (bstart, bend, 0, bend) else: shls_slice = (bstart, bend, 0, cell.nbas) for p0, p1 in lib.prange(0, ngrids, Gblksize): dat = ft_ao._ft_aopair_kpts(cell, Gv[p0:p1], shls_slice, aosym, b, gxyz[p0:p1], Gvbase, kpt, adapted_kptjs, out=buf) nG = p1 - p0 for k, ji in enumerate(adapted_ji_idx): aoao = dat[k].reshape(nG,ncol) pqkR = numpy.ndarray((ncol,nG), buffer=pqkRbuf) pqkI = numpy.ndarray((ncol,nG), buffer=pqkIbuf) pqkR[:] = aoao.real.T pqkI[:] = aoao.imag.T lib.dot(kLR[p0:p1].T, pqkR.T, -1, j3cR[k][naux:], 1) lib.dot(kLI[p0:p1].T, pqkI.T, -1, j3cR[k][naux:], 1) if not (is_zero(kpt) and gamma_point(adapted_kptjs[k])): lib.dot(kLR[p0:p1].T, pqkI.T, -1, j3cI[k][naux:], 1) lib.dot(kLI[p0:p1].T, pqkR.T, 1, j3cI[k][naux:], 1) for k, ji in enumerate(adapted_ji_idx): if is_zero(kpt) and gamma_point(adapted_kptjs[k]): v = fuse(j3cR[k]) else: v = fuse(j3cR[k] + j3cI[k] * 1j) if j2ctag == 'CD': v = scipy.linalg.solve_triangular(j2c, v, lower=True, overwrite_b=True) feri['j3c/%d/%d'%(ji,istep)] = v else: feri['j3c/%d/%d'%(ji,istep)] = lib.dot(j2c, v) # low-dimension systems if j2c_negative is not None: feri['j3c-/%d/%d'%(ji,istep)] = lib.dot(j2c_negative, v) with lib.call_in_background(pw_contract) as compute: col1 = 0 for istep, sh_range in enumerate(shranges): log.debug1('int3c2e [%d/%d], AO [%d:%d], ncol = %d', \ istep+1, len(shranges), *sh_range) bstart, bend, ncol = sh_range col0, col1 = col1, col1+ncol j3cR = [] j3cI = [] for k, idx in enumerate(adapted_ji_idx): v = numpy.vstack([fswap['j3c-junk/%d/%d'%(idx,i)][0,col0:col1].T for i in range(nsegs)]) # vbar is the interaction between the background charge # and the auxiliary basis. 0D, 1D, 2D do not have vbar. if is_zero(kpt) and cell.dimension == 3: for i in numpy.where(vbar != 0)[0]: v[i] -= vbar[i] * ovlp[k][col0:col1] j3cR.append(numpy.asarray(v.real, order='C')) if is_zero(kpt) and gamma_point(adapted_kptjs[k]): j3cI.append(None) else: j3cI.append(numpy.asarray(v.imag, order='C')) v = None compute(istep, sh_range, j3cR, j3cI) for ji in adapted_ji_idx: del(fswap['j3c-junk/%d'%ji])
def update_amps(mycc, t1, t2, eris): time1 = time0 = time.clock(), time.time() log = logger.Logger(mycc.stdout, mycc.verbose) cpu1 = time0 t1T = t1.T t2T = numpy.asarray(t2.transpose(2, 3, 0, 1), order='C') nvir_seg, nvir, nocc = t2T.shape[:3] t1 = t2 = None ntasks = mpi.pool.size vlocs = [_task_location(nvir, task_id) for task_id in range(ntasks)] vloc0, vloc1 = vlocs[rank] log.debug2('vlocs %s', vlocs) assert (vloc1 - vloc0 == nvir_seg) fock = eris.fock mo_e_o = eris.mo_energy[:nocc] mo_e_v = eris.mo_energy[nocc:] + mycc.level_shift def _rotate_vir_block(buf): for task_id, buf in _rotate_tensor_block(buf): loc0, loc1 = vlocs[task_id] yield task_id, buf, loc0, loc1 fswap = lib.H5TmpFile() wVooV = numpy.zeros((nvir_seg, nocc, nocc, nvir)) eris_voov = _cp(eris.ovvo).transpose(1, 0, 3, 2) tau = t2T * .5 tau += numpy.einsum('ai,bj->abij', t1T[vloc0:vloc1], t1T) for task_id, tau, p0, p1 in _rotate_vir_block(tau): wVooV += lib.einsum('bkic,cajk->bija', eris_voov[:, :, :, p0:p1], tau) fswap['wVooV1'] = wVooV wVooV = tau = None time1 = log.timer_debug1('wVooV', *time1) wVOov = eris_voov eris_VOov = eris_voov - eris_voov.transpose(0, 2, 1, 3) * .5 tau = t2T.transpose(2, 0, 3, 1) - t2T.transpose(3, 0, 2, 1) * .5 tau -= numpy.einsum('ai,bj->jaib', t1T[vloc0:vloc1], t1T) for task_id, tau, p0, p1 in _rotate_vir_block(tau): wVOov += lib.einsum('dlkc,kcjb->dljb', eris_VOov[:, :, :, p0:p1], tau) fswap['wVOov1'] = wVOov wVOov = tau = eris_VOov = eris_voov = None time1 = log.timer_debug1('wVOov', *time1) t1Tnew = numpy.zeros_like(t1T) t2Tnew = mycc._add_vvvv(t1T, t2T, eris, t2sym='jiba') time1 = log.timer_debug1('vvvv', *time1) #** make_inter_F fov = fock[:nocc, nocc:].copy() t1Tnew += fock[nocc:, :nocc] foo = fock[:nocc, :nocc] - numpy.diag(mo_e_o) foo += .5 * numpy.einsum('ia,aj->ij', fock[:nocc, nocc:], t1T) fvv = fock[nocc:, nocc:] - numpy.diag(mo_e_v) fvv -= .5 * numpy.einsum('ai,ib->ab', t1T, fock[:nocc, nocc:]) foo_priv = numpy.zeros_like(foo) fov_priv = numpy.zeros_like(fov) fvv_priv = numpy.zeros_like(fvv) t1T_priv = numpy.zeros_like(t1T) max_memory = mycc.max_memory - lib.current_memory()[0] unit = nocc * nvir**2 * 3 + nocc**2 * nvir + 1 blksize = min(nvir, max(BLKMIN, int((max_memory * .9e6 / 8 - t2T.size) / unit))) log.debug1('pass 1, max_memory %d MB, nocc,nvir = %d,%d blksize = %d', max_memory, nocc, nvir, blksize) buf = numpy.empty((blksize, nvir, nvir, nocc)) def load_vvvo(p0): p1 = min(nvir_seg, p0 + blksize) if p0 < p1: buf[:p1 - p0] = eris.vvvo[p0:p1] fswap.create_dataset('wVooV', (nvir_seg, nocc, nocc, nvir), 'f8') wVOov = [] with lib.call_in_background(load_vvvo) as prefetch: load_vvvo(0) for p0, p1 in lib.prange(vloc0, vloc1, blksize): i0, i1 = p0 - vloc0, p1 - vloc0 eris_vvvo, buf = buf[:p1 - p0], numpy.empty_like(buf) prefetch(i1) fvv_priv[p0:p1] += 2 * numpy.einsum('ck,abck->ab', t1T, eris_vvvo) fvv_priv -= numpy.einsum('ck,cabk->ab', t1T[p0:p1], eris_vvvo) if not mycc.direct: raise NotImplementedError tau = t2T[i0:i1] + numpy.einsum('ai,bj->abij', t1T[p0:p1], t1T) for task_id, tau, q0, q1 in _rotate_vir_block(tau): tmp = lib.einsum('bdck,cdij->bkij', eris_vvvo[:, :, q0:q1], tau) t2Tnew -= lib.einsum('ak,bkij->baji', t1T, tmp) tau = tmp = None fswap['wVooV'][i0:i1] = lib.einsum('cj,baci->bija', -t1T, eris_vvvo) theta = t2T[i0:i1].transpose(0, 2, 1, 3) * 2 theta -= t2T[i0:i1].transpose(0, 3, 1, 2) t1T_priv += lib.einsum('bicj,bacj->ai', theta, eris_vvvo) wVOov.append(lib.einsum('acbi,cj->abij', eris_vvvo, t1T)) theta = eris_vvvo = None time1 = log.timer_debug1('vvvo [%d:%d]' % (p0, p1), *time1) wVOov = numpy.vstack(wVOov) wVOov = mpi.alltoall([wVOov[:, q0:q1] for q0, q1 in vlocs], split_recvbuf=True) wVOov = numpy.vstack([x.reshape(-1, nvir_seg, nocc, nocc) for x in wVOov]) fswap['wVOov'] = wVOov.transpose(1, 2, 3, 0) wVooV = None unit = nocc**2 * nvir * 7 + nocc**3 + nocc * nvir**2 max_memory = max(0, mycc.max_memory - lib.current_memory()[0]) blksize = min(nvir, max(BLKMIN, int((max_memory * .9e6 / 8 - nocc**4) / unit))) log.debug1('pass 2, max_memory %d MB, nocc,nvir = %d,%d blksize = %d', max_memory, nocc, nvir, blksize) woooo = numpy.zeros((nocc, nocc, nocc, nocc)) for p0, p1 in lib.prange(vloc0, vloc1, blksize): i0, i1 = p0 - vloc0, p1 - vloc0 wVOov = fswap['wVOov'][i0:i1] wVooV = fswap['wVooV'][i0:i1] eris_ovoo = eris.ovoo[:, i0:i1] eris_oovv = numpy.empty((nocc, nocc, i1 - i0, nvir)) def load_oovv(p0, p1): eris_oovv[:] = eris.oovv[:, :, p0:p1] with lib.call_in_background(load_oovv) as prefetch_oovv: #:eris_oovv = eris.oovv[:,:,i0:i1] prefetch_oovv(i0, i1) foo_priv += numpy.einsum('ck,kcji->ij', 2 * t1T[p0:p1], eris_ovoo) foo_priv += numpy.einsum('ck,icjk->ij', -t1T[p0:p1], eris_ovoo) tmp = lib.einsum('al,jaik->lkji', t1T[p0:p1], eris_ovoo) woooo += tmp + tmp.transpose(1, 0, 3, 2) tmp = None wVOov -= lib.einsum('jbik,ak->bjia', eris_ovoo, t1T) t2Tnew[i0:i1] += wVOov.transpose(0, 3, 1, 2) wVooV += lib.einsum('kbij,ak->bija', eris_ovoo, t1T) eris_ovoo = None load_oovv = prefetch_oovv = None eris_ovvo = numpy.empty((nocc, i1 - i0, nvir, nocc)) def load_ovvo(p0, p1): eris_ovvo[:] = eris.ovvo[:, p0:p1] with lib.call_in_background(load_ovvo) as prefetch_ovvo: #:eris_ovvo = eris.ovvo[:,i0:i1] prefetch_ovvo(i0, i1) t1T_priv[p0:p1] -= numpy.einsum('bj,jiab->ai', t1T, eris_oovv) wVooV -= eris_oovv.transpose(2, 0, 1, 3) wVOov += wVooV * .5 #: bjia + bija*.5 eris_voov = eris_ovvo.transpose(1, 0, 3, 2) eris_ovvo = None load_ovvo = prefetch_ovvo = None def update_wVooV(i0, i1): wVooV[:] += fswap['wVooV1'][i0:i1] fswap['wVooV1'][i0:i1] = wVooV wVOov[:] += fswap['wVOov1'][i0:i1] fswap['wVOov1'][i0:i1] = wVOov with lib.call_in_background(update_wVooV) as update_wVooV: update_wVooV(i0, i1) t2Tnew[i0:i1] += eris_voov.transpose(0, 3, 1, 2) * .5 t1T_priv[p0:p1] += 2 * numpy.einsum('bj,aijb->ai', t1T, eris_voov) tmp = lib.einsum('ci,kjbc->bijk', t1T, eris_oovv) tmp += lib.einsum('bjkc,ci->bjik', eris_voov, t1T) t2Tnew[i0:i1] -= lib.einsum('bjik,ak->baji', tmp, t1T) eris_oovv = tmp = None fov_priv[:, p0:p1] += numpy.einsum('ck,aikc->ia', t1T, eris_voov) * 2 fov_priv[:, p0:p1] -= numpy.einsum('ck,akic->ia', t1T, eris_voov) tau = numpy.einsum('ai,bj->abij', t1T[p0:p1] * .5, t1T) tau += t2T[i0:i1] theta = tau.transpose(0, 1, 3, 2) * 2 theta -= tau fvv_priv -= lib.einsum('caij,cjib->ab', theta, eris_voov) foo_priv += lib.einsum('aikb,abkj->ij', eris_voov, theta) tau = theta = None tau = t2T[i0:i1] + numpy.einsum('ai,bj->abij', t1T[p0:p1], t1T) woooo += lib.einsum('abij,aklb->ijkl', tau, eris_voov) tau = None eris_VOov = wVOov = wVooV = update_wVooV = None time1 = log.timer_debug1('voov [%d:%d]' % (p0, p1), *time1) wVooV = _cp(fswap['wVooV1']) for task_id, wVooV, p0, p1 in _rotate_vir_block(wVooV): tmp = lib.einsum('ackj,ckib->ajbi', t2T[:, p0:p1], wVooV) t2Tnew += tmp.transpose(0, 2, 3, 1) t2Tnew += tmp.transpose(0, 2, 1, 3) * .5 wVooV = tmp = None time1 = log.timer_debug1('contracting wVooV', *time1) wVOov = _cp(fswap['wVOov1']) theta = t2T * 2 theta -= t2T.transpose(0, 1, 3, 2) for task_id, wVOov, p0, p1 in _rotate_vir_block(wVOov): t2Tnew += lib.einsum('acik,ckjb->abij', theta[:, p0:p1], wVOov) wVOov = theta = None fswap = None time1 = log.timer_debug1('contracting wVOov', *time1) foo += mpi.allreduce(foo_priv) fov += mpi.allreduce(fov_priv) fvv += mpi.allreduce(fvv_priv) theta = t2T.transpose(0, 1, 3, 2) * 2 - t2T t1T_priv[vloc0:vloc1] += numpy.einsum('jb,abji->ai', fov, theta) ovoo = _cp(eris.ovoo) for task_id, ovoo, p0, p1 in _rotate_vir_block(ovoo): t1T_priv[vloc0:vloc1] -= lib.einsum('jbki,abjk->ai', ovoo, theta[:, p0:p1]) theta = ovoo = None woooo = mpi.allreduce(woooo) woooo += _cp(eris.oooo).transpose(0, 2, 1, 3) tau = t2T + numpy.einsum('ai,bj->abij', t1T[vloc0:vloc1], t1T) t2Tnew += .5 * lib.einsum('abkl,ijkl->abij', tau, woooo) tau = woooo = None t1Tnew += mpi.allreduce(t1T_priv) ft_ij = foo + numpy.einsum('aj,ia->ij', .5 * t1T, fov) ft_ab = fvv - numpy.einsum('ai,ib->ab', .5 * t1T, fov) t2Tnew += lib.einsum('acij,bc->abij', t2T, ft_ab) t2Tnew -= lib.einsum('ki,abkj->abij', ft_ij, t2T) eia = mo_e_o[:, None] - mo_e_v t1Tnew += numpy.einsum('bi,ab->ai', t1T, fvv) t1Tnew -= numpy.einsum('aj,ji->ai', t1T, foo) t1Tnew /= eia.T t2tmp = mpi.alltoall([t2Tnew[:, p0:p1] for p0, p1 in vlocs], split_recvbuf=True) for task_id, (p0, p1) in enumerate(vlocs): tmp = t2tmp[task_id].reshape(p1 - p0, nvir_seg, nocc, nocc) t2Tnew[:, p0:p1] += tmp.transpose(1, 0, 3, 2) for i in range(vloc0, vloc1): t2Tnew[i - vloc0] /= lib.direct_sum('i+jb->bij', eia[:, i], eia) time0 = log.timer_debug1('update t1 t2', *time0) return t1Tnew.T, t2Tnew.transpose(2, 3, 0, 1)
def update_amps(mycc, t1, t2, eris): assert (isinstance(eris, _ChemistsERIs)) time0 = logger.process_clock(), logger.perf_counter() log = logger.Logger(mycc.stdout, mycc.verbose) nocc, nvir = t1.shape fock = eris.fock mo_e_o = eris.mo_energy[:nocc] mo_e_v = eris.mo_energy[nocc:] + mycc.level_shift t1new = numpy.zeros_like(t1) t2new = _add_vvvv(mycc, 0 * t1, t2, eris, t2sym='jiba') t2new *= .5 # *.5 because t2+t2.transpose(1,0,3,2) in the end time1 = log.timer_debug1('vvvv', *time0) #** make_inter_F fov = fock[:nocc, nocc:].copy() t1new += fov foo = fock[:nocc, :nocc] - numpy.diag(mo_e_o) fvv = fock[nocc:, nocc:] - numpy.diag(mo_e_v) if mycc.incore_complete: fswap = None else: fswap = lib.H5TmpFile() fwVOov, fwVooV = _add_ovvv_(mycc, t1, t2, eris, fvv, t1new, t2new, fswap) time1 = log.timer_debug1('ovvv', *time1) woooo = numpy.asarray(eris.oooo).transpose(0, 2, 1, 3).copy() unit = nocc**2 * nvir * 7 + nocc**3 + nocc * nvir**2 mem_now = lib.current_memory()[0] max_memory = max(0, mycc.max_memory - mem_now) blksize = min(nvir, max(BLKMIN, int((max_memory * .9e6 / 8 - nocc**4) / unit))) log.debug1('max_memory %d MB, nocc,nvir = %d,%d blksize = %d', max_memory, nocc, nvir, blksize) for p0, p1 in lib.prange(0, nvir, blksize): wVOov = fwVOov[p0:p1] wVooV = fwVooV[p0:p1] eris_ovoo = eris.ovoo[:, p0:p1] eris_oovv = numpy.empty((nocc, nocc, p1 - p0, nvir)) def load_oovv(p0, p1): eris_oovv[:] = eris.oovv[:, :, p0:p1] with lib.call_in_background(load_oovv, sync=not mycc.async_io) as prefetch_oovv: #:eris_oovv = eris.oovv[:,:,p0:p1] prefetch_oovv(p0, p1) wVOov -= lib.einsum('jbik,ka->bjia', eris_ovoo, t1) t2new[:, :, p0:p1] += wVOov.transpose(1, 2, 0, 3) eris_ovoo = None load_oovv = prefetch_oovv = None wVOov *= 0 # QCI eris_ovvo = numpy.empty((nocc, p1 - p0, nvir, nocc)) def load_ovvo(p0, p1): eris_ovvo[:] = eris.ovvo[:, p0:p1] with lib.call_in_background(load_ovvo, sync=not mycc.async_io) as prefetch_ovvo: #:eris_ovvo = eris.ovvo[:,p0:p1] prefetch_ovvo(p0, p1) t1new[:, p0:p1] -= numpy.einsum('jb,jiab->ia', t1, eris_oovv) wVooV -= eris_oovv.transpose(2, 0, 1, 3) wVOov += wVooV * .5 #: bjia + bija*.5 load_ovvo = prefetch_ovvo = None t2new[:, :, p0:p1] += (eris_ovvo * 0.5).transpose(0, 3, 1, 2) eris_voov = eris_ovvo.conj().transpose(1, 0, 3, 2) t1new[:, p0:p1] += 2 * numpy.einsum('jb,aijb->ia', t1, eris_voov) eris_ovvo = None eris_oovv = tmp = None fov[:, p0:p1] += numpy.einsum('kc,aikc->ia', t1, eris_voov) * 2 fov[:, p0:p1] -= numpy.einsum('kc,akic->ia', t1, eris_voov) tau = t2[:, :, p0:p1] theta = tau.transpose(1, 0, 2, 3) * 2 theta -= tau fvv -= lib.einsum('cjia,cjib->ab', theta.transpose(2, 1, 0, 3), eris_voov) foo += lib.einsum('aikb,kjab->ij', eris_voov, theta) theta = None woooo += lib.einsum('ijab,aklb->ijkl', tau, eris_voov) tau = None def update_wVooV(q0, q1, tau): wVooV[:] += lib.einsum('bkic,jkca->bija', eris_voov[:, :, :, q0:q1], tau) with lib.call_in_background(update_wVooV, sync=not mycc.async_io) as update_wVooV: for q0, q1 in lib.prange(0, nvir, blksize): tau = t2[:, :, q0:q1] * .5 #:wVooV += lib.einsum('bkic,jkca->bija', eris_voov[:,:,:,q0:q1], tau) update_wVooV(q0, q1, tau) tau = update_wVooV = None def update_t2(q0, q1, tmp): t2new[:, :, q0:q1] += tmp.transpose(2, 0, 1, 3) tmp *= .5 t2new[:, :, q0:q1] += tmp.transpose(0, 2, 1, 3) with lib.call_in_background(update_t2, sync=not mycc.async_io) as update_t2: for q0, q1 in lib.prange(0, nvir, blksize): tmp = lib.einsum('jkca,ckib->jaib', t2[:, :, p0:p1, q0:q1], wVooV) #:t2new[:,:,q0:q1] += tmp.transpose(2,0,1,3) #:tmp *= .5 #:t2new[:,:,q0:q1] += tmp.transpose(0,2,1,3) update_t2(q0, q1, tmp) tmp = None wVOov += eris_voov eris_VOov = -.5 * eris_voov.transpose(0, 2, 1, 3) eris_VOov += eris_voov eris_voov = None def update_wVOov(q0, q1, tau): wVOov[:, :, :, q0:q1] += .5 * lib.einsum('aikc,kcjb->aijb', eris_VOov, tau) with lib.call_in_background(update_wVOov, sync=not mycc.async_io) as update_wVOov: for q0, q1 in lib.prange(0, nvir, blksize): tau = t2[:, :, q0:q1].transpose(1, 3, 0, 2) * 2 tau -= t2[:, :, q0:q1].transpose(0, 3, 1, 2) #:wVOov[:,:,:,q0:q1] += .5 * lib.einsum('aikc,kcjb->aijb', eris_VOov, tau) update_wVOov(q0, q1, tau) tau = None def update_t2(q0, q1, theta): t2new[:, :, q0:q1] += lib.einsum('kica,ckjb->ijab', theta, wVOov) with lib.call_in_background(update_t2, sync=not mycc.async_io) as update_t2: for q0, q1 in lib.prange(0, nvir, blksize): theta = t2[:, :, p0:p1, q0:q1] * 2 theta -= t2[:, :, p0:p1, q0:q1].transpose(1, 0, 2, 3) #:t2new[:,:,q0:q1] += lib.einsum('kica,ckjb->ijab', theta, wVOov) update_t2(q0, q1, theta) theta = None eris_VOov = wVOov = wVooV = update_wVOov = None time1 = log.timer_debug1('voov [%d:%d]' % (p0, p1), *time1) fwVOov = fwVooV = fswap = None for p0, p1 in lib.prange(0, nvir, blksize): theta = t2[:, :, p0:p1].transpose(1, 0, 2, 3) * 2 - t2[:, :, p0:p1] t1new += numpy.einsum('jb,ijba->ia', fov[:, p0:p1], theta) t1new -= lib.einsum('jbki,kjba->ia', eris.ovoo[:, p0:p1], theta) tau = t2[:, :, p0:p1] t2new[:, :, p0:p1] += .5 * lib.einsum('ijkl,klab->ijab', woooo, tau) theta = tau = None t2new += lib.einsum('ijac,bc->ijab', t2, fvv) t2new -= lib.einsum('ki,kjab->ijab', foo, t2) eia = mo_e_o[:, None] - mo_e_v t1new += numpy.einsum('ib,ab->ia', t1, fvv) t1new -= numpy.einsum('ja,ji->ia', t1, foo) t1new /= eia #: t2new = t2new + t2new.transpose(1,0,3,2) for i in range(nocc): if i > 0: t2new[i, :i] += t2new[:i, i].transpose(0, 2, 1) t2new[i, :i] /= lib.direct_sum('a,jb->jab', eia[i], eia[:i]) t2new[:i, i] = t2new[i, :i].transpose(0, 2, 1) t2new[i, i] = t2new[i, i] + t2new[i, i].T t2new[i, i] /= lib.direct_sum('a,b->ab', eia[i], eia[i]) time0 = log.timer_debug1('update t1 t2', *time0) return t1new, t2new
def make_kpt(uniq_kptji_id, cholesky_j2c): kpt = uniq_kpts[uniq_kptji_id] # kpt = kptj - kpti log.debug1('kpt = %s', kpt) adapted_ji_idx = numpy.where(uniq_inverse == uniq_kptji_id)[0] adapted_kptjs = kptjs[adapted_ji_idx] nkptj = len(adapted_kptjs) log.debug1('adapted_ji_idx = %s', adapted_ji_idx) j2c, j2c_negative, j2ctag = cholesky_j2c shls_slice = (auxcell.nbas, fused_cell.nbas) Gaux = ft_ao.ft_ao(fused_cell, Gv, shls_slice, b, gxyz, Gvbase, kpt) wcoulG = mydf.weighted_coulG(kpt, False, mesh) Gaux *= wcoulG.reshape(-1, 1) kLR = Gaux.real.copy('C') kLI = Gaux.imag.copy('C') Gaux = None if is_zero(kpt): # kpti == kptj aosym = 's2' nao_pair = nao * (nao + 1) // 2 if cell.dimension == 3: vbar = fuse(mydf.auxbar(fused_cell)) ovlp = cell.pbc_intor('int1e_ovlp', hermi=1, kpts=adapted_kptjs) ovlp = [lib.pack_tril(s) for s in ovlp] else: aosym = 's1' nao_pair = nao**2 mem_now = lib.current_memory()[0] log.debug2('memory = %s', mem_now) max_memory = max(2000, mydf.max_memory - mem_now) # nkptj for 3c-coulomb arrays plus 1 Lpq array buflen = min(max(int(max_memory * .38e6 / 16 / naux / (nkptj + 1)), 1), nao_pair) shranges = _guess_shell_ranges(cell, buflen, aosym) buflen = max([x[2] for x in shranges]) # +1 for a pqkbuf if aosym == 's2': Gblksize = max(16, int(max_memory * .1e6 / 16 / buflen / (nkptj + 1))) else: Gblksize = max(16, int(max_memory * .2e6 / 16 / buflen / (nkptj + 1))) Gblksize = min(Gblksize, ngrids, 16384) pqkRbuf = numpy.empty(buflen * Gblksize) pqkIbuf = numpy.empty(buflen * Gblksize) # buf for ft_aopair buf = numpy.empty(nkptj * buflen * Gblksize, dtype=numpy.complex128) def pw_contract(istep, sh_range, j3cR, j3cI): bstart, bend, ncol = sh_range if aosym == 's2': shls_slice = (bstart, bend, 0, bend) else: shls_slice = (bstart, bend, 0, cell.nbas) for p0, p1 in lib.prange(0, ngrids, Gblksize): dat = ft_ao._ft_aopair_kpts(cell, Gv[p0:p1], shls_slice, aosym, b, gxyz[p0:p1], Gvbase, kpt, adapted_kptjs, out=buf) nG = p1 - p0 for k, ji in enumerate(adapted_ji_idx): aoao = dat[k].reshape(nG, ncol) pqkR = numpy.ndarray((ncol, nG), buffer=pqkRbuf) pqkI = numpy.ndarray((ncol, nG), buffer=pqkIbuf) pqkR[:] = aoao.real.T pqkI[:] = aoao.imag.T lib.dot(kLR[p0:p1].T, pqkR.T, -1, j3cR[k][naux:], 1) lib.dot(kLI[p0:p1].T, pqkI.T, -1, j3cR[k][naux:], 1) if not (is_zero(kpt) and gamma_point(adapted_kptjs[k])): lib.dot(kLR[p0:p1].T, pqkI.T, -1, j3cI[k][naux:], 1) lib.dot(kLI[p0:p1].T, pqkR.T, 1, j3cI[k][naux:], 1) for k, ji in enumerate(adapted_ji_idx): if is_zero(kpt) and gamma_point(adapted_kptjs[k]): v = fuse(j3cR[k]) else: v = fuse(j3cR[k] + j3cI[k] * 1j) if j2ctag == 'CD': v = scipy.linalg.solve_triangular(j2c, v, lower=True, overwrite_b=True) feri['j3c/%d/%d' % (ji, istep)] = v else: feri['j3c/%d/%d' % (ji, istep)] = lib.dot(j2c, v) # low-dimension systems if j2c_negative is not None: feri['j3c-/%d/%d' % (ji, istep)] = lib.dot(j2c_negative, v) with lib.call_in_background(pw_contract) as compute: col1 = 0 for istep, sh_range in enumerate(shranges): log.debug1('int3c2e [%d/%d], AO [%d:%d], ncol = %d', \ istep+1, len(shranges), *sh_range) bstart, bend, ncol = sh_range col0, col1 = col1, col1 + ncol j3cR = [] j3cI = [] for k, idx in enumerate(adapted_ji_idx): v = numpy.vstack([ fswap['j3c-junk/%d/%d' % (idx, i)][0, col0:col1].T for i in range(nsegs) ]) # vbar is the interaction between the background charge # and the auxiliary basis. 0D, 1D, 2D do not have vbar. if is_zero(kpt) and cell.dimension == 3: for i in numpy.where(vbar != 0)[0]: v[i] -= vbar[i] * ovlp[k][col0:col1] j3cR.append(numpy.asarray(v.real, order='C')) if is_zero(kpt) and gamma_point(adapted_kptjs[k]): j3cI.append(None) else: j3cI.append(numpy.asarray(v.imag, order='C')) v = None compute(istep, sh_range, j3cR, j3cI) for ji in adapted_ji_idx: del (fswap['j3c-junk/%d' % ji])
def _make_eris_outcore(mycc, mo_coeff=None): cput0 = (time.clock(), time.time()) log = logger.Logger(mycc.stdout, mycc.verbose) _sync_(mycc) eris = ccsd._ChemistsERIs() if rank == 0: eris._common_init_(mycc, mo_coeff) comm.bcast((eris.mo_coeff, eris.fock, eris.nocc, eris.mo_energy)) else: eris.mol = mycc.mol eris.mo_coeff, eris.fock, eris.nocc, eris.mo_energy = comm.bcast(None) mol = mycc.mol mo_coeff = numpy.asarray(eris.mo_coeff, order='F') nocc = eris.nocc nao, nmo = mo_coeff.shape nvir = nmo - nocc orbo = mo_coeff[:, :nocc] orbv = mo_coeff[:, nocc:] nvpair = nvir * (nvir + 1) // 2 vlocs = [_task_location(nvir, task_id) for task_id in range(mpi.pool.size)] vloc0, vloc1 = vlocs[rank] vseg = vloc1 - vloc0 eris.feri1 = lib.H5TmpFile() eris.oooo = eris.feri1.create_dataset('oooo', (nocc, nocc, nocc, nocc), 'f8') eris.oovv = eris.feri1.create_dataset('oovv', (nocc, nocc, vseg, nvir), 'f8', chunks=(nocc, nocc, 1, nvir)) eris.ovoo = eris.feri1.create_dataset('ovoo', (nocc, vseg, nocc, nocc), 'f8', chunks=(nocc, 1, nocc, nocc)) eris.ovvo = eris.feri1.create_dataset('ovvo', (nocc, vseg, nvir, nocc), 'f8', chunks=(nocc, 1, nvir, nocc)) eris.ovov = eris.feri1.create_dataset('ovov', (nocc, vseg, nocc, nvir), 'f8', chunks=(nocc, 1, nocc, nvir)) # eris.ovvv = eris.feri1.create_dataset('ovvv', (nocc,vseg,nvpair), 'f8', chunks=(nocc,1,nvpair)) eris.vvvo = eris.feri1.create_dataset('vvvo', (vseg, nvir, nvir, nocc), 'f8', chunks=(1, nvir, 1, nocc)) assert (mycc.direct) def save_occ_frac(p0, p1, eri): eri = eri.reshape(p1 - p0, nocc, nmo, nmo) eris.oooo[p0:p1] = eri[:, :, :nocc, :nocc] eris.oovv[p0:p1] = eri[:, :, nocc + vloc0:nocc + vloc1, nocc:] def save_vir_frac(p0, p1, eri): log.alldebug1('save_vir_frac %d %d %s', p0, p1, eri.shape) eri = eri.reshape(p1 - p0, nocc, nmo, nmo) eris.ovoo[:, p0:p1] = eri[:, :, :nocc, :nocc].transpose(1, 0, 2, 3) eris.ovvo[:, p0:p1] = eri[:, :, nocc:, :nocc].transpose(1, 0, 2, 3) eris.ovov[:, p0:p1] = eri[:, :, :nocc, nocc:].transpose(1, 0, 2, 3) # vvv = lib.pack_tril(eri[:,:,nocc:,nocc:].reshape((p1-p0)*nocc,nvir,nvir)) # eris.ovvv[:,p0:p1] = vvv.reshape(p1-p0,nocc,nvpair).transpose(1,0,2) cput2 = time.clock(), time.time() ovvv_segs = [ eri[:, :, nocc + q0:nocc + q1, nocc:].transpose(2, 3, 0, 1) for q0, q1 in vlocs ] ovvv_segs = mpi.alltoall(ovvv_segs, split_recvbuf=True) cput2 = log.timer_debug1('vvvo alltoall', *cput2) for task_id, (q0, q1) in enumerate(comm.allgather((p0, p1))): ip0 = q0 + vlocs[task_id][0] ip1 = q1 + vlocs[task_id][0] eris.vvvo[:, :, ip0:ip1] = ovvv_segs[task_id].reshape( vseg, nvir, q1 - q0, nocc) fswap = lib.H5TmpFile() max_memory = max(MEMORYMIN, mycc.max_memory - lib.current_memory()[0]) int2e = mol._add_suffix('int2e') orbov = numpy.hstack((orbo, orbv[:, vloc0:vloc1])) ao2mo.outcore.half_e1(mol, (orbov, orbo), fswap, int2e, 's4', 1, max_memory, verbose=log) ao_loc = mol.ao_loc_nr() nao_pair = nao * (nao + 1) // 2 blksize = int(min(8e9, max_memory * .5e6) / 8 / (nao_pair + nmo**2) / nocc) blksize = min(nvir, max(BLKMIN, blksize)) fload = ao2mo.outcore._load_from_h5g buf = numpy.empty((blksize * nocc, nao_pair)) buf_prefetch = numpy.empty_like(buf) def prefetch(p0, p1, rowmax): p0, p1 = p1, min(rowmax, p1 + blksize) if p0 < p1: fload(fswap['0'], p0 * nocc, p1 * nocc, buf_prefetch) cput1 = time.clock(), time.time() outbuf = numpy.empty((blksize * nocc, nmo**2)) with lib.call_in_background(prefetch) as bprefetch: fload(fswap['0'], 0, min(nocc, blksize) * nocc, buf_prefetch) for p0, p1 in lib.prange(0, nocc, blksize): nrow = (p1 - p0) * nocc buf, buf_prefetch = buf_prefetch, buf bprefetch(p0, p1, nocc) dat = ao2mo._ao2mo.nr_e2(buf[:nrow], mo_coeff, (0, nmo, 0, nmo), 's4', 's1', out=outbuf, ao_loc=ao_loc) save_occ_frac(p0, p1, dat) blksize = min(comm.allgather(blksize)) norb_max = nocc + vseg fload(fswap['0'], nocc**2, min(nocc + blksize, norb_max) * nocc, buf_prefetch) for p0, p1 in mpi.prange(vloc0, vloc1, blksize): i0, i1 = p0 - vloc0, p1 - vloc0 nrow = (p1 - p0) * nocc buf, buf_prefetch = buf_prefetch, buf bprefetch(nocc + i0, nocc + i1, norb_max) dat = ao2mo._ao2mo.nr_e2(buf[:nrow], mo_coeff, (0, nmo, 0, nmo), 's4', 's1', out=outbuf, ao_loc=ao_loc) save_vir_frac(i0, i1, dat) buf = buf_prefecth = outbuf = None cput1 = log.timer_debug1('transforming oppp', *cput1) log.timer('CCSD integral transformation', *cput0) mycc._eris = eris return eris
def _aux_e2(cell, auxcell, erifile, intor='int3c2e', aosym='s2ij', comp=None, kptij_lst=None, dataname='eri_mo', shls_slice=None, max_memory=2000, verbose=0): r'''3-center AO integrals (ij|L) with double lattice sum: \sum_{lm} (i[l]j[m]|L[0]), where L is the auxiliary basis. Three-index integral tensor (kptij_idx, nao_pair, naux) or four-index integral tensor (kptij_idx, comp, nao_pair, naux) are stored on disk. **This function should be only used by df and mdf initialization function _make_j3c** Args: kptij_lst : (*,2,3) array A list of (kpti, kptj) ''' intor, comp = gto.moleintor._get_intor_and_comp(cell._add_suffix(intor), comp) if isinstance(erifile, h5py.Group): feri = erifile elif h5py.is_hdf5(erifile): feri = h5py.File(erifile, 'a') else: feri = h5py.File(erifile, 'w') if dataname in feri: del (feri[dataname]) if dataname + '-kptij' in feri: del (feri[dataname + '-kptij']) if kptij_lst is None: kptij_lst = numpy.zeros((1, 2, 3)) feri[dataname + '-kptij'] = kptij_lst if shls_slice is None: shls_slice = (0, cell.nbas, 0, cell.nbas, 0, auxcell.nbas) ao_loc = cell.ao_loc_nr() aux_loc = auxcell.ao_loc_nr(auxcell.cart or 'ssc' in intor)[:shls_slice[5] + 1] ni = ao_loc[shls_slice[1]] - ao_loc[shls_slice[0]] nj = ao_loc[shls_slice[3]] - ao_loc[shls_slice[2]] naux = aux_loc[shls_slice[5]] - aux_loc[shls_slice[4]] nkptij = len(kptij_lst) nii = (ao_loc[shls_slice[1]] * (ao_loc[shls_slice[1]] + 1) // 2 - ao_loc[shls_slice[0]] * (ao_loc[shls_slice[0]] + 1) // 2) nij = ni * nj kpti = kptij_lst[:, 0] kptj = kptij_lst[:, 1] aosym_ks2 = abs(kpti - kptj).sum(axis=1) < KPT_DIFF_TOL j_only = numpy.all(aosym_ks2) #aosym_ks2 &= (aosym[:2] == 's2' and shls_slice[:2] == shls_slice[2:4]) aosym_ks2 &= aosym[:2] == 's2' if j_only and aosym[:2] == 's2': assert (shls_slice[2] == 0) nao_pair = nii else: nao_pair = nij if gamma_point(kptij_lst): dtype = numpy.double else: dtype = numpy.complex128 buflen = max(8, int(max_memory * .47e6 / 16 / (nkptij * ni * nj * comp))) auxdims = aux_loc[shls_slice[4] + 1:shls_slice[5] + 1] - aux_loc[shls_slice[4]:shls_slice[5]] auxranges = balance_segs(auxdims, buflen) buflen = max([x[2] for x in auxranges]) buf = numpy.empty(nkptij * comp * ni * nj * buflen, dtype=dtype) buf1 = numpy.empty_like(buf) int3c = wrap_int3c(cell, auxcell, intor, aosym, comp, kptij_lst) kptis = kptij_lst[:, 0] kptjs = kptij_lst[:, 1] kpt_ji = kptjs - kptis uniq_kpts, uniq_index, uniq_inverse = unique(kpt_ji) # sorted_ij_idx: Sort and group the kptij_lst according to the ordering in # df._make_j3c to reduce the data fragment in the hdf5 file. When datasets # are written to hdf5, they are saved sequentially. If the integral data are # saved as the order of kptij_lst, removing the datasets in df._make_j3c will # lead to holes that can not be reused. sorted_ij_idx = numpy.hstack( [numpy.where(uniq_inverse == k)[0] for k, kpt in enumerate(uniq_kpts)]) tril_idx = numpy.tril_indices(ni) tril_idx = tril_idx[0] * ni + tril_idx[1] def save(istep, mat): for k in sorted_ij_idx: v = mat[k] if gamma_point(kptij_lst[k]): v = v.real if aosym_ks2[k] and nao_pair == ni**2: v = v[:, tril_idx] feri['%s/%d/%d' % (dataname, k, istep)] = v with lib.call_in_background(save) as bsave: for istep, auxrange in enumerate(auxranges): sh0, sh1, nrow = auxrange sub_slice = (shls_slice[0], shls_slice[1], shls_slice[2], shls_slice[3], shls_slice[4] + sh0, shls_slice[4] + sh1) mat = numpy.ndarray((nkptij, comp, nao_pair, nrow), dtype=dtype, buffer=buf) bsave(istep, int3c(sub_slice, mat)) buf, buf1 = buf1, buf if not isinstance(erifile, h5py.Group): feri.close() return erifile
def _make_eris_outcore(myci, mo_coeff=None): cput0 = (time.clock(), time.time()) log = logger.Logger(myci.stdout, myci.verbose) eris = _RCISD_ERIs(myci, mo_coeff) mol = myci.mol mo_coeff = eris.mo_coeff nocc = eris.nocc nao, nmo = mo_coeff.shape nvir = nmo - nocc orbo = mo_coeff[:, :nocc] orbv = mo_coeff[:, nocc:] nvpair = nvir * (nvir + 1) // 2 eris.feri1 = lib.H5TmpFile() eris.oooo = eris.feri1.create_dataset('oooo', (nocc, nocc, nocc, nocc), 'f8') eris.vvoo = eris.feri1.create_dataset('vvoo', (nvir, nvir, nocc, nocc), 'f8') eris.vooo = eris.feri1.create_dataset('vooo', (nvir, nocc, nocc, nocc), 'f8') eris.voov = eris.feri1.create_dataset('voov', (nvir, nocc, nocc, nvir), 'f8') eris.vovv = eris.feri1.create_dataset('vovv', (nvir, nocc, nvpair), 'f8') nvir_pair = nvir * (nvir + 1) // 2 oovv = numpy.empty((nocc, nocc, nvir, nvir)) def save_occ_frac(p0, p1, eri): eri = eri.reshape(p1 - p0, nocc, nmo, nmo) eris.oooo[p0:p1] = eri[:, :, :nocc, :nocc] oovv[p0:p1] = eri[:, :, nocc:, nocc:] def save_vir_frac(p0, p1, eri): eri = eri.reshape(p1 - p0, nocc, nmo, nmo) eris.vooo[p0:p1] = eri[:, :, :nocc, :nocc] eris.voov[p0:p1] = eri[:, :, :nocc, nocc:] vv = _cp(eri[:, :, nocc:, nocc:].reshape((p1 - p0) * nocc, nvir, nvir)) eris.vovv[p0:p1] = lib.pack_tril(vv).reshape(p1 - p0, nocc, nvir_pair) cput1 = time.clock(), time.time() if not myci.direct: max_memory = max(2000, myci.max_memory - lib.current_memory()[0]) eris.feri2 = lib.H5TmpFile() ao2mo.full(mol, orbv, eris.feri2, max_memory=max_memory, verbose=log) eris.vvvv = eris.feri2['eri_mo'] cput1 = log.timer_debug1('transforming vvvv', *cput1) tmpfile3 = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR) with h5py.File(tmpfile3.name, 'w') as fswap: mo_coeff = numpy.asarray(mo_coeff, order='F') max_memory = max(2000, myci.max_memory - lib.current_memory()[0]) int2e = mol._add_suffix('int2e') ao2mo.outcore.half_e1(mol, (mo_coeff, mo_coeff[:, :nocc]), fswap, int2e, 's4', 1, max_memory, verbose=log) ao_loc = mol.ao_loc_nr() nao_pair = nao * (nao + 1) // 2 blksize = int( min(8e9, max_memory * .5e6) / 8 / (nao_pair + nmo**2) / nocc) blksize = max(1, min(nmo * nocc, blksize)) fload = ao2mo.outcore._load_from_h5g def prefetch(p0, p1, rowmax, buf): p0, p1 = p1, min(rowmax, p1 + blksize) if p0 < p1: fload(fswap['0'], p0 * nocc, p1 * nocc, buf) buf = numpy.empty((blksize * nocc, nao_pair)) buf_prefetch = numpy.empty_like(buf) outbuf = numpy.empty((blksize * nocc, nmo**2)) with lib.call_in_background(prefetch) as bprefetch: fload(fswap['0'], 0, min(nocc, blksize) * nocc, buf_prefetch) for p0, p1 in lib.prange(0, nocc, blksize): nrow = (p1 - p0) * nocc buf, buf_prefetch = buf_prefetch, buf bprefetch(p0, p1, nocc, buf_prefetch) dat = ao2mo._ao2mo.nr_e2(buf[:nrow], mo_coeff, (0, nmo, 0, nmo), 's4', 's1', out=outbuf, ao_loc=ao_loc) save_occ_frac(p0, p1, dat) fload(fswap['0'], nocc**2, min(nmo, nocc + blksize) * nocc, buf_prefetch) for p0, p1 in lib.prange(0, nvir, blksize): nrow = (p1 - p0) * nocc buf, buf_prefetch = buf_prefetch, buf bprefetch(nocc + p0, nocc + p1, nmo, buf_prefetch) dat = ao2mo._ao2mo.nr_e2(buf[:nrow], mo_coeff, (0, nmo, 0, nmo), 's4', 's1', out=outbuf, ao_loc=ao_loc) save_vir_frac(p0, p1, dat) cput1 = log.timer_debug1('transforming oppp', *cput1) eris.vvoo[:] = lib.transpose(oovv.reshape(nocc**2, -1)).reshape( nvir, nvir, nocc, nocc) log.timer('CISD integral transformation', *cput0) return eris
def _assemble(mydf, kptij_lst, j3c_jobs, gen_int3c, ft_fuse, cderi_file, fswap, log): t1 = (time.clock(), time.time()) cell = mydf.cell ao_loc = cell.ao_loc_nr() nao = ao_loc[-1] kptis = kptij_lst[:, 0] kptjs = kptij_lst[:, 1] kpt_ji = kptjs - kptis uniq_kpts, uniq_index, uniq_inverse = unique(kpt_ji) aosym_s2 = numpy.einsum('ix->i', abs(kptis - kptjs)) < 1e-9 t2 = t1 j3c_workers = numpy.zeros(len(j3c_jobs), dtype=int) #for job_id, ish0, ish1 in mpi.work_share_partition(j3c_jobs): for job_id, ish0, ish1 in mpi.work_stealing_partition(j3c_jobs): gen_int3c(job_id, ish0, ish1) t2 = log.alltimer_debug2('int j3c %d' % job_id, *t2) for k, kpt in enumerate(uniq_kpts): ft_fuse(job_id, k, ish0, ish1) t2 = log.alltimer_debug2('ft-fuse %d k %d' % (job_id, k), *t2) j3c_workers[job_id] = rank j3c_workers = mpi.allreduce(j3c_workers) log.debug2('j3c_workers %s', j3c_workers) t1 = log.timer_debug1('int3c and fuse', *t1) # Pass 2 # Transpose 3-index tensor and save data in cderi_file feri = h5py.File(cderi_file, 'w') nauxs = [fswap['j2c/%d' % k].shape[0] for k, kpt in enumerate(uniq_kpts)] segsize = (max(nauxs) + mpi.pool.size - 1) // mpi.pool.size naux0 = rank * segsize for k, kptij in enumerate(kptij_lst): naux1 = min(nauxs[uniq_inverse[k]], naux0 + segsize) nrow = max(0, naux1 - naux0) if gamma_point(kptij): dtype = 'f8' else: dtype = 'c16' if aosym_s2[k]: nao_pair = nao * (nao + 1) // 2 else: nao_pair = nao * nao feri.create_dataset('j3c/%d' % k, (nrow, nao_pair), dtype, maxshape=(None, nao_pair)) def get_segs_loc(aosym): off0 = numpy.asarray([ao_loc[i0] for x, i0, i1 in j3c_jobs]) off1 = numpy.asarray([ao_loc[i1] for x, i0, i1 in j3c_jobs]) if aosym: # s2 dims = off1 * (off1 + 1) // 2 - off0 * (off0 + 1) // 2 else: dims = (off1 - off0) * nao #dims = numpy.asarray([ao_loc[i1]-ao_loc[i0] for x,i0,i1 in j3c_jobs]) dims = numpy.hstack( [dims[j3c_workers == w] for w in range(mpi.pool.size)]) job_idx = numpy.hstack( [numpy.where(j3c_workers == w)[0] for w in range(mpi.pool.size)]) segs_loc = numpy.append(0, numpy.cumsum(dims)) segs_loc = [(segs_loc[j], segs_loc[j + 1]) for j in numpy.argsort(job_idx)] return segs_loc segs_loc_s1 = get_segs_loc(False) segs_loc_s2 = get_segs_loc(True) job_ids = numpy.where(rank == j3c_workers)[0] def load(k, p0, p1): naux1 = nauxs[uniq_inverse[k]] slices = [(min(i * segsize + p0, naux1), min(i * segsize + p1, naux1)) for i in range(mpi.pool.size)] segs = [] for p0, p1 in slices: val = [ fswap['j3c-chunks/%d/%d' % (job, k)][p0:p1].ravel() for job in job_ids ] if val: segs.append(numpy.hstack(val)) else: segs.append(numpy.zeros(0)) return segs def save(k, p0, p1, segs): segs = mpi.alltoall(segs) naux1 = nauxs[uniq_inverse[k]] loc0, loc1 = min(p0, naux1 - naux0), min(p1, naux1 - naux0) nL = loc1 - loc0 if nL > 0: if aosym_s2[k]: segs = numpy.hstack([ segs[i0 * nL:i1 * nL].reshape(nL, -1) for i0, i1 in segs_loc_s2 ]) else: segs = numpy.hstack([ segs[i0 * nL:i1 * nL].reshape(nL, -1) for i0, i1 in segs_loc_s1 ]) feri['j3c/%d' % k][loc0:loc1] = segs mem_now = max(comm.allgather(lib.current_memory()[0])) max_memory = max(2000, min(8000, mydf.max_memory - mem_now)) if numpy.all(aosym_s2): if gamma_point(kptij_lst): blksize = max(16, int(max_memory * .5e6 / 8 / nao**2)) else: blksize = max(16, int(max_memory * .5e6 / 16 / nao**2)) else: blksize = max(16, int(max_memory * .5e6 / 16 / nao**2 / 2)) log.debug1('max_momory %d MB (%d in use), blksize %d', max_memory, mem_now, blksize) t2 = t1 with lib.call_in_background(save) as async_write: for k, kptji in enumerate(kptij_lst): for p0, p1 in lib.prange(0, segsize, blksize): segs = load(k, p0, p1) async_write(k, p0, p1, segs) t2 = log.timer_debug1( 'assemble k=%d %d:%d (in %d)' % (k, p0, p1, segsize), *t2) if 'j2c-' in fswap: j2c_kpts_lists = [] for k, kpt in enumerate(uniq_kpts): if ('j2c-/%d' % k) in fswap: adapted_ji_idx = numpy.where(uniq_inverse == k)[0] j2c_kpts_lists.append(adapted_ji_idx) for k in numpy.hstack(j2c_kpts_lists): val = [ numpy.asarray(fswap['j3c-/%d/%d' % (job, k)]).ravel() for job in job_ids ] val = mpi.gather(numpy.hstack(val)) if rank == 0: naux1 = fswap['j3c-/0/%d' % k].shape[0] if aosym_s2[k]: v = [ val[i0 * naux1:i1 * naux1].reshape(naux1, -1) for i0, i1 in segs_loc_s2 ] else: v = [ val[i0 * naux1:i1 * naux1].reshape(naux1, -1) for i0, i1 in segs_loc_s1 ] feri['j3c-/%d' % k] = numpy.hstack(v) if 'j3c-kptij' in feri: del (feri['j3c-kptij']) feri['j3c-kptij'] = kptij_lst t1 = log.alltimer_debug1('assembling j3c', *t1) feri.close()
def _ao2mo_ovov(mp, orbs, feri, max_memory=2000, verbose=None): time0 = (time.clock(), time.time()) log = logger.new_logger(mp, verbose) orboa = numpy.asarray(orbs[0], order='F') orbva = numpy.asarray(orbs[1], order='F') orbob = numpy.asarray(orbs[2], order='F') orbvb = numpy.asarray(orbs[3], order='F') nao, nocca = orboa.shape noccb = orbob.shape[1] nvira = orbva.shape[1] nvirb = orbvb.shape[1] mol = mp.mol int2e = mol._add_suffix('int2e') ao2mopt = _ao2mo.AO2MOpt(mol, int2e, 'CVHFnr_schwarz_cond', 'CVHFsetnr_direct_scf') nbas = mol.nbas assert(nvira <= nao) assert(nvirb <= nao) ao_loc = mol.ao_loc_nr() dmax = max(4, min(nao/3, numpy.sqrt(max_memory*.95e6/8/(nao+nocca)**2))) sh_ranges = ao2mo.outcore.balance_partition(ao_loc, dmax) dmax = max(x[2] for x in sh_ranges) eribuf = numpy.empty((nao,dmax,dmax,nao)) ftmp = lib.H5TmpFile() disk = (nocca**2*(nao*(nao+dmax)/2+nvira**2) + noccb**2*(nao*(nao+dmax)/2+nvirb**2) + nocca*noccb*(nao**2+nvira*nvirb)) log.debug('max_memory %s MB (dmax = %s) required disk space %g MB', max_memory, dmax, disk*8/1e6) fint = gto.moleintor.getints4c aa_blk_slices = [] ab_blk_slices = [] count_ab = 0 count_aa = 0 time1 = time0 with lib.call_in_background(ftmp.__setitem__) as save: for ish0, ish1, ni in sh_ranges: for jsh0, jsh1, nj in sh_ranges: i0, i1 = ao_loc[ish0], ao_loc[ish1] j0, j1 = ao_loc[jsh0], ao_loc[jsh1] eri = fint(int2e, mol._atm, mol._bas, mol._env, shls_slice=(0,nbas,ish0,ish1, jsh0,jsh1,0,nbas), aosym='s1', ao_loc=ao_loc, cintopt=ao2mopt._cintopt, out=eribuf) tmp_i = lib.ddot(orboa.T, eri.reshape(nao,(i1-i0)*(j1-j0)*nao)) tmp_li = lib.ddot(orbob.T, tmp_i.reshape(nocca*(i1-i0)*(j1-j0),nao).T) tmp_li = tmp_li.reshape(noccb,nocca,(i1-i0),(j1-j0)) save('ab/%d'%count_ab, tmp_li.transpose(1,0,2,3)) ab_blk_slices.append((i0,i1,j0,j1)) count_ab += 1 if ish0 >= jsh0: tmp_li = lib.ddot(orboa.T, tmp_i.reshape(nocca*(i1-i0)*(j1-j0),nao).T) tmp_li = tmp_li.reshape(nocca,nocca,(i1-i0),(j1-j0)) save('aa/%d'%count_aa, tmp_li.transpose(1,0,2,3)) tmp_i = lib.ddot(orbob.T, eri.reshape(nao,(i1-i0)*(j1-j0)*nao)) tmp_li = lib.ddot(orbob.T, tmp_i.reshape(noccb*(i1-i0)*(j1-j0),nao).T) tmp_li = tmp_li.reshape(noccb,noccb,(i1-i0),(j1-j0)) save('bb/%d'%count_aa, tmp_li.transpose(1,0,2,3)) aa_blk_slices.append((i0,i1,j0,j1)) count_aa += 1 time1 = log.timer_debug1('partial ao2mo [%d:%d,%d:%d]' % (ish0,ish1,jsh0,jsh1), *time1) time1 = time0 = log.timer('mp2 ao2mo_ovov pass1', *time0) eri = eribuf = tmp_i = tmp_li = None fovov = feri.create_dataset('ovov', (nocca*nvira,nocca*nvira), 'f8', chunks=(nvira,nvira)) fovOV = feri.create_dataset('ovOV', (nocca*nvira,noccb*nvirb), 'f8', chunks=(nvira,nvirb)) fOVOV = feri.create_dataset('OVOV', (noccb*nvirb,noccb*nvirb), 'f8', chunks=(nvirb,nvirb)) occblk = int(min(max(nocca,noccb), max(4, 250/nocca, max_memory*.9e6/8/(nao**2*nocca)/5))) def load_aa(h5g, nocc, i0, eri): if i0 < nocc: i1 = min(i0+occblk, nocc) for k, (p0,p1,q0,q1) in enumerate(aa_blk_slices): eri[:i1-i0,:,p0:p1,q0:q1] = h5g[str(k)][i0:i1] if p0 != q0: dat = numpy.asarray(h5g[str(k)][:,i0:i1]) eri[:i1-i0,:,q0:q1,p0:p1] = dat.transpose(1,0,3,2) def load_ab(h5g, nocca, i0, eri): if i0 < nocca: i1 = min(i0+occblk, nocca) for k, (p0,p1,q0,q1) in enumerate(ab_blk_slices): eri[:i1-i0,:,p0:p1,q0:q1] = h5g[str(k)][i0:i1] def save(h5dat, nvir, i0, i1, dat): for i in range(i0, i1): h5dat[i*nvir:(i+1)*nvir] = dat[i-i0].reshape(nvir,-1) with lib.call_in_background(save) as bsave: with lib.call_in_background(load_aa) as prefetch: buf_prefecth = numpy.empty((occblk,nocca,nao,nao)) buf = numpy.empty_like(buf_prefecth) load_aa(ftmp['aa'], nocca, 0, buf_prefecth) for i0, i1 in lib.prange(0, nocca, occblk): buf, buf_prefecth = buf_prefecth, buf prefetch(ftmp['aa'], nocca, i1, buf_prefecth) eri = buf[:i1-i0].reshape((i1-i0)*nocca,nao,nao) dat = _ao2mo.nr_e2(eri, orbva, (0,nvira,0,nvira), 's1', 's1') bsave(fovov, nvira, i0, i1, dat.reshape(i1-i0,nocca,nvira,nvira).transpose(0,2,1,3)) time1 = log.timer_debug1('pass2 ao2mo for aa [%d:%d]' % (i0,i1), *time1) buf_prefecth = numpy.empty((occblk,noccb,nao,nao)) buf = numpy.empty_like(buf_prefecth) load_aa(ftmp['bb'], noccb, 0, buf_prefecth) for i0, i1 in lib.prange(0, noccb, occblk): buf, buf_prefecth = buf_prefecth, buf prefetch(ftmp['bb'], noccb, i1, buf_prefecth) eri = buf[:i1-i0].reshape((i1-i0)*noccb,nao,nao) dat = _ao2mo.nr_e2(eri, orbvb, (0,nvirb,0,nvirb), 's1', 's1') bsave(fOVOV, nvirb, i0, i1, dat.reshape(i1-i0,noccb,nvirb,nvirb).transpose(0,2,1,3)) time1 = log.timer_debug1('pass2 ao2mo for bb [%d:%d]' % (i0,i1), *time1) orbvab = numpy.asarray(numpy.hstack((orbva, orbvb)), order='F') with lib.call_in_background(load_ab) as prefetch: load_ab(ftmp['ab'], nocca, 0, buf_prefecth) for i0, i1 in lib.prange(0, nocca, occblk): buf, buf_prefecth = buf_prefecth, buf prefetch(ftmp['ab'], nocca, i1, buf_prefecth) eri = buf[:i1-i0].reshape((i1-i0)*noccb,nao,nao) dat = _ao2mo.nr_e2(eri, orbvab, (0,nvira,nvira,nvira+nvirb), 's1', 's1') bsave(fovOV, nvira, i0, i1, dat.reshape(i1-i0,noccb,nvira,nvirb).transpose(0,2,1,3)) time1 = log.timer_debug1('pass2 ao2mo for ab [%d:%d]' % (i0,i1), *time1) time0 = log.timer('mp2 ao2mo_ovov pass2', *time0)