def kernel(mycc, eris=None, t1=None, t2=None, max_cycle=50, tol=1e-8, tolnormt=1e-6, verbose=None): log = logger.new_logger(mycc, verbose) cput0 = (logger.process_clock(), logger.perf_counter()) _sync_(mycc) eris = getattr(mycc, '_eris', None) if eris is None: mycc.ao2mo(mycc.mo_coeff) eris = mycc._eris cput1 = (logger.process_clock(), logger.perf_counter()) # Use the existed amplitudes as initial guess if t1 is None: t1 = mycc.t1 if t2 is None: t2 = mycc.t2 if t1 is None and t2 is None: t1, t2 = mycc.get_init_guess(eris) elif t2 is None: t2 = mycc.get_init_guess(eris)[1] eold = 0 eccsd = mycc.energy(t1, t2, eris) log.info('Init E(CCSD) = %.15g', eccsd) if isinstance(mycc.diis, diis.DistributedDIIS): adiis = mycc.diis elif mycc.diis: adiis = diis.DistributedDIIS(mycc, mycc.diis_file) adiis.space = mycc.diis_space else: adiis = None conv = False for istep in range(max_cycle): t1new, t2new = mycc.update_amps(t1, t2, eris) normt = _diff_norm(mycc, t1new, t2new, t1, t2) t1, t2 = t1new, t2new t1new = t2new = None t1, t2 = mycc.run_diis(t1, t2, istep, normt, eccsd - eold, adiis) eold, eccsd = eccsd, mycc.energy(t1, t2, eris) log.info('cycle = %d E(CCSD) = %.15g dE = %.9g norm(t1,t2) = %.6g', istep + 1, eccsd, eccsd - eold, normt) cput1 = log.timer('CCSD iter', *cput1) if abs(eccsd - eold) < tol and normt < tolnormt: conv = True break mycc.e_corr = eccsd mycc.t1 = t1 mycc.t2 = t2 log.timer('CCSD', *cput0) return conv, eccsd, t1, t2
def init_amps(mycc, eris=None): eris = getattr(mycc, '_eris', None) if eris is None: mycc.ao2mo() eris = mycc._eris time0 = logger.process_clock(), logger.perf_counter() mo_e = eris.mo_energy nocc = mycc.nocc nvir = mo_e.size - nocc eia = mo_e[:nocc, None] - mo_e[None, nocc:] t1T = eris.fock[nocc:, :nocc] / eia.T loc0, loc1 = _task_location(nvir) t2T = numpy.empty((loc1 - loc0, nvir, nocc, nocc)) max_memory = mycc.max_memory - lib.current_memory()[0] blksize = int( min(nvir, max(BLKMIN, max_memory * .3e6 / 8 / (nocc**2 * nvir + 1)))) emp2 = 0 for p0, p1 in lib.prange(0, loc1 - loc0, blksize): eris_ovov = eris.ovov[:, p0:p1] t2T[p0:p1] = ( eris_ovov.transpose(1, 3, 0, 2) / lib.direct_sum('ia,jb->abij', eia[:, p0 + loc0:p1 + loc0], eia)) emp2 += 2 * numpy.einsum('abij,iajb', t2T[p0:p1], eris_ovov) emp2 -= numpy.einsum('abji,iajb', t2T[p0:p1], eris_ovov) mycc.emp2 = comm.allreduce(emp2) logger.info(mycc, 'Init t2, MP2 energy = %.15g', mycc.emp2) logger.timer(mycc, 'init mp2', *time0) return mycc.emp2, t1T.T, t2T.transpose(2, 3, 0, 1)
def init_amps(mycc, eris=None): eris = getattr(mycc, '_eris', None) if eris is None: mycc.ao2mo() eris = mycc._eris time0 = logger.process_clock(), logger.perf_counter() mo_e = eris.mo_energy nocc = mycc.nocc nvir = mo_e.size - nocc mo_e_o = mo_e[:nocc] mo_e_v = mo_e[nocc:] + mycc.level_shift eia = mo_e_o[:, None] - mo_e_v t1T = eris.fock[nocc:, :nocc] / eia.T loc0, loc1 = _task_location(nvir) t2T = np.empty((loc1-loc0, nvir, nocc, nocc)) max_memory = mycc.max_memory - lib.current_memory()[0] blksize = int(min(nvir, max(BLKMIN, max_memory*.3e6/8/(nocc**2*nvir+1)))) emp2 = 0 for p0, p1 in lib.prange(0, loc1-loc0, blksize): eris_vvoo = eris.xvoo[p0:p1] t2T[p0:p1] = (eris_vvoo / lib.direct_sum('ia, jb -> abij', eia[:, loc0+p0:loc0+p1], eia)) emp2 += np.einsum('abij, abij', t2T[p0:p1], eris_vvoo.conj(), optimize=True).real eris_vvoo = None mycc.emp2 = comm.allreduce(emp2) * 0.25 logger.info(mycc, 'Init t2, MP2 energy = %.15g', mycc.emp2) logger.timer(mycc, 'init mp2', *time0) mycc.t1 = t1T.T mycc.t2 = t2T.transpose(2, 3, 0, 1) return mycc.emp2, mycc.t1, mycc.t2
def _add_vvvv_full(mycc, t1T, t2T, eris, out=None, with_ovvv=False): '''Ht2 = numpy.einsum('ijcd,acdb->ijab', t2, vvvv) without using symmetry t2[ijab] = t2[jiba] in t2 or Ht2 ''' time0 = logger.process_clock(), logger.perf_counter() log = logger.Logger(mycc.stdout, mycc.verbose) nvir_seg, nvir, nocc = t2T.shape[:3] vloc0, vloc1 = _task_location(nvir, rank) nocc2 = nocc * (nocc + 1) // 2 if t1T is None: tau = lib.pack_tril(t2T.reshape(nvir_seg * nvir, nocc2)) else: tau = t2T + numpy.einsum('ai,bj->abij', t1T[vloc0:vloc1], t1T) tau = lib.pack_tril(tau.reshape(nvir_seg * nvir, nocc2)) tau = tau.reshape(nvir_seg, nvir, nocc2) if mycc.direct: # AO-direct CCSD if with_ovvv: raise NotImplementedError mo = getattr(eris, 'mo_coeff', None) if mo is None: # If eris does not have the attribute mo_coeff mo = _mo_without_core(mycc, mycc.mo_coeff) ao_loc = mycc.mol.ao_loc_nr() nao, nmo = mo.shape ntasks = mpi.pool.size task_sh_locs = lib.misc._balanced_partition(ao_loc, ntasks) ao_loc0 = ao_loc[task_sh_locs[rank]] ao_loc1 = ao_loc[task_sh_locs[rank + 1]] orbv = mo[:, nocc:] tau = lib.einsum('abij,pb->apij', tau, orbv) tau_priv = numpy.zeros((ao_loc1 - ao_loc0, nao, nocc, nocc)) for task_id, tau in _rotate_tensor_block(tau): loc0, loc1 = _task_location(nvir, task_id) tau_priv += lib.einsum('pa,abij->pbij', orbv[ao_loc0:ao_loc1, loc0:loc1], tau) tau = None time1 = log.timer_debug1('vvvv-tau mo2ao', *time0) buf = _contract_vvvv_t2(mycc, None, tau_priv, task_sh_locs, None, log) buf = buf.reshape(tau_priv.shape) tau_priv = None time1 = log.timer_debug1('vvvv-tau contraction', *time1) buf = lib.einsum('apij,pb->abij', buf, orbv) Ht2 = numpy.ndarray(t2T.shape, buffer=out) Ht2[:] = 0 for task_id, buf in _rotate_tensor_block(buf): ao_loc0 = ao_loc[task_sh_locs[task_id]] ao_loc1 = ao_loc[task_sh_locs[task_id + 1]] Ht2 += lib.einsum('pa,pbij->abij', orbv[ao_loc0:ao_loc1, vloc0:vloc1], buf) time1 = log.timer_debug1('vvvv-tau ao2mo', *time1) else: raise NotImplementedError return Ht2.reshape(t2T.shape)
def start(self, interval=0.02): mycc = self._cc log = logger.new_logger(mycc) cpu1 = (logger.process_clock(), logger.perf_counter()) eris = mycc._eris t2T = mycc.t2.transpose(2, 3, 0, 1) nocc, nvir = mycc.t1.shape nmo = nocc + nvir vloc0, vloc1 = self.vranges[rank] nvir_seg = vloc1 - vloc0 max_memory = min(24000, mycc.max_memory - lib.current_memory()[0]) blksize = min( nvir_seg // 4 + 1, max(16, int(max_memory * .3e6 / 8 / (nvir * nocc * nmo)))) self.eri_tmp = lib.H5TmpFile() vvop = self.eri_tmp.create_dataset('vvop', (nvir_seg, nvir, nocc, nmo), 'f8') def save_vvop(j0, j1, vvvo): buf = numpy.empty((j1 - j0, nvir, nocc, nmo), dtype=t2T.dtype) buf[:, :, :, :nocc] = eris.ovov[:, j0:j1].conj().transpose( 1, 3, 0, 2) for k, (q0, q1) in enumerate(self.vranges): blk = vvvo[k].reshape(q1 - q0, nvir, j1 - j0, nocc) buf[:, q0:q1, :, nocc:] = blk.transpose(2, 0, 3, 1) vvop[j0:j1] = buf with lib.call_in_background(save_vvop) as save_vvop: for p0, p1 in mpi.prange(vloc0, vloc1, blksize): j0, j1 = p0 - vloc0, p1 - vloc0 sub_locs = comm.allgather((p0, p1)) vvvo = mpi.alltoall_new( [eris.vvvo[:, :, q0:q1] for q0, q1 in sub_locs], split_recvbuf=True) save_vvop(j0, j1, vvvo) cpu1 = log.timer_debug1('transpose %d:%d' % (p0, p1), *cpu1) def send_data(): while True: while comm.Iprobe(source=MPI.ANY_SOURCE, tag=INQUIRY): tensors, dest = comm.recv(source=MPI.ANY_SOURCE, tag=INQUIRY) for task, slices in tensors: if task == 'Done': return else: mpi.send(self._get_tensor(task, slices), dest, tag=TRANSFER_DATA) time.sleep(interval) daemon = threading.Thread(target=send_data) daemon.start() return daemon
def build(mydf, j_only=None, with_j3c=True, kpts_band=None): # Unlike DF and AFT class, here MDF objects are synced once if mpi.pool.size == 1: return df.DF.build(mydf, j_only, with_j3c, kpts_band) mydf = _sync_mydf(mydf) cell = mydf.cell log = logger.Logger(mydf.stdout, mydf.verbose) log.debug('MPI info (rank, host, pid) %s', mpi.platform_info()) t1 = (logger.process_clock(), logger.perf_counter()) if mydf.kpts_band is not None: mydf.kpts_band = numpy.reshape(mydf.kpts_band, (-1, 3)) if kpts_band is not None: kpts_band = numpy.reshape(kpts_band, (-1, 3)) if mydf.kpts_band is None: mydf.kpts_band = kpts_band else: mydf.kpts_band = unique(numpy.vstack( (mydf.kpts_band, kpts_band)))[0] mydf.dump_flags() mydf.auxcell = make_modrho_basis(cell, mydf.auxbasis, mydf.eta) if mydf.kpts_band is None: kpts = mydf.kpts kband_uniq = numpy.zeros((0, 3)) else: kpts = mydf.kpts kband_uniq = [k for k in mydf.kpts_band if len(member(k, kpts)) == 0] if j_only is None: j_only = mydf._j_only if j_only: kall = numpy.vstack([kpts, kband_uniq]) kptij_lst = numpy.hstack((kall, kall)).reshape(-1, 2, 3) else: kptij_lst = [(ki, kpts[j]) for i, ki in enumerate(kpts) for j in range(i + 1)] kptij_lst.extend([(ki, kj) for ki in kband_uniq for kj in kpts]) kptij_lst.extend([(ki, ki) for ki in kband_uniq]) kptij_lst = numpy.asarray(kptij_lst) if with_j3c: if isinstance(mydf._cderi_to_save, str): cderi = mydf._cderi_to_save else: cderi = mydf._cderi_to_save.name if isinstance(mydf._cderi, str): log.warn( 'Value of _cderi is ignored. DF integrals will be ' 'saved in file %s .', cderi) mydf._cderi = cderi mydf._make_j3c(cell, mydf.auxcell, kptij_lst, cderi) t1 = log.timer_debug1('j3c', *t1) return mydf
def kernel(mycc, eris=None): cpu0 = (logger.process_clock(), logger.perf_counter()) ccsd._sync_(mycc) log = logger.new_logger(mycc) eris = getattr(mycc, '_eris', None) if eris is None: mycc.ao2mo(mycc.mo_coeff) eris = mycc._eris t1T = numpy.asarray(mycc.t1.T, order='C') nvir, nocc = t1T.shape fvo = eris.fock[nocc:, :nocc].copy() mo_energy = eris.mo_energy.copy() et_sum = numpy.zeros(1, dtype=t1T.dtype) drv = _ccsd.libcc.MPICCsd_t_contract cpu2 = [process_clock(), perf_counter()] def contract(slices, data): #vvop_ab, vvop_ac, vvop_ba, vvop_bc, vvop_ca, vvop_cb, \ # vooo_a, vooo_b, vooo_c, t2T_a, t2T_b, t2T_c = data data_ptrs = [x.ctypes.data_as(ctypes.c_void_p) for x in data] data_ptrs = (ctypes.c_void_p * 12)(*data_ptrs) drv(et_sum.ctypes.data_as(ctypes.c_void_p), mo_energy.ctypes.data_as(ctypes.c_void_p), t1T.ctypes.data_as(ctypes.c_void_p), fvo.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nocc), ctypes.c_int(nvir), (ctypes.c_int * 6)(*slices), data_ptrs) cpu2[:] = log.alltimer_debug1('contract' + str(slices), *cpu2) with GlobalDataHandler(mycc) as daemon: v_seg_ranges = daemon.data_partition tasks = [] for ka, (a0, a1) in enumerate(v_seg_ranges): for kb, (b0, b1) in enumerate(v_seg_ranges[:ka + 1]): for c0, c1 in v_seg_ranges[:kb + 1]: tasks.append((a0, a1, b0, b1, c0, c1)) log.debug('ntasks = %d', len(tasks)) task_count = 0 with lib.call_in_background(contract) as async_contract: #for task in mpi.static_partition(tasks): #for task in mpi.work_stealing_partition(tasks): for task in mpi.work_share_partition(tasks, loadmin=2): log.alldebug2('request for segment %s', task) data = [None] * 12 daemon.request_(task, data) async_contract(task, data) task_count += 1 log.alldebug1('task_count = %d', task_count) et = comm.allreduce(et_sum[0] * 2).real log.timer('CCSD(T)', *cpu0) log.note('CCSD(T) correction = %.15g', et) return et
def save_vir_frac(p0, p1, eri): log.alldebug1('save_vir_frac %d %d %s', p0, p1, eri.shape) eri = eri.reshape(p1 - p0, nocc, nmo, nmo) eris.ovoo[:, p0:p1] = eri[:, :, :nocc, :nocc].transpose(1, 0, 2, 3) eris.ovvo[:, p0:p1] = eri[:, :, nocc:, :nocc].transpose(1, 0, 2, 3) eris.ovov[:, p0:p1] = eri[:, :, :nocc, nocc:].transpose(1, 0, 2, 3) # vvv = lib.pack_tril(eri[:,:,nocc:,nocc:].reshape((p1-p0)*nocc,nvir,nvir)) # eris.ovvv[:,p0:p1] = vvv.reshape(p1-p0,nocc,nvpair).transpose(1,0,2) cput2 = logger.process_clock(), logger.perf_counter() ovvv_segs = [ eri[:, :, nocc + q0:nocc + q1, nocc:].transpose(2, 3, 0, 1) for q0, q1 in vlocs ] ovvv_segs = mpi.alltoall_new(ovvv_segs, split_recvbuf=True) cput2 = log.timer_debug1('vvvo alltoall', *cput2) for task_id, (q0, q1) in enumerate(comm.allgather((p0, p1))): ip0 = q0 + vlocs[task_id][0] ip1 = q1 + vlocs[task_id][0] eris.vvvo[:, :, ip0:ip1] = ovvv_segs[task_id].reshape( vseg, nvir, q1 - q0, nocc)
def _make_eris_incore_ghf(mycc, mo_coeff=None, ao2mofn=None): """ Make physist eri with incore ao2mo, for GGHF. """ cput0 = (logger.process_clock(), logger.perf_counter()) log = logger.Logger(mycc.stdout, mycc.verbose) _sync_(mycc) eris = gccsd._PhysicistsERIs() if rank == 0: eris._common_init_(mycc, mo_coeff) comm.bcast((eris.mo_coeff, eris.fock, eris.nocc, eris.mo_energy)) else: eris.mol = mycc.mol eris.mo_coeff, eris.fock, eris.nocc, eris.mo_energy = comm.bcast(None) nocc = eris.nocc nao, nmo = eris.mo_coeff.shape nvir = nmo - nocc vlocs = [_task_location(nvir, task_id) for task_id in range(mpi.pool.size)] vloc0, vloc1 = vlocs[rank] vseg = vloc1 - vloc0 if rank == 0: if callable(ao2mofn): raise NotImplementedError else: assert eris.mo_coeff.dtype == np.double eri = mycc._scf._eri if (nao == nmo) and (la.norm(eris.mo_coeff - np.eye(nmo)) < 1e-12): # ZHC NOTE special treatment for OO-CCD, # where the ao2mo is not needed for identity mo_coeff. from libdmet.utils import take_eri as fn o = np.arange(0, nocc) v = np.arange(nocc, nmo) if eri.size == nmo**4: eri = ao2mo.restore(8, eri, nmo) else: if mycc.save_mem: # ZHC NOTE the following is slower, although may save some memory. def fn(x, mo0, mo1, mo2, mo3): return ao2mo.general(x, (mo0, mo1, mo2, mo3), compact=False).reshape(mo0.shape[-1], mo1.shape[-1], mo2.shape[-1], mo3.shape[-1]) o = eris.mo_coeff[:, :nocc] v = eris.mo_coeff[:, nocc:] if eri.size == nao**4: eri = ao2mo.restore(8, eri, nao) else: from libdmet.utils import take_eri as fn o = np.arange(0, nocc) v = np.arange(nocc, nmo) if mycc.remove_h2: mycc._scf._eri = None _release_regs(mycc, remove_h2=True) eri = ao2mo.kernel(eri, eris.mo_coeff) if eri.size == nmo**4: eri = ao2mo.restore(8, eri, nmo) comm.Barrier() cput2 = log.timer('CCSD ao2mo initialization: ', *cput0) # chunck and scatter: # 1. oooo if rank == 0: tmp = fn(eri, o, o, o, o) eris.oooo = tmp.transpose(0, 2, 1, 3) - tmp.transpose(0, 2, 3, 1) tmp = None mpi.bcast(eris.oooo) else: eris.oooo = mpi.bcast(None) cput3 = log.timer('CCSD bcast oooo: ', *cput2) # 2. xooo if rank == 0: tmp = fn(eri, v, o, o, o) eri_sliced = [tmp[p0:p1] for (p0, p1) in vlocs] else: tmp = None eri_sliced = None tmp = mpi.scatter_new(eri_sliced, root=0, data=tmp) eri_sliced = None eris.xooo = tmp.transpose(0, 2, 1, 3) - tmp.transpose(0, 2, 3, 1) tmp = None cput4 = log.timer('CCSD scatter xooo: ', *cput3) # 3. xovo if rank == 0: tmp_vvoo = fn(eri, v, v, o, o) tmp_voov = fn(eri, v, o, o, v) # ZHC NOTE need to keep tmp_voov for xvoo eri_1 = [tmp_vvoo[p0:p1] for (p0, p1) in vlocs] eri_2 = [tmp_voov[p0:p1] for (p0, p1) in vlocs] else: tmp_vvoo = None tmp_voov = None eri_1 = None eri_2 = None tmp_1 = mpi.scatter_new(eri_1, root=0, data=tmp_vvoo) eri_1 = None tmp_vvoo = None tmp_2 = mpi.scatter_new(eri_2, root=0, data=tmp_voov) eri_2 = None tmp_voov = None eris.xovo = tmp_1.transpose(0, 2, 1, 3) - tmp_2.transpose(0, 2, 3, 1) tmp_1 = None cput5 = log.timer('CCSD scatter xovo: ', *cput4) # 4. xvoo eris.xvoo = tmp_2.transpose(0, 3, 1, 2) - tmp_2.transpose(0, 3, 2, 1) tmp_2 = None cput6 = log.timer('CCSD scatter xvoo: ', *cput5) # 5. 6. xovv, xvvo if rank == 0: tmp = fn(eri, v, v, o, v) eri_sliced = [tmp[p0:p1] for (p0, p1) in vlocs] else: tmp = None eri_sliced = None tmp_1 = mpi.scatter_new(eri_sliced, root=0, data=tmp) eri_sliced = None eris.xovv = tmp_1.transpose(0, 2, 1, 3) - tmp_1.transpose(0, 2, 3, 1) if rank == 0: tmp_2 = np.asarray(tmp.transpose(3, 2, 1, 0), order='C') # vovv tmp = None eri_sliced = [tmp_2[p0:p1] for (p0, p1) in vlocs] else: tmp_2 = None tmp = None eri_sliced = None tmp_2 = mpi.scatter_new(eri_sliced, root=0, data=tmp_2) eri_sliced = None eris.xvvo = tmp_1.transpose(0, 3, 1, 2) - tmp_2.transpose(0, 2, 3, 1) tmp_1 = None tmp_2 = None cput7 = log.timer('CCSD scatter xovv, xvvo: ', *cput6) # 7. xvvv if rank == 0: tmp = fn(eri, v, v, v, v) if mycc.remove_h2: eri = None if mycc._scf is not None: mycc._scf._eri = None eri_sliced = [tmp[p0:p1] for (p0, p1) in vlocs] else: tmp = None eri_sliced = None tmp = mpi.scatter_new(eri_sliced, root=0, data=tmp) eri_sliced = None eris.xvvv = tmp.transpose(0, 2, 1, 3) - tmp.transpose(0, 2, 3, 1) tmp = None eri = None cput8 = log.timer('CCSD scatter xvvv: ', *cput7) mycc._eris = eris log.timer('CCSD integral transformation ', *cput0) return eris
def _make_eris_incore(mycc, mo_coeff=None, ao2mofn=None): """ Make physist eri with incore ao2mo. """ cput0 = (logger.process_clock(), logger.perf_counter()) log = logger.Logger(mycc.stdout, mycc.verbose) _sync_(mycc) eris = gccsd._PhysicistsERIs() if rank == 0: eris._common_init_(mycc, mo_coeff) comm.bcast((eris.mo_coeff, eris.fock, eris.nocc, eris.mo_energy)) else: eris.mol = mycc.mol eris.mo_coeff, eris.fock, eris.nocc, eris.mo_energy = comm.bcast(None) # if workers does not have _eri, bcast from root if comm.allreduce(mycc._scf._eri is None, op=mpi.MPI.LOR): if rank == 0: mpi.bcast(mycc._scf._eri) else: mycc._scf._eri = mpi.bcast(None) cput1 = log.timer('CCSD ao2mo initialization: ', *cput0) nocc = eris.nocc nao, nmo = eris.mo_coeff.shape nvir = nmo - nocc vlocs = [_task_location(nvir, task_id) for task_id in range(mpi.pool.size)] vloc0, vloc1 = vlocs[rank] vseg = vloc1 - vloc0 plocs = [_task_location(nmo, task_id) for task_id in range(mpi.pool.size)] ploc0, ploc1 = plocs[rank] pseg = ploc1 - ploc0 mo_a = eris.mo_coeff[:nao//2] mo_b = eris.mo_coeff[nao//2:] mo_seg_a = mo_a[:, ploc0:ploc1] mo_seg_b = mo_b[:, ploc0:ploc1] fname = "gccsd_eri_tmp_%s.h5"%rank f = h5py.File(fname, 'w') eri_phys = f.create_dataset('eri_phys', (pseg, nmo, nmo, nmo), 'f8', chunks=(pseg, 1, nmo, nmo)) eri_a = ao2mo.incore.half_e1(mycc._scf._eri, (mo_seg_a, mo_a), compact=False) eri_b = ao2mo.incore.half_e1(mycc._scf._eri, (mo_seg_b, mo_b), compact=False) cput1 = log.timer('CCSD ao2mo half_e1: ', *cput1) unit = pseg * nmo * nmo * 2 mem_now = lib.current_memory()[0] max_memory = max(0, mycc.max_memory - mem_now) blksize = min(nmo, max(BLKMIN, int((max_memory*0.9e6/8)/unit))) for p0, p1 in lib.prange(0, nmo, blksize): klmosym_a, nkl_pair_a, mokl_a, klshape_a = \ ao2mo.incore._conc_mos(mo_a[:, p0:p1], mo_a, compact=False) klmosym_b, nkl_pair_b, mokl_b, klshape_b = \ ao2mo.incore._conc_mos(mo_b[:, p0:p1], mo_b, compact=False) eri = _ao2mo.nr_e2(eri_a, mokl_a, klshape_a, aosym='s4', mosym=klmosym_a) eri += _ao2mo.nr_e2(eri_a, mokl_b, klshape_b, aosym='s4', mosym=klmosym_b) eri += _ao2mo.nr_e2(eri_b, mokl_a, klshape_a, aosym='s4', mosym=klmosym_a) eri += _ao2mo.nr_e2(eri_b, mokl_b, klshape_b, aosym='s4', mosym=klmosym_b) eri = eri.reshape(pseg, nmo, p1-p0, nmo) eri_phys[:, p0:p1] = eri.transpose(0, 2, 1, 3) - eri.transpose(0, 2, 3, 1) eri = None eri_a = None eri_b = None f.close() comm.Barrier() cput1 = log.timer('CCSD ao2mo nr_e2: ', *cput1) o_idx = -1 v_idx = mpi.pool.size for r, (p0, p1) in enumerate(plocs): if p0 <= nocc - 1 < p1: o_idx = r if p0 <= nocc < p1: v_idx = r break o_files = np.arange(mpi.pool.size)[:(o_idx+1)] v_files = np.arange(mpi.pool.size)[v_idx:] eris.oooo = np.empty((nocc, nocc, nocc, nocc)) eris.xooo = np.empty((vseg, nocc, nocc, nocc)) eris.xovo = np.empty((vseg, nocc, nvir, nocc)) eris.xovv = np.empty((vseg, nocc, nvir, nvir)) eris.xvvo = np.empty((vseg, nvir, nvir, nocc)) eris.xvoo = np.empty((vseg, nvir, nocc, nocc)) eris.xvvv = np.empty((vseg, nvir, nvir, nvir)) for r in range(mpi.pool.size): f = lib.H5TmpFile(filename="gccsd_eri_tmp_%s.h5"%r, mode='r') eri_phys = f["eri_phys"] if r in o_files: p0, p1 = plocs[r] p1 = min(p1, nocc) pseg = p1 - p0 if pseg > 0: eris.oooo[p0:p1] = eri_phys[:pseg, :nocc, :nocc, :nocc] if r in v_files: p00, p10 = plocs[r] p0 = max(p00, nocc+vloc0) p1 = min(p10, nocc+vloc1) pseg = p1 - p0 if pseg > 0: eris.xooo[p0-(nocc+vloc0):p1-(nocc+vloc0)] = eri_phys[p0-p00:p1-p00, :nocc, :nocc, :nocc] eris.xovo[p0-(nocc+vloc0):p1-(nocc+vloc0)] = eri_phys[p0-p00:p1-p00, :nocc, nocc:, :nocc] eris.xvoo[p0-(nocc+vloc0):p1-(nocc+vloc0)] = eri_phys[p0-p00:p1-p00, nocc:, :nocc, :nocc] eris.xvvo[p0-(nocc+vloc0):p1-(nocc+vloc0)] = eri_phys[p0-p00:p1-p00, nocc:, nocc:, :nocc] eris.xovv[p0-(nocc+vloc0):p1-(nocc+vloc0)] = eri_phys[p0-p00:p1-p00, :nocc, nocc:, nocc:] eris.xvvv[p0-(nocc+vloc0):p1-(nocc+vloc0)] = eri_phys[p0-p00:p1-p00, nocc:, nocc:, nocc:] cput1 = log.timer('CCSD ao2mo load: ', *cput1) f.close() comm.Barrier() os.remove("gccsd_eri_tmp_%s.h5"%rank) mycc._eris = eris log.timer('CCSD integral transformation ', *cput0) return eris
def _make_j3c(mydf, cell, auxcell, kptij_lst, cderi_file): log = logger.Logger(mydf.stdout, mydf.verbose) t1 = t0 = (logger.process_clock(), logger.perf_counter()) fused_cell, fuse = fuse_auxcell(mydf, mydf.auxcell) ao_loc = cell.ao_loc_nr() nao = ao_loc[-1] naux = auxcell.nao_nr() nkptij = len(kptij_lst) mesh = mydf.mesh Gv, Gvbase, kws = cell.get_Gv_weights(mesh) b = cell.reciprocal_vectors() gxyz = lib.cartesian_prod([numpy.arange(len(x)) for x in Gvbase]) ngrids = gxyz.shape[0] kptis = kptij_lst[:, 0] kptjs = kptij_lst[:, 1] kpt_ji = kptjs - kptis uniq_kpts, uniq_index, uniq_inverse = unique(kpt_ji) log.debug('Num uniq kpts %d', len(uniq_kpts)) log.debug2('uniq_kpts %s', uniq_kpts) # j2c ~ (-kpt_ji | kpt_ji) j2c = fused_cell.pbc_intor('int2c2e', hermi=1, kpts=uniq_kpts) j2ctags = [] t1 = log.timer_debug1('2c2e', *t1) swapfile = tempfile.NamedTemporaryFile(dir=os.path.dirname(cderi_file)) fswap = lib.H5TmpFile(swapfile.name) # Unlink swapfile to avoid trash swapfile = None mem_now = max(comm.allgather(lib.current_memory()[0])) max_memory = max(2000, mydf.max_memory - mem_now) blksize = max(2048, int(max_memory * .5e6 / 16 / fused_cell.nao_nr())) log.debug2('max_memory %s (MB) blocksize %s', max_memory, blksize) for k, kpt in enumerate(uniq_kpts): coulG = mydf.weighted_coulG(kpt, False, mesh) j2c_k = numpy.zeros_like(j2c[k]) for p0, p1 in mydf.prange(0, ngrids, blksize): aoaux = ft_ao.ft_ao(fused_cell, Gv[p0:p1], None, b, gxyz[p0:p1], Gvbase, kpt).T LkR = numpy.asarray(aoaux.real, order='C') LkI = numpy.asarray(aoaux.imag, order='C') aoaux = None if is_zero(kpt): # kpti == kptj j2c_k[naux:] += lib.ddot(LkR[naux:] * coulG[p0:p1], LkR.T) j2c_k[naux:] += lib.ddot(LkI[naux:] * coulG[p0:p1], LkI.T) else: j2cR, j2cI = zdotCN(LkR[naux:] * coulG[p0:p1], LkI[naux:] * coulG[p0:p1], LkR.T, LkI.T) j2c_k[naux:] += j2cR + j2cI * 1j kLR = kLI = None j2c_k[:naux, naux:] = j2c_k[naux:, :naux].conj().T j2c[k] -= mpi.allreduce(j2c_k) j2c[k] = fuse(fuse(j2c[k]).T).T try: fswap['j2c/%d' % k] = scipy.linalg.cholesky(j2c[k], lower=True) j2ctags.append('CD') except scipy.linalg.LinAlgError as e: #msg =('===================================\n' # 'J-metric not positive definite.\n' # 'It is likely that mesh is not enough.\n' # '===================================') #log.error(msg) #raise scipy.linalg.LinAlgError('\n'.join([str(e), msg])) w, v = scipy.linalg.eigh(j2c[k]) log.debug2('metric linear dependency for kpt %s', k) log.debug2('cond = %.4g, drop %d bfns', w[0] / w[-1], numpy.count_nonzero(w < mydf.linear_dep_threshold)) v1 = v[:, w > mydf.linear_dep_threshold].T.conj() v1 /= numpy.sqrt(w[w > mydf.linear_dep_threshold]).reshape(-1, 1) fswap['j2c/%d' % k] = v1 if cell.dimension == 2 and cell.low_dim_ft_type != 'inf_vacuum': idx = numpy.where(w < -mydf.linear_dep_threshold)[0] if len(idx) > 0: fswap['j2c-/%d' % k] = (v[:, idx] / numpy.sqrt(-w[idx])).conj().T w = v = v1 = None j2ctags.append('eig') j2c = coulG = None aosym_s2 = numpy.einsum('ix->i', abs(kptis - kptjs)) < 1e-9 j_only = numpy.all(aosym_s2) if gamma_point(kptij_lst): dtype = 'f8' else: dtype = 'c16' t1 = log.timer_debug1('aoaux and int2c', *t1) # Estimates the buffer size based on the last contraction in G-space. # This contraction requires to hold nkptj copies of (naux,?) array # simultaneously in memory. mem_now = max(comm.allgather(lib.current_memory()[0])) max_memory = max(2000, mydf.max_memory - mem_now) nkptj_max = max((uniq_inverse == x).sum() for x in set(uniq_inverse)) buflen = max( int( min(max_memory * .5e6 / 16 / naux / (nkptj_max + 2) / nao, nao / 3 / mpi.pool.size)), 1) chunks = (buflen, nao) j3c_jobs = grids2d_int3c_jobs(cell, auxcell, kptij_lst, chunks, j_only) log.debug1('max_memory = %d MB (%d in use) chunks %s', max_memory, mem_now, chunks) log.debug2('j3c_jobs %s', j3c_jobs) if j_only: int3c = wrap_int3c(cell, fused_cell, 'int3c2e', 's2', 1, kptij_lst) else: int3c = wrap_int3c(cell, fused_cell, 'int3c2e', 's1', 1, kptij_lst) idxb = numpy.tril_indices(nao) idxb = (idxb[0] * nao + idxb[1]).astype('i') aux_loc = fused_cell.ao_loc_nr('ssc' in 'int3c2e') def gen_int3c(job_id, ish0, ish1): dataname = 'j3c-chunks/%d' % job_id i0 = ao_loc[ish0] i1 = ao_loc[ish1] dii = i1 * (i1 + 1) // 2 - i0 * (i0 + 1) // 2 if j_only: dij = dii buflen = max(8, int(max_memory * 1e6 / 16 / (nkptij * dii + dii))) else: dij = (i1 - i0) * nao buflen = max(8, int(max_memory * 1e6 / 16 / (nkptij * dij + dij))) auxranges = balance_segs(aux_loc[1:] - aux_loc[:-1], buflen) buflen = max([x[2] for x in auxranges]) buf = numpy.empty(nkptij * dij * buflen, dtype=dtype) buf1 = numpy.empty(dij * buflen, dtype=dtype) naux = aux_loc[-1] for kpt_id, kptij in enumerate(kptij_lst): key = '%s/%d' % (dataname, kpt_id) if aosym_s2[kpt_id]: shape = (naux, dii) else: shape = (naux, dij) if gamma_point(kptij): fswap.create_dataset(key, shape, 'f8') else: fswap.create_dataset(key, shape, 'c16') naux0 = 0 for istep, auxrange in enumerate(auxranges): log.alldebug2("aux_e1 job_id %d step %d", job_id, istep) sh0, sh1, nrow = auxrange sub_slice = (ish0, ish1, 0, cell.nbas, sh0, sh1) mat = numpy.ndarray((nkptij, dij, nrow), dtype=dtype, buffer=buf) mat = int3c(sub_slice, mat) for k, kptij in enumerate(kptij_lst): h5dat = fswap['%s/%d' % (dataname, k)] v = lib.transpose(mat[k], out=buf1) if not j_only and aosym_s2[k]: idy = idxb[i0 * (i0 + 1) // 2:i1 * (i1 + 1) // 2] - i0 * nao out = numpy.ndarray((nrow, dii), dtype=v.dtype, buffer=mat[k]) v = numpy.take(v, idy, axis=1, out=out) if gamma_point(kptij): h5dat[naux0:naux0 + nrow] = v.real else: h5dat[naux0:naux0 + nrow] = v naux0 += nrow def ft_fuse(job_id, uniq_kptji_id, sh0, sh1): kpt = uniq_kpts[uniq_kptji_id] # kpt = kptj - kpti adapted_ji_idx = numpy.where(uniq_inverse == uniq_kptji_id)[0] adapted_kptjs = kptjs[adapted_ji_idx] nkptj = len(adapted_kptjs) j2c = numpy.asarray(fswap['j2c/%d' % uniq_kptji_id]) j2ctag = j2ctags[uniq_kptji_id] naux0 = j2c.shape[0] if ('j2c-/%d' % uniq_kptji_id) in fswap: j2c_negative = numpy.asarray(fswap['j2c-/%d' % uniq_kptji_id]) else: j2c_negative = None if is_zero(kpt): aosym = 's2' else: aosym = 's1' if aosym == 's2' and cell.dimension == 3: vbar = fuse(mydf.auxbar(fused_cell)) ovlp = cell.pbc_intor('int1e_ovlp', hermi=1, kpts=adapted_kptjs) ovlp = [lib.pack_tril(s) for s in ovlp] j3cR = [None] * nkptj j3cI = [None] * nkptj i0 = ao_loc[sh0] i1 = ao_loc[sh1] for k, idx in enumerate(adapted_ji_idx): key = 'j3c-chunks/%d/%d' % (job_id, idx) v = numpy.asarray(fswap[key]) if aosym == 's2' and cell.dimension == 3: for i in numpy.where(vbar != 0)[0]: v[i] -= vbar[i] * ovlp[k][i0 * (i0 + 1) // 2:i1 * (i1 + 1) // 2].ravel() j3cR[k] = numpy.asarray(v.real, order='C') if v.dtype == numpy.complex128: j3cI[k] = numpy.asarray(v.imag, order='C') v = None ncol = j3cR[0].shape[1] Gblksize = max(16, int(max_memory * 1e6 / 16 / ncol / (nkptj + 1))) # +1 for pqkRbuf/pqkIbuf Gblksize = min(Gblksize, ngrids, 16384) pqkRbuf = numpy.empty(ncol * Gblksize) pqkIbuf = numpy.empty(ncol * Gblksize) buf = numpy.empty(nkptj * ncol * Gblksize, dtype=numpy.complex128) log.alldebug2('job_id %d blksize (%d,%d)', job_id, Gblksize, ncol) wcoulG = mydf.weighted_coulG(kpt, False, mesh) fused_cell_slice = (auxcell.nbas, fused_cell.nbas) if aosym == 's2': shls_slice = (sh0, sh1, 0, sh1) else: shls_slice = (sh0, sh1, 0, cell.nbas) for p0, p1 in lib.prange(0, ngrids, Gblksize): Gaux = ft_ao.ft_ao(fused_cell, Gv[p0:p1], fused_cell_slice, b, gxyz[p0:p1], Gvbase, kpt) Gaux *= wcoulG[p0:p1, None] kLR = Gaux.real.copy('C') kLI = Gaux.imag.copy('C') Gaux = None dat = ft_ao._ft_aopair_kpts(cell, Gv[p0:p1], shls_slice, aosym, b, gxyz[p0:p1], Gvbase, kpt, adapted_kptjs, out=buf) nG = p1 - p0 for k, ji in enumerate(adapted_ji_idx): aoao = dat[k].reshape(nG, ncol) pqkR = numpy.ndarray((ncol, nG), buffer=pqkRbuf) pqkI = numpy.ndarray((ncol, nG), buffer=pqkIbuf) pqkR[:] = aoao.real.T pqkI[:] = aoao.imag.T lib.dot(kLR.T, pqkR.T, -1, j3cR[k][naux:], 1) lib.dot(kLI.T, pqkI.T, -1, j3cR[k][naux:], 1) if not (is_zero(kpt) and gamma_point(adapted_kptjs[k])): lib.dot(kLR.T, pqkI.T, -1, j3cI[k][naux:], 1) lib.dot(kLI.T, pqkR.T, 1, j3cI[k][naux:], 1) kLR = kLI = None for k, idx in enumerate(adapted_ji_idx): if is_zero(kpt) and gamma_point(adapted_kptjs[k]): v = fuse(j3cR[k]) else: v = fuse(j3cR[k] + j3cI[k] * 1j) if j2ctag == 'CD': v = scipy.linalg.solve_triangular(j2c, v, lower=True, overwrite_b=True) fswap['j3c-chunks/%d/%d' % (job_id, idx)][:naux0] = v else: fswap['j3c-chunks/%d/%d' % (job_id, idx)][:naux0] = lib.dot( j2c, v) # low-dimension systems if j2c_negative is not None: fswap['j3c-/%d/%d' % (job_id, idx)] = lib.dot(j2c_negative, v) _assemble(mydf, kptij_lst, j3c_jobs, gen_int3c, ft_fuse, cderi_file, fswap, log)
def _assemble(mydf, kptij_lst, j3c_jobs, gen_int3c, ft_fuse, cderi_file, fswap, log): t1 = (logger.process_clock(), logger.perf_counter()) cell = mydf.cell ao_loc = cell.ao_loc_nr() nao = ao_loc[-1] kptis = kptij_lst[:, 0] kptjs = kptij_lst[:, 1] kpt_ji = kptjs - kptis uniq_kpts, uniq_index, uniq_inverse = unique(kpt_ji) aosym_s2 = numpy.einsum('ix->i', abs(kptis - kptjs)) < 1e-9 t2 = t1 j3c_workers = numpy.zeros(len(j3c_jobs), dtype=int) #for job_id, ish0, ish1 in mpi.work_share_partition(j3c_jobs): for job_id, ish0, ish1 in mpi.work_stealing_partition(j3c_jobs): gen_int3c(job_id, ish0, ish1) t2 = log.alltimer_debug2('int j3c %d' % job_id, *t2) for k, kpt in enumerate(uniq_kpts): ft_fuse(job_id, k, ish0, ish1) t2 = log.alltimer_debug2('ft-fuse %d k %d' % (job_id, k), *t2) j3c_workers[job_id] = rank j3c_workers = mpi.allreduce(j3c_workers) log.debug2('j3c_workers %s', j3c_workers) t1 = log.timer_debug1('int3c and fuse', *t1) # Pass 2 # Transpose 3-index tensor and save data in cderi_file feri = h5py.File(cderi_file, 'w') nauxs = [fswap['j2c/%d' % k].shape[0] for k, kpt in enumerate(uniq_kpts)] segsize = (max(nauxs) + mpi.pool.size - 1) // mpi.pool.size naux0 = rank * segsize for k, kptij in enumerate(kptij_lst): naux1 = min(nauxs[uniq_inverse[k]], naux0 + segsize) nrow = max(0, naux1 - naux0) if gamma_point(kptij): dtype = 'f8' else: dtype = 'c16' if aosym_s2[k]: nao_pair = nao * (nao + 1) // 2 else: nao_pair = nao * nao feri.create_dataset('j3c/%d' % k, (nrow, nao_pair), dtype, maxshape=(None, nao_pair)) def get_segs_loc(aosym): off0 = numpy.asarray([ao_loc[i0] for x, i0, i1 in j3c_jobs]) off1 = numpy.asarray([ao_loc[i1] for x, i0, i1 in j3c_jobs]) if aosym: # s2 dims = off1 * (off1 + 1) // 2 - off0 * (off0 + 1) // 2 else: dims = (off1 - off0) * nao #dims = numpy.asarray([ao_loc[i1]-ao_loc[i0] for x,i0,i1 in j3c_jobs]) dims = numpy.hstack( [dims[j3c_workers == w] for w in range(mpi.pool.size)]) job_idx = numpy.hstack( [numpy.where(j3c_workers == w)[0] for w in range(mpi.pool.size)]) segs_loc = numpy.append(0, numpy.cumsum(dims)) segs_loc = [(segs_loc[j], segs_loc[j + 1]) for j in numpy.argsort(job_idx)] return segs_loc segs_loc_s1 = get_segs_loc(False) segs_loc_s2 = get_segs_loc(True) job_ids = numpy.where(rank == j3c_workers)[0] def load(k, p0, p1): naux1 = nauxs[uniq_inverse[k]] slices = [(min(i * segsize + p0, naux1), min(i * segsize + p1, naux1)) for i in range(mpi.pool.size)] segs = [] for p0, p1 in slices: val = [ fswap['j3c-chunks/%d/%d' % (job, k)][p0:p1].ravel() for job in job_ids ] if val: segs.append(numpy.hstack(val)) else: segs.append(numpy.zeros(0)) return segs def save(k, p0, p1, segs): segs = mpi.alltoall(segs) naux1 = nauxs[uniq_inverse[k]] loc0, loc1 = min(p0, naux1 - naux0), min(p1, naux1 - naux0) nL = loc1 - loc0 if nL > 0: if aosym_s2[k]: segs = numpy.hstack([ segs[i0 * nL:i1 * nL].reshape(nL, -1) for i0, i1 in segs_loc_s2 ]) else: segs = numpy.hstack([ segs[i0 * nL:i1 * nL].reshape(nL, -1) for i0, i1 in segs_loc_s1 ]) feri['j3c/%d' % k][loc0:loc1] = segs mem_now = max(comm.allgather(lib.current_memory()[0])) max_memory = max(2000, min(8000, mydf.max_memory - mem_now)) if numpy.all(aosym_s2): if gamma_point(kptij_lst): blksize = max(16, int(max_memory * .5e6 / 8 / nao**2)) else: blksize = max(16, int(max_memory * .5e6 / 16 / nao**2)) else: blksize = max(16, int(max_memory * .5e6 / 16 / nao**2 / 2)) log.debug1('max_momory %d MB (%d in use), blksize %d', max_memory, mem_now, blksize) t2 = t1 with lib.call_in_background(save) as async_write: for k, kptji in enumerate(kptij_lst): for p0, p1 in lib.prange(0, segsize, blksize): segs = load(k, p0, p1) async_write(k, p0, p1, segs) t2 = log.timer_debug1( 'assemble k=%d %d:%d (in %d)' % (k, p0, p1, segsize), *t2) if 'j2c-' in fswap: j2c_kpts_lists = [] for k, kpt in enumerate(uniq_kpts): if ('j2c-/%d' % k) in fswap: adapted_ji_idx = numpy.where(uniq_inverse == k)[0] j2c_kpts_lists.append(adapted_ji_idx) for k in numpy.hstack(j2c_kpts_lists): val = [ numpy.asarray(fswap['j3c-/%d/%d' % (job, k)]).ravel() for job in job_ids ] val = mpi.gather(numpy.hstack(val)) if rank == 0: naux1 = fswap['j3c-/0/%d' % k].shape[0] if aosym_s2[k]: v = [ val[i0 * naux1:i1 * naux1].reshape(naux1, -1) for i0, i1 in segs_loc_s2 ] else: v = [ val[i0 * naux1:i1 * naux1].reshape(naux1, -1) for i0, i1 in segs_loc_s1 ] feri['j3c-/%d' % k] = numpy.hstack(v) if 'j3c-kptij' in feri: del (feri['j3c-kptij']) feri['j3c-kptij'] = kptij_lst t1 = log.alltimer_debug1('assembling j3c', *t1) feri.close()
def _make_eris_outcore(mycc, mo_coeff=None): cput0 = (logger.process_clock(), logger.perf_counter()) log = logger.Logger(mycc.stdout, mycc.verbose) _sync_(mycc) eris = ccsd._ChemistsERIs() if rank == 0: eris._common_init_(mycc, mo_coeff) comm.bcast((eris.mo_coeff, eris.fock, eris.nocc, eris.mo_energy)) else: eris.mol = mycc.mol eris.mo_coeff, eris.fock, eris.nocc, eris.mo_energy = comm.bcast(None) mol = mycc.mol mo_coeff = numpy.asarray(eris.mo_coeff, order='F') nocc = eris.nocc nao, nmo = mo_coeff.shape nvir = nmo - nocc orbo = mo_coeff[:, :nocc] orbv = mo_coeff[:, nocc:] nvpair = nvir * (nvir + 1) // 2 vlocs = [_task_location(nvir, task_id) for task_id in range(mpi.pool.size)] vloc0, vloc1 = vlocs[rank] vseg = vloc1 - vloc0 eris.feri1 = lib.H5TmpFile() eris.oooo = eris.feri1.create_dataset('oooo', (nocc, nocc, nocc, nocc), 'f8') eris.oovv = eris.feri1.create_dataset('oovv', (nocc, nocc, vseg, nvir), 'f8', chunks=(nocc, nocc, 1, nvir)) eris.ovoo = eris.feri1.create_dataset('ovoo', (nocc, vseg, nocc, nocc), 'f8', chunks=(nocc, 1, nocc, nocc)) eris.ovvo = eris.feri1.create_dataset('ovvo', (nocc, vseg, nvir, nocc), 'f8', chunks=(nocc, 1, nvir, nocc)) eris.ovov = eris.feri1.create_dataset('ovov', (nocc, vseg, nocc, nvir), 'f8', chunks=(nocc, 1, nocc, nvir)) # eris.ovvv = eris.feri1.create_dataset('ovvv', (nocc,vseg,nvpair), 'f8', chunks=(nocc,1,nvpair)) eris.vvvo = eris.feri1.create_dataset('vvvo', (vseg, nvir, nvir, nocc), 'f8', chunks=(1, nvir, 1, nocc)) assert (mycc.direct) def save_occ_frac(p0, p1, eri): eri = eri.reshape(p1 - p0, nocc, nmo, nmo) eris.oooo[p0:p1] = eri[:, :, :nocc, :nocc] eris.oovv[p0:p1] = eri[:, :, nocc + vloc0:nocc + vloc1, nocc:] def save_vir_frac(p0, p1, eri): log.alldebug1('save_vir_frac %d %d %s', p0, p1, eri.shape) eri = eri.reshape(p1 - p0, nocc, nmo, nmo) eris.ovoo[:, p0:p1] = eri[:, :, :nocc, :nocc].transpose(1, 0, 2, 3) eris.ovvo[:, p0:p1] = eri[:, :, nocc:, :nocc].transpose(1, 0, 2, 3) eris.ovov[:, p0:p1] = eri[:, :, :nocc, nocc:].transpose(1, 0, 2, 3) # vvv = lib.pack_tril(eri[:,:,nocc:,nocc:].reshape((p1-p0)*nocc,nvir,nvir)) # eris.ovvv[:,p0:p1] = vvv.reshape(p1-p0,nocc,nvpair).transpose(1,0,2) cput2 = logger.process_clock(), logger.perf_counter() ovvv_segs = [ eri[:, :, nocc + q0:nocc + q1, nocc:].transpose(2, 3, 0, 1) for q0, q1 in vlocs ] ovvv_segs = mpi.alltoall_new(ovvv_segs, split_recvbuf=True) cput2 = log.timer_debug1('vvvo alltoall', *cput2) for task_id, (q0, q1) in enumerate(comm.allgather((p0, p1))): ip0 = q0 + vlocs[task_id][0] ip1 = q1 + vlocs[task_id][0] eris.vvvo[:, :, ip0:ip1] = ovvv_segs[task_id].reshape( vseg, nvir, q1 - q0, nocc) fswap = lib.H5TmpFile() max_memory = max(MEMORYMIN, mycc.max_memory - lib.current_memory()[0]) int2e = mol._add_suffix('int2e') orbov = numpy.hstack((orbo, orbv[:, vloc0:vloc1])) ao2mo.outcore.half_e1(mol, (orbov, orbo), fswap, int2e, 's4', 1, max_memory, verbose=log) ao_loc = mol.ao_loc_nr() nao_pair = nao * (nao + 1) // 2 blksize = int(min(8e9, max_memory * .5e6) / 8 / (nao_pair + nmo**2) / nocc) blksize = min(nvir, max(BLKMIN, blksize)) fload = ao2mo.outcore._load_from_h5g buf = numpy.empty((blksize * nocc, nao_pair)) buf_prefetch = numpy.empty_like(buf) def prefetch(p0, p1, rowmax): p0, p1 = p1, min(rowmax, p1 + blksize) if p0 < p1: fload(fswap['0'], p0 * nocc, p1 * nocc, buf_prefetch) cput1 = logger.process_clock(), logger.perf_counter() outbuf = numpy.empty((blksize * nocc, nmo**2)) with lib.call_in_background(prefetch) as bprefetch: fload(fswap['0'], 0, min(nocc, blksize) * nocc, buf_prefetch) for p0, p1 in lib.prange(0, nocc, blksize): nrow = (p1 - p0) * nocc buf, buf_prefetch = buf_prefetch, buf bprefetch(p0, p1, nocc) dat = ao2mo._ao2mo.nr_e2(buf[:nrow], mo_coeff, (0, nmo, 0, nmo), 's4', 's1', out=outbuf, ao_loc=ao_loc) save_occ_frac(p0, p1, dat) blksize = min(comm.allgather(blksize)) norb_max = nocc + vseg fload(fswap['0'], nocc**2, min(nocc + blksize, norb_max) * nocc, buf_prefetch) for p0, p1 in mpi.prange(vloc0, vloc1, blksize): i0, i1 = p0 - vloc0, p1 - vloc0 nrow = (p1 - p0) * nocc buf, buf_prefetch = buf_prefetch, buf bprefetch(nocc + i0, nocc + i1, norb_max) dat = ao2mo._ao2mo.nr_e2(buf[:nrow], mo_coeff, (0, nmo, 0, nmo), 's4', 's1', out=outbuf, ao_loc=ao_loc) save_vir_frac(i0, i1, dat) buf = buf_prefecth = outbuf = None cput1 = log.timer_debug1('transforming oppp', *cput1) log.timer('CCSD integral transformation', *cput0) mycc._eris = eris return eris
def update_lambda(mycc, t1, t2, l1, l2, eris, imds): """ Update GCCSD lambda. """ time0 = logger.process_clock(), logger.perf_counter() log = logger.Logger(mycc.stdout, mycc.verbose) t1T = t1.T t2T = np.asarray(t2.transpose(2, 3, 0, 1), order='C') t1 = t2 = None nvir_seg, nvir, nocc = t2T.shape[:3] l1T = l1.T l2T = np.asarray(l2.transpose(2, 3, 0, 1), order='C') l1 = l2 = None ntasks = mpi.pool.size vlocs = [_task_location(nvir, task_id) for task_id in range(ntasks)] vloc0, vloc1 = vlocs[rank] log.debug2('vlocs %s', vlocs) assert vloc1 - vloc0 == nvir_seg fvo = eris.fock[nocc:, :nocc] mo_e_o = eris.mo_energy[:nocc] mo_e_v = eris.mo_energy[nocc:] + mycc.level_shift v1 = imds.v1 - np.diag(mo_e_v) v2 = imds.v2 - np.diag(mo_e_o) mba = einsum('cakl, cbkl -> ba', l2T, t2T) * 0.5 mba = mpi.allreduce_inplace(mba) mij = einsum('cdki, cdkj -> ij', l2T, t2T) * 0.5 mij = mpi.allreduce_inplace(mij) # m3 [a]bij m3 = einsum('abkl, ijkl -> abij', l2T, np.asarray(imds.woooo)) tauT = t2T #+ np.einsum('ai, bj -> abij', t1T[vloc0:vloc1] * 2.0, t1T, optimize=True) tmp = einsum('cdij, cdkl -> ijkl', l2T, tauT) tmp = mpi.allreduce_inplace(tmp) tauT = None vvoo = np.asarray(eris.xvoo) tmp = einsum('abkl, ijkl -> abij', vvoo, tmp) tmp *= 0.25 m3 += tmp tmp = None #tmp = einsum('cdij, dk -> ckij', l2T, t1T) #for task_id, tmp, p0, p1 in _rotate_vir_block(tmp, vlocs=vlocs): # m3 -= einsum('kcba, ckij -> abij', eris.ovvx[:, p0:p1], tmp) # tmp = None eris_vvvv = eris.xvvv.transpose(2, 3, 0, 1) tmp_2 = np.empty_like(l2T) # used for line 387 for task_id, l2T_tmp, p0, p1 in _rotate_vir_block(l2T, vlocs=vlocs): tmp = einsum('cdij, cdab -> abij', l2T_tmp, eris_vvvv[p0:p1]) tmp *= 0.5 m3 += tmp tmp_2[:, p0:p1] = einsum('acij, cb -> baij', l2T_tmp, v1[:, vloc0:vloc1]) tmp = l2T_tmp = None eris_vvvv = None #l1Tnew = einsum('abij, bj -> ai', m3, t1T) #l1Tnew = mpi.allgather(l1Tnew) l1Tnew = np.zeros_like(l1T) l2Tnew = m3 l2Tnew += vvoo #fvo1 = fvo #+ mpi.allreduce(einsum('cbkj, ck -> bj', vvoo, t1T[vloc0:vloc1])) #tmp = np.einsum('ai, bj -> abij', l1T[vloc0:vloc1], fvo1, optimize=True) tmp = 0.0 wvovo = np.asarray(imds.wovvo).transpose(1, 0, 2, 3) for task_id, w_tmp, p0, p1 in _rotate_vir_block(wvovo, vlocs=vlocs): tmp -= einsum('acki, cjbk -> abij', l2T[:, p0:p1], w_tmp) w_tmp = None wvovo = None tmp = tmp - tmp.transpose(0, 1, 3, 2) l2Tnew += tmp tmpT = mpi.alltoall_new([tmp[:, p0:p1] for p0, p1 in vlocs], split_recvbuf=True) for task_id, (p0, p1) in enumerate(vlocs): tmp = tmpT[task_id].reshape(p1 - p0, nvir_seg, nocc, nocc) l2Tnew[:, p0:p1] -= tmp.transpose(1, 0, 2, 3) tmp = None #tmp = einsum('ak, ijkb -> baij', l1T, eris.ooox) #tmp -= tmp_2 tmp = -tmp_2 tmp1vv = mba #+ np.dot(t1T, l1T.T) # ba tmp -= einsum('ca, bcij -> baij', tmp1vv, vvoo) l2Tnew += tmp tmpT = mpi.alltoall_new([tmp[:, p0:p1] for p0, p1 in vlocs], split_recvbuf=True) for task_id, (p0, p1) in enumerate(vlocs): tmp = tmpT[task_id].reshape(p1 - p0, nvir_seg, nocc, nocc) l2Tnew[:, p0:p1] -= tmp.transpose(1, 0, 2, 3) tmp = None #tmp = einsum('jcab, ci -> baji', eris.ovvx, -l1T) tmp = einsum('abki, jk -> abij', l2T, v2) tmp1oo = mij #+ np.dot(l1T.T, t1T) # ik tmp -= einsum('ik, abkj -> abij', tmp1oo, vvoo) vvoo = None l2Tnew += tmp l2Tnew -= tmp.transpose(0, 1, 3, 2) tmp = None #l1Tnew += fvo #tmp = einsum('bj, ibja -> ai', -l1T[vloc0:vloc1], eris.oxov) #l1Tnew += np.dot(v1.T, l1T) #l1Tnew -= np.dot(l1T, v2.T) #tmp -= einsum('cakj, icjk -> ai', l2T, imds.wovoo) #tmp -= einsum('bcak, bcik -> ai', imds.wvvvo, l2T) #tmp += einsum('baji, bj -> ai', l2T, imds.w3[vloc0:vloc1]) #tmp_2 = t1T[vloc0:vloc1] - np.dot(tmp1vv[vloc0:vloc1], t1T) #tmp_2 -= np.dot(t1T[vloc0:vloc1], mij) #tmp_2 += einsum('bcjk, ck -> bj', t2T, l1T) #tmp += einsum('baji, bj -> ai', vvoo, tmp_2) #tmp_2 = None #tmp += einsum('icab, bc -> ai', eris.oxvv, tmp1vv[:, vloc0:vloc1]) #l1Tnew += mpi.allreduce(tmp) #l1Tnew -= mpi.allgather(einsum('jika, kj -> ai', eris.ooox, tmp1oo)) #tmp = fvo - mpi.allreduce(einsum('bakj, bj -> ak', vvoo, t1T[vloc0:vloc1])) #vvoo = None #l1Tnew -= np.dot(tmp, mij.T) #l1Tnew -= np.dot(mba.T, tmp) eia = mo_e_o[:, None] - mo_e_v #l1Tnew /= eia.T for i in range(vloc0, vloc1): l2Tnew[i - vloc0] /= lib.direct_sum('i + jb -> bij', eia[:, i], eia) time0 = log.timer_debug1('update l1 l2', *time0) return l1Tnew.T, l2Tnew.transpose(2, 3, 0, 1)
def get_veff(mf, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1): t0 = (logger.process_clock(), logger.perf_counter()) mf.unpack_(comm.bcast(mf.pack())) mol = mf.mol ni = mf._numint if mf.nlc != '': raise NotImplementedError omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin) # Broadcast the large input arrays here. if any(comm.allgather(dm is mpi.Message.SkippedArg)): if rank == 0 and dm is None: dm = mf.make_rdm1() dm = mpi.bcast_tagged_array(dm) if any(comm.allgather(dm_last is mpi.Message.SkippedArg)): dm_last = mpi.bcast_tagged_array(dm_last) if any(comm.allgather(vhf_last is mpi.Message.SkippedArg)): vhf_last = mpi.bcast_tagged_array(vhf_last) ground_state = (dm.ndim == 3 and dm.shape[0] == 2) if mf.grids.coords is None: mpi_rks._setup_grids_(mf, dm[0]+dm[1]) t0 = logger.timer(mf, 'setting up grids', *t0) if hermi == 2: # because rho = 0 n, exc, vxc = 0, 0, 0 else: n, exc, vxc = ni.nr_uks(mol, mf.grids, mf.xc, dm) n = comm.allreduce(n) exc = comm.allreduce(exc) vxc = mpi.reduce(vxc) logger.debug(mf, 'nelec by numeric integration = %s', n) t0 = logger.timer(mf, 'vxc', *t0) if abs(hyb) < 1e-10 and abs(alpha) < 1e-10: vk = None if getattr(vhf_last, 'vj', None) is not None: ddm = numpy.asarray(dm) - dm_last ddm = ddm[0] + ddm[1] vj = mf.get_j(mol, ddm, hermi) vj += vhf_last.vj else: vj = mf.get_j(mol, dm[0]+dm[1], hermi) vxc += vj else: if getattr(vhf_last, 'vk', None) is not None: ddm = numpy.asarray(dm) - dm_last vj, vk = mf.get_jk(mol, ddm, hermi) vj = vj[0] + vj[1] vk *= hyb if abs(omega) > 1e-10: vklr = mf.get_k(mol, ddm, hermi, omega=omega) vk += vklr * (alpha - hyb) ddm = None vj += vhf_last.vj vk += vhf_last.vk else: vj, vk = mf.get_jk(mol, dm, hermi) vj = vj[0] + vj[1] vk *= hyb if abs(omega) > 1e-10: vklr = mf.get_k(mol, dm, hermi, omega=omega) vk += vklr * (alpha - hyb) vxc += vj vxc -= vk if ground_state: exc -=(numpy.einsum('ij,ji', dm[0], vk[0]) + numpy.einsum('ij,ji', dm[1], vk[1])) * .5 if ground_state: ecoul = numpy.einsum('ij,ji', dm[0]+dm[1], vj) * .5 else: ecoul = None vxc = lib.tag_array(vxc, ecoul=ecoul, exc=exc, vj=vj, vk=vk) return vxc
def update_amps(mycc, t1, t2, eris): time1 = time0 = logger.process_clock(), logger.perf_counter() log = logger.Logger(mycc.stdout, mycc.verbose) cpu1 = time0 t1T = t1.T t2T = numpy.asarray(t2.transpose(2, 3, 0, 1), order='C') nvir_seg, nvir, nocc = t2T.shape[:3] t1 = t2 = None ntasks = mpi.pool.size vlocs = [_task_location(nvir, task_id) for task_id in range(ntasks)] vloc0, vloc1 = vlocs[rank] log.debug2('vlocs %s', vlocs) assert (vloc1 - vloc0 == nvir_seg) fock = eris.fock mo_e_o = eris.mo_energy[:nocc] mo_e_v = eris.mo_energy[nocc:] + mycc.level_shift def _rotate_vir_block(buf): for task_id, buf in _rotate_tensor_block(buf): loc0, loc1 = vlocs[task_id] yield task_id, buf, loc0, loc1 fswap = lib.H5TmpFile() wVooV = numpy.zeros((nvir_seg, nocc, nocc, nvir)) eris_voov = _cp(eris.ovvo).transpose(1, 0, 3, 2) tau = t2T * .5 tau += numpy.einsum('ai,bj->abij', t1T[vloc0:vloc1], t1T) for task_id, tau, p0, p1 in _rotate_vir_block(tau): wVooV += lib.einsum('bkic,cajk->bija', eris_voov[:, :, :, p0:p1], tau) fswap['wVooV1'] = wVooV wVooV = tau = None time1 = log.timer_debug1('wVooV', *time1) wVOov = eris_voov eris_VOov = eris_voov - eris_voov.transpose(0, 2, 1, 3) * .5 tau = t2T.transpose(2, 0, 3, 1) - t2T.transpose(3, 0, 2, 1) * .5 tau -= numpy.einsum('ai,bj->jaib', t1T[vloc0:vloc1], t1T) for task_id, tau, p0, p1 in _rotate_vir_block(tau): wVOov += lib.einsum('dlkc,kcjb->dljb', eris_VOov[:, :, :, p0:p1], tau) fswap['wVOov1'] = wVOov wVOov = tau = eris_VOov = eris_voov = None time1 = log.timer_debug1('wVOov', *time1) t1Tnew = numpy.zeros_like(t1T) t2Tnew = mycc._add_vvvv(t1T, t2T, eris, t2sym='jiba') time1 = log.timer_debug1('vvvv', *time1) #** make_inter_F fov = fock[:nocc, nocc:].copy() t1Tnew += fock[nocc:, :nocc] foo = fock[:nocc, :nocc] - numpy.diag(mo_e_o) foo += .5 * numpy.einsum('ia,aj->ij', fock[:nocc, nocc:], t1T) fvv = fock[nocc:, nocc:] - numpy.diag(mo_e_v) fvv -= .5 * numpy.einsum('ai,ib->ab', t1T, fock[:nocc, nocc:]) foo_priv = numpy.zeros_like(foo) fov_priv = numpy.zeros_like(fov) fvv_priv = numpy.zeros_like(fvv) t1T_priv = numpy.zeros_like(t1T) max_memory = mycc.max_memory - lib.current_memory()[0] unit = nocc * nvir**2 * 3 + nocc**2 * nvir + 1 blksize = min(nvir, max(BLKMIN, int((max_memory * .9e6 / 8 - t2T.size) / unit))) log.debug1('pass 1, max_memory %d MB, nocc,nvir = %d,%d blksize = %d', max_memory, nocc, nvir, blksize) buf = numpy.empty((blksize, nvir, nvir, nocc)) def load_vvvo(p0): p1 = min(nvir_seg, p0 + blksize) if p0 < p1: buf[:p1 - p0] = eris.vvvo[p0:p1] fswap.create_dataset('wVooV', (nvir_seg, nocc, nocc, nvir), 'f8') wVOov = [] with lib.call_in_background(load_vvvo) as prefetch: load_vvvo(0) for p0, p1 in lib.prange(vloc0, vloc1, blksize): i0, i1 = p0 - vloc0, p1 - vloc0 eris_vvvo, buf = buf[:p1 - p0], numpy.empty_like(buf) prefetch(i1) fvv_priv[p0:p1] += 2 * numpy.einsum('ck,abck->ab', t1T, eris_vvvo) fvv_priv -= numpy.einsum('ck,cabk->ab', t1T[p0:p1], eris_vvvo) if not mycc.direct: raise NotImplementedError tau = t2T[i0:i1] + numpy.einsum('ai,bj->abij', t1T[p0:p1], t1T) for task_id, tau, q0, q1 in _rotate_vir_block(tau): tmp = lib.einsum('bdck,cdij->bkij', eris_vvvo[:, :, q0:q1], tau) t2Tnew -= lib.einsum('ak,bkij->baji', t1T, tmp) tau = tmp = None fswap['wVooV'][i0:i1] = lib.einsum('cj,baci->bija', -t1T, eris_vvvo) theta = t2T[i0:i1].transpose(0, 2, 1, 3) * 2 theta -= t2T[i0:i1].transpose(0, 3, 1, 2) t1T_priv += lib.einsum('bicj,bacj->ai', theta, eris_vvvo) wVOov.append(lib.einsum('acbi,cj->abij', eris_vvvo, t1T)) theta = eris_vvvo = None time1 = log.timer_debug1('vvvo [%d:%d]' % (p0, p1), *time1) wVOov = numpy.vstack(wVOov) wVOov = mpi.alltoall_new([wVOov[:, q0:q1] for q0, q1 in vlocs], split_recvbuf=True) wVOov = numpy.vstack([x.reshape(-1, nvir_seg, nocc, nocc) for x in wVOov]) fswap['wVOov'] = wVOov.transpose(1, 2, 3, 0) wVooV = None unit = nocc**2 * nvir * 7 + nocc**3 + nocc * nvir**2 max_memory = max(0, mycc.max_memory - lib.current_memory()[0]) blksize = min(nvir, max(BLKMIN, int((max_memory * .9e6 / 8 - nocc**4) / unit))) log.debug1('pass 2, max_memory %d MB, nocc,nvir = %d,%d blksize = %d', max_memory, nocc, nvir, blksize) woooo = numpy.zeros((nocc, nocc, nocc, nocc)) for p0, p1 in lib.prange(vloc0, vloc1, blksize): i0, i1 = p0 - vloc0, p1 - vloc0 wVOov = fswap['wVOov'][i0:i1] wVooV = fswap['wVooV'][i0:i1] eris_ovoo = eris.ovoo[:, i0:i1] eris_oovv = numpy.empty((nocc, nocc, i1 - i0, nvir)) def load_oovv(p0, p1): eris_oovv[:] = eris.oovv[:, :, p0:p1] with lib.call_in_background(load_oovv) as prefetch_oovv: #:eris_oovv = eris.oovv[:,:,i0:i1] prefetch_oovv(i0, i1) foo_priv += numpy.einsum('ck,kcji->ij', 2 * t1T[p0:p1], eris_ovoo) foo_priv += numpy.einsum('ck,icjk->ij', -t1T[p0:p1], eris_ovoo) tmp = lib.einsum('al,jaik->lkji', t1T[p0:p1], eris_ovoo) woooo += tmp + tmp.transpose(1, 0, 3, 2) tmp = None wVOov -= lib.einsum('jbik,ak->bjia', eris_ovoo, t1T) t2Tnew[i0:i1] += wVOov.transpose(0, 3, 1, 2) wVooV += lib.einsum('kbij,ak->bija', eris_ovoo, t1T) eris_ovoo = None load_oovv = prefetch_oovv = None eris_ovvo = numpy.empty((nocc, i1 - i0, nvir, nocc)) def load_ovvo(p0, p1): eris_ovvo[:] = eris.ovvo[:, p0:p1] with lib.call_in_background(load_ovvo) as prefetch_ovvo: #:eris_ovvo = eris.ovvo[:,i0:i1] prefetch_ovvo(i0, i1) t1T_priv[p0:p1] -= numpy.einsum('bj,jiab->ai', t1T, eris_oovv) wVooV -= eris_oovv.transpose(2, 0, 1, 3) wVOov += wVooV * .5 #: bjia + bija*.5 eris_voov = eris_ovvo.transpose(1, 0, 3, 2) eris_ovvo = None load_ovvo = prefetch_ovvo = None def update_wVooV(i0, i1): wVooV[:] += fswap['wVooV1'][i0:i1] fswap['wVooV1'][i0:i1] = wVooV wVOov[:] += fswap['wVOov1'][i0:i1] fswap['wVOov1'][i0:i1] = wVOov with lib.call_in_background(update_wVooV) as update_wVooV: update_wVooV(i0, i1) t2Tnew[i0:i1] += eris_voov.transpose(0, 3, 1, 2) * .5 t1T_priv[p0:p1] += 2 * numpy.einsum('bj,aijb->ai', t1T, eris_voov) tmp = lib.einsum('ci,kjbc->bijk', t1T, eris_oovv) tmp += lib.einsum('bjkc,ci->bjik', eris_voov, t1T) t2Tnew[i0:i1] -= lib.einsum('bjik,ak->baji', tmp, t1T) eris_oovv = tmp = None fov_priv[:, p0:p1] += numpy.einsum('ck,aikc->ia', t1T, eris_voov) * 2 fov_priv[:, p0:p1] -= numpy.einsum('ck,akic->ia', t1T, eris_voov) tau = numpy.einsum('ai,bj->abij', t1T[p0:p1] * .5, t1T) tau += t2T[i0:i1] theta = tau.transpose(0, 1, 3, 2) * 2 theta -= tau fvv_priv -= lib.einsum('caij,cjib->ab', theta, eris_voov) foo_priv += lib.einsum('aikb,abkj->ij', eris_voov, theta) tau = theta = None tau = t2T[i0:i1] + numpy.einsum('ai,bj->abij', t1T[p0:p1], t1T) woooo += lib.einsum('abij,aklb->ijkl', tau, eris_voov) tau = None eris_VOov = wVOov = wVooV = update_wVooV = None time1 = log.timer_debug1('voov [%d:%d]' % (p0, p1), *time1) wVooV = _cp(fswap['wVooV1']) for task_id, wVooV, p0, p1 in _rotate_vir_block(wVooV): tmp = lib.einsum('ackj,ckib->ajbi', t2T[:, p0:p1], wVooV) t2Tnew += tmp.transpose(0, 2, 3, 1) t2Tnew += tmp.transpose(0, 2, 1, 3) * .5 wVooV = tmp = None time1 = log.timer_debug1('contracting wVooV', *time1) wVOov = _cp(fswap['wVOov1']) theta = t2T * 2 theta -= t2T.transpose(0, 1, 3, 2) for task_id, wVOov, p0, p1 in _rotate_vir_block(wVOov): t2Tnew += lib.einsum('acik,ckjb->abij', theta[:, p0:p1], wVOov) wVOov = theta = None fswap = None time1 = log.timer_debug1('contracting wVOov', *time1) foo += mpi.allreduce(foo_priv) fov += mpi.allreduce(fov_priv) fvv += mpi.allreduce(fvv_priv) theta = t2T.transpose(0, 1, 3, 2) * 2 - t2T t1T_priv[vloc0:vloc1] += numpy.einsum('jb,abji->ai', fov, theta) ovoo = _cp(eris.ovoo) for task_id, ovoo, p0, p1 in _rotate_vir_block(ovoo): t1T_priv[vloc0:vloc1] -= lib.einsum('jbki,abjk->ai', ovoo, theta[:, p0:p1]) theta = ovoo = None woooo = mpi.allreduce(woooo) woooo += _cp(eris.oooo).transpose(0, 2, 1, 3) tau = t2T + numpy.einsum('ai,bj->abij', t1T[vloc0:vloc1], t1T) t2Tnew += .5 * lib.einsum('abkl,ijkl->abij', tau, woooo) tau = woooo = None t1Tnew += mpi.allreduce(t1T_priv) ft_ij = foo + numpy.einsum('aj,ia->ij', .5 * t1T, fov) ft_ab = fvv - numpy.einsum('ai,ib->ab', .5 * t1T, fov) t2Tnew += lib.einsum('acij,bc->abij', t2T, ft_ab) t2Tnew -= lib.einsum('ki,abkj->abij', ft_ij, t2T) eia = mo_e_o[:, None] - mo_e_v t1Tnew += numpy.einsum('bi,ab->ai', t1T, fvv) t1Tnew -= numpy.einsum('aj,ji->ai', t1T, foo) t1Tnew /= eia.T t2tmp = mpi.alltoall_new([t2Tnew[:, p0:p1] for p0, p1 in vlocs], split_recvbuf=True) for task_id, (p0, p1) in enumerate(vlocs): tmp = t2tmp[task_id].reshape(p1 - p0, nvir_seg, nocc, nocc) t2Tnew[:, p0:p1] += tmp.transpose(1, 0, 3, 2) for i in range(vloc0, vloc1): t2Tnew[i - vloc0] /= lib.direct_sum('i+jb->bij', eia[:, i], eia) time0 = log.timer_debug1('update t1 t2', *time0) return t1Tnew.T, t2Tnew.transpose(2, 3, 0, 1)
def _contract_vvvv_t2(mycc, vvvv, t2T, task_locs, out=None, verbose=None): '''Ht2 = numpy.einsum('ijcd,acbd->ijab', t2, vvvv) where vvvv has to be real and has the 4-fold permutation symmetry Args: vvvv : None or integral object if vvvv is None, contract t2 to AO-integrals using AO-direct algorithm ''' time0 = logger.process_clock(), logger.perf_counter() mol = mycc.mol log = logger.new_logger(mycc, verbose) if callable(t2T): t2T = t2T() assert (t2T.dtype == numpy.double) nvira, nvirb = t2T.shape[:2] nvir2 = nvira * nvirb t2T = t2T.reshape(nvira, nvirb, -1) nocc2 = t2T.shape[2] Ht2 = numpy.ndarray(t2T.shape, dtype=t2T.dtype, buffer=out) Ht2[:] = 0 _dgemm = lib.numpy_helper._dgemm def contract_blk_(Ht2, t2T, eri, i0, i1, j0, j1): ic = i1 - i0 jc = j1 - j0 #:Ht2[j0:j1] += numpy.einsum('efx,efab->abx', t2T[i0:i1], eri) _dgemm('T', 'N', jc * nvirb, nocc2, ic * nvirb, eri.reshape(ic * nvirb, jc * nvirb), t2T.reshape(-1, nocc2), Ht2.reshape(nvir2, nocc2), 1, 1, 0, i0 * nvirb * nocc2, j0 * nvirb * nocc2) max_memory = max(MEMORYMIN, mycc.max_memory - lib.current_memory()[0]) if vvvv is None: # AO-direct CCSD ao_loc = mol.ao_loc_nr() intor = mol._add_suffix('int2e') ao2mopt = _ao2mo.AO2MOpt(mol, intor, 'CVHFnr_schwarz_cond', 'CVHFsetnr_direct_scf') blksize = max(BLKMIN, numpy.sqrt(max_memory * .9e6 / 8 / nvirb**2 / 2)) fint = gto.moleintor.getints4c fload = ccsd._ccsd.libcc.CCload_eri ntasks = mpi.pool.size task_sh_locs = task_locs sh_ranges_tasks = [] for task in range(ntasks): sh0 = task_sh_locs[task] sh1 = task_sh_locs[task + 1] sh_ranges = ao2mo.outcore.balance_partition( ao_loc, blksize, sh0, sh1) sh_ranges_tasks.append(sh_ranges) blksize = max( max(x[2] for x in sh_ranges) if sh_ranges else 0 for sh_ranges in sh_ranges_tasks) eribuf = numpy.empty((blksize, blksize, nvirb, nvirb)) loadbuf = numpy.empty((blksize, blksize, nvirb, nvirb)) ao_sh_ranges = sh_ranges_tasks[rank] ao_sh0 = task_sh_locs[rank] ao_sh1 = task_sh_locs[rank + 1] ao_offset = ao_loc[ao_sh0] assert (nvira == ao_loc[ao_sh1] - ao_loc[ao_sh0]) for task_id, t2T in _rotate_tensor_block(t2T): sh_ranges = sh_ranges_tasks[task_id] sh0 = task_sh_locs[task_id] cur_offset = ao_loc[sh0] for ish0, ish1, ni in sh_ranges: for jsh0, jsh1, nj in ao_sh_ranges: eri = fint(intor, mol._atm, mol._bas, mol._env, shls_slice=(ish0, ish1, jsh0, jsh1), aosym='s2kl', ao_loc=ao_loc, cintopt=ao2mopt._cintopt, out=eribuf) i0, i1 = ao_loc[ish0] - cur_offset, ao_loc[ ish1] - cur_offset j0, j1 = ao_loc[jsh0] - ao_offset, ao_loc[jsh1] - ao_offset tmp = numpy.ndarray((i1 - i0, nvirb, j1 - j0, nvirb), buffer=loadbuf) fload(tmp.ctypes.data_as(ctypes.c_void_p), eri.ctypes.data_as(ctypes.c_void_p), (ctypes.c_int * 4)(i0, i1, j0, j1), ctypes.c_int(nvirb)) contract_blk_(Ht2, t2T, tmp, i0, i1, j0, j1) time0 = log.timer_debug1( 'AO-vvvv [%d:%d,%d:%d]' % (ish0, ish1, jsh0, jsh1), *time0) else: raise NotImplementedError return Ht2
def _add_vvvv_tril(mycc, t1T, t2T, eris, out=None, with_ovvv=None): '''Ht2 = numpy.einsum('ijcd,acdb->ijab', t2, vvvv) Using symmetry t2[ijab] = t2[jiba] and Ht2[ijab] = Ht2[jiba], compute the lower triangular part of Ht2 ''' time0 = logger.process_clock(), logger.perf_counter() log = logger.Logger(mycc.stdout, mycc.verbose) if with_ovvv is None: with_ovvv = mycc.direct nvir_seg, nvir, nocc = t2T.shape[:3] vloc0, vloc1 = _task_location(nvir, rank) nocc2 = nocc * (nocc + 1) // 2 if t1T is None: tau = lib.pack_tril(t2T.reshape(nvir_seg * nvir, nocc, nocc)) else: tau = t2T + numpy.einsum('ai,bj->abij', t1T[vloc0:vloc1], t1T) tau = lib.pack_tril(tau.reshape(nvir_seg * nvir, nocc, nocc)) tau = tau.reshape(nvir_seg, nvir, nocc2) if mycc.direct: # AO-direct CCSD mo = getattr(eris, 'mo_coeff', None) if mo is None: # If eris does not have the attribute mo_coeff mo = _mo_without_core(mycc, mycc.mo_coeff) tau_shape = tau.shape ao_loc = mycc.mol.ao_loc_nr() orbv = mo[:, nocc:] nao, nvir = orbv.shape ntasks = mpi.pool.size task_sh_locs = lib.misc._balanced_partition(ao_loc, ntasks) ao_loc0 = ao_loc[task_sh_locs[rank]] ao_loc1 = ao_loc[task_sh_locs[rank + 1]] tau = lib.einsum('pb,abx->apx', orbv, tau) tau_priv = numpy.zeros((ao_loc1 - ao_loc0, nao, nocc2)) for task_id, tau in _rotate_tensor_block(tau): loc0, loc1 = _task_location(nvir, task_id) tau_priv += lib.einsum('pa,abx->pbx', orbv[ao_loc0:ao_loc1, loc0:loc1], tau) tau = None time1 = log.timer_debug1('vvvv-tau mo2ao', *time0) buf = _contract_vvvv_t2(mycc, None, tau_priv, task_sh_locs, None, log) buf = buf_ao = buf.reshape(tau_priv.shape) tau_priv = None time1 = log.timer_debug1('vvvv-tau contraction', *time1) buf = lib.einsum('apx,pb->abx', buf, orbv) Ht2tril = numpy.ndarray((nvir_seg, nvir, nocc2), buffer=out) Ht2tril[:] = 0 for task_id, buf in _rotate_tensor_block(buf): ao_loc0 = ao_loc[task_sh_locs[task_id]] ao_loc1 = ao_loc[task_sh_locs[task_id + 1]] Ht2tril += lib.einsum('pa,pbx->abx', orbv[ao_loc0:ao_loc1, vloc0:vloc1], buf) time1 = log.timer_debug1('vvvv-tau ao2mo', *time1) if with_ovvv: #: tmp = numpy.einsum('ijcd,ak,kdcb->ijba', tau, t1T, eris.ovvv) #: t2new -= tmp + tmp.transpose(1,0,3,2) orbo = mo[:, :nocc] buf = lib.einsum('apx,pi->axi', buf_ao, orbo) tmp = numpy.zeros((nvir_seg, nocc2, nocc)) for task_id, buf in _rotate_tensor_block(buf): ao_loc0 = ao_loc[task_sh_locs[task_id]] ao_loc1 = ao_loc[task_sh_locs[task_id + 1]] tmp += lib.einsum('pa,pxi->axi', orbv[ao_loc0:ao_loc1, vloc0:vloc1], buf) Ht2tril -= lib.einsum('axi,bi->abx', tmp, t1T) tmp = buf = None t1_ao = numpy.dot(orbo, t1T[vloc0:vloc1].T) buf = lib.einsum('apx,pb->abx', buf_ao, orbv) for task_id, buf in _rotate_tensor_block(buf): ao_loc0 = ao_loc[task_sh_locs[task_id]] ao_loc1 = ao_loc[task_sh_locs[task_id + 1]] Ht2tril -= lib.einsum('pa,pbx->abx', t1_ao[ao_loc0:ao_loc1], buf) time1 = log.timer_debug1('contracting vvvv-tau', *time0) else: raise NotImplementedError return Ht2tril
def update_amps(mycc, t1, t2, eris): """ Update GCCD amplitudes. """ time0 = logger.process_clock(), logger.perf_counter() log = logger.Logger(mycc.stdout, mycc.verbose) t1T = t1.T t2T = np.asarray(t2.transpose(2, 3, 0, 1), order='C') nvir_seg, nvir, nocc = t2T.shape[:3] t2 = None ntasks = mpi.pool.size vlocs = [_task_location(nvir, task_id) for task_id in range(ntasks)] vloc0, vloc1 = vlocs[rank] log.debug2('vlocs %s', vlocs) assert vloc1 - vloc0 == nvir_seg fock = eris.fock fvo = fock[nocc:, :nocc] mo_e_o = eris.mo_energy[:nocc] mo_e_v = eris.mo_energy[nocc:] + mycc.level_shift tauT_tilde = make_tauT(t1T, t2T, fac=0.5, vlocs=vlocs) Fvv = cc_Fvv(t1T, t2T, eris, tauT_tilde=tauT_tilde, vlocs=vlocs) Foo = cc_Foo(t1T, t2T, eris, tauT_tilde=tauT_tilde, vlocs=vlocs) tauT_tilde = None Fov = cc_Fov(t1T, eris, vlocs=vlocs) # Move energy terms to the other side Fvv[np.diag_indices(nvir)] -= mo_e_v Foo[np.diag_indices(nocc)] -= mo_e_o # T1 equation t1Tnew = np.zeros_like(t1T) #t1Tnew = np.dot(Fvv, t1T) #t1Tnew -= np.dot(t1T, Foo) tmp = einsum('aeim, me -> ai', t2T, Fov) #tmp -= np.einsum('fn, naif -> ai', t1T, eris.oxov, optimize=True) tmp = mpi.allgather(tmp) #tmp2 = einsum('eamn, mnie -> ai', t2T, eris.ooox) tmp2 = einsum('eamn, einm -> ai', t2T, eris.xooo) #tmp2 += einsum('efim, mafe -> ai', t2T, eris.ovvx) tmp2 += einsum('efim, efam -> ai', t2T, eris.xvvo) tmp2 *= 0.5 tmp2 = mpi.allreduce_inplace(tmp2) tmp += tmp2 tmp2 = None #t1Tnew += tmp #t1Tnew += fvo # T2 equation Ftmp = Fvv #- 0.5 * np.dot(t1T, Fov) t2Tnew = einsum('aeij, be -> abij', t2T, Ftmp) t2T_tmp = mpi.alltoall_new([t2Tnew[:, p0:p1] for p0, p1 in vlocs], split_recvbuf=True) for task_id, (p0, p1) in enumerate(vlocs): tmp = t2T_tmp[task_id].reshape(p1 - p0, nvir_seg, nocc, nocc) t2Tnew[:, p0:p1] -= tmp.transpose(1, 0, 2, 3) tmp = None t2T_tmp = None Ftmp = Foo #+ 0.5 * np.dot(Fov, t1T) tmp = einsum('abim, mj -> abij', t2T, Ftmp) t2Tnew -= tmp t2Tnew += tmp.transpose(0, 1, 3, 2) tmp = None t2Tnew += np.asarray(eris.xvoo) tauT = make_tauT(t1T, t2T, vlocs=vlocs) Woooo = cc_Woooo(t1T, t2T, eris, tauT=tauT, vlocs=vlocs) Woooo *= 0.5 t2Tnew += einsum('abmn, mnij -> abij', tauT, Woooo) Woooo = None Wvvvv = cc_Wvvvv(t1T, t2T, eris, tauT=tauT, vlocs=vlocs) for task_id, tauT_tmp, p0, p1 in _rotate_vir_block(tauT, vlocs=vlocs): tmp = einsum('abef, efij -> abij', Wvvvv[:, :, p0:p1], tauT_tmp) tmp *= 0.5 t2Tnew += tmp tmp = tauT_tmp = None Wvvvv = None tauT = None #tmp = einsum('mbje, ei -> bmij', eris.oxov, t1T) # [b]mij #tmp = mpi.allgather(tmp) # bmij #tmp = einsum('am, bmij -> abij', t1T[vloc0:vloc1], tmp) # [a]bij tmp = 0.0 Wvovo = cc_Wovvo(t1T, t2T, eris, vlocs=vlocs).transpose(2, 0, 1, 3) for task_id, w_tmp, p0, p1 in _rotate_vir_block(Wvovo, vlocs=vlocs): tmp += einsum('aeim, embj -> abij', t2T[:, p0:p1], w_tmp) w_tmp = None Wvovo = None tmp = tmp - tmp.transpose(0, 1, 3, 2) t2Tnew += tmp tmpT = mpi.alltoall_new([tmp[:, p0:p1] for p0, p1 in vlocs], split_recvbuf=True) for task_id, (p0, p1) in enumerate(vlocs): tmp = tmpT[task_id].reshape(p1 - p0, nvir_seg, nocc, nocc) t2Tnew[:, p0:p1] -= tmp.transpose(1, 0, 2, 3) tmp = None tmpT = None #tmp = einsum('ei, jeba -> abij', t1T, eris.ovvx) #t2Tnew += tmp #t2Tnew -= tmp.transpose(0, 1, 3, 2) #tmp = einsum('am, ijmb -> baij', t1T, eris.ooox.conj()) #t2Tnew += tmp #tmpT = mpi.alltoall([tmp[:, p0:p1] for p0, p1 in vlocs], # split_recvbuf=True) #for task_id, (p0, p1) in enumerate(vlocs): # tmp = tmpT[task_id].reshape(p1-p0, nvir_seg, nocc, nocc) # t2Tnew[:, p0:p1] -= tmp.transpose(1, 0, 2, 3) # tmp = None #tmpT = None eia = mo_e_o[:, None] - mo_e_v #t1Tnew /= eia.T for i in range(vloc0, vloc1): t2Tnew[i - vloc0] /= lib.direct_sum('i + jb -> bij', eia[:, i], eia) time0 = log.timer_debug1('update t1 t2', *time0) return t1Tnew.T, t2Tnew.transpose(2, 3, 0, 1)
def _eval_jk(mf, dm, hermi, gen_jobs): cpu0 = (logger.process_clock(), logger.perf_counter()) mol = mf.mol ao_loc = mol.ao_loc_nr() nao = ao_loc[-1] bas_groups = _partition_bas(mol) jobs = gen_jobs(len(bas_groups), hermi) njobs = len(jobs) logger.debug1(mf, 'njobs %d', njobs) # Each job has multiple recipes. n_recipes = len(jobs[0][1:]) dm = numpy.asarray(dm).reshape(-1, nao, nao) n_dm = dm.shape[0] vk = numpy.zeros((n_recipes, n_dm, nao, nao)) if mf.opt is None: vhfopt = mf.init_direct_scf(mol) else: vhfopt = mf.opt # Assign the entire dm_cond to vhfopt. # The prescreen function CVHFnrs8_prescreen will index q_cond and dm_cond # over the entire basis. "set_dm" in function jk.get_jk/direct_bindm only # creates a subblock of dm_cond which is not compatible with # CVHFnrs8_prescreen. vhfopt.set_dm(dm, mol._atm, mol._bas, mol._env) # Then skip the "set_dm" initialization in function jk.get_jk/direct_bindm. vhfopt._dmcondname = None logger.timer_debug1(mf, 'get_jk initialization', *cpu0) for job_id in mpi.work_stealing_partition(range(njobs)): group_ids = jobs[job_id][0] recipes = jobs[job_id][1:] shls_slice = lib.flatten([bas_groups[i] for i in group_ids]) loc = ao_loc[shls_slice].reshape(4, 2) dm_blks = [] for i_dm in range(n_dm): for ir, recipe in enumerate(recipes): for i, rec in enumerate(recipe): p0, p1 = loc[rec[0]] q0, q1 = loc[rec[1]] dm_blks.append(dm[i_dm, p0:p1, q0:q1]) scripts = [ 'ijkl,%s%s->%s%s' % tuple(['ijkl'[x] for x in rec]) for recipe in recipes for rec in recipe ] * n_dm kparts = jk.get_jk(mol, dm_blks, scripts, shls_slice=shls_slice, vhfopt=vhfopt) for i_dm in range(n_dm): for ir, recipe in enumerate(recipes): for i, rec in enumerate(recipe): p0, p1 = loc[rec[2]] q0, q1 = loc[rec[3]] vk[ir, i_dm, p0:p1, q0:q1] += kparts[i] # Pop the results of one recipe kparts = kparts[i + 1:] vk = mpi.reduce(vk) if rank == 0: if hermi: for i in range(n_recipes): for j in range(n_dm): lib.hermi_triu(vk[i, j], hermi, inplace=True) else: # Zero out vk on workers. If reduce(get_jk()) is called twice, # non-zero vk on workers can cause error. vk[:] = 0 logger.timer(mf, 'get_jk', *cpu0) return vk
def kernel(mycc, eris=None, t1=None, t2=None, l1=None, l2=None, max_cycle=50, tol=1e-6, verbose=None, fintermediates=None, fupdate=None, approx_l=False): """ CCSD lambda kernel. """ log = logger.new_logger(mycc, verbose) cput0 = (logger.process_clock(), logger.perf_counter()) _sync_(mycc) eris = getattr(mycc, '_eris', None) if eris is None: mycc.ao2mo(mycc.mo_coeff) eris = mycc._eris if t1 is None: t1 = mycc.t1 if t2 is None: t2 = mycc.t2 if l1 is None: if mycc.l1 is None: l1 = t1 else: l1 = mycc.l1 if l2 is None: if mycc.l2 is None: l2 = t2 else: l2 = mycc.l2 t1 = np.zeros_like(t1) l1 = np.zeros_like(l1) if approx_l: mycc.l1 = l1 mycc.l2 = l2 conv = True return conv, l1, l2 if fintermediates is None: fintermediates = make_intermediates if fupdate is None: fupdate = update_lambda imds = fintermediates(mycc, t1, t2, eris) if isinstance(mycc.diis, diis.DistributedDIIS): adiis = mycc.diis elif mycc.diis: adiis = diis.DistributedDIIS(mycc, mycc.diis_file) adiis.space = mycc.diis_space else: adiis = None cput1 = log.timer('CCSD lambda initialization', *cput0) conv = False for istep in range(max_cycle): l1new, l2new = fupdate(mycc, t1, t2, l1, l2, eris, imds) normt = _diff_norm(mycc, l1new, l2new, l1, l2) l1, l2 = l1new, l2new l1new = l2new = None l1, l2 = mycc.run_diis(l1, l2, istep, normt, 0, adiis) log.info('cycle = %d norm(lambda1,lambda2) = %.6g', istep + 1, normt) cput1 = log.timer('CCSD iter', *cput1) if normt < tol: conv = True break mycc.l1 = l1 mycc.l2 = l2 log.timer('CCSD lambda', *cput0) return conv, l1, l2
def _make_eris(mp, mo_coeff=None, verbose=None): log = logger.new_logger(mp, verbose) time0 = (logger.process_clock(), logger.perf_counter()) log.debug('transform (ia|jb) outcore') mol = mp.mol nocc = mp.nocc nmo = mp.nmo nvir = nmo - nocc eris = mp2._ChemistsERIs(mp, mo_coeff) nao = eris.mo_coeff.shape[0] assert(nvir <= nao) orbo = eris.mo_coeff[:,:nocc] orbv = numpy.asarray(eris.mo_coeff[:,nocc:], order='F') eris.feri = lib.H5TmpFile() int2e = mol._add_suffix('int2e') ao2mopt = _ao2mo.AO2MOpt(mol, int2e, 'CVHFnr_schwarz_cond', 'CVHFsetnr_direct_scf') fint = gto.moleintor.getints4c ntasks = mpi.pool.size olocs = [_task_location(nocc, task_id) for task_id in range(ntasks)] oloc0, oloc1 = olocs[rank] nocc_seg = oloc1 - oloc0 log.debug2('olocs %s', olocs) ao_loc = mol.ao_loc_nr() task_sh_locs = lib.misc._balanced_partition(ao_loc, ntasks) log.debug2('task_sh_locs %s', task_sh_locs) ao_sh0 = task_sh_locs[rank] ao_sh1 = task_sh_locs[rank+1] ao_loc0 = ao_loc[ao_sh0] ao_loc1 = ao_loc[ao_sh1] nao_seg = ao_loc1 - ao_loc0 orbo_seg = orbo[ao_loc0:ao_loc1] mem_now = lib.current_memory()[0] max_memory = max(0, mp.max_memory - mem_now) dmax = numpy.sqrt(max_memory*.9e6/8/((nao+nocc)*(nao_seg+nocc))) dmax = min(nao//4+2, max(BLKMIN, min(comm.allgather(dmax)))) sh_ranges = ao2mo.outcore.balance_partition(ao_loc, dmax) sh_ranges = comm.bcast(sh_ranges) dmax = max(x[2] for x in sh_ranges) eribuf = numpy.empty((nao,dmax,dmax,nao_seg)) ftmp = lib.H5TmpFile() log.debug('max_memory %s MB (dmax = %s) required disk space %g MB', max_memory, dmax, nocc*nocc_seg*(nao*(nao+dmax)/2+nvir**2)*8/1e6) def save(count, tmp_xo): di, dj = tmp_xo.shape[2:4] tmp_xo = [tmp_xo[p0:p1] for p0, p1 in olocs] tmp_xo = mpi.alltoall(tmp_xo, split_recvbuf=True) tmp_xo = sum(tmp_xo).reshape(nocc_seg,nocc,di,dj) ftmp[str(count)+'b'] = tmp_xo tmp_ox = mpi.alltoall([tmp_xo[:,p0:p1] for p0, p1 in olocs], split_recvbuf=True) tmp_ox = [tmp_ox[i].reshape(p1-p0,nocc_seg,di,dj) for i, (p0,p1) in enumerate(olocs)] ftmp[str(count)+'a'] = numpy.vstack(tmp_ox) jk_blk_slices = [] count = 0 time1 = time0 with lib.call_in_background(save) as bg_save: for ip, (ish0, ish1, ni) in enumerate(sh_ranges): for jsh0, jsh1, nj in sh_ranges[:ip+1]: i0, i1 = ao_loc[ish0], ao_loc[ish1] j0, j1 = ao_loc[jsh0], ao_loc[jsh1] jk_blk_slices.append((i0,i1,j0,j1)) shls_slice = (0,mol.nbas,ish0,ish1, jsh0,jsh1,ao_sh0,ao_sh1) eri = fint(int2e, mol._atm, mol._bas, mol._env, shls_slice=shls_slice, aosym='s1', ao_loc=ao_loc, cintopt=ao2mopt._cintopt, out=eribuf) tmp_xo = lib.einsum('pi,pqrs->iqrs', orbo, eri) tmp_xo = lib.einsum('iqrs,sl->ilqr', tmp_xo, orbo_seg) bg_save(count, tmp_xo) tmp_xo = None count += 1 time1 = log.timer_debug1('partial ao2mo [%d:%d,%d:%d]' % (ish0,ish1,jsh0,jsh1), *time1) eri = eribuf = None time1 = time0 = log.timer('mp2 ao2mo_ovov pass1', *time0) eris.ovov = eris.feri.create_dataset('ovov', (nocc,nvir,nocc_seg,nvir), 'f8') occblk = int(min(nocc, max(BLKMIN, max_memory*.9e6/8/(nao**2*nocc_seg+1)/5))) def load(i0, eri): if i0 < nocc: i1 = min(i0+occblk, nocc) for k, (p0,p1,q0,q1) in enumerate(jk_blk_slices): eri[:i1-i0,:,p0:p1,q0:q1] = ftmp[str(k)+'a'][i0:i1] if p0 != q0: dat = numpy.asarray(ftmp[str(k)+'b'][:,i0:i1]) eri[:i1-i0,:,q0:q1,p0:p1] = dat.transpose(1,0,3,2) def save(i0, i1, dat): eris.ovov[i0:i1] = dat buf_prefecth = numpy.empty((occblk,nocc_seg,nao,nao)) buf = numpy.empty_like(buf_prefecth) bufw = numpy.empty((occblk*nocc_seg,nvir**2)) bufw1 = numpy.empty_like(bufw) with lib.call_in_background(load) as prefetch: with lib.call_in_background(save) as bsave: load(0, buf_prefecth) for i0, i1 in lib.prange(0, nocc, occblk): buf, buf_prefecth = buf_prefecth, buf prefetch(i1, buf_prefecth) eri = buf[:i1-i0].reshape((i1-i0)*nocc_seg,nao,nao) dat = _ao2mo.nr_e2(eri, orbv, (0,nvir,0,nvir), 's1', 's1', out=bufw) bsave(i0, i1, dat.reshape(i1-i0,nocc_seg,nvir,nvir).transpose(0,2,1,3)) bufw, bufw1 = bufw1, bufw time1 = log.timer_debug1('pass2 ao2mo [%d:%d]' % (i0,i1), *time1) time0 = log.timer('mp2 ao2mo_ovov pass2', *time0) mp._eris = eris return eris