def kernel(mycc, eris=None, t1=None, t2=None, max_cycle=50, tol=1e-8, tolnormt=1e-6, verbose=None): log = logger.new_logger(mycc, verbose) cput0 = (logger.process_clock(), logger.perf_counter()) _sync_(mycc) eris = getattr(mycc, '_eris', None) if eris is None: mycc.ao2mo(mycc.mo_coeff) eris = mycc._eris cput1 = (logger.process_clock(), logger.perf_counter()) # Use the existed amplitudes as initial guess if t1 is None: t1 = mycc.t1 if t2 is None: t2 = mycc.t2 if t1 is None and t2 is None: t1, t2 = mycc.get_init_guess(eris) elif t2 is None: t2 = mycc.get_init_guess(eris)[1] eold = 0 eccsd = mycc.energy(t1, t2, eris) log.info('Init E(CCSD) = %.15g', eccsd) if isinstance(mycc.diis, diis.DistributedDIIS): adiis = mycc.diis elif mycc.diis: adiis = diis.DistributedDIIS(mycc, mycc.diis_file) adiis.space = mycc.diis_space else: adiis = None conv = False for istep in range(max_cycle): t1new, t2new = mycc.update_amps(t1, t2, eris) normt = _diff_norm(mycc, t1new, t2new, t1, t2) t1, t2 = t1new, t2new t1new = t2new = None t1, t2 = mycc.run_diis(t1, t2, istep, normt, eccsd - eold, adiis) eold, eccsd = eccsd, mycc.energy(t1, t2, eris) log.info('cycle = %d E(CCSD) = %.15g dE = %.9g norm(t1,t2) = %.6g', istep + 1, eccsd, eccsd - eold, normt) cput1 = log.timer('CCSD iter', *cput1) if abs(eccsd - eold) < tol and normt < tolnormt: conv = True break mycc.e_corr = eccsd mycc.t1 = t1 mycc.t2 = t2 log.timer('CCSD', *cput0) return conv, eccsd, t1, t2
def kernel(mycc, eris=None): cpu0 = (time.clock(), time.time()) ccsd._sync_(mycc) log = logger.new_logger(mycc) eris = getattr(mycc, '_eris', None) if eris is None: mycc.ao2mo(mycc.mo_coeff) eris = mycc._eris t1T = numpy.asarray(mycc.t1.T, order='C') nvir, nocc = t1T.shape fvo = eris.fock[nocc:,:nocc].copy() mo_energy = eris.mo_energy.copy() et_sum = numpy.zeros(1, dtype=t1T.dtype) drv = _ccsd.libcc.MPICCsd_t_contract cpu2 = [time.clock(), time.time()] def contract(slices, data): #vvop_ab, vvop_ac, vvop_ba, vvop_bc, vvop_ca, vvop_cb, \ # vooo_a, vooo_b, vooo_c, t2T_a, t2T_b, t2T_c = data data_ptrs = [x.ctypes.data_as(ctypes.c_void_p) for x in data] data_ptrs = (ctypes.c_void_p*12)(*data_ptrs) drv(et_sum.ctypes.data_as(ctypes.c_void_p), mo_energy.ctypes.data_as(ctypes.c_void_p), t1T.ctypes.data_as(ctypes.c_void_p), fvo.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nocc), ctypes.c_int(nvir), (ctypes.c_int*6)(*slices), data_ptrs) cpu2[:] = log.alltimer_debug1('contract'+str(slices), *cpu2) with GlobalDataHandler(mycc) as daemon: v_seg_ranges = daemon.data_partition tasks = [] for ka, (a0, a1) in enumerate(v_seg_ranges): for kb, (b0, b1) in enumerate(v_seg_ranges[:ka+1]): for c0, c1 in v_seg_ranges[:kb+1]: tasks.append((a0, a1, b0, b1, c0, c1)) log.debug('ntasks = %d', len(tasks)) task_count = 0 with lib.call_in_background(contract) as async_contract: #for task in mpi.static_partition(tasks): #for task in mpi.work_stealing_partition(tasks): for task in mpi.work_share_partition(tasks, loadmin=2): log.alldebug2('request for segment %s', task) data = [None] * 12 daemon.request_(task, data) async_contract(task, data) task_count += 1 log.alldebug1('task_count = %d', task_count) et = comm.allreduce(et_sum[0] * 2).real log.timer('CCSD(T)', *cpu0) log.note('CCSD(T) correction = %.15g', et) return et
def start(self, interval=0.02): mycc = self._cc log = logger.new_logger(mycc) cpu1 = (logger.process_clock(), logger.perf_counter()) eris = mycc._eris t2T = mycc.t2.transpose(2, 3, 0, 1) nocc, nvir = mycc.t1.shape nmo = nocc + nvir vloc0, vloc1 = self.vranges[rank] nvir_seg = vloc1 - vloc0 max_memory = min(24000, mycc.max_memory - lib.current_memory()[0]) blksize = min( nvir_seg // 4 + 1, max(16, int(max_memory * .3e6 / 8 / (nvir * nocc * nmo)))) self.eri_tmp = lib.H5TmpFile() vvop = self.eri_tmp.create_dataset('vvop', (nvir_seg, nvir, nocc, nmo), 'f8') def save_vvop(j0, j1, vvvo): buf = numpy.empty((j1 - j0, nvir, nocc, nmo), dtype=t2T.dtype) buf[:, :, :, :nocc] = eris.ovov[:, j0:j1].conj().transpose( 1, 3, 0, 2) for k, (q0, q1) in enumerate(self.vranges): blk = vvvo[k].reshape(q1 - q0, nvir, j1 - j0, nocc) buf[:, q0:q1, :, nocc:] = blk.transpose(2, 0, 3, 1) vvop[j0:j1] = buf with lib.call_in_background(save_vvop) as save_vvop: for p0, p1 in mpi.prange(vloc0, vloc1, blksize): j0, j1 = p0 - vloc0, p1 - vloc0 sub_locs = comm.allgather((p0, p1)) vvvo = mpi.alltoall_new( [eris.vvvo[:, :, q0:q1] for q0, q1 in sub_locs], split_recvbuf=True) save_vvop(j0, j1, vvvo) cpu1 = log.timer_debug1('transpose %d:%d' % (p0, p1), *cpu1) def send_data(): while True: while comm.Iprobe(source=MPI.ANY_SOURCE, tag=INQUIRY): tensors, dest = comm.recv(source=MPI.ANY_SOURCE, tag=INQUIRY) for task, slices in tensors: if task == 'Done': return else: mpi.send(self._get_tensor(task, slices), dest, tag=TRANSFER_DATA) time.sleep(interval) daemon = threading.Thread(target=send_data) daemon.start() return daemon
def kernel(mycc, eris=None, t1=None, t2=None, max_cycle=50, tol=1e-8, tolnormt=1e-6, verbose=None): log = logger.new_logger(mycc, verbose) cput1 = cput0 = (time.clock(), time.time()) _sync_(mycc) eris = getattr(mycc, '_eris', None) if eris is None: mycc.ao2mo(mycc.mo_coeff) eris = mycc._eris # Use the existed amplitudes as initial guess if t1 is None: t1 = mycc.t1 if t2 is None: t2 = mycc.t2 if t1 is None and t2 is None: t1, t2 = mycc.get_init_guess(eris) elif t2 is None: t2 = mycc.get_init_guess(eris)[1] eold = 0 eccsd = mycc.energy(t1, t2, eris) log.info('Init E(CCSD) = %.15g', eccsd) if isinstance(mycc.diis, diis.DistributedDIIS): adiis = mycc.diis elif mycc.diis: adiis = diis.DistributedDIIS(mycc, mycc.diis_file) adiis.space = mycc.diis_space else: adiis = None conv = False for istep in range(max_cycle): t1new, t2new = mycc.update_amps(t1, t2, eris) normt = _diff_norm(mycc, t1new, t2new, t1, t2) t1, t2 = t1new, t2new t1new = t2new = None t1, t2 = mycc.run_diis(t1, t2, istep, normt, eccsd-eold, adiis) eold, eccsd = eccsd, mycc.energy(t1, t2, eris) log.info('cycle = %d E(CCSD) = %.15g dE = %.9g norm(t1,t2) = %.6g', istep+1, eccsd, eccsd - eold, normt) cput1 = log.timer('CCSD iter', *cput1) if abs(eccsd-eold) < tol and normt < tolnormt: conv = True break mycc.e_corr = eccsd mycc.t1 = t1 mycc.t2 = t2 log.timer('CCSD', *cput0) return conv, eccsd, t1, t2
def start(self, interval=0.02): mycc = self._cc log = logger.new_logger(mycc) cpu1 = (time.clock(), time.time()) eris = mycc._eris t2T = mycc.t2.transpose(2,3,0,1) nocc, nvir = mycc.t1.shape nmo = nocc + nvir vloc0, vloc1 = self.vranges[rank] nvir_seg = vloc1 - vloc0 max_memory = min(24000, mycc.max_memory - lib.current_memory()[0]) blksize = min(nvir_seg//4+1, max(16, int(max_memory*.3e6/8/(nvir*nocc*nmo)))) self.eri_tmp = lib.H5TmpFile() vvop = self.eri_tmp.create_dataset('vvop', (nvir_seg,nvir,nocc,nmo), 'f8') def save_vvop(j0, j1, vvvo): buf = numpy.empty((j1-j0,nvir,nocc,nmo), dtype=t2T.dtype) buf[:,:,:,:nocc] = eris.ovov[:,j0:j1].conj().transpose(1,3,0,2) for k, (q0, q1) in enumerate(self.vranges): blk = vvvo[k].reshape(q1-q0,nvir,j1-j0,nocc) buf[:,q0:q1,:,nocc:] = blk.transpose(2,0,3,1) vvop[j0:j1] = buf with lib.call_in_background(save_vvop) as save_vvop: for p0, p1 in mpi.prange(vloc0, vloc1, blksize): j0, j1 = p0 - vloc0, p1 - vloc0 sub_locs = comm.allgather((p0,p1)) vvvo = mpi.alltoall([eris.vvvo[:,:,q0:q1] for q0, q1 in sub_locs], split_recvbuf=True) save_vvop(j0, j1, vvvo) cpu1 = log.timer_debug1('transpose %d:%d'%(p0,p1), *cpu1) def send_data(): while True: while comm.Iprobe(source=MPI.ANY_SOURCE, tag=INQUIRY): tensors, dest = comm.recv(source=MPI.ANY_SOURCE, tag=INQUIRY) for task, slices in tensors: if task == 'Done': return else: mpi.send(self._get_tensor(task, slices), dest, tag=TRANSFER_DATA) time.sleep(interval) daemon = threading.Thread(target=send_data) daemon.start() return daemon
def kernel(mycc, eris=None, t1=None, t2=None, l1=None, l2=None, max_cycle=50, tol=1e-6, verbose=None, fintermediates=None, fupdate=None, approx_l=False): """ CCSD lambda kernel. """ log = logger.new_logger(mycc, verbose) cput0 = (logger.process_clock(), logger.perf_counter()) _sync_(mycc) eris = getattr(mycc, '_eris', None) if eris is None: mycc.ao2mo(mycc.mo_coeff) eris = mycc._eris if t1 is None: t1 = mycc.t1 if t2 is None: t2 = mycc.t2 if l1 is None: if mycc.l1 is None: l1 = t1 else: l1 = mycc.l1 if l2 is None: if mycc.l2 is None: l2 = t2 else: l2 = mycc.l2 t1 = np.zeros_like(t1) l1 = np.zeros_like(l1) if approx_l: mycc.l1 = l1 mycc.l2 = l2 conv = True return conv, l1, l2 if fintermediates is None: fintermediates = make_intermediates if fupdate is None: fupdate = update_lambda imds = fintermediates(mycc, t1, t2, eris) if isinstance(mycc.diis, diis.DistributedDIIS): adiis = mycc.diis elif mycc.diis: adiis = diis.DistributedDIIS(mycc, mycc.diis_file) adiis.space = mycc.diis_space else: adiis = None cput1 = log.timer('CCSD lambda initialization', *cput0) conv = False for istep in range(max_cycle): l1new, l2new = fupdate(mycc, t1, t2, l1, l2, eris, imds) normt = _diff_norm(mycc, l1new, l2new, l1, l2) l1, l2 = l1new, l2new l1new = l2new = None l1, l2 = mycc.run_diis(l1, l2, istep, normt, 0, adiis) log.info('cycle = %d norm(lambda1,lambda2) = %.6g', istep + 1, normt) cput1 = log.timer('CCSD iter', *cput1) if normt < tol: conv = True break mycc.l1 = l1 mycc.l2 = l2 log.timer('CCSD lambda', *cput0) return conv, l1, l2
def _contract_vvvv_t2(mycc, vvvv, t2T, task_locs, out=None, verbose=None): '''Ht2 = numpy.einsum('ijcd,acbd->ijab', t2, vvvv) where vvvv has to be real and has the 4-fold permutation symmetry Args: vvvv : None or integral object if vvvv is None, contract t2 to AO-integrals using AO-direct algorithm ''' time0 = time.clock(), time.time() mol = mycc.mol log = logger.new_logger(mycc, verbose) if callable(t2T): t2T = t2T() assert (t2T.dtype == numpy.double) nvira, nvirb = t2T.shape[:2] nvir2 = nvira * nvirb t2T = t2T.reshape(nvira, nvirb, -1) nocc2 = t2T.shape[2] Ht2 = numpy.ndarray(t2T.shape, dtype=t2T.dtype, buffer=out) Ht2[:] = 0 _dgemm = lib.numpy_helper._dgemm def contract_blk_(Ht2, t2T, eri, i0, i1, j0, j1): ic = i1 - i0 jc = j1 - j0 #:Ht2[j0:j1] += numpy.einsum('efx,efab->abx', t2T[i0:i1], eri) _dgemm('T', 'N', jc * nvirb, nocc2, ic * nvirb, eri.reshape(ic * nvirb, jc * nvirb), t2T.reshape(-1, nocc2), Ht2.reshape(nvir2, nocc2), 1, 1, 0, i0 * nvirb * nocc2, j0 * nvirb * nocc2) max_memory = max(MEMORYMIN, mycc.max_memory - lib.current_memory()[0]) if vvvv is None: # AO-direct CCSD ao_loc = mol.ao_loc_nr() intor = mol._add_suffix('int2e') ao2mopt = _ao2mo.AO2MOpt(mol, intor, 'CVHFnr_schwarz_cond', 'CVHFsetnr_direct_scf') blksize = max(BLKMIN, numpy.sqrt(max_memory * .9e6 / 8 / nvirb**2 / 2)) fint = gto.moleintor.getints4c fload = ccsd._ccsd.libcc.CCload_eri ntasks = mpi.pool.size task_sh_locs = task_locs sh_ranges_tasks = [] for task in range(ntasks): sh0 = task_sh_locs[task] sh1 = task_sh_locs[task + 1] sh_ranges = ao2mo.outcore.balance_partition( ao_loc, blksize, sh0, sh1) sh_ranges_tasks.append(sh_ranges) blksize = max( max(x[2] for x in sh_ranges) if sh_ranges else 0 for sh_ranges in sh_ranges_tasks) eribuf = numpy.empty((blksize, blksize, nvirb, nvirb)) loadbuf = numpy.empty((blksize, blksize, nvirb, nvirb)) ao_sh_ranges = sh_ranges_tasks[rank] ao_sh0 = task_sh_locs[rank] ao_sh1 = task_sh_locs[rank + 1] ao_offset = ao_loc[ao_sh0] assert (nvira == ao_loc[ao_sh1] - ao_loc[ao_sh0]) for task_id, t2T in _rotate_tensor_block(t2T): sh_ranges = sh_ranges_tasks[task_id] sh0 = task_sh_locs[task_id] cur_offset = ao_loc[sh0] for ish0, ish1, ni in sh_ranges: for jsh0, jsh1, nj in ao_sh_ranges: eri = fint(intor, mol._atm, mol._bas, mol._env, shls_slice=(ish0, ish1, jsh0, jsh1), aosym='s2kl', ao_loc=ao_loc, cintopt=ao2mopt._cintopt, out=eribuf) i0, i1 = ao_loc[ish0] - cur_offset, ao_loc[ ish1] - cur_offset j0, j1 = ao_loc[jsh0] - ao_offset, ao_loc[jsh1] - ao_offset tmp = numpy.ndarray((i1 - i0, nvirb, j1 - j0, nvirb), buffer=loadbuf) fload(tmp.ctypes.data_as(ctypes.c_void_p), eri.ctypes.data_as(ctypes.c_void_p), (ctypes.c_int * 4)(i0, i1, j0, j1), ctypes.c_int(nvirb)) contract_blk_(Ht2, t2T, tmp, i0, i1, j0, j1) time0 = log.timer_debug1( 'AO-vvvv [%d:%d,%d:%d]' % (ish0, ish1, jsh0, jsh1), *time0) else: raise NotImplementedError return Ht2
def _contract_vvvv_t2(mycc, vvvv, t2T, task_locs, out=None, verbose=None): '''Ht2 = numpy.einsum('ijcd,acbd->ijab', t2, vvvv) where vvvv has to be real and has the 4-fold permutation symmetry Args: vvvv : None or integral object if vvvv is None, contract t2 to AO-integrals using AO-direct algorithm ''' time0 = time.clock(), time.time() mol = mycc.mol log = logger.new_logger(mycc, verbose) if callable(t2T): t2T = t2T() assert(t2T.dtype == numpy.double) nvira, nvirb = t2T.shape[:2] nvir2 = nvira * nvirb t2T = t2T.reshape(nvira,nvirb,-1) nocc2 = t2T.shape[2] Ht2 = numpy.ndarray(t2T.shape, dtype=t2T.dtype, buffer=out) Ht2[:] = 0 _dgemm = lib.numpy_helper._dgemm def contract_blk_(Ht2, t2T, eri, i0, i1, j0, j1): ic = i1 - i0 jc = j1 - j0 #:Ht2[j0:j1] += numpy.einsum('efx,efab->abx', t2T[i0:i1], eri) _dgemm('T', 'N', jc*nvirb, nocc2, ic*nvirb, eri.reshape(ic*nvirb,jc*nvirb), t2T.reshape(-1,nocc2), Ht2.reshape(nvir2,nocc2), 1, 1, 0, i0*nvirb*nocc2, j0*nvirb*nocc2) max_memory = max(MEMORYMIN, mycc.max_memory - lib.current_memory()[0]) if vvvv is None: # AO-direct CCSD ao_loc = mol.ao_loc_nr() intor = mol._add_suffix('int2e') ao2mopt = _ao2mo.AO2MOpt(mol, intor, 'CVHFnr_schwarz_cond', 'CVHFsetnr_direct_scf') blksize = max(BLKMIN, numpy.sqrt(max_memory*.9e6/8/nvirb**2/2)) fint = gto.moleintor.getints4c fload = ccsd._ccsd.libcc.CCload_eri ntasks = mpi.pool.size task_sh_locs = task_locs sh_ranges_tasks = [] for task in range(ntasks): sh0 = task_sh_locs[task] sh1 = task_sh_locs[task+1] sh_ranges = ao2mo.outcore.balance_partition(ao_loc, blksize, sh0, sh1) sh_ranges_tasks.append(sh_ranges) blksize = max(max(x[2] for x in sh_ranges) if sh_ranges else 0 for sh_ranges in sh_ranges_tasks) eribuf = numpy.empty((blksize,blksize,nvirb,nvirb)) loadbuf = numpy.empty((blksize,blksize,nvirb,nvirb)) ao_sh_ranges = sh_ranges_tasks[rank] ao_sh0 = task_sh_locs[rank] ao_sh1 = task_sh_locs[rank+1] ao_offset = ao_loc[ao_sh0] assert(nvira == ao_loc[ao_sh1] - ao_loc[ao_sh0]) for task_id, t2T in _rotate_tensor_block(t2T): sh_ranges = sh_ranges_tasks[task_id] sh0 = task_sh_locs[task_id] cur_offset = ao_loc[sh0] for ish0, ish1, ni in sh_ranges: for jsh0, jsh1, nj in ao_sh_ranges: eri = fint(intor, mol._atm, mol._bas, mol._env, shls_slice=(ish0,ish1,jsh0,jsh1), aosym='s2kl', ao_loc=ao_loc, cintopt=ao2mopt._cintopt, out=eribuf) i0, i1 = ao_loc[ish0] - cur_offset, ao_loc[ish1] - cur_offset j0, j1 = ao_loc[jsh0] - ao_offset , ao_loc[jsh1] - ao_offset tmp = numpy.ndarray((i1-i0,nvirb,j1-j0,nvirb), buffer=loadbuf) fload(tmp.ctypes.data_as(ctypes.c_void_p), eri.ctypes.data_as(ctypes.c_void_p), (ctypes.c_int*4)(i0, i1, j0, j1), ctypes.c_int(nvirb)) contract_blk_(Ht2, t2T, tmp, i0, i1, j0, j1) time0 = log.timer_debug1('AO-vvvv [%d:%d,%d:%d]' % (ish0,ish1,jsh0,jsh1), *time0) else: raise NotImplementedError return Ht2
def _make_eris(mp, mo_coeff=None, verbose=None): log = logger.new_logger(mp, verbose) time0 = (time.clock(), time.time()) log.debug('transform (ia|jb) outcore') mol = mp.mol nocc = mp.nocc nmo = mp.nmo nvir = nmo - nocc eris = mp2._ChemistsERIs(mp, mo_coeff) nao = eris.mo_coeff.shape[0] assert (nvir <= nao) orbo = eris.mo_coeff[:, :nocc] orbv = numpy.asarray(eris.mo_coeff[:, nocc:], order='F') eris.feri = lib.H5TmpFile() int2e = mol._add_suffix('int2e') ao2mopt = _ao2mo.AO2MOpt(mol, int2e, 'CVHFnr_schwarz_cond', 'CVHFsetnr_direct_scf') fint = gto.moleintor.getints4c ntasks = mpi.pool.size olocs = [_task_location(nocc, task_id) for task_id in range(ntasks)] oloc0, oloc1 = olocs[rank] nocc_seg = oloc1 - oloc0 log.debug2('olocs %s', olocs) ao_loc = mol.ao_loc_nr() task_sh_locs = lib.misc._balanced_partition(ao_loc, ntasks) log.debug2('task_sh_locs %s', task_sh_locs) ao_sh0 = task_sh_locs[rank] ao_sh1 = task_sh_locs[rank + 1] ao_loc0 = ao_loc[ao_sh0] ao_loc1 = ao_loc[ao_sh1] nao_seg = ao_loc1 - ao_loc0 orbo_seg = orbo[ao_loc0:ao_loc1] mem_now = lib.current_memory()[0] max_memory = max(0, mp.max_memory - mem_now) dmax = numpy.sqrt(max_memory * .9e6 / 8 / ((nao + nocc) * (nao_seg + nocc))) dmax = min(nao // 4 + 2, max(BLKMIN, min(comm.allgather(dmax)))) sh_ranges = ao2mo.outcore.balance_partition(ao_loc, dmax) sh_ranges = comm.bcast(sh_ranges) dmax = max(x[2] for x in sh_ranges) eribuf = numpy.empty((nao, dmax, dmax, nao_seg)) ftmp = lib.H5TmpFile() log.debug('max_memory %s MB (dmax = %s) required disk space %g MB', max_memory, dmax, nocc * nocc_seg * (nao * (nao + dmax) / 2 + nvir**2) * 8 / 1e6) def save(count, tmp_xo): di, dj = tmp_xo.shape[2:4] tmp_xo = [tmp_xo[p0:p1] for p0, p1 in olocs] tmp_xo = mpi.alltoall(tmp_xo, split_recvbuf=True) tmp_xo = sum(tmp_xo).reshape(nocc_seg, nocc, di, dj) ftmp[str(count) + 'b'] = tmp_xo tmp_ox = mpi.alltoall([tmp_xo[:, p0:p1] for p0, p1 in olocs], split_recvbuf=True) tmp_ox = [ tmp_ox[i].reshape(p1 - p0, nocc_seg, di, dj) for i, (p0, p1) in enumerate(olocs) ] ftmp[str(count) + 'a'] = numpy.vstack(tmp_ox) jk_blk_slices = [] count = 0 time1 = time0 with lib.call_in_background(save) as bg_save: for ip, (ish0, ish1, ni) in enumerate(sh_ranges): for jsh0, jsh1, nj in sh_ranges[:ip + 1]: i0, i1 = ao_loc[ish0], ao_loc[ish1] j0, j1 = ao_loc[jsh0], ao_loc[jsh1] jk_blk_slices.append((i0, i1, j0, j1)) shls_slice = (0, mol.nbas, ish0, ish1, jsh0, jsh1, ao_sh0, ao_sh1) eri = fint(int2e, mol._atm, mol._bas, mol._env, shls_slice=shls_slice, aosym='s1', ao_loc=ao_loc, cintopt=ao2mopt._cintopt, out=eribuf) tmp_xo = lib.einsum('pi,pqrs->iqrs', orbo, eri) tmp_xo = lib.einsum('iqrs,sl->ilqr', tmp_xo, orbo_seg) bg_save(count, tmp_xo) tmp_xo = None count += 1 time1 = log.timer_debug1( 'partial ao2mo [%d:%d,%d:%d]' % (ish0, ish1, jsh0, jsh1), *time1) eri = eribuf = None time1 = time0 = log.timer('mp2 ao2mo_ovov pass1', *time0) eris.ovov = eris.feri.create_dataset('ovov', (nocc, nvir, nocc_seg, nvir), 'f8') occblk = int( min(nocc, max(BLKMIN, max_memory * .9e6 / 8 / (nao**2 * nocc_seg + 1) / 5))) def load(i0, eri): if i0 < nocc: i1 = min(i0 + occblk, nocc) for k, (p0, p1, q0, q1) in enumerate(jk_blk_slices): eri[:i1 - i0, :, p0:p1, q0:q1] = ftmp[str(k) + 'a'][i0:i1] if p0 != q0: dat = numpy.asarray(ftmp[str(k) + 'b'][:, i0:i1]) eri[:i1 - i0, :, q0:q1, p0:p1] = dat.transpose(1, 0, 3, 2) def save(i0, i1, dat): eris.ovov[i0:i1] = dat buf_prefecth = numpy.empty((occblk, nocc_seg, nao, nao)) buf = numpy.empty_like(buf_prefecth) bufw = numpy.empty((occblk * nocc_seg, nvir**2)) bufw1 = numpy.empty_like(bufw) with lib.call_in_background(load) as prefetch: with lib.call_in_background(save) as bsave: load(0, buf_prefecth) for i0, i1 in lib.prange(0, nocc, occblk): buf, buf_prefecth = buf_prefecth, buf prefetch(i1, buf_prefecth) eri = buf[:i1 - i0].reshape((i1 - i0) * nocc_seg, nao, nao) dat = _ao2mo.nr_e2(eri, orbv, (0, nvir, 0, nvir), 's1', 's1', out=bufw) bsave( i0, i1, dat.reshape(i1 - i0, nocc_seg, nvir, nvir).transpose(0, 2, 1, 3)) bufw, bufw1 = bufw1, bufw time1 = log.timer_debug1('pass2 ao2mo [%d:%d]' % (i0, i1), *time1) time0 = log.timer('mp2 ao2mo_ovov pass2', *time0) mp._eris = eris return eris
def _make_eris(mp, mo_coeff=None, verbose=None): log = logger.new_logger(mp, verbose) time0 = (time.clock(), time.time()) log.debug('transform (ia|jb) outcore') mol = mp.mol nocc = mp.nocc nmo = mp.nmo nvir = nmo - nocc eris = mp2._ChemistsERIs(mp, mo_coeff) nao = eris.mo_coeff.shape[0] assert(nvir <= nao) orbo = eris.mo_coeff[:,:nocc] orbv = numpy.asarray(eris.mo_coeff[:,nocc:], order='F') eris.feri = lib.H5TmpFile() int2e = mol._add_suffix('int2e') ao2mopt = _ao2mo.AO2MOpt(mol, int2e, 'CVHFnr_schwarz_cond', 'CVHFsetnr_direct_scf') fint = gto.moleintor.getints4c ntasks = mpi.pool.size olocs = [_task_location(nocc, task_id) for task_id in range(ntasks)] oloc0, oloc1 = olocs[rank] nocc_seg = oloc1 - oloc0 log.debug2('olocs %s', olocs) ao_loc = mol.ao_loc_nr() task_sh_locs = lib.misc._balanced_partition(ao_loc, ntasks) log.debug2('task_sh_locs %s', task_sh_locs) ao_sh0 = task_sh_locs[rank] ao_sh1 = task_sh_locs[rank+1] ao_loc0 = ao_loc[ao_sh0] ao_loc1 = ao_loc[ao_sh1] nao_seg = ao_loc1 - ao_loc0 orbo_seg = orbo[ao_loc0:ao_loc1] mem_now = lib.current_memory()[0] max_memory = max(0, mp.max_memory - mem_now) dmax = numpy.sqrt(max_memory*.9e6/8/((nao+nocc)*(nao_seg+nocc))) dmax = min(nao//4+2, max(BLKMIN, min(comm.allgather(dmax)))) sh_ranges = ao2mo.outcore.balance_partition(ao_loc, dmax) sh_ranges = comm.bcast(sh_ranges) dmax = max(x[2] for x in sh_ranges) eribuf = numpy.empty((nao,dmax,dmax,nao_seg)) ftmp = lib.H5TmpFile() log.debug('max_memory %s MB (dmax = %s) required disk space %g MB', max_memory, dmax, nocc*nocc_seg*(nao*(nao+dmax)/2+nvir**2)*8/1e6) def save(count, tmp_xo): di, dj = tmp_xo.shape[2:4] tmp_xo = [tmp_xo[p0:p1] for p0, p1 in olocs] tmp_xo = mpi.alltoall(tmp_xo, split_recvbuf=True) tmp_xo = sum(tmp_xo).reshape(nocc_seg,nocc,di,dj) ftmp[str(count)+'b'] = tmp_xo tmp_ox = mpi.alltoall([tmp_xo[:,p0:p1] for p0, p1 in olocs], split_recvbuf=True) tmp_ox = [tmp_ox[i].reshape(p1-p0,nocc_seg,di,dj) for i, (p0,p1) in enumerate(olocs)] ftmp[str(count)+'a'] = numpy.vstack(tmp_ox) jk_blk_slices = [] count = 0 time1 = time0 with lib.call_in_background(save) as bg_save: for ip, (ish0, ish1, ni) in enumerate(sh_ranges): for jsh0, jsh1, nj in sh_ranges[:ip+1]: i0, i1 = ao_loc[ish0], ao_loc[ish1] j0, j1 = ao_loc[jsh0], ao_loc[jsh1] jk_blk_slices.append((i0,i1,j0,j1)) shls_slice = (0,mol.nbas,ish0,ish1, jsh0,jsh1,ao_sh0,ao_sh1) eri = fint(int2e, mol._atm, mol._bas, mol._env, shls_slice=shls_slice, aosym='s1', ao_loc=ao_loc, cintopt=ao2mopt._cintopt, out=eribuf) tmp_xo = lib.einsum('pi,pqrs->iqrs', orbo, eri) tmp_xo = lib.einsum('iqrs,sl->ilqr', tmp_xo, orbo_seg) bg_save(count, tmp_xo) tmp_xo = None count += 1 time1 = log.timer_debug1('partial ao2mo [%d:%d,%d:%d]' % (ish0,ish1,jsh0,jsh1), *time1) eri = eribuf = None time1 = time0 = log.timer('mp2 ao2mo_ovov pass1', *time0) eris.ovov = eris.feri.create_dataset('ovov', (nocc,nvir,nocc_seg,nvir), 'f8') occblk = int(min(nocc, max(BLKMIN, max_memory*.9e6/8/(nao**2*nocc_seg+1)/5))) def load(i0, eri): if i0 < nocc: i1 = min(i0+occblk, nocc) for k, (p0,p1,q0,q1) in enumerate(jk_blk_slices): eri[:i1-i0,:,p0:p1,q0:q1] = ftmp[str(k)+'a'][i0:i1] if p0 != q0: dat = numpy.asarray(ftmp[str(k)+'b'][:,i0:i1]) eri[:i1-i0,:,q0:q1,p0:p1] = dat.transpose(1,0,3,2) def save(i0, i1, dat): eris.ovov[i0:i1] = dat buf_prefecth = numpy.empty((occblk,nocc_seg,nao,nao)) buf = numpy.empty_like(buf_prefecth) bufw = numpy.empty((occblk*nocc_seg,nvir**2)) bufw1 = numpy.empty_like(bufw) with lib.call_in_background(load) as prefetch: with lib.call_in_background(save) as bsave: load(0, buf_prefecth) for i0, i1 in lib.prange(0, nocc, occblk): buf, buf_prefecth = buf_prefecth, buf prefetch(i1, buf_prefecth) eri = buf[:i1-i0].reshape((i1-i0)*nocc_seg,nao,nao) dat = _ao2mo.nr_e2(eri, orbv, (0,nvir,0,nvir), 's1', 's1', out=bufw) bsave(i0, i1, dat.reshape(i1-i0,nocc_seg,nvir,nvir).transpose(0,2,1,3)) bufw, bufw1 = bufw1, bufw time1 = log.timer_debug1('pass2 ao2mo [%d:%d]' % (i0,i1), *time1) time0 = log.timer('mp2 ao2mo_ovov pass2', *time0) mp._eris = eris return eris