def ecp_int(cell, kpts=None): if rank == 0: comm.bcast(cell.dumps()) else: cell = pgto.loads(comm.bcast(None)) if kpts is None: kpts_lst = numpy.zeros((1,3)) else: kpts_lst = numpy.reshape(kpts, (-1,3)) ecpcell = gto.Mole() ecpcell._atm = cell._atm # append a fictitious s function to mimic the auxiliary index in pbc.incore. # ptr2last_env_idx to force PBCnr3c_fill_* function to copy the entire "env" ptr2last_env_idx = len(cell._env) - 1 ecpbas = numpy.vstack([[0, 0, 1, 1, 0, ptr2last_env_idx, 0, 0], cell._ecpbas]).astype(numpy.int32) ecpcell._bas = ecpbas ecpcell._env = cell._env # In pbc.incore _ecpbas is appended to two sets of cell._bas and the # fictitious s function. cell._env[AS_ECPBAS_OFFSET] = cell.nbas * 2 + 1 cell._env[AS_NECPBAS] = len(cell._ecpbas) kptij_lst = numpy.hstack((kpts_lst,kpts_lst)).reshape(-1,2,3) nkpts = len(kpts_lst) if abs(kpts_lst).sum() < 1e-9: # gamma_point dtype = numpy.double else: dtype = numpy.complex128 ao_loc = cell.ao_loc_nr() nao = ao_loc[-1] mat = numpy.zeros((nkpts,nao,nao), dtype=dtype) intor = cell._add_suffix('ECPscalar') int3c = incore.wrap_int3c(cell, ecpcell, intor, kptij_lst=kptij_lst) # shls_slice of auxiliary index (0,1) corresponds to the fictitious s function tasks = [(i, i+1, j, j+1, 0, 1) # shls_slice for i in range(cell.nbas) for j in range(i+1)] for shls_slice in mpi.work_stealing_partition(tasks): i0 = ao_loc[shls_slice[0]] i1 = ao_loc[shls_slice[1]] j0 = ao_loc[shls_slice[2]] j1 = ao_loc[shls_slice[3]] buf = numpy.empty((nkpts,i1-i0,j1-j0), dtype=dtype) mat[:,i0:i1,j0:j1] = int3c(shls_slice, buf) buf = mpi.reduce(mat) if rank == 0: mat = [] for k, kpt in enumerate(kpts_lst): v = lib.unpack_tril(lib.pack_tril(buf[k]), lib.HERMITIAN) if abs(kpt).sum() < 1e-9: # gamma_point: v = v.real mat.append(v) if kpts is None or numpy.shape(kpts) == (3,): mat = mat[0] return mat
def _assemble(mydf, kptij_lst, j3c_jobs, gen_int3c, ft_fuse, cderi_file, fswap, log): t1 = (time.clock(), time.time()) cell = mydf.cell ao_loc = cell.ao_loc_nr() nao = ao_loc[-1] kptis = kptij_lst[:, 0] kptjs = kptij_lst[:, 1] kpt_ji = kptjs - kptis uniq_kpts, uniq_index, uniq_inverse = unique(kpt_ji) aosym_s2 = numpy.einsum('ix->i', abs(kptis - kptjs)) < 1e-9 t2 = t1 j3c_workers = numpy.zeros(len(j3c_jobs), dtype=int) #for job_id, ish0, ish1 in mpi.work_share_partition(j3c_jobs): for job_id, ish0, ish1 in mpi.work_stealing_partition(j3c_jobs): gen_int3c(job_id, ish0, ish1) t2 = log.alltimer_debug2('int j3c %d' % job_id, *t2) for k, kpt in enumerate(uniq_kpts): ft_fuse(job_id, k, ish0, ish1) t2 = log.alltimer_debug2('ft-fuse %d k %d' % (job_id, k), *t2) j3c_workers[job_id] = rank j3c_workers = mpi.allreduce(j3c_workers) log.debug2('j3c_workers %s', j3c_workers) t1 = log.timer_debug1('int3c and fuse', *t1) # Pass 2 # Transpose 3-index tensor and save data in cderi_file feri = h5py.File(cderi_file, 'w') nauxs = [fswap['j2c/%d' % k].shape[0] for k, kpt in enumerate(uniq_kpts)] segsize = (max(nauxs) + mpi.pool.size - 1) // mpi.pool.size naux0 = rank * segsize for k, kptij in enumerate(kptij_lst): naux1 = min(nauxs[uniq_inverse[k]], naux0 + segsize) nrow = max(0, naux1 - naux0) if gamma_point(kptij): dtype = 'f8' else: dtype = 'c16' if aosym_s2[k]: nao_pair = nao * (nao + 1) // 2 else: nao_pair = nao * nao feri.create_dataset('j3c/%d' % k, (nrow, nao_pair), dtype, maxshape=(None, nao_pair)) def get_segs_loc(aosym): off0 = numpy.asarray([ao_loc[i0] for x, i0, i1 in j3c_jobs]) off1 = numpy.asarray([ao_loc[i1] for x, i0, i1 in j3c_jobs]) if aosym: # s2 dims = off1 * (off1 + 1) // 2 - off0 * (off0 + 1) // 2 else: dims = (off1 - off0) * nao #dims = numpy.asarray([ao_loc[i1]-ao_loc[i0] for x,i0,i1 in j3c_jobs]) dims = numpy.hstack( [dims[j3c_workers == w] for w in range(mpi.pool.size)]) job_idx = numpy.hstack( [numpy.where(j3c_workers == w)[0] for w in range(mpi.pool.size)]) segs_loc = numpy.append(0, numpy.cumsum(dims)) segs_loc = [(segs_loc[j], segs_loc[j + 1]) for j in numpy.argsort(job_idx)] return segs_loc segs_loc_s1 = get_segs_loc(False) segs_loc_s2 = get_segs_loc(True) job_ids = numpy.where(rank == j3c_workers)[0] def load(k, p0, p1): naux1 = nauxs[uniq_inverse[k]] slices = [(min(i * segsize + p0, naux1), min(i * segsize + p1, naux1)) for i in range(mpi.pool.size)] segs = [] for p0, p1 in slices: val = [ fswap['j3c-chunks/%d/%d' % (job, k)][p0:p1].ravel() for job in job_ids ] if val: segs.append(numpy.hstack(val)) else: segs.append(numpy.zeros(0)) return segs def save(k, p0, p1, segs): segs = mpi.alltoall(segs) naux1 = nauxs[uniq_inverse[k]] loc0, loc1 = min(p0, naux1 - naux0), min(p1, naux1 - naux0) nL = loc1 - loc0 if nL > 0: if aosym_s2[k]: segs = numpy.hstack([ segs[i0 * nL:i1 * nL].reshape(nL, -1) for i0, i1 in segs_loc_s2 ]) else: segs = numpy.hstack([ segs[i0 * nL:i1 * nL].reshape(nL, -1) for i0, i1 in segs_loc_s1 ]) feri['j3c/%d' % k][loc0:loc1] = segs mem_now = max(comm.allgather(lib.current_memory()[0])) max_memory = max(2000, min(8000, mydf.max_memory - mem_now)) if numpy.all(aosym_s2): if gamma_point(kptij_lst): blksize = max(16, int(max_memory * .5e6 / 8 / nao**2)) else: blksize = max(16, int(max_memory * .5e6 / 16 / nao**2)) else: blksize = max(16, int(max_memory * .5e6 / 16 / nao**2 / 2)) log.debug1('max_momory %d MB (%d in use), blksize %d', max_memory, mem_now, blksize) t2 = t1 with lib.call_in_background(save) as async_write: for k, kptji in enumerate(kptij_lst): for p0, p1 in lib.prange(0, segsize, blksize): segs = load(k, p0, p1) async_write(k, p0, p1, segs) t2 = log.timer_debug1( 'assemble k=%d %d:%d (in %d)' % (k, p0, p1, segsize), *t2) if 'j2c-' in fswap: j2c_kpts_lists = [] for k, kpt in enumerate(uniq_kpts): if ('j2c-/%d' % k) in fswap: adapted_ji_idx = numpy.where(uniq_inverse == k)[0] j2c_kpts_lists.append(adapted_ji_idx) for k in numpy.hstack(j2c_kpts_lists): val = [ numpy.asarray(fswap['j3c-/%d/%d' % (job, k)]).ravel() for job in job_ids ] val = mpi.gather(numpy.hstack(val)) if rank == 0: naux1 = fswap['j3c-/0/%d' % k].shape[0] if aosym_s2[k]: v = [ val[i0 * naux1:i1 * naux1].reshape(naux1, -1) for i0, i1 in segs_loc_s2 ] else: v = [ val[i0 * naux1:i1 * naux1].reshape(naux1, -1) for i0, i1 in segs_loc_s1 ] feri['j3c-/%d' % k] = numpy.hstack(v) if 'j3c-kptij' in feri: del (feri['j3c-kptij']) feri['j3c-kptij'] = kptij_lst t1 = log.alltimer_debug1('assembling j3c', *t1) feri.close()
def _make_j3c(mydf, cell, auxcell, kptij_lst, cderi_file): log = logger.Logger(mydf.stdout, mydf.verbose) t1 = t0 = (time.clock(), time.time()) fused_cell, fuse = fuse_auxcell(mydf, mydf.auxcell) ao_loc = cell.ao_loc_nr() nao = ao_loc[-1] naux = auxcell.nao_nr() nkptij = len(kptij_lst) gs = mydf.gs Gv, Gvbase, kws = cell.get_Gv_weights(gs) b = cell.reciprocal_vectors() gxyz = lib.cartesian_prod([numpy.arange(len(x)) for x in Gvbase]) ngs = gxyz.shape[0] kptis = kptij_lst[:, 0] kptjs = kptij_lst[:, 1] kpt_ji = kptjs - kptis uniq_kpts, uniq_index, uniq_inverse = unique(kpt_ji) log.debug('Num uniq kpts %d', len(uniq_kpts)) log.debug2('uniq_kpts %s', uniq_kpts) # j2c ~ (-kpt_ji | kpt_ji) j2c = fused_cell.pbc_intor('int2c2e_sph', hermi=1, kpts=uniq_kpts) j2ctags = [] nauxs = [] t1 = log.timer_debug1('2c2e', *t1) if h5py.is_hdf5(cderi_file): feri = h5py.File(cderi_file) else: feri = h5py.File(cderi_file, 'w') for k, kpt in enumerate(uniq_kpts): aoaux = ft_ao.ft_ao(fused_cell, Gv, None, b, gxyz, Gvbase, kpt).T coulG = numpy.sqrt(mydf.weighted_coulG(kpt, False, gs)) kLR = (aoaux.real * coulG).T kLI = (aoaux.imag * coulG).T if not kLR.flags.c_contiguous: kLR = lib.transpose(kLR.T) if not kLI.flags.c_contiguous: kLI = lib.transpose(kLI.T) aoaux = None kLR1 = numpy.asarray(kLR[:, naux:], order='C') kLI1 = numpy.asarray(kLI[:, naux:], order='C') if is_zero(kpt): # kpti == kptj for p0, p1 in mydf.mpi_prange(0, ngs): j2cR = lib.ddot(kLR1[p0:p1].T, kLR[p0:p1]) j2cR = lib.ddot(kLI1[p0:p1].T, kLI[p0:p1], 1, j2cR, 1) j2c[k][naux:] -= mpi.allreduce(j2cR) j2c[k][:naux, naux:] = j2c[k][naux:, :naux].T else: for p0, p1 in mydf.mpi_prange(0, ngs): j2cR, j2cI = zdotCN(kLR1[p0:p1].T, kLI1[p0:p1].T, kLR[p0:p1], kLI[p0:p1]) j2cR = mpi.allreduce(j2cR) j2cI = mpi.allreduce(j2cI) j2c[k][naux:] -= j2cR + j2cI * 1j j2c[k][:naux, naux:] = j2c[k][naux:, :naux].T.conj() j2c[k] = fuse(fuse(j2c[k]).T).T try: feri['j2c/%d' % k] = scipy.linalg.cholesky(j2c[k], lower=True) j2ctags.append('CD') nauxs.append(naux) except scipy.linalg.LinAlgError as e: #msg =('===================================\n' # 'J-metric not positive definite.\n' # 'It is likely that gs is not enough.\n' # '===================================') #log.error(msg) #raise scipy.linalg.LinAlgError('\n'.join([e.message, msg])) w, v = scipy.linalg.eigh(j2c) log.debug2('metric linear dependency for kpt %s', uniq_kptji_id) log.debug2('cond = %.4g, drop %d bfns', w[0] / w[-1], numpy.count_nonzero(w < LINEAR_DEP_THR)) v = v[:, w > LINEAR_DEP_THR].T.conj() v /= numpy.sqrt(w[w > LINEAR_DEP_THR]).reshape(-1, 1) feri['j2c/%d' % k] = v j2ctags.append('eig') nauxs.append(v.shape[0]) kLR = kLI = kLR1 = kLI1 = coulG = None j2c = None aosym_s2 = numpy.einsum('ix->i', abs(kptis - kptjs)) < 1e-9 j_only = numpy.all(aosym_s2) if gamma_point(kptij_lst): dtype = 'f8' else: dtype = 'c16' vbar = mydf.auxbar(fused_cell) vbar = fuse(vbar) ovlp = cell.pbc_intor('int1e_ovlp_sph', hermi=1, kpts=kptjs[aosym_s2]) ovlp = [lib.pack_tril(s) for s in ovlp] t1 = log.timer_debug1('aoaux and int2c', *t1) # Estimates the buffer size based on the last contraction in G-space. # This contraction requires to hold nkptj copies of (naux,?) array # simultaneously in memory. mem_now = max(comm.allgather(lib.current_memory()[0])) max_memory = max(2000, mydf.max_memory - mem_now) nkptj_max = max((uniq_inverse == x).sum() for x in set(uniq_inverse)) buflen = max( int( min(max_memory * .5e6 / 16 / naux / (nkptj_max + 2) / nao, nao / 3 / mpi.pool.size)), 1) chunks = (buflen, nao) j3c_jobs = grids2d_int3c_jobs(cell, auxcell, kptij_lst, chunks, j_only) log.debug1('max_memory = %d MB (%d in use) chunks %s', max_memory, mem_now, chunks) log.debug2('j3c_jobs %s', j3c_jobs) if j_only: int3c = wrap_int3c(cell, fused_cell, 'int3c2e_sph', 's2', 1, kptij_lst) else: int3c = wrap_int3c(cell, fused_cell, 'int3c2e_sph', 's1', 1, kptij_lst) idxb = numpy.tril_indices(nao) idxb = (idxb[0] * nao + idxb[1]).astype('i') aux_loc = fused_cell.ao_loc_nr('ssc' in 'int3c2e_sph') def gen_int3c(auxcell, job_id, ish0, ish1): dataname = 'j3c-chunks/%d' % job_id if dataname in feri: del (feri[dataname]) i0 = ao_loc[ish0] i1 = ao_loc[ish1] dii = i1 * (i1 + 1) // 2 - i0 * (i0 + 1) // 2 dij = (i1 - i0) * nao if j_only: buflen = max(8, int(max_memory * 1e6 / 16 / (nkptij * dii + dii))) else: buflen = max(8, int(max_memory * 1e6 / 16 / (nkptij * dij + dij))) auxranges = balance_segs(aux_loc[1:] - aux_loc[:-1], buflen) buflen = max([x[2] for x in auxranges]) buf = numpy.empty(nkptij * dij * buflen, dtype=dtype) buf1 = numpy.empty(dij * buflen, dtype=dtype) naux = aux_loc[-1] for kpt_id, kptij in enumerate(kptij_lst): key = '%s/%d' % (dataname, kpt_id) if aosym_s2[kpt_id]: shape = (naux, dii) else: shape = (naux, dij) if gamma_point(kptij): feri.create_dataset(key, shape, 'f8') else: feri.create_dataset(key, shape, 'c16') naux0 = 0 for istep, auxrange in enumerate(auxranges): log.alldebug2("aux_e2 job_id %d step %d", job_id, istep) sh0, sh1, nrow = auxrange sub_slice = (ish0, ish1, 0, cell.nbas, sh0, sh1) if j_only: mat = numpy.ndarray((nkptij, dii, nrow), dtype=dtype, buffer=buf) else: mat = numpy.ndarray((nkptij, dij, nrow), dtype=dtype, buffer=buf) mat = int3c(sub_slice, mat) for k, kptij in enumerate(kptij_lst): h5dat = feri['%s/%d' % (dataname, k)] v = lib.transpose(mat[k], out=buf1) if not j_only and aosym_s2[k]: idy = idxb[i0 * (i0 + 1) // 2:i1 * (i1 + 1) // 2] - i0 * nao out = numpy.ndarray((nrow, dii), dtype=v.dtype, buffer=mat[k]) v = numpy.take(v, idy, axis=1, out=out) if gamma_point(kptij): h5dat[naux0:naux0 + nrow] = v.real else: h5dat[naux0:naux0 + nrow] = v naux0 += nrow def ft_fuse(job_id, uniq_kptji_id, sh0, sh1): kpt = uniq_kpts[uniq_kptji_id] # kpt = kptj - kpti adapted_ji_idx = numpy.where(uniq_inverse == uniq_kptji_id)[0] adapted_kptjs = kptjs[adapted_ji_idx] nkptj = len(adapted_kptjs) shls_slice = (auxcell.nbas, fused_cell.nbas) Gaux = ft_ao.ft_ao(fused_cell, Gv, shls_slice, b, gxyz, Gvbase, kpt) Gaux *= mydf.weighted_coulG(kpt, False, gs).reshape(-1, 1) kLR = Gaux.real.copy('C') kLI = Gaux.imag.copy('C') j2c = numpy.asarray(feri['j2c/%d' % uniq_kptji_id]) j2ctag = j2ctags[uniq_kptji_id] naux0 = j2c.shape[0] if is_zero(kpt): aosym = 's2' else: aosym = 's1' j3cR = [None] * nkptj j3cI = [None] * nkptj i0 = ao_loc[sh0] i1 = ao_loc[sh1] for k, idx in enumerate(adapted_ji_idx): key = 'j3c-chunks/%d/%d' % (job_id, idx) v = numpy.asarray(feri[key]) if is_zero(kpt): for i, c in enumerate(vbar): if c != 0: v[i] -= c * ovlp[k][i0 * (i0 + 1) // 2:i1 * (i1 + 1) // 2].ravel() j3cR[k] = numpy.asarray(v.real, order='C') if v.dtype == numpy.complex128: j3cI[k] = numpy.asarray(v.imag, order='C') v = None ncol = j3cR[0].shape[1] Gblksize = max(16, int(max_memory * 1e6 / 16 / ncol / (nkptj + 1))) # +1 for pqkRbuf/pqkIbuf Gblksize = min(Gblksize, ngs, 16384) pqkRbuf = numpy.empty(ncol * Gblksize) pqkIbuf = numpy.empty(ncol * Gblksize) buf = numpy.empty(nkptj * ncol * Gblksize, dtype=numpy.complex128) log.alldebug2(' blksize (%d,%d)', Gblksize, ncol) shls_slice = (sh0, sh1, 0, cell.nbas) for p0, p1 in lib.prange(0, ngs, Gblksize): dat = ft_ao._ft_aopair_kpts(cell, Gv[p0:p1], shls_slice, aosym, b, gxyz[p0:p1], Gvbase, kpt, adapted_kptjs, out=buf) nG = p1 - p0 for k, ji in enumerate(adapted_ji_idx): aoao = dat[k].reshape(nG, ncol) pqkR = numpy.ndarray((ncol, nG), buffer=pqkRbuf) pqkI = numpy.ndarray((ncol, nG), buffer=pqkIbuf) pqkR[:] = aoao.real.T pqkI[:] = aoao.imag.T lib.dot(kLR[p0:p1].T, pqkR.T, -1, j3cR[k][naux:], 1) lib.dot(kLI[p0:p1].T, pqkI.T, -1, j3cR[k][naux:], 1) if not (is_zero(kpt) and gamma_point(adapted_kptjs[k])): lib.dot(kLR[p0:p1].T, pqkI.T, -1, j3cI[k][naux:], 1) lib.dot(kLI[p0:p1].T, pqkR.T, 1, j3cI[k][naux:], 1) for k, idx in enumerate(adapted_ji_idx): if is_zero(kpt) and gamma_point(adapted_kptjs[k]): v = fuse(j3cR[k]) else: v = fuse(j3cR[k] + j3cI[k] * 1j) if j2ctag == 'CD': v = scipy.linalg.solve_triangular(j2c, v, lower=True, overwrite_b=True) else: v = lib.dot(j2c, v) feri['j3c-chunks/%d/%d' % (job_id, idx)][:naux0] = v t2 = t1 j3c_workers = numpy.zeros(len(j3c_jobs), dtype=int) #for job_id, ish0, ish1 in mpi.work_share_partition(j3c_jobs): for job_id, ish0, ish1 in mpi.work_stealing_partition(j3c_jobs): gen_int3c(fused_cell, job_id, ish0, ish1) t2 = log.alltimer_debug2('int j3c %d' % job_id, *t2) for k, kpt in enumerate(uniq_kpts): ft_fuse(job_id, k, ish0, ish1) t2 = log.alltimer_debug2('ft-fuse %d k %d' % (job_id, k), *t2) j3c_workers[job_id] = rank j3c_workers = mpi.allreduce(j3c_workers) log.debug2('j3c_workers %s', j3c_workers) j2c = kLRs = kLIs = ovlp = vbar = fuse = gen_int3c = ft_fuse = None t1 = log.timer_debug1('int3c and fuse', *t1) def get_segs_loc(aosym): off0 = numpy.asarray([ao_loc[i0] for x, i0, i1 in j3c_jobs]) off1 = numpy.asarray([ao_loc[i1] for x, i0, i1 in j3c_jobs]) if aosym: # s2 dims = off1 * (off1 + 1) // 2 - off0 * (off0 + 1) // 2 else: dims = (off1 - off0) * nao #dims = numpy.asarray([ao_loc[i1]-ao_loc[i0] for x,i0,i1 in j3c_jobs]) dims = numpy.hstack( [dims[j3c_workers == w] for w in range(mpi.pool.size)]) job_idx = numpy.hstack( [numpy.where(j3c_workers == w)[0] for w in range(mpi.pool.size)]) segs_loc = numpy.append(0, numpy.cumsum(dims)) segs_loc = [(segs_loc[j], segs_loc[j + 1]) for j in numpy.argsort(job_idx)] return segs_loc segs_loc_s1 = get_segs_loc(False) segs_loc_s2 = get_segs_loc(True) if 'j3c' in feri: del (feri['j3c']) segsize = (max(nauxs) + mpi.pool.size - 1) // mpi.pool.size naux0 = rank * segsize for k, kptij in enumerate(kptij_lst): naux1 = min(nauxs[uniq_inverse[k]], naux0 + segsize) nrow = max(0, naux1 - naux0) if gamma_point(kptij): dtype = 'f8' else: dtype = 'c16' if aosym_s2[k]: nao_pair = nao * (nao + 1) // 2 else: nao_pair = nao * nao feri.create_dataset('j3c/%d' % k, (nrow, nao_pair), dtype, maxshape=(None, nao_pair)) def load(k, p0, p1): naux1 = nauxs[uniq_inverse[k]] slices = [(min(i * segsize + p0, naux1), min(i * segsize + p1, naux1)) for i in range(mpi.pool.size)] segs = [] for p0, p1 in slices: val = [] for job_id, worker in enumerate(j3c_workers): if rank == worker: key = 'j3c-chunks/%d/%d' % (job_id, k) val.append(feri[key][p0:p1].ravel()) if val: segs.append(numpy.hstack(val)) else: segs.append(numpy.zeros(0)) return segs def save(k, p0, p1, segs): segs = mpi.alltoall(segs) naux1 = nauxs[uniq_inverse[k]] loc0, loc1 = min(p0, naux1 - naux0), min(p1, naux1 - naux0) nL = loc1 - loc0 if nL > 0: if aosym_s2[k]: segs = numpy.hstack([ segs[i0 * nL:i1 * nL].reshape(nL, -1) for i0, i1 in segs_loc_s2 ]) else: segs = numpy.hstack([ segs[i0 * nL:i1 * nL].reshape(nL, -1) for i0, i1 in segs_loc_s1 ]) feri['j3c/%d' % k][loc0:loc1] = segs mem_now = max(comm.allgather(lib.current_memory()[0])) max_memory = max(2000, min(8000, mydf.max_memory - mem_now)) if numpy.all(aosym_s2): if gamma_point(kptij_lst): blksize = max(16, int(max_memory * .5e6 / 8 / nao**2)) else: blksize = max(16, int(max_memory * .5e6 / 16 / nao**2)) else: blksize = max(16, int(max_memory * .5e6 / 16 / nao**2 / 2)) log.debug1('max_momory %d MB (%d in use), blksize %d', max_memory, mem_now, blksize) t2 = t1 with lib.call_in_background(save) as async_write: for k, kptji in enumerate(kptij_lst): for p0, p1 in lib.prange(0, segsize, blksize): segs = load(k, p0, p1) async_write(k, p0, p1, segs) t2 = log.timer_debug1( 'assemble k=%d %d:%d (in %d)' % (k, p0, p1, segsize), *t2) if 'j3c-chunks' in feri: del (feri['j3c-chunks']) if 'j3c-kptij' in feri: del (feri['j3c-kptij']) feri['j3c-kptij'] = kptij_lst t1 = log.alltimer_debug1('assembling j3c', *t1) feri.close()
def _eval_jk(mf, dm, hermi, gen_jobs): cpu0 = (logger.process_clock(), logger.perf_counter()) mol = mf.mol ao_loc = mol.ao_loc_nr() nao = ao_loc[-1] bas_groups = _partition_bas(mol) jobs = gen_jobs(len(bas_groups), hermi) njobs = len(jobs) logger.debug1(mf, 'njobs %d', njobs) # Each job has multiple recipes. n_recipes = len(jobs[0][1:]) dm = numpy.asarray(dm).reshape(-1, nao, nao) n_dm = dm.shape[0] vk = numpy.zeros((n_recipes, n_dm, nao, nao)) if mf.opt is None: vhfopt = mf.init_direct_scf(mol) else: vhfopt = mf.opt # Assign the entire dm_cond to vhfopt. # The prescreen function CVHFnrs8_prescreen will index q_cond and dm_cond # over the entire basis. "set_dm" in function jk.get_jk/direct_bindm only # creates a subblock of dm_cond which is not compatible with # CVHFnrs8_prescreen. vhfopt.set_dm(dm, mol._atm, mol._bas, mol._env) # Then skip the "set_dm" initialization in function jk.get_jk/direct_bindm. vhfopt._dmcondname = None logger.timer_debug1(mf, 'get_jk initialization', *cpu0) for job_id in mpi.work_stealing_partition(range(njobs)): group_ids = jobs[job_id][0] recipes = jobs[job_id][1:] shls_slice = lib.flatten([bas_groups[i] for i in group_ids]) loc = ao_loc[shls_slice].reshape(4, 2) dm_blks = [] for i_dm in range(n_dm): for ir, recipe in enumerate(recipes): for i, rec in enumerate(recipe): p0, p1 = loc[rec[0]] q0, q1 = loc[rec[1]] dm_blks.append(dm[i_dm, p0:p1, q0:q1]) scripts = [ 'ijkl,%s%s->%s%s' % tuple(['ijkl'[x] for x in rec]) for recipe in recipes for rec in recipe ] * n_dm kparts = jk.get_jk(mol, dm_blks, scripts, shls_slice=shls_slice, vhfopt=vhfopt) for i_dm in range(n_dm): for ir, recipe in enumerate(recipes): for i, rec in enumerate(recipe): p0, p1 = loc[rec[2]] q0, q1 = loc[rec[3]] vk[ir, i_dm, p0:p1, q0:q1] += kparts[i] # Pop the results of one recipe kparts = kparts[i + 1:] vk = mpi.reduce(vk) if rank == 0: if hermi: for i in range(n_recipes): for j in range(n_dm): lib.hermi_triu(vk[i, j], hermi, inplace=True) else: # Zero out vk on workers. If reduce(get_jk()) is called twice, # non-zero vk on workers can cause error. vk[:] = 0 logger.timer(mf, 'get_jk', *cpu0) return vk
def _assemble(mydf, kptij_lst, j3c_jobs, gen_int3c, ft_fuse, cderi_file, fswap, log): t1 = (time.clock(), time.time()) cell = mydf.cell ao_loc = cell.ao_loc_nr() nao = ao_loc[-1] kptis = kptij_lst[:,0] kptjs = kptij_lst[:,1] kpt_ji = kptjs - kptis uniq_kpts, uniq_index, uniq_inverse = unique(kpt_ji) aosym_s2 = numpy.einsum('ix->i', abs(kptis-kptjs)) < 1e-9 t2 = t1 j3c_workers = numpy.zeros(len(j3c_jobs), dtype=int) #for job_id, ish0, ish1 in mpi.work_share_partition(j3c_jobs): for job_id, ish0, ish1 in mpi.work_stealing_partition(j3c_jobs): gen_int3c(job_id, ish0, ish1) t2 = log.alltimer_debug2('int j3c %d' % job_id, *t2) for k, kpt in enumerate(uniq_kpts): ft_fuse(job_id, k, ish0, ish1) t2 = log.alltimer_debug2('ft-fuse %d k %d' % (job_id, k), *t2) j3c_workers[job_id] = rank j3c_workers = mpi.allreduce(j3c_workers) log.debug2('j3c_workers %s', j3c_workers) t1 = log.timer_debug1('int3c and fuse', *t1) # Pass 2 # Transpose 3-index tensor and save data in cderi_file feri = h5py.File(cderi_file, 'w') nauxs = [fswap['j2c/%d'%k].shape[0] for k, kpt in enumerate(uniq_kpts)] segsize = (max(nauxs)+mpi.pool.size-1) // mpi.pool.size naux0 = rank * segsize for k, kptij in enumerate(kptij_lst): naux1 = min(nauxs[uniq_inverse[k]], naux0+segsize) nrow = max(0, naux1-naux0) if gamma_point(kptij): dtype = 'f8' else: dtype = 'c16' if aosym_s2[k]: nao_pair = nao * (nao+1) // 2 else: nao_pair = nao * nao feri.create_dataset('j3c/%d'%k, (nrow,nao_pair), dtype, maxshape=(None,nao_pair)) def get_segs_loc(aosym): off0 = numpy.asarray([ao_loc[i0] for x,i0,i1 in j3c_jobs]) off1 = numpy.asarray([ao_loc[i1] for x,i0,i1 in j3c_jobs]) if aosym: # s2 dims = off1*(off1+1)//2 - off0*(off0+1)//2 else: dims = (off1-off0) * nao #dims = numpy.asarray([ao_loc[i1]-ao_loc[i0] for x,i0,i1 in j3c_jobs]) dims = numpy.hstack([dims[j3c_workers==w] for w in range(mpi.pool.size)]) job_idx = numpy.hstack([numpy.where(j3c_workers==w)[0] for w in range(mpi.pool.size)]) segs_loc = numpy.append(0, numpy.cumsum(dims)) segs_loc = [(segs_loc[j], segs_loc[j+1]) for j in numpy.argsort(job_idx)] return segs_loc segs_loc_s1 = get_segs_loc(False) segs_loc_s2 = get_segs_loc(True) job_ids = numpy.where(rank == j3c_workers)[0] def load(k, p0, p1): naux1 = nauxs[uniq_inverse[k]] slices = [(min(i*segsize+p0,naux1), min(i*segsize+p1,naux1)) for i in range(mpi.pool.size)] segs = [] for p0, p1 in slices: val = [fswap['j3c-chunks/%d/%d' % (job, k)][p0:p1].ravel() for job in job_ids] if val: segs.append(numpy.hstack(val)) else: segs.append(numpy.zeros(0)) return segs def save(k, p0, p1, segs): segs = mpi.alltoall(segs) naux1 = nauxs[uniq_inverse[k]] loc0, loc1 = min(p0, naux1-naux0), min(p1, naux1-naux0) nL = loc1 - loc0 if nL > 0: if aosym_s2[k]: segs = numpy.hstack([segs[i0*nL:i1*nL].reshape(nL,-1) for i0,i1 in segs_loc_s2]) else: segs = numpy.hstack([segs[i0*nL:i1*nL].reshape(nL,-1) for i0,i1 in segs_loc_s1]) feri['j3c/%d'%k][loc0:loc1] = segs mem_now = max(comm.allgather(lib.current_memory()[0])) max_memory = max(2000, min(8000, mydf.max_memory - mem_now)) if numpy.all(aosym_s2): if gamma_point(kptij_lst): blksize = max(16, int(max_memory*.5e6/8/nao**2)) else: blksize = max(16, int(max_memory*.5e6/16/nao**2)) else: blksize = max(16, int(max_memory*.5e6/16/nao**2/2)) log.debug1('max_momory %d MB (%d in use), blksize %d', max_memory, mem_now, blksize) t2 = t1 with lib.call_in_background(save) as async_write: for k, kptji in enumerate(kptij_lst): for p0, p1 in lib.prange(0, segsize, blksize): segs = load(k, p0, p1) async_write(k, p0, p1, segs) t2 = log.timer_debug1('assemble k=%d %d:%d (in %d)' % (k, p0, p1, segsize), *t2) if 'j2c-' in fswap: j2c_kpts_lists = [] for k, kpt in enumerate(uniq_kpts): if ('j2c-/%d' % k) in fswap: adapted_ji_idx = numpy.where(uniq_inverse == k)[0] j2c_kpts_lists.append(adapted_ji_idx) for k in numpy.hstack(j2c_kpts_lists): val = [numpy.asarray(fswap['j3c-/%d/%d' % (job, k)]).ravel() for job in job_ids] val = mpi.gather(numpy.hstack(val)) if rank == 0: naux1 = fswap['j3c-/0/%d'%k].shape[0] if aosym_s2[k]: v = [val[i0*naux1:i1*naux1].reshape(naux1,-1) for i0,i1 in segs_loc_s2] else: v = [val[i0*naux1:i1*naux1].reshape(naux1,-1) for i0,i1 in segs_loc_s1] feri['j3c-/%d'%k] = numpy.hstack(v) if 'j3c-kptij' in feri: del(feri['j3c-kptij']) feri['j3c-kptij'] = kptij_lst t1 = log.alltimer_debug1('assembling j3c', *t1) feri.close()