Beispiel #1
0
    def loop(self, blksize=None):
        if self._cderi is None:
            self.build()
        if blksize is None:
            blksize = self.blockdim

        with addons.load(self._cderi, 'j3c') as feri:
            if isinstance(feri, numpy.ndarray):
                naoaux = feri.shape[0]
                for b0, b1 in self.prange(0, naoaux, blksize):
                    yield numpy.asarray(feri[b0:b1], order='C')

            else:
                if isinstance(feri, h5py.Group):
                    # starting from pyscf-1.7, DF tensor may be stored in
                    # block format
                    naoaux = feri['0'].shape[0]
                    def load(b0, b1, prefetch):
                        prefetch[0] = _load_from_h5g(feri, b0, b1)
                else:
                    naoaux = feri.shape[0]
                    def load(b0, b1, prefetch):
                        prefetch[0] = numpy.asarray(feri[b0:b1])

                dat = [None]
                prefetch = [None]
                with lib.call_in_background(load) as bload:
                    bload(0, min(blksize, naoaux), prefetch)
                    for b0, b1 in self.prange(blksize, naoaux, blksize):
                        dat, prefetch = prefetch, dat
                        bload(b0, b1, prefetch)
                        yield dat[0]
                yield prefetch[0]
Beispiel #2
0
        def bg_raise():
            def raise1():
                raise ValueError

            with lib.call_in_background(raise1) as f:
                f()

            raise IndexError
Beispiel #3
0
        def bg_raise():
            def raise1():
                raise ValueError

            with lib.call_in_background(raise1) as f:
                f()

            raise IndexError
Beispiel #4
0
def kernel(mycc, eris=None):
    cpu0 = (time.clock(), time.time())
    ccsd._sync_(mycc)
    log = logger.new_logger(mycc)

    eris = getattr(mycc, '_eris', None)
    if eris is None:
        mycc.ao2mo(mycc.mo_coeff)
        eris = mycc._eris

    t1T = numpy.asarray(mycc.t1.T, order='C')
    nvir, nocc = t1T.shape

    fvo = eris.fock[nocc:,:nocc].copy()
    mo_energy = eris.mo_energy.copy()
    et_sum = numpy.zeros(1, dtype=t1T.dtype)
    drv = _ccsd.libcc.MPICCsd_t_contract
    cpu2 = [time.clock(), time.time()]
    def contract(slices, data):
        #vvop_ab, vvop_ac, vvop_ba, vvop_bc, vvop_ca, vvop_cb, \
        #        vooo_a, vooo_b, vooo_c, t2T_a, t2T_b, t2T_c = data
        data_ptrs = [x.ctypes.data_as(ctypes.c_void_p) for x in data]
        data_ptrs = (ctypes.c_void_p*12)(*data_ptrs)
        drv(et_sum.ctypes.data_as(ctypes.c_void_p),
            mo_energy.ctypes.data_as(ctypes.c_void_p),
            t1T.ctypes.data_as(ctypes.c_void_p),
            fvo.ctypes.data_as(ctypes.c_void_p),
            ctypes.c_int(nocc), ctypes.c_int(nvir),
            (ctypes.c_int*6)(*slices), data_ptrs)
        cpu2[:] = log.alltimer_debug1('contract'+str(slices), *cpu2)

    with GlobalDataHandler(mycc) as daemon:
        v_seg_ranges = daemon.data_partition
        tasks = []
        for ka, (a0, a1) in enumerate(v_seg_ranges):
            for kb, (b0, b1) in enumerate(v_seg_ranges[:ka+1]):
                for c0, c1 in v_seg_ranges[:kb+1]:
                    tasks.append((a0, a1, b0, b1, c0, c1))
        log.debug('ntasks = %d', len(tasks))

        task_count = 0
        with lib.call_in_background(contract) as async_contract:
            #for task in mpi.static_partition(tasks):
            #for task in mpi.work_stealing_partition(tasks):
            for task in mpi.work_share_partition(tasks, loadmin=2):
                log.alldebug2('request for segment %s', task)
                data = [None] * 12
                daemon.request_(task, data)
                async_contract(task, data)
                task_count += 1
        log.alldebug1('task_count = %d', task_count)

    et = comm.allreduce(et_sum[0] * 2).real
    log.timer('CCSD(T)', *cpu0)
    log.note('CCSD(T) correction = %.15g', et)
    return et
Beispiel #5
0
    def start(self, interval=0.02):
        mycc = self._cc
        log = logger.new_logger(mycc)
        cpu1 = (logger.process_clock(), logger.perf_counter())
        eris = mycc._eris
        t2T = mycc.t2.transpose(2, 3, 0, 1)

        nocc, nvir = mycc.t1.shape
        nmo = nocc + nvir
        vloc0, vloc1 = self.vranges[rank]
        nvir_seg = vloc1 - vloc0

        max_memory = min(24000, mycc.max_memory - lib.current_memory()[0])
        blksize = min(
            nvir_seg // 4 + 1,
            max(16, int(max_memory * .3e6 / 8 / (nvir * nocc * nmo))))
        self.eri_tmp = lib.H5TmpFile()
        vvop = self.eri_tmp.create_dataset('vvop', (nvir_seg, nvir, nocc, nmo),
                                           'f8')

        def save_vvop(j0, j1, vvvo):
            buf = numpy.empty((j1 - j0, nvir, nocc, nmo), dtype=t2T.dtype)
            buf[:, :, :, :nocc] = eris.ovov[:, j0:j1].conj().transpose(
                1, 3, 0, 2)
            for k, (q0, q1) in enumerate(self.vranges):
                blk = vvvo[k].reshape(q1 - q0, nvir, j1 - j0, nocc)
                buf[:, q0:q1, :, nocc:] = blk.transpose(2, 0, 3, 1)
            vvop[j0:j1] = buf

        with lib.call_in_background(save_vvop) as save_vvop:
            for p0, p1 in mpi.prange(vloc0, vloc1, blksize):
                j0, j1 = p0 - vloc0, p1 - vloc0
                sub_locs = comm.allgather((p0, p1))
                vvvo = mpi.alltoall_new(
                    [eris.vvvo[:, :, q0:q1] for q0, q1 in sub_locs],
                    split_recvbuf=True)
                save_vvop(j0, j1, vvvo)
                cpu1 = log.timer_debug1('transpose %d:%d' % (p0, p1), *cpu1)

        def send_data():
            while True:
                while comm.Iprobe(source=MPI.ANY_SOURCE, tag=INQUIRY):
                    tensors, dest = comm.recv(source=MPI.ANY_SOURCE,
                                              tag=INQUIRY)
                    for task, slices in tensors:
                        if task == 'Done':
                            return
                        else:
                            mpi.send(self._get_tensor(task, slices),
                                     dest,
                                     tag=TRANSFER_DATA)
                time.sleep(interval)

        daemon = threading.Thread(target=send_data)
        daemon.start()
        return daemon
Beispiel #6
0
    def start(self, interval=0.02):
        mycc = self._cc
        log = logger.new_logger(mycc)
        cpu1 = (time.clock(), time.time())
        eris = mycc._eris
        t2T = mycc.t2.transpose(2,3,0,1)

        nocc, nvir = mycc.t1.shape
        nmo = nocc + nvir
        vloc0, vloc1 = self.vranges[rank]
        nvir_seg = vloc1 - vloc0

        max_memory = min(24000, mycc.max_memory - lib.current_memory()[0])
        blksize = min(nvir_seg//4+1, max(16, int(max_memory*.3e6/8/(nvir*nocc*nmo))))
        self.eri_tmp = lib.H5TmpFile()
        vvop = self.eri_tmp.create_dataset('vvop', (nvir_seg,nvir,nocc,nmo), 'f8')

        def save_vvop(j0, j1, vvvo):
            buf = numpy.empty((j1-j0,nvir,nocc,nmo), dtype=t2T.dtype)
            buf[:,:,:,:nocc] = eris.ovov[:,j0:j1].conj().transpose(1,3,0,2)
            for k, (q0, q1) in enumerate(self.vranges):
                blk = vvvo[k].reshape(q1-q0,nvir,j1-j0,nocc)
                buf[:,q0:q1,:,nocc:] = blk.transpose(2,0,3,1)
            vvop[j0:j1] = buf

        with lib.call_in_background(save_vvop) as save_vvop:
            for p0, p1 in mpi.prange(vloc0, vloc1, blksize):
                j0, j1 = p0 - vloc0, p1 - vloc0
                sub_locs = comm.allgather((p0,p1))
                vvvo = mpi.alltoall([eris.vvvo[:,:,q0:q1] for q0, q1 in sub_locs],
                                    split_recvbuf=True)
                save_vvop(j0, j1, vvvo)
                cpu1 = log.timer_debug1('transpose %d:%d'%(p0,p1), *cpu1)

        def send_data():
            while True:
                while comm.Iprobe(source=MPI.ANY_SOURCE, tag=INQUIRY):
                    tensors, dest = comm.recv(source=MPI.ANY_SOURCE, tag=INQUIRY)
                    for task, slices in tensors:
                        if task == 'Done':
                            return
                        else:
                            mpi.send(self._get_tensor(task, slices), dest,
                                     tag=TRANSFER_DATA)
                time.sleep(interval)

        daemon = threading.Thread(target=send_data)
        daemon.start()
        return daemon
Beispiel #7
0
def _sort_eri(mycc, eris, nocc, nvir, vvop, log):
    cpu1 = (time.clock(), time.time())
    mol = mycc.mol
    nmo = nocc + nvir

    if mol.symmetry:
        orbsym = symm.addons.label_orb_symm(mol,
                                            mol.irrep_id,
                                            mol.symm_orb,
                                            eris.mo_coeff,
                                            check=False)
        orbsym = numpy.asarray(orbsym, dtype=numpy.int32) % 10
    else:
        orbsym = numpy.zeros(nmo, dtype=numpy.int32)

    o_sorted = _irrep_argsort(orbsym[:nocc])
    v_sorted = _irrep_argsort(orbsym[nocc:])
    vrank = numpy.argsort(v_sorted)

    max_memory = max(0, mycc.max_memory - lib.current_memory()[0])
    max_memory = min(8000, max_memory * .9)
    blksize = min(nvir, max(16,
                            int(max_memory * 1e6 / 8 / (nvir * nocc * nmo))))
    log.debug1('_sort_eri max_memory %g  blksize %d', max_memory, blksize)
    dtype = vvop.dtype
    with lib.call_in_background(vvop.__setitem__,
                                sync=not mycc.async_io) as save:
        bufopv = numpy.empty((nocc, nmo, nvir), dtype=dtype)
        buf1 = numpy.empty_like(bufopv)
        buf = numpy.empty((nocc, nvir, nvir), dtype=dtype)
        for j0, j1 in lib.prange(0, nvir, blksize):
            ovov = numpy.asarray(eris.ovov[:, j0:j1])
            #ovvv = numpy.asarray(eris.ovvv[:,j0:j1])
            ovvv = eris.get_ovvv(slice(None), slice(j0, j1))
            for j in range(j0, j1):
                oov = ovov[o_sorted, j - j0]
                ovv = ovvv[o_sorted, j - j0]
                #if ovv.ndim == 2:
                #    ovv = lib.unpack_tril(ovv, out=buf)
                bufopv[:, :nocc, :] = oov[:, o_sorted][:, :, v_sorted].conj()
                bufopv[:, nocc:, :] = ovv[:, v_sorted][:, :, v_sorted].conj()
                save(vrank[j], bufopv.transpose(2, 0, 1))
                bufopv, buf1 = buf1, bufopv
            cpu1 = log.timer_debug1('transpose %d:%d' % (j0, j1), *cpu1)

    return orbsym
Beispiel #8
0
def _sort_eri(mycc, eris, nocc, nvir, vvop, log):
    cpu1 = (time.clock(), time.time())
    mol = mycc.mol
    nmo = nocc + nvir

    if mol.symmetry:
        orbsym = symm.addons.label_orb_symm(mol, mol.irrep_id, mol.symm_orb,
                                            eris.mo_coeff, check=False)
        orbsym = numpy.asarray(orbsym, dtype=numpy.int32) % 10
    else:
        orbsym = numpy.zeros(nmo, dtype=numpy.int32)

    o_sorted = _irrep_argsort(orbsym[:nocc])
    v_sorted = _irrep_argsort(orbsym[nocc:])
    vrank = numpy.argsort(v_sorted)

    max_memory = max(0, mycc.max_memory - lib.current_memory()[0])
    max_memory = min(8000, max_memory*.9)
    blksize = min(nvir, max(16, int(max_memory*1e6/8/(nvir*nocc*nmo))))
    log.debug1('_sort_eri max_memory %g  blksize %d', max_memory, blksize)
    dtype = vvop.dtype
    with lib.call_in_background(vvop.__setitem__, sync=not mycc.async_io) as save:
        bufopv = numpy.empty((nocc,nmo,nvir), dtype=dtype)
        buf1 = numpy.empty_like(bufopv)
        buf = numpy.empty((nocc,nvir,nvir), dtype=dtype)
        for j0, j1 in lib.prange(0, nvir, blksize):
            ovov = numpy.asarray(eris.ovov[:,j0:j1])
            #ovvv = numpy.asarray(eris.ovvv[:,j0:j1])
            ovvv = eris.get_ovvv(slice(None), slice(j0,j1))
            for j in range(j0,j1):
                oov = ovov[o_sorted,j-j0]
                ovv = ovvv[o_sorted,j-j0]
                #if ovv.ndim == 2:
                #    ovv = lib.unpack_tril(ovv, out=buf)
                bufopv[:,:nocc,:] = oov[:,o_sorted][:,:,v_sorted].conj()
                bufopv[:,nocc:,:] = ovv[:,v_sorted][:,:,v_sorted].conj()
                save(vrank[j], bufopv.transpose(2,0,1))
                bufopv, buf1 = buf1, bufopv
            cpu1 = log.timer_debug1('transpose %d:%d'%(j0,j1), *cpu1)

    return orbsym
Beispiel #9
0
def general(eri, mo_coeffs, erifile, dataname='eri_mo',
            ioblk_size=IOBLK_SIZE, compact=True, verbose=logger.NOTE):
    '''For the given four sets of orbitals, transfer arbitrary spherical AO
    integrals to MO integrals on disk.
    Args:
        eri : 8-fold reduced eri vector
        mo_coeffs : 4-item list of ndarray
            Four sets of orbital coefficients, corresponding to the four
            indices of (ij|kl)
        erifile : str or h5py File or h5py Group object
            To store the transformed integrals, in HDF5 format.
    Kwargs
        dataname : str
            The dataset name in the erifile (ref the hierarchy of HDF5 format
            http://www.hdfgroup.org/HDF5/doc1.6/UG/09_Groups.html).  By assigning
            different dataname, the existed integral file can be reused.  If
            the erifile contains the dataname, the new integrals data will
            overwrite the old one.
        ioblk_size : float or int
            The block size for IO, large block size may **not** improve performance
        compact : bool
            When compact is True, depending on the four oribital sets, the
            returned MO integrals has (up to 4-fold) permutation symmetry.
            If it's False, the function will abandon any permutation symmetry,
            and return the "plain" MO integrals


    Pseudocode / algorithm:
        u = mu
        v = nu
        l = lambda
        o = sigma

        Assume eri's are 8-fold reduced.
        nij/nkl_pair = npair or i*j/k*l if only transforming a subset

        First half transform:
            Initialize half_eri of size (nij_pair,npair)
                For lo = 1 -> npair
                    Unpack row lo
                    Unpack row lo to matrix E_{uv}^{lo}
                    Transform C_ui^+*E*C_nj -> E_{ij}^{lo}
                    Ravel or pack E_{ij}^{lo}
                    Save E_{ij}^{lo} -> half_eri[:,lo]

        Second half transform:
            Initialize h5d_eri of size (nij_pair,nkl_pair)
                For ij = 1 -> nij_pair
                    Load and unpack half_eri[ij,:] -> E_{lo}^{ij}
                    Transform C_{lk}E_{lo}^{ij}C_{ol} -> E_{kl}^{ij}
                    Repack E_{kl}^{ij}
                    Save E_{kl}^{ij} -> h5d_eri[ij,:]

        Each matrix is indexed by the composite index ij x kl, where ij/kl is
        either npair or ixj/kxl, if only a subset of MOs are being transformed.
        Since entire rows or columns need to be read in, the arrays are chunked
        such that IOBLK_SIZE = row/col x chunking col/row. For example, for the
        first half transform, we would save in nij_pair x IOBLK_SIZE/nij_pair,
        then load in IOBLK_SIZE/nkl_pair x npair for the second half transform.

        ------ kl ----->
        |jxl
        |
        ij
        |
        |
        v

        As a first guess, the chunking size is jxl. If the super-rows/cols are
        larger than IOBLK_SIZE, then the chunk rectangle jxl is trimmed
        accordingly. The pathological limiting case is where the dimensions
        nao_pair, nij_pair, or nkl_pair are so large that the arrays are
        chunked 1x1, in which case IOBLK_SIZE needs to be increased.

    '''
    log = logger.new_logger(None, verbose)
    log.info('******** ao2mo disk, custom eri ********')

    eri_ao = numpy.asarray(eri, order='C')
    nao, nmoi = mo_coeffs[0].shape
    nmoj = mo_coeffs[1].shape[1]
    nao_pair = nao*(nao+1)//2
    ijmosym, nij_pair, moij, ijshape = _conc_mos(mo_coeffs[0], mo_coeffs[1], compact)
    klmosym, nkl_pair, mokl, klshape = _conc_mos(mo_coeffs[2], mo_coeffs[3], compact)
    ijshape = (ijshape[0], ijshape[1]-ijshape[0],
               ijshape[2], ijshape[3]-ijshape[2])
    dtype = numpy.result_type(eri, *mo_coeffs)
    typesize = dtype.itemsize/1e6 # in MB

    if nij_pair == 0:
        return numpy.empty((nij_pair,nkl_pair))

    ij_red = ijmosym == 's1'
    kl_red = klmosym == 's1'

    if isinstance(erifile, str):
        if h5py.is_hdf5(erifile):
            feri = h5py.File(erifile, 'a')
            if dataname in feri:
                del(feri[dataname])
        else:
            feri = h5py.File(erifile,'w',libver='latest')
    else:
        assert(isinstance(erifile, h5py.Group))
        feri = erifile

    h5d_eri = feri.create_dataset(dataname,(nij_pair,nkl_pair), dtype.char)
    feri_swap = lib.H5TmpFile(libver='latest')
    chunk_size = min(nao_pair, max(4, int(ioblk_size*1e6/8/nao_pair)))

    log.debug('Memory information:')
    log.debug('  IOBLK_SIZE (MB): {}  chunk_size: {}'
              .format(ioblk_size, chunk_size))
    log.debug('  Final disk eri size (MB): {:.3g}'
              .format(nij_pair*nkl_pair*typesize))
    log.debug('  Half transformed eri size (MB): {:.3g}'
              .format(nij_pair*nao_pair*typesize))
    log.debug('  RAM buffer (MB): {:.3g}'
             .format(nij_pair*IOBLK_SIZE*typesize*2))

    if eri_ao.size == nao_pair**2: # 4-fold symmetry
        # half_e1 first transforms the indices which are contiguous in memory
        # transpose the 4-fold integrals to make ij the contiguous indices
        eri_ao = lib.transpose(eri_ao)
        ftrans = _ao2mo.libao2mo.AO2MOtranse1_incore_s4
    elif eri_ao.size == nao_pair*(nao_pair+1)//2:
        ftrans = _ao2mo.libao2mo.AO2MOtranse1_incore_s8
    else:
        raise NotImplementedError

    if ijmosym == 's2':
        fmmm = _ao2mo.libao2mo.AO2MOmmm_nr_s2_s2
    elif nmoi <= nmoj:
        fmmm = _ao2mo.libao2mo.AO2MOmmm_nr_s2_iltj
    else:
        fmmm = _ao2mo.libao2mo.AO2MOmmm_nr_s2_igtj
    fdrv = getattr(_ao2mo.libao2mo, 'AO2MOnr_e1incore_drv')

    def save(piece, buf):
        feri_swap[str(piece)] = buf.T

    # transform \mu\nu -> ij
    cput0 = time.clock(), time.time()
    with lib.call_in_background(save) as async_write:
        for istep, (p0, p1) in enumerate(lib.prange(0, nao_pair, chunk_size)):
            if dtype == numpy.double:
                buf = numpy.empty((p1-p0, nij_pair))
                fdrv(ftrans, fmmm,
                     buf.ctypes.data_as(ctypes.c_void_p),
                     eri_ao.ctypes.data_as(ctypes.c_void_p),
                     moij.ctypes.data_as(ctypes.c_void_p),
                     ctypes.c_int(p0), ctypes.c_int(p1-p0),
                     ctypes.c_int(nao),
                     ctypes.c_int(ijshape[0]), ctypes.c_int(ijshape[1]),
                     ctypes.c_int(ijshape[2]), ctypes.c_int(ijshape[3]))
            else:  # complex
                tmp = numpy.empty((p1-p0, nao_pair))
                if eri_ao.size == nao_pair**2: # 4-fold symmetry
                    tmp = eri_ao[p0:p1]
                else: # 8-fold symmetry
                    for i in range(p0, p1):
                        tmp[i-p0] = lib.unpack_row(eri_ao, i)
                tmp = lib.unpack_tril(tmp, filltriu=lib.SYMMETRIC)
                buf = lib.einsum('xpq,pi,qj->xij', tmp, mo_coeffs[0].conj(), mo_coeffs[1])
                if ij_red:
                    buf = buf.reshape(p1-p0,-1) # grabs by row
                else:
                    buf = lib.pack_tril(buf)

            async_write(istep, buf)

    log.timer('(uv|lo) -> (ij|lo)', *cput0)

    # transform \lambda\sigma -> kl
    cput1 = time.clock(), time.time()
    Cklam = mo_coeffs[2].conj()
    buf_read = numpy.empty((chunk_size,nao_pair), dtype=dtype)
    buf_prefetch = numpy.empty_like(buf_read)

    def load(start, stop, buf):
        if start < stop:
            _load_from_h5g(feri_swap, start, stop, buf)

    def save(start, stop, buf):
        if start < stop:
            h5d_eri[start:stop] = buf[:stop-start]

    with lib.call_in_background(save,load) as (async_write, prefetch):
        for p0, p1 in lib.prange(0, nij_pair, chunk_size):
            if p0 == 0:
                load(p0, p1, buf_prefetch)

            buf_read, buf_prefetch = buf_prefetch, buf_read
            prefetch(p1, min(p1+chunk_size, nij_pair), buf_prefetch)

            lo = lib.unpack_tril(buf_read[:p1-p0], filltriu=lib.SYMMETRIC)
            lo = lib.einsum('xpq,pi,qj->xij', lo, Cklam, mo_coeffs[3])
            if kl_red:
                kl = lo.reshape(p1-p0,-1)
            else:
                kl = lib.pack_tril(lo)
            async_write(p0, p1, kl)

    log.timer('(ij|lo) -> (ij|kl)', *cput1)

    if isinstance(erifile, str):
        feri.close()
    return erifile
Beispiel #10
0
def general(mol,
            mo_coeffs,
            erifile,
            auxbasis='weigend+etb',
            dataname='eri_mo',
            tmpdir=None,
            int3c='int3c2e',
            aosym='s2ij',
            int2c='int2c2e',
            comp=1,
            max_memory=MAX_MEMORY,
            verbose=0,
            compact=True):
    ''' Transform ij of (ij|L) to MOs.
    '''
    assert (aosym in ('s1', 's2ij'))
    time0 = (time.clock(), time.time())
    log = logger.new_logger(mol, verbose)

    if tmpdir is None:
        tmpdir = lib.param.TMPDIR
    swapfile = tempfile.NamedTemporaryFile(dir=tmpdir)
    cholesky_eri_b(mol,
                   swapfile.name,
                   auxbasis,
                   dataname,
                   int3c,
                   aosym,
                   int2c,
                   comp,
                   max_memory,
                   verbose=log)
    fswap = h5py.File(swapfile.name, 'r')
    time1 = log.timer('AO->MO eri transformation 1 pass', *time0)

    nao = mo_coeffs[0].shape[0]
    if aosym == 's1':
        nao_pair = nao * nao
        aosym_as_nr_e2 = 's1'
    else:
        nao_pair = nao * (nao + 1) // 2
        aosym_as_nr_e2 = 's2kl'

    ijmosym, nij_pair, moij, ijshape = \
            ao2mo.incore._conc_mos(mo_coeffs[0], mo_coeffs[1],
                                   compact and aosym != 's1')

    naoaux = fswap['%s/0' % dataname].shape[-2]
    feri = _create_h5file(erifile, dataname)
    if comp == 1:
        h5d_eri = feri.create_dataset(dataname, (naoaux, nij_pair), 'f8')
    else:
        h5d_eri = feri.create_dataset(dataname, (comp, naoaux, nij_pair), 'f8')

    def save(row0, row1, buf):
        if comp == 1:
            h5d_eri[row0:row1] = buf
        else:
            h5d_eri[:, row0:row1] = buf

    iolen = min(max(int(max_memory * .45e6 / 8 / (nao_pair + nij_pair)), 28),
                naoaux)
    totstep = (naoaux + iolen - 1) // iolen
    ti0 = time1
    with lib.call_in_background(save) as bsave:
        for istep, (row0, row1) in enumerate(lib.prange(0, naoaux, iolen)):
            nrow = row1 - row0
            log.debug('step 2 [%d/%d], [%d:%d], row = %d', istep + 1, totstep,
                      row0, row1, nrow)
            buf = _load_from_h5g(fswap[dataname], row0, row1)
            if comp == 1:
                buf = _ao2mo.nr_e2(buf, moij, ijshape, aosym_as_nr_e2, ijmosym)
                bsave(row0, row1, buf)
            else:
                buf = _ao2mo.nr_e2(buf.reshape(comp * nrow, nao_pair), moij,
                                   ijshape, aosym_as_nr_e2, ijmosym)
                bsave(row0, row1, buf.reshape(comp, nrow, nij_pair))
            buf = None
            ti0 = log.timer(
                'step 2 [%d/%d], [%d:%d], row = %d' %
                (istep + 1, totstep, row0, row1, nrow), *ti0)

    fswap.close()
    feri.close()
    log.timer('AO->MO CD eri transformation 2 pass', *time1)
    log.timer('AO->MO CD eri transformation', *time0)
    return erifile
Beispiel #11
0
def _ao2mo_ovov(mp, orbs, feri, max_memory=2000, verbose=None):
    time0 = (time.clock(), time.time())
    log = logger.new_logger(mp, verbose)
    orboa = numpy.asarray(orbs[0], order='F')
    orbva = numpy.asarray(orbs[1], order='F')
    orbob = numpy.asarray(orbs[2], order='F')
    orbvb = numpy.asarray(orbs[3], order='F')
    nao, nocca = orboa.shape
    noccb = orbob.shape[1]
    nvira = orbva.shape[1]
    nvirb = orbvb.shape[1]

    mol = mp.mol
    int2e = mol._add_suffix('int2e')
    ao2mopt = _ao2mo.AO2MOpt(mol, int2e, 'CVHFnr_schwarz_cond',
                             'CVHFsetnr_direct_scf')
    nbas = mol.nbas
    assert (nvira <= nao)
    assert (nvirb <= nao)

    ao_loc = mol.ao_loc_nr()
    dmax = max(
        4, min(nao / 3, numpy.sqrt(max_memory * .95e6 / 8 / (nao + nocca)**2)))
    sh_ranges = ao2mo.outcore.balance_partition(ao_loc, dmax)
    dmax = max(x[2] for x in sh_ranges)
    eribuf = numpy.empty((nao, dmax, dmax, nao))
    ftmp = lib.H5TmpFile()
    disk = (nocca**2 * (nao * (nao + dmax) / 2 + nvira**2) + noccb**2 *
            (nao * (nao + dmax) / 2 + nvirb**2) + nocca * noccb *
            (nao**2 + nvira * nvirb))
    log.debug('max_memory %s MB (dmax = %s) required disk space %g MB',
              max_memory, dmax, disk * 8 / 1e6)

    fint = gto.moleintor.getints4c
    aa_blk_slices = []
    ab_blk_slices = []
    count_ab = 0
    count_aa = 0
    time1 = time0
    with lib.call_in_background(ftmp.__setitem__) as save:
        for ish0, ish1, ni in sh_ranges:
            for jsh0, jsh1, nj in sh_ranges:
                i0, i1 = ao_loc[ish0], ao_loc[ish1]
                j0, j1 = ao_loc[jsh0], ao_loc[jsh1]

                eri = fint(int2e,
                           mol._atm,
                           mol._bas,
                           mol._env,
                           shls_slice=(0, nbas, ish0, ish1, jsh0, jsh1, 0,
                                       nbas),
                           aosym='s1',
                           ao_loc=ao_loc,
                           cintopt=ao2mopt._cintopt,
                           out=eribuf)
                tmp_i = lib.ddot(orboa.T,
                                 eri.reshape(nao, (i1 - i0) * (j1 - j0) * nao))
                tmp_li = lib.ddot(
                    orbob.T,
                    tmp_i.reshape(nocca * (i1 - i0) * (j1 - j0), nao).T)
                tmp_li = tmp_li.reshape(noccb, nocca, (i1 - i0), (j1 - j0))
                save('ab/%d' % count_ab, tmp_li.transpose(1, 0, 2, 3))
                ab_blk_slices.append((i0, i1, j0, j1))
                count_ab += 1

                if ish0 >= jsh0:
                    tmp_li = lib.ddot(
                        orboa.T,
                        tmp_i.reshape(nocca * (i1 - i0) * (j1 - j0), nao).T)
                    tmp_li = tmp_li.reshape(nocca, nocca, (i1 - i0), (j1 - j0))
                    save('aa/%d' % count_aa, tmp_li.transpose(1, 0, 2, 3))

                    tmp_i = lib.ddot(
                        orbob.T, eri.reshape(nao, (i1 - i0) * (j1 - j0) * nao))
                    tmp_li = lib.ddot(
                        orbob.T,
                        tmp_i.reshape(noccb * (i1 - i0) * (j1 - j0), nao).T)
                    tmp_li = tmp_li.reshape(noccb, noccb, (i1 - i0), (j1 - j0))
                    save('bb/%d' % count_aa, tmp_li.transpose(1, 0, 2, 3))
                    aa_blk_slices.append((i0, i1, j0, j1))
                    count_aa += 1

                time1 = log.timer_debug1(
                    'partial ao2mo [%d:%d,%d:%d]' % (ish0, ish1, jsh0, jsh1),
                    *time1)
    time1 = time0 = log.timer('mp2 ao2mo_ovov pass1', *time0)
    eri = eribuf = tmp_i = tmp_li = None

    fovov = feri.create_dataset('ovov', (nocca * nvira, nocca * nvira),
                                'f8',
                                chunks=(nvira, nvira))
    fovOV = feri.create_dataset('ovOV', (nocca * nvira, noccb * nvirb),
                                'f8',
                                chunks=(nvira, nvirb))
    fOVOV = feri.create_dataset('OVOV', (noccb * nvirb, noccb * nvirb),
                                'f8',
                                chunks=(nvirb, nvirb))
    occblk = int(
        min(max(nocca, noccb),
            max(4, 250 / nocca, max_memory * .9e6 / 8 / (nao**2 * nocca) / 5)))

    def load_aa(h5g, nocc, i0, eri):
        if i0 < nocc:
            i1 = min(i0 + occblk, nocc)
            for k, (p0, p1, q0, q1) in enumerate(aa_blk_slices):
                eri[:i1 - i0, :, p0:p1, q0:q1] = h5g[str(k)][i0:i1]
                if p0 != q0:
                    dat = numpy.asarray(h5g[str(k)][:, i0:i1])
                    eri[:i1 - i0, :, q0:q1, p0:p1] = dat.transpose(1, 0, 3, 2)

    def load_ab(h5g, nocca, i0, eri):
        if i0 < nocca:
            i1 = min(i0 + occblk, nocca)
            for k, (p0, p1, q0, q1) in enumerate(ab_blk_slices):
                eri[:i1 - i0, :, p0:p1, q0:q1] = h5g[str(k)][i0:i1]

    def save(h5dat, nvir, i0, i1, dat):
        for i in range(i0, i1):
            h5dat[i * nvir:(i + 1) * nvir] = dat[i - i0].reshape(nvir, -1)

    with lib.call_in_background(save) as bsave:
        with lib.call_in_background(load_aa) as prefetch:
            buf_prefecth = numpy.empty((occblk, nocca, nao, nao))
            buf = numpy.empty_like(buf_prefecth)
            load_aa(ftmp['aa'], nocca, 0, buf_prefecth)
            for i0, i1 in lib.prange(0, nocca, occblk):
                buf, buf_prefecth = buf_prefecth, buf
                prefetch(ftmp['aa'], nocca, i1, buf_prefecth)
                eri = buf[:i1 - i0].reshape((i1 - i0) * nocca, nao, nao)
                dat = _ao2mo.nr_e2(eri, orbva, (0, nvira, 0, nvira), 's1',
                                   's1')
                bsave(
                    fovov, nvira, i0, i1,
                    dat.reshape(i1 - i0, nocca, nvira,
                                nvira).transpose(0, 2, 1, 3))
                time1 = log.timer_debug1(
                    'pass2 ao2mo for aa [%d:%d]' % (i0, i1), *time1)

            buf_prefecth = numpy.empty((occblk, noccb, nao, nao))
            buf = numpy.empty_like(buf_prefecth)
            load_aa(ftmp['bb'], noccb, 0, buf_prefecth)
            for i0, i1 in lib.prange(0, noccb, occblk):
                buf, buf_prefecth = buf_prefecth, buf
                prefetch(ftmp['bb'], noccb, i1, buf_prefecth)
                eri = buf[:i1 - i0].reshape((i1 - i0) * noccb, nao, nao)
                dat = _ao2mo.nr_e2(eri, orbvb, (0, nvirb, 0, nvirb), 's1',
                                   's1')
                bsave(
                    fOVOV, nvirb, i0, i1,
                    dat.reshape(i1 - i0, noccb, nvirb,
                                nvirb).transpose(0, 2, 1, 3))
                time1 = log.timer_debug1(
                    'pass2 ao2mo for bb [%d:%d]' % (i0, i1), *time1)

        orbvab = numpy.asarray(numpy.hstack((orbva, orbvb)), order='F')
        with lib.call_in_background(load_ab) as prefetch:
            load_ab(ftmp['ab'], nocca, 0, buf_prefecth)
            for i0, i1 in lib.prange(0, nocca, occblk):
                buf, buf_prefecth = buf_prefecth, buf
                prefetch(ftmp['ab'], nocca, i1, buf_prefecth)
                eri = buf[:i1 - i0].reshape((i1 - i0) * noccb, nao, nao)
                dat = _ao2mo.nr_e2(eri, orbvab,
                                   (0, nvira, nvira, nvira + nvirb), 's1',
                                   's1')
                bsave(
                    fovOV, nvira, i0, i1,
                    dat.reshape(i1 - i0, noccb, nvira,
                                nvirb).transpose(0, 2, 1, 3))
                time1 = log.timer_debug1(
                    'pass2 ao2mo for ab [%d:%d]' % (i0, i1), *time1)

    time0 = log.timer('mp2 ao2mo_ovov pass2', *time0)
Beispiel #12
0
def kernel(mycc, eris, t1=None, t2=None, verbose=logger.NOTE):
    cpu1 = cpu0 = (logger.process_clock(), logger.perf_counter())
    log = logger.new_logger(mycc, verbose)
    if t1 is None: t1 = mycc.t1
    if t2 is None: t2 = mycc.t2

    nocc, nvir = t1.shape
    nmo = nocc + nvir

    dtype = numpy.result_type(t1, t2, eris.ovoo.dtype)
    if mycc.incore_complete:
        ftmp = None
        eris_vvop = numpy.zeros((nvir, nvir, nocc, nmo), dtype)
    else:
        ftmp = lib.H5TmpFile()
        eris_vvop = ftmp.create_dataset('vvop', (nvir, nvir, nocc, nmo), dtype)

    orbsym = _sort_eri(mycc, eris, nocc, nvir, eris_vvop, log)

    mo_energy, t1T, t2T, vooo, fvo, restore_t2_inplace = \
            _sort_t2_vooo_(mycc, orbsym, t1, t2, eris)
    cpu1 = log.timer_debug1('CCSD(T) sort_eri', *cpu1)

    cpu2 = list(cpu1)
    orbsym = numpy.hstack(
        (numpy.sort(orbsym[:nocc]), numpy.sort(orbsym[nocc:])))
    o_ir_loc = numpy.append(
        0, numpy.cumsum(numpy.bincount(orbsym[:nocc], minlength=8)))
    v_ir_loc = numpy.append(
        0, numpy.cumsum(numpy.bincount(orbsym[nocc:], minlength=8)))
    o_sym = orbsym[:nocc]
    oo_sym = (o_sym[:, None] ^ o_sym).ravel()
    oo_ir_loc = numpy.append(0,
                             numpy.cumsum(numpy.bincount(oo_sym, minlength=8)))
    nirrep = max(oo_sym) + 1

    orbsym = orbsym.astype(numpy.int32)
    o_ir_loc = o_ir_loc.astype(numpy.int32)
    v_ir_loc = v_ir_loc.astype(numpy.int32)
    oo_ir_loc = oo_ir_loc.astype(numpy.int32)
    if dtype == numpy.complex:
        drv = _ccsd.libcc.CCsd_t_zcontract
    else:
        drv = _ccsd.libcc.CCsd_t_contract
    et_sum = numpy.zeros(1, dtype=dtype)

    def contract(a0, a1, b0, b1, cache):
        cache_row_a, cache_col_a, cache_row_b, cache_col_b = cache
        drv(et_sum.ctypes.data_as(ctypes.c_void_p),
            mo_energy.ctypes.data_as(ctypes.c_void_p),
            t1T.ctypes.data_as(ctypes.c_void_p),
            t2T.ctypes.data_as(ctypes.c_void_p),
            vooo.ctypes.data_as(ctypes.c_void_p),
            fvo.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nocc),
            ctypes.c_int(nvir), ctypes.c_int(a0), ctypes.c_int(a1),
            ctypes.c_int(b0), ctypes.c_int(b1), ctypes.c_int(nirrep),
            o_ir_loc.ctypes.data_as(ctypes.c_void_p),
            v_ir_loc.ctypes.data_as(ctypes.c_void_p),
            oo_ir_loc.ctypes.data_as(ctypes.c_void_p),
            orbsym.ctypes.data_as(ctypes.c_void_p),
            cache_row_a.ctypes.data_as(ctypes.c_void_p),
            cache_col_a.ctypes.data_as(ctypes.c_void_p),
            cache_row_b.ctypes.data_as(ctypes.c_void_p),
            cache_col_b.ctypes.data_as(ctypes.c_void_p))
        cpu2[:] = log.timer_debug1('contract %d:%d,%d:%d' % (a0, a1, b0, b1),
                                   *cpu2)

    # The rest 20% memory for cache b
    mem_now = lib.current_memory()[0]
    max_memory = max(0, mycc.max_memory - mem_now)
    bufsize = (max_memory * .5e6 / 8 - nocc**3 * 3 * lib.num_threads()) / (
        nocc * nmo)  #*.5 for async_io
    bufsize *= .5  #*.5 upper triangular part is loaded
    bufsize *= .8  #*.8 for [a0:a1]/[b0:b1] partition
    bufsize = max(8, bufsize)
    log.debug('max_memory %d MB (%d MB in use)', max_memory, mem_now)
    with lib.call_in_background(contract,
                                sync=not mycc.async_io) as async_contract:
        for a0, a1 in reversed(list(lib.prange_tril(0, nvir, bufsize))):
            cache_row_a = numpy.asarray(eris_vvop[a0:a1, :a1], order='C')
            if a0 == 0:
                cache_col_a = cache_row_a
            else:
                cache_col_a = numpy.asarray(eris_vvop[:a0, a0:a1], order='C')
            async_contract(
                a0, a1, a0, a1,
                (cache_row_a, cache_col_a, cache_row_a, cache_col_a))

            for b0, b1 in lib.prange_tril(0, a0, bufsize / 8):
                cache_row_b = numpy.asarray(eris_vvop[b0:b1, :b1], order='C')
                if b0 == 0:
                    cache_col_b = cache_row_b
                else:
                    cache_col_b = numpy.asarray(eris_vvop[:b0, b0:b1],
                                                order='C')
                async_contract(
                    a0, a1, b0, b1,
                    (cache_row_a, cache_col_a, cache_row_b, cache_col_b))

    t2 = restore_t2_inplace(t2T)
    et_sum *= 2
    if abs(et_sum[0].imag) > 1e-4:
        logger.warn(mycc,
                    'Non-zero imaginary part of CCSD(T) energy was found %s',
                    et_sum[0])
    et = et_sum[0].real
    log.timer('CCSD(T)', *cpu0)
    log.note('CCSD(T) correction = %.15g', et)
    return et
Beispiel #13
0
def half_e1(mol, mo_coeffs, swapfile,
            intor='int2e', aosym='s4', comp=1,
            max_memory=MAX_MEMORY, ioblk_size=IOBLK_SIZE, verbose=logger.WARN,
            compact=True, ao2mopt=None):
    r'''Half transform arbitrary spherical AO integrals to MO integrals
    for the given two sets of orbitals

    Args:
        mol : :class:`Mole` object
            AO integrals will be generated in terms of mol._atm, mol._bas, mol._env
        mo_coeff : ndarray
            Transform (ij|kl) with the same set of orbitals.
        swapfile : str or h5py File or h5py Group object
            To store the transformed integrals, in HDF5 format.  The transformed
            integrals are saved in blocks.

    Kwargs
        intor : str
            Name of the 2-electron integral.  Ref to :func:`getints_by_shell`
            for the complete list of available 2-electron integral names
        aosym : int or str
            Permutation symmetry for the AO integrals

            | 4 or '4' or 's4': 4-fold symmetry (default)
            | '2ij' or 's2ij' : symmetry between i, j in (ij|kl)
            | '2kl' or 's2kl' : symmetry between k, l in (ij|kl)
            | 1 or '1' or 's1': no symmetry
            | 'a4ij' : 4-fold symmetry with anti-symmetry between i, j in (ij|kl) (TODO)
            | 'a4kl' : 4-fold symmetry with anti-symmetry between k, l in (ij|kl) (TODO)
            | 'a2ij' : anti-symmetry between i, j in (ij|kl) (TODO)
            | 'a2kl' : anti-symmetry between k, l in (ij|kl) (TODO)

        comp : int
            Components of the integrals, e.g. int2e_ip_sph has 3 components.
        verbose : int
            Print level
        max_memory : float or int
            The maximum size of cache to use (in MB), large cache may **not**
            improve performance.
        ioblk_size : float or int
            The block size for IO, large block size may **not** improve performance
        verbose : int
            Print level
        compact : bool
            When compact is True, depending on the four oribital sets, the
            returned MO integrals has (up to 4-fold) permutation symmetry.
            If it's False, the function will abandon any permutation symmetry,
            and return the "plain" MO integrals
        ao2mopt : :class:`AO2MOpt` object
            Precomputed data to improve perfomance

    Returns:
        None

    '''
    if any(c.dtype == numpy.complex128 for c in mo_coeffs):
        raise NotImplementedError('Integral transformation for complex orbitals')

    intor = mol._add_suffix(intor)
    time0 = (logger.process_clock(), logger.perf_counter())
    log = logger.new_logger(mol, verbose)

    nao = mo_coeffs[0].shape[0]
    aosym = _stand_sym_code(aosym)
    if aosym in ('s4', 's2ij'):
        nao_pair = nao * (nao+1) // 2
    else:
        nao_pair = nao * nao

    ijmosym, nij_pair, moij, ijshape = \
            incore._conc_mos(mo_coeffs[0], mo_coeffs[1],
                             compact and aosym in ('s4', 's2ij'))

    e1buflen, mem_words, iobuf_words, ioblk_words = \
            guess_e1bufsize(max_memory, ioblk_size, nij_pair, nao_pair, comp)
    ioblk_size = ioblk_words * 8/1e6
# The buffer to hold AO integrals in C code, see line (@)
    aobuflen = max(int((mem_words - 2*comp*e1buflen*nij_pair) // (nao_pair*comp)),
                   IOBUF_ROW_MIN)
    ao_loc = mol.ao_loc_nr('_cart' in intor)
    shranges = guess_shell_ranges(mol, (aosym in ('s4', 's2kl')), e1buflen,
                                  aobuflen, ao_loc)
    if ao2mopt is None:
        if intor == 'int2e_cart' or intor == 'int2e_sph':
            ao2mopt = _ao2mo.AO2MOpt(mol, intor, 'CVHFnr_schwarz_cond',
                                     'CVHFsetnr_direct_scf')
        else:
            ao2mopt = _ao2mo.AO2MOpt(mol, intor)

    if isinstance(swapfile, h5py.Group):
        fswap = swapfile
    else:
        fswap = lib.H5TmpFile(swapfile)
    for icomp in range(comp):
        fswap.create_group(str(icomp)) # for h5py old version

    log.debug('step1: tmpfile %s  %.8g MB', fswap.filename, nij_pair*nao_pair*8/1e6)
    log.debug('step1: (ij,kl) = (%d,%d), mem cache %.8g MB, iobuf %.8g MB',
              nij_pair, nao_pair, mem_words*8/1e6, iobuf_words*8/1e6)
    nstep = len(shranges)
    e1buflen = max([x[2] for x in shranges])

    e2buflen, chunks = guess_e2bufsize(ioblk_size, nij_pair, e1buflen)
    def save(istep, iobuf):
        for icomp in range(comp):
            _transpose_to_h5g(fswap, '%d/%d'%(icomp,istep), iobuf[icomp],
                              e2buflen, None)

    # transform e1
    ti0 = log.timer('Initializing ao2mo.outcore.half_e1', *time0)
    with lib.call_in_background(save) as async_write:
        buf1 = numpy.empty((comp*e1buflen,nao_pair))
        buf2 = numpy.empty((comp*e1buflen,nij_pair))
        buf_write = numpy.empty_like(buf2)
        fill = _ao2mo.nr_e1fill
        f_e1 = _ao2mo.nr_e1
        for istep,sh_range in enumerate(shranges):
            log.debug1('step 1 [%d/%d], AO [%d:%d], len(buf) = %d',
                       istep+1, nstep, *(sh_range[:3]))
            buflen = sh_range[2]
            iobuf = numpy.ndarray((comp,buflen,nij_pair), buffer=buf2)
            nmic = len(sh_range[3])
            p1 = 0
            for imic, aoshs in enumerate(sh_range[3]):
                log.debug2('      fill iobuf micro [%d/%d], AO [%d:%d], len(aobuf) = %d',
                           imic+1, nmic, *aoshs)
                buf = fill(intor, aoshs, mol._atm, mol._bas, mol._env,
                           aosym, comp, ao2mopt, out=buf1).reshape(-1,nao_pair)
                buf = f_e1(buf, moij, ijshape, aosym, ijmosym)
                p0, p1 = p1, p1 + aoshs[2]
                iobuf[:,p0:p1] = buf.reshape(comp,aoshs[2],nij_pair)
            ti0 = log.timer_debug1('gen AO/transform MO [%d/%d]'%(istep+1,nstep), *ti0)

            async_write(istep, iobuf)
            buf2, buf_write = buf_write, buf2

    fswap = None
    return swapfile
Beispiel #14
0
    def make_kpt(uniq_kptji_id):  # kpt = kptj - kpti
        kpt = uniq_kpts[uniq_kptji_id]
        log.debug1('kpt = %s', kpt)
        adapted_ji_idx = numpy.where(uniq_inverse == uniq_kptji_id)[0]
        adapted_kptjs = kptjs[adapted_ji_idx]
        nkptj = len(adapted_kptjs)
        log.debug1('adapted_ji_idx = %s', adapted_ji_idx)

        shls_slice = (auxcell.nbas, fused_cell.nbas)
        Gaux = ft_ao.ft_ao(fused_cell, Gv, shls_slice, b, gxyz, Gvbase, kpt)
        if (cell.dimension == 1 or cell.dimension == 2) and is_zero(kpt):
            G0idx, SI_on_z = pbcgto.cell._SI_for_uniform_model_charge(cell, Gv)
            s = plain_ints[-Gaux.shape[1]:]  # Only compensated Gaussians
            Gaux[G0idx] -= numpy.einsum('g,i->gi', SI_on_z, s)

        wcoulG = mydf.weighted_coulG(kpt, False, mesh)
        Gaux *= wcoulG.reshape(-1,1)
        kLR = Gaux.real.copy('C')
        kLI = Gaux.imag.copy('C')
        Gaux = None
        j2c = numpy.asarray(fswap['j2c/%d'%uniq_kptji_id])
        try:
            j2c = scipy.linalg.cholesky(j2c, lower=True)
            j2ctag = 'CD'
        except scipy.linalg.LinAlgError as e:
            #msg =('===================================\n'
            #      'J-metric not positive definite.\n'
            #      'It is likely that mesh is not enough.\n'
            #      '===================================')
            #log.error(msg)
            #raise scipy.linalg.LinAlgError('\n'.join([e.message, msg]))
            w, v = scipy.linalg.eigh(j2c)
            log.debug('DF metric linear dependency for kpt %s', uniq_kptji_id)
            log.debug('cond = %.4g, drop %d bfns',
                      w[-1]/w[0], numpy.count_nonzero(w<mydf.linear_dep_threshold))
            v = v[:,w>mydf.linear_dep_threshold].T.conj()
            v /= numpy.sqrt(w[w>mydf.linear_dep_threshold]).reshape(-1,1)
            j2c = v
            j2ctag = 'eig'
        naux0 = j2c.shape[0]

        if is_zero(kpt):  # kpti == kptj
            aosym = 's2'
            nao_pair = nao*(nao+1)//2

            vbar = mydf.auxbar(fused_cell)
            ovlp = cell.pbc_intor('int1e_ovlp', hermi=1, kpts=adapted_kptjs)
            ovlp = [lib.pack_tril(s) for s in ovlp]
        else:
            aosym = 's1'
            nao_pair = nao**2

        mem_now = lib.current_memory()[0]
        log.debug2('memory = %s', mem_now)
        max_memory = max(2000, mydf.max_memory-mem_now)
        # nkptj for 3c-coulomb arrays plus 1 Lpq array
        buflen = min(max(int(max_memory*.38e6/16/naux/(nkptj+1)), 1), nao_pair)
        shranges = _guess_shell_ranges(cell, buflen, aosym)
        buflen = max([x[2] for x in shranges])
        # +1 for a pqkbuf
        if aosym == 's2':
            Gblksize = max(16, int(max_memory*.1e6/16/buflen/(nkptj+1)))
        else:
            Gblksize = max(16, int(max_memory*.2e6/16/buflen/(nkptj+1)))
        Gblksize = min(Gblksize, ngrids, 16384)
        pqkRbuf = numpy.empty(buflen*Gblksize)
        pqkIbuf = numpy.empty(buflen*Gblksize)
        # buf for ft_aopair
        buf = numpy.empty(nkptj*buflen*Gblksize, dtype=numpy.complex128)
        def pw_contract(istep, sh_range, j3cR, j3cI):
            bstart, bend, ncol = sh_range
            if aosym == 's2':
                shls_slice = (bstart, bend, 0, bend)
            else:
                shls_slice = (bstart, bend, 0, cell.nbas)

            for p0, p1 in lib.prange(0, ngrids, Gblksize):
                dat = ft_ao._ft_aopair_kpts(cell, Gv[p0:p1], shls_slice, aosym,
                                            b, gxyz[p0:p1], Gvbase, kpt,
                                            adapted_kptjs, out=buf)

                if (cell.dimension == 1 or cell.dimension == 2) and is_zero(kpt):
                    G0idx, SI_on_z = pbcgto.cell._SI_for_uniform_model_charge(cell, Gv[p0:p1])
                    if SI_on_z.size > 0:
                        for k, aoao in enumerate(dat):
                            aoao[G0idx] -= numpy.einsum('g,i->gi', SI_on_z, ovlp[k])
                            aux = fuse(ft_ao.ft_ao(fused_cell, Gv[p0:p1][G0idx]).T)
                            vG_mod = numpy.einsum('ig,g,g->i', aux.conj(),
                                                  wcoulG[p0:p1][G0idx], SI_on_z)
                            if gamma_point(adapted_kptjs[k]):
                                j3cR[k][:naux] -= vG_mod[:,None].real * ovlp[k]
                            else:
                                tmp = vG_mod[:,None] * ovlp[k]
                                j3cR[k][:naux] -= tmp.real
                                j3cI[k][:naux] -= tmp.imag
                            tmp = aux = vG_mod

                nG = p1 - p0
                for k, ji in enumerate(adapted_ji_idx):
                    aoao = dat[k].reshape(nG,ncol)
                    pqkR = numpy.ndarray((ncol,nG), buffer=pqkRbuf)
                    pqkI = numpy.ndarray((ncol,nG), buffer=pqkIbuf)
                    pqkR[:] = aoao.real.T
                    pqkI[:] = aoao.imag.T

                    lib.dot(kLR[p0:p1].T, pqkR.T, -1, j3cR[k][naux:], 1)
                    lib.dot(kLI[p0:p1].T, pqkI.T, -1, j3cR[k][naux:], 1)
                    if not (is_zero(kpt) and gamma_point(adapted_kptjs[k])):
                        lib.dot(kLR[p0:p1].T, pqkI.T, -1, j3cI[k][naux:], 1)
                        lib.dot(kLI[p0:p1].T, pqkR.T,  1, j3cI[k][naux:], 1)

            for k, ji in enumerate(adapted_ji_idx):
                if is_zero(kpt) and gamma_point(adapted_kptjs[k]):
                    v = fuse(j3cR[k])
                else:
                    v = fuse(j3cR[k] + j3cI[k] * 1j)
                if j2ctag == 'CD':
                    v = scipy.linalg.solve_triangular(j2c, v, lower=True, overwrite_b=True)
                else:
                    v = lib.dot(j2c, v)
                feri['j3c/%d/%d'%(ji,istep)] = v

        with lib.call_in_background(pw_contract) as compute:
            col1 = 0
            for istep, sh_range in enumerate(shranges):
                log.debug1('int3c2e [%d/%d], AO [%d:%d], ncol = %d', \
                           istep+1, len(shranges), *sh_range)
                bstart, bend, ncol = sh_range
                col0, col1 = col1, col1+ncol
                j3cR = []
                j3cI = []
                for k, idx in enumerate(adapted_ji_idx):
                    v = numpy.vstack([fswap['j3c-junk/%d/%d'%(idx,i)][0,col0:col1].T
                                      for i in range(nsegs)])
                    if is_zero(kpt) and cell.dimension == 3:
                        for i in numpy.where(vbar != 0)[0]:
                            v[i] -= vbar[i] * ovlp[k][col0:col1]
                    j3cR.append(numpy.asarray(v.real, order='C'))
                    if is_zero(kpt) and gamma_point(adapted_kptjs[k]):
                        j3cI.append(None)
                    else:
                        j3cI.append(numpy.asarray(v.imag, order='C'))
                v = None
                compute(istep, sh_range, j3cR, j3cI)
        for ji in adapted_ji_idx:
            del(fswap['j3c-junk/%d'%ji])
Beispiel #15
0
def _make_eris(mp, mo_coeff=None, verbose=None):
    log = logger.new_logger(mp, verbose)
    time0 = (time.clock(), time.time())

    log.debug('transform (ia|jb) outcore')
    mol = mp.mol
    nocc = mp.nocc
    nmo = mp.nmo
    nvir = nmo - nocc

    eris = mp2._ChemistsERIs(mp, mo_coeff)
    nao = eris.mo_coeff.shape[0]
    assert (nvir <= nao)
    orbo = eris.mo_coeff[:, :nocc]
    orbv = numpy.asarray(eris.mo_coeff[:, nocc:], order='F')
    eris.feri = lib.H5TmpFile()

    int2e = mol._add_suffix('int2e')
    ao2mopt = _ao2mo.AO2MOpt(mol, int2e, 'CVHFnr_schwarz_cond',
                             'CVHFsetnr_direct_scf')
    fint = gto.moleintor.getints4c

    ntasks = mpi.pool.size
    olocs = [_task_location(nocc, task_id) for task_id in range(ntasks)]
    oloc0, oloc1 = olocs[rank]
    nocc_seg = oloc1 - oloc0
    log.debug2('olocs %s', olocs)

    ao_loc = mol.ao_loc_nr()
    task_sh_locs = lib.misc._balanced_partition(ao_loc, ntasks)
    log.debug2('task_sh_locs %s', task_sh_locs)
    ao_sh0 = task_sh_locs[rank]
    ao_sh1 = task_sh_locs[rank + 1]
    ao_loc0 = ao_loc[ao_sh0]
    ao_loc1 = ao_loc[ao_sh1]
    nao_seg = ao_loc1 - ao_loc0
    orbo_seg = orbo[ao_loc0:ao_loc1]

    mem_now = lib.current_memory()[0]
    max_memory = max(0, mp.max_memory - mem_now)
    dmax = numpy.sqrt(max_memory * .9e6 / 8 / ((nao + nocc) *
                                               (nao_seg + nocc)))
    dmax = min(nao // 4 + 2, max(BLKMIN, min(comm.allgather(dmax))))
    sh_ranges = ao2mo.outcore.balance_partition(ao_loc, dmax)
    sh_ranges = comm.bcast(sh_ranges)
    dmax = max(x[2] for x in sh_ranges)
    eribuf = numpy.empty((nao, dmax, dmax, nao_seg))
    ftmp = lib.H5TmpFile()
    log.debug('max_memory %s MB (dmax = %s) required disk space %g MB',
              max_memory, dmax,
              nocc * nocc_seg * (nao * (nao + dmax) / 2 + nvir**2) * 8 / 1e6)

    def save(count, tmp_xo):
        di, dj = tmp_xo.shape[2:4]
        tmp_xo = [tmp_xo[p0:p1] for p0, p1 in olocs]
        tmp_xo = mpi.alltoall(tmp_xo, split_recvbuf=True)
        tmp_xo = sum(tmp_xo).reshape(nocc_seg, nocc, di, dj)
        ftmp[str(count) + 'b'] = tmp_xo

        tmp_ox = mpi.alltoall([tmp_xo[:, p0:p1] for p0, p1 in olocs],
                              split_recvbuf=True)
        tmp_ox = [
            tmp_ox[i].reshape(p1 - p0, nocc_seg, di, dj)
            for i, (p0, p1) in enumerate(olocs)
        ]
        ftmp[str(count) + 'a'] = numpy.vstack(tmp_ox)

    jk_blk_slices = []
    count = 0
    time1 = time0
    with lib.call_in_background(save) as bg_save:
        for ip, (ish0, ish1, ni) in enumerate(sh_ranges):
            for jsh0, jsh1, nj in sh_ranges[:ip + 1]:
                i0, i1 = ao_loc[ish0], ao_loc[ish1]
                j0, j1 = ao_loc[jsh0], ao_loc[jsh1]
                jk_blk_slices.append((i0, i1, j0, j1))

                shls_slice = (0, mol.nbas, ish0, ish1, jsh0, jsh1, ao_sh0,
                              ao_sh1)
                eri = fint(int2e,
                           mol._atm,
                           mol._bas,
                           mol._env,
                           shls_slice=shls_slice,
                           aosym='s1',
                           ao_loc=ao_loc,
                           cintopt=ao2mopt._cintopt,
                           out=eribuf)
                tmp_xo = lib.einsum('pi,pqrs->iqrs', orbo, eri)
                tmp_xo = lib.einsum('iqrs,sl->ilqr', tmp_xo, orbo_seg)
                bg_save(count, tmp_xo)
                tmp_xo = None
                count += 1
                time1 = log.timer_debug1(
                    'partial ao2mo [%d:%d,%d:%d]' % (ish0, ish1, jsh0, jsh1),
                    *time1)
    eri = eribuf = None
    time1 = time0 = log.timer('mp2 ao2mo_ovov pass1', *time0)

    eris.ovov = eris.feri.create_dataset('ovov', (nocc, nvir, nocc_seg, nvir),
                                         'f8')
    occblk = int(
        min(nocc,
            max(BLKMIN, max_memory * .9e6 / 8 / (nao**2 * nocc_seg + 1) / 5)))

    def load(i0, eri):
        if i0 < nocc:
            i1 = min(i0 + occblk, nocc)
            for k, (p0, p1, q0, q1) in enumerate(jk_blk_slices):
                eri[:i1 - i0, :, p0:p1, q0:q1] = ftmp[str(k) + 'a'][i0:i1]
                if p0 != q0:
                    dat = numpy.asarray(ftmp[str(k) + 'b'][:, i0:i1])
                    eri[:i1 - i0, :, q0:q1, p0:p1] = dat.transpose(1, 0, 3, 2)

    def save(i0, i1, dat):
        eris.ovov[i0:i1] = dat

    buf_prefecth = numpy.empty((occblk, nocc_seg, nao, nao))
    buf = numpy.empty_like(buf_prefecth)
    bufw = numpy.empty((occblk * nocc_seg, nvir**2))
    bufw1 = numpy.empty_like(bufw)
    with lib.call_in_background(load) as prefetch:
        with lib.call_in_background(save) as bsave:
            load(0, buf_prefecth)
            for i0, i1 in lib.prange(0, nocc, occblk):
                buf, buf_prefecth = buf_prefecth, buf
                prefetch(i1, buf_prefecth)
                eri = buf[:i1 - i0].reshape((i1 - i0) * nocc_seg, nao, nao)

                dat = _ao2mo.nr_e2(eri,
                                   orbv, (0, nvir, 0, nvir),
                                   's1',
                                   's1',
                                   out=bufw)
                bsave(
                    i0, i1,
                    dat.reshape(i1 - i0, nocc_seg, nvir,
                                nvir).transpose(0, 2, 1, 3))
                bufw, bufw1 = bufw1, bufw
                time1 = log.timer_debug1('pass2 ao2mo [%d:%d]' % (i0, i1),
                                         *time1)

    time0 = log.timer('mp2 ao2mo_ovov pass2', *time0)
    mp._eris = eris
    return eris
Beispiel #16
0
    def make_kpt(uniq_kptji_id):  # kpt = kptj - kpti
        kpt = uniq_kpts[uniq_kptji_id]
        log.debug1('kpt = %s', kpt)
        adapted_ji_idx = numpy.where(uniq_inverse == uniq_kptji_id)[0]
        adapted_kptjs = kptjs[adapted_ji_idx]
        nkptj = len(adapted_kptjs)
        log.debug1('adapted_ji_idx = %s', adapted_ji_idx)

        Gaux = ft_ao.ft_ao(fused_cell, Gv, None, b, gxyz, Gvbase, kpt).T
        Gaux = fuse(Gaux)
        Gaux *= mydf.weighted_coulG(kpt, False, mesh)
        kLR = Gaux.T.real.copy('C')
        kLI = Gaux.T.imag.copy('C')
        j2c = numpy.asarray(fswap['j2c/%d' % uniq_kptji_id])
        # Note large difference may be found in results between the CD/eig treatments.
        # In some systems, small integral errors can lead to different treatments of
        # linear dependency which can be observed in the total energy/orbital energy
        # around 4th decimal place.
        #        try:
        #            j2c = scipy.linalg.cholesky(j2c, lower=True)
        #            j2ctag = 'CD'
        #        except scipy.linalg.LinAlgError as e:
        #
        # Abandon CD treatment for better numerical stablity
        w, v = scipy.linalg.eigh(j2c)
        log.debug('MDF metric for kpt %s cond = %.4g, drop %d bfns',
                  uniq_kptji_id, w[-1] / w[0],
                  numpy.count_nonzero(w < mydf.linear_dep_threshold))
        v = v[:, w > mydf.linear_dep_threshold].T.conj()
        v /= numpy.sqrt(w[w > mydf.linear_dep_threshold]).reshape(-1, 1)
        j2c = v
        j2ctag = 'eig'
        naux0 = j2c.shape[0]

        if is_zero(kpt):  # kpti == kptj
            aosym = 's2'
            nao_pair = nao * (nao + 1) // 2

            vbar = fuse(mydf.auxbar(fused_cell))
            ovlp = cell.pbc_intor('int1e_ovlp', hermi=1, kpts=adapted_kptjs)
            for k, ji in enumerate(adapted_ji_idx):
                ovlp[k] = lib.pack_tril(ovlp[k])
        else:
            aosym = 's1'
            nao_pair = nao**2

        mem_now = lib.current_memory()[0]
        log.debug2('memory = %s', mem_now)
        max_memory = max(2000, mydf.max_memory - mem_now)
        # nkptj for 3c-coulomb arrays plus 1 Lpq array
        buflen = min(max(int(max_memory * .38e6 / 16 / naux / (nkptj + 1)), 1),
                     nao_pair)
        shranges = _guess_shell_ranges(cell, buflen, aosym)
        buflen = max([x[2] for x in shranges])
        # +1 for a pqkbuf
        if aosym == 's2':
            Gblksize = max(16,
                           int(max_memory * .1e6 / 16 / buflen / (nkptj + 1)))
        else:
            Gblksize = max(16,
                           int(max_memory * .2e6 / 16 / buflen / (nkptj + 1)))
        Gblksize = min(Gblksize, ngrids, 16384)
        pqkRbuf = numpy.empty(buflen * Gblksize)
        pqkIbuf = numpy.empty(buflen * Gblksize)
        # buf for ft_aopair
        buf = numpy.empty((nkptj, buflen * Gblksize), dtype=numpy.complex128)

        def pw_contract(istep, sh_range, j3cR, j3cI):
            bstart, bend, ncol = sh_range
            if aosym == 's2':
                shls_slice = (bstart, bend, 0, bend)
            else:
                shls_slice = (bstart, bend, 0, cell.nbas)

            for p0, p1 in lib.prange(0, ngrids, Gblksize):
                dat = ft_ao._ft_aopair_kpts(cell,
                                            Gv[p0:p1],
                                            shls_slice,
                                            aosym,
                                            b,
                                            gxyz[p0:p1],
                                            Gvbase,
                                            kpt,
                                            adapted_kptjs,
                                            out=buf)
                nG = p1 - p0
                for k, ji in enumerate(adapted_ji_idx):
                    aoao = dat[k].reshape(nG, ncol)
                    pqkR = numpy.ndarray((ncol, nG), buffer=pqkRbuf)
                    pqkI = numpy.ndarray((ncol, nG), buffer=pqkIbuf)
                    pqkR[:] = aoao.real.T
                    pqkI[:] = aoao.imag.T

                    lib.dot(kLR[p0:p1].T, pqkR.T, -1, j3cR[k], 1)
                    lib.dot(kLI[p0:p1].T, pqkI.T, -1, j3cR[k], 1)
                    if not (is_zero(kpt) and gamma_point(adapted_kptjs[k])):
                        lib.dot(kLR[p0:p1].T, pqkI.T, -1, j3cI[k], 1)
                        lib.dot(kLI[p0:p1].T, pqkR.T, 1, j3cI[k], 1)

            for k, ji in enumerate(adapted_ji_idx):
                if is_zero(kpt) and gamma_point(adapted_kptjs[k]):
                    v = j3cR[k]
                else:
                    v = j3cR[k] + j3cI[k] * 1j
                if j2ctag == 'CD':
                    v = scipy.linalg.solve_triangular(j2c,
                                                      v,
                                                      lower=True,
                                                      overwrite_b=True)
                else:
                    v = lib.dot(j2c, v)
                feri['j3c/%d/%d' % (ji, istep)] = v

        with lib.call_in_background(pw_contract) as compute:
            col1 = 0
            for istep, sh_range in enumerate(shranges):
                log.debug1('int3c2e [%d/%d], AO [%d:%d], ncol = %d', \
                           istep+1, len(shranges), *sh_range)
                bstart, bend, ncol = sh_range
                col0, col1 = col1, col1 + ncol
                j3cR = []
                j3cI = []
                for k, idx in enumerate(adapted_ji_idx):
                    v = [
                        feri['j3c-junk/%d/%d' % (idx, i)][0, col0:col1].T
                        for i in range(nsegs)
                    ]
                    v = fuse(numpy.vstack(v))
                    if is_zero(kpt) and cell.dimension == 3:
                        for i, c in enumerate(vbar):
                            if c != 0:
                                v[i] -= c * ovlp[k][col0:col1]
                    j3cR.append(numpy.asarray(v.real, order='C'))
                    if is_zero(kpt) and gamma_point(adapted_kptjs[k]):
                        j3cI.append(None)
                    else:
                        j3cI.append(numpy.asarray(v.imag, order='C'))
                    v = None
                compute(istep, sh_range, j3cR, j3cI)
        for ji in adapted_ji_idx:
            del (feri['j3c-junk/%d' % ji])
Beispiel #17
0
def _aux_e2(cell, auxcell, erifile, intor='int3c2e', aosym='s2ij', comp=None,
            kptij_lst=None, dataname='eri_mo', shls_slice=None, max_memory=2000,
            verbose=0):
    r'''3-center AO integrals (ij|L) with double lattice sum:
    \sum_{lm} (i[l]j[m]|L[0]), where L is the auxiliary basis.
    Three-index integral tensor (kptij_idx, nao_pair, naux) or four-index
    integral tensor (kptij_idx, comp, nao_pair, naux) are stored on disk.

    **This function should be only used by df and mdf initialization function
    _make_j3c**

    Args:
        kptij_lst : (*,2,3) array
            A list of (kpti, kptj)
    '''
    intor, comp = gto.moleintor._get_intor_and_comp(cell._add_suffix(intor), comp)

    if isinstance(erifile, h5py.Group):
        feri = erifile
    elif h5py.is_hdf5(erifile):
        feri = h5py.File(erifile)
    else:
        feri = h5py.File(erifile, 'w')
    if dataname in feri:
        del(feri[dataname])
    if dataname+'-kptij' in feri:
        del(feri[dataname+'-kptij'])

    if kptij_lst is None:
        kptij_lst = numpy.zeros((1,2,3))
    feri[dataname+'-kptij'] = kptij_lst

    if shls_slice is None:
        shls_slice = (0, cell.nbas, 0, cell.nbas, 0, auxcell.nbas)

    ao_loc = cell.ao_loc_nr()
    aux_loc = auxcell.ao_loc_nr(auxcell.cart or 'ssc' in intor)[:shls_slice[5]+1]
    ni = ao_loc[shls_slice[1]] - ao_loc[shls_slice[0]]
    nj = ao_loc[shls_slice[3]] - ao_loc[shls_slice[2]]
    naux = aux_loc[shls_slice[5]] - aux_loc[shls_slice[4]]
    nkptij = len(kptij_lst)

    nii = (ao_loc[shls_slice[1]]*(ao_loc[shls_slice[1]]+1)//2 -
           ao_loc[shls_slice[0]]*(ao_loc[shls_slice[0]]+1)//2)
    nij = ni * nj

    kpti = kptij_lst[:,0]
    kptj = kptij_lst[:,1]
    aosym_ks2 = abs(kpti-kptj).sum(axis=1) < KPT_DIFF_TOL
    j_only = numpy.all(aosym_ks2)
    #aosym_ks2 &= (aosym[:2] == 's2' and shls_slice[:2] == shls_slice[2:4])
    aosym_ks2 &= aosym[:2] == 's2'

    if j_only and aosym[:2] == 's2':
        assert(shls_slice[2] == 0)
        nao_pair = nii
    else:
        nao_pair = nij

    if gamma_point(kptij_lst):
        dtype = numpy.double
    else:
        dtype = numpy.complex128

    buflen = max(8, int(max_memory*.47e6/16/(nkptij*ni*nj*comp)))
    auxdims = aux_loc[shls_slice[4]+1:shls_slice[5]+1] - aux_loc[shls_slice[4]:shls_slice[5]]
    auxranges = balance_segs(auxdims, buflen)
    buflen = max([x[2] for x in auxranges])
    buf = numpy.empty(nkptij*comp*ni*nj*buflen, dtype=dtype)
    buf1 = numpy.empty_like(buf)

    int3c = wrap_int3c(cell, auxcell, intor, aosym, comp, kptij_lst)

    kptis = kptij_lst[:,0]
    kptjs = kptij_lst[:,1]
    kpt_ji = kptjs - kptis
    uniq_kpts, uniq_index, uniq_inverse = unique(kpt_ji)
# sorted_ij_idx: Sort and group the kptij_lst according to the ordering in
# df._make_j3c to reduce the data fragment in the hdf5 file.  When datasets
# are written to hdf5, they are saved sequentially. If the integral data are
# saved as the order of kptij_lst, removing the datasets in df._make_j3c will
# lead to holes that can not be reused.
    sorted_ij_idx = numpy.hstack([numpy.where(uniq_inverse == k)[0]
                                  for k, kpt in enumerate(uniq_kpts)])
    tril_idx = numpy.tril_indices(ni)
    tril_idx = tril_idx[0] * ni + tril_idx[1]
    def save(istep, mat):
        for k in sorted_ij_idx:
            v = mat[k]
            if gamma_point(kptij_lst[k]):
                v = v.real
            if aosym_ks2[k] and nao_pair == ni**2:
                v = v[:,tril_idx]
            feri['%s/%d/%d' % (dataname,k,istep)] = v

    with lib.call_in_background(save) as bsave:
        for istep, auxrange in enumerate(auxranges):
            sh0, sh1, nrow = auxrange
            sub_slice = (shls_slice[0], shls_slice[1],
                         shls_slice[2], shls_slice[3],
                         shls_slice[4]+sh0, shls_slice[4]+sh1)
            mat = numpy.ndarray((nkptij,comp,nao_pair,nrow), dtype=dtype, buffer=buf)
            bsave(istep, int3c(sub_slice, mat))
            buf, buf1 = buf1, buf

    if not isinstance(erifile, h5py.Group):
        feri.close()
    return erifile
Beispiel #18
0
def update_amps(mycc, t1, t2, eris):
    time1 = time0 = time.clock(), time.time()
    log = logger.Logger(mycc.stdout, mycc.verbose)
    cpu1 = time0

    t1T = t1.T
    t2T = numpy.asarray(t2.transpose(2,3,0,1), order='C')
    nvir_seg, nvir, nocc = t2T.shape[:3]
    t1 = t2 = None
    ntasks = mpi.pool.size
    vlocs = [_task_location(nvir, task_id) for task_id in range(ntasks)]
    vloc0, vloc1 = vlocs[rank]
    log.debug2('vlocs %s', vlocs)
    assert(vloc1-vloc0 == nvir_seg)

    fock = eris.fock
    mo_e_o = eris.mo_energy[:nocc]
    mo_e_v = eris.mo_energy[nocc:] + mycc.level_shift

    def _rotate_vir_block(buf):
        for task_id, buf in _rotate_tensor_block(buf):
            loc0, loc1 = vlocs[task_id]
            yield task_id, buf, loc0, loc1

    fswap = lib.H5TmpFile()
    wVooV = numpy.zeros((nvir_seg,nocc,nocc,nvir))
    eris_voov = _cp(eris.ovvo).transpose(1,0,3,2)
    tau  = t2T * .5
    tau += numpy.einsum('ai,bj->abij', t1T[vloc0:vloc1], t1T)
    for task_id, tau, p0, p1 in _rotate_vir_block(tau):
        wVooV += lib.einsum('bkic,cajk->bija', eris_voov[:,:,:,p0:p1], tau)
    fswap['wVooV1'] = wVooV
    wVooV = tau = None
    time1 = log.timer_debug1('wVooV', *time1)

    wVOov = eris_voov
    eris_VOov = eris_voov - eris_voov.transpose(0,2,1,3)*.5
    tau  = t2T.transpose(2,0,3,1) - t2T.transpose(3,0,2,1)*.5
    tau -= numpy.einsum('ai,bj->jaib', t1T[vloc0:vloc1], t1T)
    for task_id, tau, p0, p1 in _rotate_vir_block(tau):
        wVOov += lib.einsum('dlkc,kcjb->dljb', eris_VOov[:,:,:,p0:p1], tau)
    fswap['wVOov1'] = wVOov
    wVOov = tau = eris_VOov = eris_voov = None
    time1 = log.timer_debug1('wVOov', *time1)

    t1Tnew = numpy.zeros_like(t1T)
    t2Tnew = mycc._add_vvvv(t1T, t2T, eris, t2sym='jiba')
    time1 = log.timer_debug1('vvvv', *time1)

#** make_inter_F
    fov = fock[:nocc,nocc:].copy()
    t1Tnew += fock[nocc:,:nocc]

    foo = fock[:nocc,:nocc] - numpy.diag(mo_e_o)
    foo += .5 * numpy.einsum('ia,aj->ij', fock[:nocc,nocc:], t1T)

    fvv = fock[nocc:,nocc:] - numpy.diag(mo_e_v)
    fvv -= .5 * numpy.einsum('ai,ib->ab', t1T, fock[:nocc,nocc:])

    foo_priv = numpy.zeros_like(foo)
    fov_priv = numpy.zeros_like(fov)
    fvv_priv = numpy.zeros_like(fvv)
    t1T_priv = numpy.zeros_like(t1T)

    max_memory = mycc.max_memory - lib.current_memory()[0]
    unit = nocc*nvir**2*3 + nocc**2*nvir + 1
    blksize = min(nvir, max(BLKMIN, int((max_memory*.9e6/8-t2T.size)/unit)))
    log.debug1('pass 1, max_memory %d MB,  nocc,nvir = %d,%d  blksize = %d',
               max_memory, nocc, nvir, blksize)

    buf = numpy.empty((blksize,nvir,nvir,nocc))
    def load_vvvo(p0):
        p1 = min(nvir_seg, p0+blksize)
        if p0 < p1:
            buf[:p1-p0] = eris.vvvo[p0:p1]
    fswap.create_dataset('wVooV', (nvir_seg,nocc,nocc,nvir), 'f8')
    wVOov = []

    with lib.call_in_background(load_vvvo) as prefetch:
        load_vvvo(0)
        for p0, p1 in lib.prange(vloc0, vloc1, blksize):
            i0, i1 = p0 - vloc0, p1 - vloc0
            eris_vvvo, buf = buf[:p1-p0], numpy.empty_like(buf)
            prefetch(i1)

            fvv_priv[p0:p1] += 2*numpy.einsum('ck,abck->ab', t1T, eris_vvvo)
            fvv_priv -= numpy.einsum('ck,cabk->ab', t1T[p0:p1], eris_vvvo)

            if not mycc.direct:
                raise NotImplementedError
                tau = t2T[i0:i1] + numpy.einsum('ai,bj->abij', t1T[p0:p1], t1T)
                for task_id, tau, q0, q1 in _rotate_vir_block(tau):
                    tmp = lib.einsum('bdck,cdij->bkij', eris_vvvo[:,:,q0:q1], tau)
                    t2Tnew -= lib.einsum('ak,bkij->baji', t1T, tmp)
                tau = tmp = None

            fswap['wVooV'][i0:i1] = lib.einsum('cj,baci->bija', -t1T, eris_vvvo)

            theta  = t2T[i0:i1].transpose(0,2,1,3) * 2
            theta -= t2T[i0:i1].transpose(0,3,1,2)
            t1T_priv += lib.einsum('bicj,bacj->ai', theta, eris_vvvo)
            wVOov.append(lib.einsum('acbi,cj->abij', eris_vvvo, t1T))
            theta = eris_vvvo = None
            time1 = log.timer_debug1('vvvo [%d:%d]'%(p0, p1), *time1)

    wVOov = numpy.vstack(wVOov)
    wVOov = mpi.alltoall([wVOov[:,q0:q1] for q0,q1 in vlocs], split_recvbuf=True)
    wVOov = numpy.vstack([x.reshape(-1,nvir_seg,nocc,nocc) for x in wVOov])
    fswap['wVOov'] = wVOov.transpose(1,2,3,0)
    wVooV = None

    unit = nocc**2*nvir*7 + nocc**3 + nocc*nvir**2
    max_memory = max(0, mycc.max_memory - lib.current_memory()[0])
    blksize = min(nvir, max(BLKMIN, int((max_memory*.9e6/8-nocc**4)/unit)))
    log.debug1('pass 2, max_memory %d MB,  nocc,nvir = %d,%d  blksize = %d',
               max_memory, nocc, nvir, blksize)

    woooo = numpy.zeros((nocc,nocc,nocc,nocc))

    for p0, p1 in lib.prange(vloc0, vloc1, blksize):
        i0, i1 = p0 - vloc0, p1 - vloc0
        wVOov = fswap['wVOov'][i0:i1]
        wVooV = fswap['wVooV'][i0:i1]
        eris_ovoo = eris.ovoo[:,i0:i1]
        eris_oovv = numpy.empty((nocc,nocc,i1-i0,nvir))
        def load_oovv(p0, p1):
            eris_oovv[:] = eris.oovv[:,:,p0:p1]
        with lib.call_in_background(load_oovv) as prefetch_oovv:
            #:eris_oovv = eris.oovv[:,:,i0:i1]
            prefetch_oovv(i0, i1)
            foo_priv += numpy.einsum('ck,kcji->ij', 2*t1T[p0:p1], eris_ovoo)
            foo_priv += numpy.einsum('ck,icjk->ij',  -t1T[p0:p1], eris_ovoo)
            tmp = lib.einsum('al,jaik->lkji', t1T[p0:p1], eris_ovoo)
            woooo += tmp + tmp.transpose(1,0,3,2)
            tmp = None

            wVOov -= lib.einsum('jbik,ak->bjia', eris_ovoo, t1T)
            t2Tnew[i0:i1] += wVOov.transpose(0,3,1,2)

            wVooV += lib.einsum('kbij,ak->bija', eris_ovoo, t1T)
            eris_ovoo = None
        load_oovv = prefetch_oovv = None

        eris_ovvo = numpy.empty((nocc,i1-i0,nvir,nocc))
        def load_ovvo(p0, p1):
            eris_ovvo[:] = eris.ovvo[:,p0:p1]
        with lib.call_in_background(load_ovvo) as prefetch_ovvo:
            #:eris_ovvo = eris.ovvo[:,i0:i1]
            prefetch_ovvo(i0, i1)
            t1T_priv[p0:p1] -= numpy.einsum('bj,jiab->ai', t1T, eris_oovv)
            wVooV -= eris_oovv.transpose(2,0,1,3)
            wVOov += wVooV*.5  #: bjia + bija*.5
        eris_voov = eris_ovvo.transpose(1,0,3,2)
        eris_ovvo = None
        load_ovvo = prefetch_ovvo = None

        def update_wVooV(i0, i1):
            wVooV[:] += fswap['wVooV1'][i0:i1]
            fswap['wVooV1'][i0:i1] = wVooV
            wVOov[:] += fswap['wVOov1'][i0:i1]
            fswap['wVOov1'][i0:i1] = wVOov
        with lib.call_in_background(update_wVooV) as update_wVooV:
            update_wVooV(i0, i1)
            t2Tnew[i0:i1] += eris_voov.transpose(0,3,1,2) * .5
            t1T_priv[p0:p1] += 2*numpy.einsum('bj,aijb->ai', t1T, eris_voov)

            tmp  = lib.einsum('ci,kjbc->bijk', t1T, eris_oovv)
            tmp += lib.einsum('bjkc,ci->bjik', eris_voov, t1T)
            t2Tnew[i0:i1] -= lib.einsum('bjik,ak->baji', tmp, t1T)
            eris_oovv = tmp = None

            fov_priv[:,p0:p1] += numpy.einsum('ck,aikc->ia', t1T, eris_voov) * 2
            fov_priv[:,p0:p1] -= numpy.einsum('ck,akic->ia', t1T, eris_voov)

            tau  = numpy.einsum('ai,bj->abij', t1T[p0:p1]*.5, t1T)
            tau += t2T[i0:i1]
            theta  = tau.transpose(0,1,3,2) * 2
            theta -= tau
            fvv_priv -= lib.einsum('caij,cjib->ab', theta, eris_voov)
            foo_priv += lib.einsum('aikb,abkj->ij', eris_voov, theta)
            tau = theta = None

            tau = t2T[i0:i1] + numpy.einsum('ai,bj->abij', t1T[p0:p1], t1T)
            woooo += lib.einsum('abij,aklb->ijkl', tau, eris_voov)
            tau = None
        eris_VOov = wVOov = wVooV = update_wVooV = None
        time1 = log.timer_debug1('voov [%d:%d]'%(p0, p1), *time1)

    wVooV = _cp(fswap['wVooV1'])
    for task_id, wVooV, p0, p1 in _rotate_vir_block(wVooV):
        tmp = lib.einsum('ackj,ckib->ajbi', t2T[:,p0:p1], wVooV)
        t2Tnew += tmp.transpose(0,2,3,1)
        t2Tnew += tmp.transpose(0,2,1,3) * .5
    wVooV = tmp = None
    time1 = log.timer_debug1('contracting wVooV', *time1)

    wVOov = _cp(fswap['wVOov1'])
    theta  = t2T * 2
    theta -= t2T.transpose(0,1,3,2)
    for task_id, wVOov, p0, p1 in _rotate_vir_block(wVOov):
        t2Tnew += lib.einsum('acik,ckjb->abij', theta[:,p0:p1], wVOov)
    wVOov = theta = None
    fswap = None
    time1 = log.timer_debug1('contracting wVOov', *time1)

    foo += mpi.allreduce(foo_priv)
    fov += mpi.allreduce(fov_priv)
    fvv += mpi.allreduce(fvv_priv)

    theta = t2T.transpose(0,1,3,2) * 2 - t2T
    t1T_priv[vloc0:vloc1] += numpy.einsum('jb,abji->ai', fov, theta)
    ovoo = _cp(eris.ovoo)
    for task_id, ovoo, p0, p1 in _rotate_vir_block(ovoo):
        t1T_priv[vloc0:vloc1] -= lib.einsum('jbki,abjk->ai', ovoo, theta[:,p0:p1])
    theta = ovoo = None

    woooo = mpi.allreduce(woooo)
    woooo += _cp(eris.oooo).transpose(0,2,1,3)
    tau = t2T + numpy.einsum('ai,bj->abij', t1T[vloc0:vloc1], t1T)
    t2Tnew += .5 * lib.einsum('abkl,ijkl->abij', tau, woooo)
    tau = woooo = None

    t1Tnew += mpi.allreduce(t1T_priv)

    ft_ij = foo + numpy.einsum('aj,ia->ij', .5*t1T, fov)
    ft_ab = fvv - numpy.einsum('ai,ib->ab', .5*t1T, fov)
    t2Tnew += lib.einsum('acij,bc->abij', t2T, ft_ab)
    t2Tnew -= lib.einsum('ki,abkj->abij', ft_ij, t2T)

    eia = mo_e_o[:,None] - mo_e_v
    t1Tnew += numpy.einsum('bi,ab->ai', t1T, fvv)
    t1Tnew -= numpy.einsum('aj,ji->ai', t1T, foo)
    t1Tnew /= eia.T

    t2tmp = mpi.alltoall([t2Tnew[:,p0:p1] for p0,p1 in vlocs],
                         split_recvbuf=True)
    for task_id, (p0, p1) in enumerate(vlocs):
        tmp = t2tmp[task_id].reshape(p1-p0,nvir_seg,nocc,nocc)
        t2Tnew[:,p0:p1] += tmp.transpose(1,0,3,2)

    for i in range(vloc0, vloc1):
        t2Tnew[i-vloc0] /= lib.direct_sum('i+jb->bij', eia[:,i], eia)

    time0 = log.timer_debug1('update t1 t2', *time0)
    return t1Tnew.T, t2Tnew.transpose(2,3,0,1)
Beispiel #19
0
def _make_eris_outcore(mycc, mo_coeff=None):
    cput0 = (time.clock(), time.time())
    log = logger.Logger(mycc.stdout, mycc.verbose)
    _sync_(mycc)
    eris = ccsd._ChemistsERIs()
    if rank == 0:
        eris._common_init_(mycc, mo_coeff)
        comm.bcast((eris.mo_coeff, eris.fock, eris.nocc, eris.mo_energy))
    else:
        eris.mol = mycc.mol
        eris.mo_coeff, eris.fock, eris.nocc, eris.mo_energy = comm.bcast(None)

    mol = mycc.mol
    mo_coeff = numpy.asarray(eris.mo_coeff, order='F')
    nocc = eris.nocc
    nao, nmo = mo_coeff.shape
    nvir = nmo - nocc
    orbo = mo_coeff[:,:nocc]
    orbv = mo_coeff[:,nocc:]
    nvpair = nvir * (nvir+1) // 2
    vlocs = [_task_location(nvir, task_id) for task_id in range(mpi.pool.size)]
    vloc0, vloc1 = vlocs[rank]
    vseg = vloc1 - vloc0

    eris.feri1 = lib.H5TmpFile()
    eris.oooo = eris.feri1.create_dataset('oooo', (nocc,nocc,nocc,nocc), 'f8')
    eris.oovv = eris.feri1.create_dataset('oovv', (nocc,nocc,vseg,nvir), 'f8', chunks=(nocc,nocc,1,nvir))
    eris.ovoo = eris.feri1.create_dataset('ovoo', (nocc,vseg,nocc,nocc), 'f8', chunks=(nocc,1,nocc,nocc))
    eris.ovvo = eris.feri1.create_dataset('ovvo', (nocc,vseg,nvir,nocc), 'f8', chunks=(nocc,1,nvir,nocc))
    eris.ovov = eris.feri1.create_dataset('ovov', (nocc,vseg,nocc,nvir), 'f8', chunks=(nocc,1,nocc,nvir))
#    eris.ovvv = eris.feri1.create_dataset('ovvv', (nocc,vseg,nvpair), 'f8', chunks=(nocc,1,nvpair))
    eris.vvvo = eris.feri1.create_dataset('vvvo', (vseg,nvir,nvir,nocc), 'f8', chunks=(1,nvir,1,nocc))
    assert(mycc.direct)

    def save_occ_frac(p0, p1, eri):
        eri = eri.reshape(p1-p0,nocc,nmo,nmo)
        eris.oooo[p0:p1] = eri[:,:,:nocc,:nocc]
        eris.oovv[p0:p1] = eri[:,:,nocc+vloc0:nocc+vloc1,nocc:]

    def save_vir_frac(p0, p1, eri):
        log.alldebug1('save_vir_frac %d %d %s', p0, p1, eri.shape)
        eri = eri.reshape(p1-p0,nocc,nmo,nmo)
        eris.ovoo[:,p0:p1] = eri[:,:,:nocc,:nocc].transpose(1,0,2,3)
        eris.ovvo[:,p0:p1] = eri[:,:,nocc:,:nocc].transpose(1,0,2,3)
        eris.ovov[:,p0:p1] = eri[:,:,:nocc,nocc:].transpose(1,0,2,3)
#        vvv = lib.pack_tril(eri[:,:,nocc:,nocc:].reshape((p1-p0)*nocc,nvir,nvir))
#        eris.ovvv[:,p0:p1] = vvv.reshape(p1-p0,nocc,nvpair).transpose(1,0,2)

        cput2 = time.clock(), time.time()
        ovvv_segs = [eri[:,:,nocc+q0:nocc+q1,nocc:].transpose(2,3,0,1) for q0,q1 in vlocs]
        ovvv_segs = mpi.alltoall(ovvv_segs, split_recvbuf=True)
        cput2 = log.timer_debug1('vvvo alltoall', *cput2)
        for task_id, (q0, q1) in enumerate(comm.allgather((p0,p1))):
            ip0 = q0 + vlocs[task_id][0]
            ip1 = q1 + vlocs[task_id][0]
            eris.vvvo[:,:,ip0:ip1] = ovvv_segs[task_id].reshape(vseg,nvir,q1-q0,nocc)

    fswap = lib.H5TmpFile()
    max_memory = max(MEMORYMIN, mycc.max_memory-lib.current_memory()[0])
    int2e = mol._add_suffix('int2e')
    orbov = numpy.hstack((orbo, orbv[:,vloc0:vloc1]))
    ao2mo.outcore.half_e1(mol, (orbov,orbo), fswap, int2e,
                          's4', 1, max_memory, verbose=log)

    ao_loc = mol.ao_loc_nr()
    nao_pair = nao * (nao+1) // 2
    blksize = int(min(8e9,max_memory*.5e6)/8/(nao_pair+nmo**2)/nocc)
    blksize = min(nvir, max(BLKMIN, blksize))
    fload = ao2mo.outcore._load_from_h5g

    buf = numpy.empty((blksize*nocc,nao_pair))
    buf_prefetch = numpy.empty_like(buf)
    def prefetch(p0, p1, rowmax):
        p0, p1 = p1, min(rowmax, p1+blksize)
        if p0 < p1:
            fload(fswap['0'], p0*nocc, p1*nocc, buf_prefetch)

    cput1 = time.clock(), time.time()
    outbuf = numpy.empty((blksize*nocc,nmo**2))
    with lib.call_in_background(prefetch) as bprefetch:
        fload(fswap['0'], 0, min(nocc,blksize)*nocc, buf_prefetch)
        for p0, p1 in lib.prange(0, nocc, blksize):
            nrow = (p1 - p0) * nocc
            buf, buf_prefetch = buf_prefetch, buf
            bprefetch(p0, p1, nocc)
            dat = ao2mo._ao2mo.nr_e2(buf[:nrow], mo_coeff, (0,nmo,0,nmo),
                                     's4', 's1', out=outbuf, ao_loc=ao_loc)
            save_occ_frac(p0, p1, dat)

        blksize = min(comm.allgather(blksize))
        norb_max = nocc + vseg
        fload(fswap['0'], nocc**2, min(nocc+blksize,norb_max)*nocc, buf_prefetch)
        for p0, p1 in mpi.prange(vloc0, vloc1, blksize):
            i0, i1 = p0 - vloc0, p1 - vloc0
            nrow = (p1 - p0) * nocc
            buf, buf_prefetch = buf_prefetch, buf
            bprefetch(nocc+i0, nocc+i1, norb_max)
            dat = ao2mo._ao2mo.nr_e2(buf[:nrow], mo_coeff, (0,nmo,0,nmo),
                                     's4', 's1', out=outbuf, ao_loc=ao_loc)
            save_vir_frac(i0, i1, dat)
    buf = buf_prefecth = outbuf = None

    cput1 = log.timer_debug1('transforming oppp', *cput1)
    log.timer('CCSD integral transformation', *cput0)
    mycc._eris = eris
    return eris
Beispiel #20
0
def transform_integrals_outcore(myadc):

    cput0 = (time.clock(), time.time())
    log = logger.Logger(myadc.stdout, myadc.verbose)

    mol = myadc.mol
    mo_coeff = myadc.mo_coeff
    nao = mo_coeff.shape[0]
    nmo = myadc._nmo

    occ = myadc.mo_coeff[:, :myadc._nocc]
    vir = myadc.mo_coeff[:, myadc._nocc:]

    nocc = occ.shape[1]
    nvir = vir.shape[1]
    nvpair = nvir * (nvir + 1) // 2

    eris = lambda: None

    eris.feri1 = lib.H5TmpFile()
    eris.oooo = eris.feri1.create_dataset('oooo', (nocc, nocc, nocc, nocc),
                                          'f8')
    eris.oovv = eris.feri1.create_dataset('oovv', (nocc, nocc, nvir, nvir),
                                          'f8',
                                          chunks=(nocc, nocc, 1, nvir))
    eris.ovoo = eris.feri1.create_dataset('ovoo', (nocc, nvir, nocc, nocc),
                                          'f8',
                                          chunks=(nocc, 1, nocc, nocc))
    eris.ovvo = eris.feri1.create_dataset('ovvo', (nocc, nvir, nvir, nocc),
                                          'f8',
                                          chunks=(nocc, 1, nvir, nocc))
    eris.ovov = eris.feri1.create_dataset('ovov', (nocc, nvir, nocc, nvir),
                                          'f8',
                                          chunks=(nocc, 1, nocc, nvir))
    eris.ovvv = eris.feri1.create_dataset('ovvv', (nocc, nvir, nvpair), 'f8')

    def save_occ_frac(p0, p1, eri):
        eri = eri.reshape(p1 - p0, nocc, nmo, nmo)
        eris.oooo[p0:p1] = eri[:, :, :nocc, :nocc]
        eris.oovv[p0:p1] = eri[:, :, nocc:, nocc:]

    def save_vir_frac(p0, p1, eri):
        eri = eri.reshape(p1 - p0, nocc, nmo, nmo)
        eris.ovoo[:, p0:p1] = eri[:, :, :nocc, :nocc].transpose(1, 0, 2, 3)
        eris.ovvo[:, p0:p1] = eri[:, :, nocc:, :nocc].transpose(1, 0, 2, 3)
        eris.ovov[:, p0:p1] = eri[:, :, :nocc, nocc:].transpose(1, 0, 2, 3)
        vvv = lib.pack_tril(eri[:, :, nocc:, nocc:].reshape((p1 - p0) * nocc,
                                                            nvir, nvir))
        eris.ovvv[:, p0:p1] = vvv.reshape(p1 - p0, nocc,
                                          nvpair).transpose(1, 0, 2)

    cput1 = time.clock(), time.time()
    fswap = lib.H5TmpFile()
    max_memory = myadc.max_memory - lib.current_memory()[0]
    if max_memory <= 0:
        max_memory = myadc.memorymin
    int2e = mol._add_suffix('int2e')
    ao2mo.outcore.half_e1(mol, (mo_coeff, occ),
                          fswap,
                          int2e,
                          's4',
                          1,
                          max_memory=max_memory,
                          verbose=log)

    ao_loc = mol.ao_loc_nr()
    nao_pair = nao * (nao + 1) // 2
    blksize = int(min(8e9, max_memory * .5e6) / 8 / (nao_pair + nmo**2) / nocc)
    blksize = min(nmo, max(myadc.blkmin, blksize))

    log.debug1('blksize %d', blksize)
    cput2 = cput1

    fload = ao2mo.outcore._load_from_h5g
    buf = np.empty((blksize * nocc, nao_pair))
    buf_prefetch = np.empty_like(buf)

    def load(buf_prefetch, p0, rowmax):
        if p0 < rowmax:
            p1 = min(rowmax, p0 + blksize)
            fload(fswap['0'], p0 * nocc, p1 * nocc, buf_prefetch)

    outbuf = np.empty((blksize * nocc, nmo**2))
    with lib.call_in_background(load, sync=not myadc.async_io) as prefetch:
        prefetch(buf_prefetch, 0, nocc)
        for p0, p1 in lib.prange(0, nocc, blksize):
            buf, buf_prefetch = buf_prefetch, buf
            prefetch(buf_prefetch, p1, nocc)

            nrow = (p1 - p0) * nocc
            dat = ao2mo._ao2mo.nr_e2(buf[:nrow],
                                     mo_coeff, (0, nmo, 0, nmo),
                                     's4',
                                     's1',
                                     out=outbuf,
                                     ao_loc=ao_loc)
            save_occ_frac(p0, p1, dat)
        cput2 = log.timer_debug1('transforming oopp', *cput2)

        prefetch(buf_prefetch, nocc, nmo)
        for p0, p1 in lib.prange(0, nvir, blksize):
            buf, buf_prefetch = buf_prefetch, buf
            prefetch(buf_prefetch, nocc + p1, nmo)

            nrow = (p1 - p0) * nocc
            dat = ao2mo._ao2mo.nr_e2(buf[:nrow],
                                     mo_coeff, (0, nmo, 0, nmo),
                                     's4',
                                     's1',
                                     out=outbuf,
                                     ao_loc=ao_loc)
            save_vir_frac(p0, p1, dat)

            cput2 = log.timer_debug1('transforming ovpp [%d:%d]' % (p0, p1),
                                     *cput2)

    cput1 = log.timer_debug1('transforming oppp', *cput1)

    ############### forming eris_vvvv ########################################

    if (myadc.method == "adc(2)-x" or myadc.method == "adc(3)"):
        eris.vvvv = []

        cput3 = time.clock(), time.time()
        avail_mem = (myadc.max_memory - lib.current_memory()[0]) * 0.5
        chnk_size = calculate_chunk_size(myadc)

        for p in range(0, vir.shape[1], chnk_size):

            if chnk_size < vir.shape[1]:
                orb_slice = vir[:, p:p + chnk_size]
            else:
                orb_slice = vir[:, p:]

            _, tmp = tempfile.mkstemp()
            ao2mo.outcore.general(mol, (orb_slice, vir, vir, vir),
                                  tmp,
                                  max_memory=avail_mem,
                                  ioblk_size=100,
                                  compact=False)
            vvvv = read_dataset(tmp, 'eri_mo')
            vvvv = vvvv.reshape(orb_slice.shape[1], vir.shape[1], vir.shape[1],
                                vir.shape[1])
            vvvv = np.ascontiguousarray(vvvv.transpose(0, 2, 1, 3)).reshape(
                -1, nvir, nvir * nvir)

            vvvv_p = write_dataset(vvvv)
            del vvvv
            eris.vvvv.append(vvvv_p)
            cput3 = log.timer_debug1('transforming vvvv', *cput3)

    log.timer('ADC integral transformation', *cput0)

    return eris
Beispiel #21
0
def _sort_eri(mycc, eris, h5tmp, log):
    cpu1 = (logger.process_clock(), logger.perf_counter())
    nocca, noccb = mycc.nocc
    nmoa = eris.focka.shape[0]
    nmob = eris.fockb.shape[0]
    nvira = nmoa - nocca
    nvirb = nmob - noccb

    if mycc.t2 is None:
        dtype = eris.ovov.dtype
    else:
        dtype = numpy.result_type(mycc.t2[0], eris.ovov.dtype)

    if mycc.incore_complete or h5tmp is None:
        eris_vvop = numpy.empty((nvira, nvira, nocca, nmoa), dtype)
        eris_VVOP = numpy.empty((nvirb, nvirb, noccb, nmob), dtype)
        eris_vVoP = numpy.empty((nvira, nvirb, nocca, nmob), dtype)
        eris_VvOp = numpy.empty((nvirb, nvira, noccb, nmoa), dtype)
    else:
        eris_vvop = h5tmp.create_dataset('vvop', (nvira, nvira, nocca, nmoa),
                                         dtype)
        eris_VVOP = h5tmp.create_dataset('VVOP', (nvirb, nvirb, noccb, nmob),
                                         dtype)
        eris_vVoP = h5tmp.create_dataset('vVoP', (nvira, nvirb, nocca, nmob),
                                         dtype)
        eris_VvOp = h5tmp.create_dataset('VvOp', (nvirb, nvira, noccb, nmoa),
                                         dtype)

    max_memory = max(2000, mycc.max_memory - lib.current_memory()[0])
    max_memory = min(8000, max_memory * .9)

    blksize = min(nvira,
                  max(16, int(max_memory * 1e6 / 8 / (nvira * nocca * nmoa))))
    with lib.call_in_background(eris_vvop.__setitem__,
                                sync=not mycc.async_io) as save:
        bufopv = numpy.empty((nocca, nmoa, nvira), dtype=dtype)
        buf1 = numpy.empty_like(bufopv)
        for j0, j1 in lib.prange(0, nvira, blksize):
            ovov = numpy.asarray(eris.ovov[:, j0:j1])
            ovvv = eris.get_ovvv(slice(None), slice(j0, j1))
            for j in range(j0, j1):
                bufopv[:, :nocca, :] = ovov[:, j - j0].conj()
                bufopv[:, nocca:, :] = ovvv[:, j - j0].conj()
                save(j, bufopv.transpose(2, 0, 1))
                bufopv, buf1 = buf1, bufopv
            ovov = ovvv = None
            cpu1 = log.timer_debug1('transpose %d:%d' % (j0, j1), *cpu1)

    blksize = min(nvirb,
                  max(16, int(max_memory * 1e6 / 8 / (nvirb * noccb * nmob))))
    with lib.call_in_background(eris_VVOP.__setitem__,
                                sync=not mycc.async_io) as save:
        bufopv = numpy.empty((noccb, nmob, nvirb), dtype=dtype)
        buf1 = numpy.empty_like(bufopv)
        for j0, j1 in lib.prange(0, nvirb, blksize):
            ovov = numpy.asarray(eris.OVOV[:, j0:j1])
            ovvv = eris.get_OVVV(slice(None), slice(j0, j1))
            for j in range(j0, j1):
                bufopv[:, :noccb, :] = ovov[:, j - j0].conj()
                bufopv[:, noccb:, :] = ovvv[:, j - j0].conj()
                save(j, bufopv.transpose(2, 0, 1))
                bufopv, buf1 = buf1, bufopv
            ovov = ovvv = None
            cpu1 = log.timer_debug1('transpose %d:%d' % (j0, j1), *cpu1)

    blksize = min(nvira,
                  max(16, int(max_memory * 1e6 / 8 / (nvirb * nocca * nmob))))
    with lib.call_in_background(eris_vVoP.__setitem__,
                                sync=not mycc.async_io) as save:
        bufopv = numpy.empty((nocca, nmob, nvirb), dtype=dtype)
        buf1 = numpy.empty_like(bufopv)
        for j0, j1 in lib.prange(0, nvira, blksize):
            ovov = numpy.asarray(eris.ovOV[:, j0:j1])
            ovvv = eris.get_ovVV(slice(None), slice(j0, j1))
            for j in range(j0, j1):
                bufopv[:, :noccb, :] = ovov[:, j - j0].conj()
                bufopv[:, noccb:, :] = ovvv[:, j - j0].conj()
                save(j, bufopv.transpose(2, 0, 1))
                bufopv, buf1 = buf1, bufopv
            ovov = ovvv = None
            cpu1 = log.timer_debug1('transpose %d:%d' % (j0, j1), *cpu1)

    blksize = min(nvirb,
                  max(16, int(max_memory * 1e6 / 8 / (nvira * noccb * nmoa))))
    OVov = numpy.asarray(eris.ovOV).transpose(2, 3, 0, 1)
    with lib.call_in_background(eris_VvOp.__setitem__,
                                sync=not mycc.async_io) as save:
        bufopv = numpy.empty((noccb, nmoa, nvira), dtype=dtype)
        buf1 = numpy.empty_like(bufopv)
        for j0, j1 in lib.prange(0, nvirb, blksize):
            ovov = OVov[:, j0:j1]
            ovvv = eris.get_OVvv(slice(None), slice(j0, j1))
            for j in range(j0, j1):
                bufopv[:, :nocca, :] = ovov[:, j - j0].conj()
                bufopv[:, nocca:, :] = ovvv[:, j - j0].conj()
                save(j, bufopv.transpose(2, 0, 1))
                bufopv, buf1 = buf1, bufopv
            ovov = ovvv = None
            cpu1 = log.timer_debug1('transpose %d:%d' % (j0, j1), *cpu1)
    return eris_vvop, eris_VVOP, eris_vVoP, eris_VvOp
Beispiel #22
0
def kernel(mycc, eris, t1=None, t2=None, verbose=logger.NOTE):
    cpu1 = cpu0 = (logger.process_clock(), logger.perf_counter())
    log = logger.new_logger(mycc, verbose)
    if t1 is None: t1 = mycc.t1
    if t2 is None: t2 = mycc.t2
    t1a, t1b = t1
    t2aa, t2ab, t2bb = t2

    nocca, noccb = mycc.nocc
    nmoa = eris.focka.shape[0]
    nmob = eris.fockb.shape[0]
    nvira = nmoa - nocca
    nvirb = nmob - noccb

    if mycc.incore_complete:
        ftmp = None
    else:
        ftmp = lib.H5TmpFile()
    t1aT = t1a.T.copy()
    t1bT = t1b.T.copy()
    t2aaT = t2aa.transpose(2, 3, 0, 1).copy()
    t2bbT = t2bb.transpose(2, 3, 0, 1).copy()

    eris_vooo = numpy.asarray(eris.ovoo).transpose(1, 3, 0, 2).conj().copy()
    eris_VOOO = numpy.asarray(eris.OVOO).transpose(1, 3, 0, 2).conj().copy()
    eris_vOoO = numpy.asarray(eris.ovOO).transpose(1, 3, 0, 2).conj().copy()
    eris_VoOo = numpy.asarray(eris.OVoo).transpose(1, 3, 0, 2).conj().copy()

    eris_vvop, eris_VVOP, eris_vVoP, eris_VvOp = _sort_eri(
        mycc, eris, ftmp, log)
    cpu1 = log.timer_debug1('UCCSD(T) sort_eri', *cpu1)

    dtype = numpy.result_type(t1a.dtype, t2aa.dtype, eris_vooo.dtype)
    et_sum = numpy.zeros(1, dtype=dtype)
    mem_now = lib.current_memory()[0]
    max_memory = max(0, mycc.max_memory - mem_now)
    # aaa
    bufsize = max(
        8,
        int((max_memory * .5e6 / 8 - nocca**3 * 3 * lib.num_threads()) * .4 /
            (nocca * nmoa)))
    log.debug('max_memory %d MB (%d MB in use)', max_memory, mem_now)
    orbsym = numpy.zeros(nocca, dtype=int)
    contract = _gen_contract_aaa(t1aT, t2aaT, eris_vooo, eris.focka,
                                 eris.mo_energy[0], orbsym, log)
    with lib.call_in_background(contract, sync=not mycc.async_io) as ctr:
        for a0, a1 in reversed(list(lib.prange_tril(0, nvira, bufsize))):
            cache_row_a = numpy.asarray(eris_vvop[a0:a1, :a1], order='C')
            if a0 == 0:
                cache_col_a = cache_row_a
            else:
                cache_col_a = numpy.asarray(eris_vvop[:a0, a0:a1], order='C')
            ctr(et_sum, a0, a1, a0, a1,
                (cache_row_a, cache_col_a, cache_row_a, cache_col_a))

            for b0, b1 in lib.prange_tril(0, a0, bufsize / 8):
                cache_row_b = numpy.asarray(eris_vvop[b0:b1, :b1], order='C')
                if b0 == 0:
                    cache_col_b = cache_row_b
                else:
                    cache_col_b = numpy.asarray(eris_vvop[:b0, b0:b1],
                                                order='C')
                ctr(et_sum, a0, a1, b0, b1,
                    (cache_row_a, cache_col_a, cache_row_b, cache_col_b))
    cpu1 = log.timer_debug1('contract_aaa', *cpu1)

    # bbb
    bufsize = max(
        8,
        int((max_memory * .5e6 / 8 - noccb**3 * 3 * lib.num_threads()) * .4 /
            (noccb * nmob)))
    log.debug('max_memory %d MB (%d MB in use)', max_memory, mem_now)
    orbsym = numpy.zeros(noccb, dtype=int)
    contract = _gen_contract_aaa(t1bT, t2bbT, eris_VOOO, eris.fockb,
                                 eris.mo_energy[1], orbsym, log)
    with lib.call_in_background(contract, sync=not mycc.async_io) as ctr:
        for a0, a1 in reversed(list(lib.prange_tril(0, nvirb, bufsize))):
            cache_row_a = numpy.asarray(eris_VVOP[a0:a1, :a1], order='C')
            if a0 == 0:
                cache_col_a = cache_row_a
            else:
                cache_col_a = numpy.asarray(eris_VVOP[:a0, a0:a1], order='C')
            ctr(et_sum, a0, a1, a0, a1,
                (cache_row_a, cache_col_a, cache_row_a, cache_col_a))

            for b0, b1 in lib.prange_tril(0, a0, bufsize / 8):
                cache_row_b = numpy.asarray(eris_VVOP[b0:b1, :b1], order='C')
                if b0 == 0:
                    cache_col_b = cache_row_b
                else:
                    cache_col_b = numpy.asarray(eris_VVOP[:b0, b0:b1],
                                                order='C')
                ctr(et_sum, a0, a1, b0, b1,
                    (cache_row_a, cache_col_a, cache_row_b, cache_col_b))
    cpu1 = log.timer_debug1('contract_bbb', *cpu1)

    # Cache t2abT in t2ab to reduce memory footprint
    assert (t2ab.flags.c_contiguous)
    t2abT = lib.transpose(t2ab.copy().reshape(nocca * noccb, nvira * nvirb),
                          out=t2ab)
    t2abT = t2abT.reshape(nvira, nvirb, nocca, noccb)
    # baa
    bufsize = int(
        max(12, (max_memory * .5e6 / 8 - noccb * nocca**2 * 5) * .7 /
            (nocca * nmob)))
    ts = t1aT, t1bT, t2aaT, t2abT
    fock = (eris.focka, eris.fockb)
    vooo = (eris_vooo, eris_vOoO, eris_VoOo)
    contract = _gen_contract_baa(ts, vooo, fock, eris.mo_energy, orbsym, log)
    with lib.call_in_background(contract, sync=not mycc.async_io) as ctr:
        for a0, a1 in lib.prange(0, nvirb, int(bufsize / nvira + 1)):
            cache_row_a = numpy.asarray(eris_VvOp[a0:a1, :], order='C')
            cache_col_a = numpy.asarray(eris_vVoP[:, a0:a1], order='C')
            for b0, b1 in lib.prange_tril(0, nvira, bufsize / 6 / 2):
                cache_row_b = numpy.asarray(eris_vvop[b0:b1, :b1], order='C')
                cache_col_b = numpy.asarray(eris_vvop[:b0, b0:b1], order='C')
                ctr(et_sum, a0, a1, b0, b1,
                    (cache_row_a, cache_col_a, cache_row_b, cache_col_b))
    cpu1 = log.timer_debug1('contract_baa', *cpu1)

    t2baT = numpy.ndarray((nvirb, nvira, noccb, nocca),
                          buffer=t2abT,
                          dtype=t2abT.dtype)
    t2baT[:] = t2abT.copy().transpose(1, 0, 3, 2)
    # abb
    ts = t1bT, t1aT, t2bbT, t2baT
    fock = (eris.fockb, eris.focka)
    mo_energy = (eris.mo_energy[1], eris.mo_energy[0])
    vooo = (eris_VOOO, eris_VoOo, eris_vOoO)
    contract = _gen_contract_baa(ts, vooo, fock, mo_energy, orbsym, log)
    for a0, a1 in lib.prange(0, nvira, int(bufsize / nvirb + 1)):
        with lib.call_in_background(contract, sync=not mycc.async_io) as ctr:
            cache_row_a = numpy.asarray(eris_vVoP[a0:a1, :], order='C')
            cache_col_a = numpy.asarray(eris_VvOp[:, a0:a1], order='C')
            for b0, b1 in lib.prange_tril(0, nvirb, bufsize / 6 / 2):
                cache_row_b = numpy.asarray(eris_VVOP[b0:b1, :b1], order='C')
                cache_col_b = numpy.asarray(eris_VVOP[:b0, b0:b1], order='C')
                ctr(et_sum, a0, a1, b0, b1,
                    (cache_row_a, cache_col_a, cache_row_b, cache_col_b))
    cpu1 = log.timer_debug1('contract_abb', *cpu1)

    # Restore t2ab
    lib.transpose(t2baT.transpose(1, 0, 3,
                                  2).copy().reshape(nvira * nvirb,
                                                    nocca * noccb),
                  out=t2ab)
    et_sum *= .25
    if abs(et_sum[0].imag) > 1e-4:
        logger.warn(mycc,
                    'Non-zero imaginary part of UCCSD(T) energy was found %s',
                    et_sum[0])
    et = et_sum[0].real
    log.timer('UCCSD(T)', *cpu0)
    log.note('UCCSD(T) correction = %.15g', et)
    return et
Beispiel #23
0
def _assemble(mydf, kptij_lst, j3c_jobs, gen_int3c, ft_fuse, cderi_file, fswap, log):
    t1 = (time.clock(), time.time())
    cell = mydf.cell
    ao_loc = cell.ao_loc_nr()
    nao = ao_loc[-1]
    kptis = kptij_lst[:,0]
    kptjs = kptij_lst[:,1]
    kpt_ji = kptjs - kptis
    uniq_kpts, uniq_index, uniq_inverse = unique(kpt_ji)
    aosym_s2 = numpy.einsum('ix->i', abs(kptis-kptjs)) < 1e-9

    t2 = t1
    j3c_workers = numpy.zeros(len(j3c_jobs), dtype=int)
    #for job_id, ish0, ish1 in mpi.work_share_partition(j3c_jobs):
    for job_id, ish0, ish1 in mpi.work_stealing_partition(j3c_jobs):
        gen_int3c(job_id, ish0, ish1)
        t2 = log.alltimer_debug2('int j3c %d' % job_id, *t2)

        for k, kpt in enumerate(uniq_kpts):
            ft_fuse(job_id, k, ish0, ish1)
            t2 = log.alltimer_debug2('ft-fuse %d k %d' % (job_id, k), *t2)

        j3c_workers[job_id] = rank
    j3c_workers = mpi.allreduce(j3c_workers)
    log.debug2('j3c_workers %s', j3c_workers)
    t1 = log.timer_debug1('int3c and fuse', *t1)

    # Pass 2
    # Transpose 3-index tensor and save data in cderi_file
    feri = h5py.File(cderi_file, 'w')
    nauxs = [fswap['j2c/%d'%k].shape[0] for k, kpt in enumerate(uniq_kpts)]
    segsize = (max(nauxs)+mpi.pool.size-1) // mpi.pool.size
    naux0 = rank * segsize
    for k, kptij in enumerate(kptij_lst):
        naux1 = min(nauxs[uniq_inverse[k]], naux0+segsize)
        nrow = max(0, naux1-naux0)
        if gamma_point(kptij):
            dtype = 'f8'
        else:
            dtype = 'c16'
        if aosym_s2[k]:
            nao_pair = nao * (nao+1) // 2
        else:
            nao_pair = nao * nao
        feri.create_dataset('j3c/%d'%k, (nrow,nao_pair), dtype, maxshape=(None,nao_pair))

    def get_segs_loc(aosym):
        off0 = numpy.asarray([ao_loc[i0] for x,i0,i1 in j3c_jobs])
        off1 = numpy.asarray([ao_loc[i1] for x,i0,i1 in j3c_jobs])
        if aosym:  # s2
            dims = off1*(off1+1)//2 - off0*(off0+1)//2
        else:
            dims = (off1-off0) * nao
        #dims = numpy.asarray([ao_loc[i1]-ao_loc[i0] for x,i0,i1 in j3c_jobs])
        dims = numpy.hstack([dims[j3c_workers==w] for w in range(mpi.pool.size)])
        job_idx = numpy.hstack([numpy.where(j3c_workers==w)[0]
                                for w in range(mpi.pool.size)])
        segs_loc = numpy.append(0, numpy.cumsum(dims))
        segs_loc = [(segs_loc[j], segs_loc[j+1]) for j in numpy.argsort(job_idx)]
        return segs_loc
    segs_loc_s1 = get_segs_loc(False)
    segs_loc_s2 = get_segs_loc(True)

    job_ids = numpy.where(rank == j3c_workers)[0]
    def load(k, p0, p1):
        naux1 = nauxs[uniq_inverse[k]]
        slices = [(min(i*segsize+p0,naux1), min(i*segsize+p1,naux1))
                  for i in range(mpi.pool.size)]
        segs = []
        for p0, p1 in slices:
            val = [fswap['j3c-chunks/%d/%d' % (job, k)][p0:p1].ravel()
                   for job in job_ids]
            if val:
                segs.append(numpy.hstack(val))
            else:
                segs.append(numpy.zeros(0))
        return segs

    def save(k, p0, p1, segs):
        segs = mpi.alltoall(segs)
        naux1 = nauxs[uniq_inverse[k]]
        loc0, loc1 = min(p0, naux1-naux0), min(p1, naux1-naux0)
        nL = loc1 - loc0
        if nL > 0:
            if aosym_s2[k]:
                segs = numpy.hstack([segs[i0*nL:i1*nL].reshape(nL,-1)
                                     for i0,i1 in segs_loc_s2])
            else:
                segs = numpy.hstack([segs[i0*nL:i1*nL].reshape(nL,-1)
                                     for i0,i1 in segs_loc_s1])
            feri['j3c/%d'%k][loc0:loc1] = segs

    mem_now = max(comm.allgather(lib.current_memory()[0]))
    max_memory = max(2000, min(8000, mydf.max_memory - mem_now))
    if numpy.all(aosym_s2):
        if gamma_point(kptij_lst):
            blksize = max(16, int(max_memory*.5e6/8/nao**2))
        else:
            blksize = max(16, int(max_memory*.5e6/16/nao**2))
    else:
        blksize = max(16, int(max_memory*.5e6/16/nao**2/2))
    log.debug1('max_momory %d MB (%d in use), blksize %d',
               max_memory, mem_now, blksize)

    t2 = t1
    with lib.call_in_background(save) as async_write:
        for k, kptji in enumerate(kptij_lst):
            for p0, p1 in lib.prange(0, segsize, blksize):
                segs = load(k, p0, p1)
                async_write(k, p0, p1, segs)
                t2 = log.timer_debug1('assemble k=%d %d:%d (in %d)' %
                                      (k, p0, p1, segsize), *t2)

    if 'j2c-' in fswap:
        j2c_kpts_lists = []
        for k, kpt in enumerate(uniq_kpts):
            if ('j2c-/%d' % k) in fswap:
                adapted_ji_idx = numpy.where(uniq_inverse == k)[0]
                j2c_kpts_lists.append(adapted_ji_idx)

        for k in numpy.hstack(j2c_kpts_lists):
            val = [numpy.asarray(fswap['j3c-/%d/%d' % (job, k)]).ravel()
                   for job in job_ids]
            val = mpi.gather(numpy.hstack(val))
            if rank == 0:
                naux1 = fswap['j3c-/0/%d'%k].shape[0]
                if aosym_s2[k]:
                    v = [val[i0*naux1:i1*naux1].reshape(naux1,-1)
                         for i0,i1 in segs_loc_s2]
                else:
                    v = [val[i0*naux1:i1*naux1].reshape(naux1,-1)
                         for i0,i1 in segs_loc_s1]
                feri['j3c-/%d'%k] = numpy.hstack(v)

    if 'j3c-kptij' in feri: del(feri['j3c-kptij'])
    feri['j3c-kptij'] = kptij_lst
    t1 = log.alltimer_debug1('assembling j3c', *t1)
    feri.close()
Beispiel #24
0
def _add_ovvv_(mycc, t1, t2, eris, fvv, t1new, t2new, fswap):
    time1 = logger.process_clock(), logger.perf_counter()
    log = logger.Logger(mycc.stdout, mycc.verbose)
    nocc, nvir = t1.shape
    nvir_pair = nvir * (nvir + 1) // 2

    if fswap is None:
        wVOov = numpy.zeros((nvir, nocc, nocc, nvir))
    else:
        wVOov = fswap.create_dataset('wVOov', (nvir, nocc, nocc, nvir), 'f8')
    wooVV = numpy.zeros((nocc, nocc * nvir_pair))

    max_memory = mycc.max_memory - lib.current_memory()[0]
    unit = nocc * nvir**2 * 3 + nocc**2 * nvir + 2
    blksize = min(
        nvir, max(BLKMIN, int((max_memory * .95e6 / 8 - wooVV.size) / unit)))
    if not mycc.direct:
        unit = nocc * nvir**2 * 3 + nocc**2 * nvir + 2 + nocc * nvir**2 + nocc * nvir
        blksize = min(
            nvir,
            max(
                BLKMIN,
                int((max_memory * .95e6 / 8 - wooVV.size - nocc**2 * nvir) /
                    unit)))
    log.debug1('max_memory %d MB,  nocc,nvir = %d,%d  blksize = %d',
               max_memory, nocc, nvir, blksize)

    def load_ovvv(buf, p0):
        if p0 < nvir:
            p1 = min(nvir, p0 + blksize)
            buf[:p1 - p0] = eris.ovvv[:, p0:p1].transpose(1, 0, 2)

    with lib.call_in_background(load_ovvv, sync=not mycc.async_io) as prefetch:
        buf = numpy.empty((blksize, nocc, nvir_pair))
        buf_prefetch = numpy.empty((blksize, nocc, nvir_pair))

        load_ovvv(buf_prefetch, 0)
        for p0, p1 in lib.prange(0, nvir, blksize):
            buf, buf_prefetch = buf_prefetch, buf
            prefetch(buf_prefetch, p1)

            eris_vovv = buf[:p1 - p0]
            eris_vovv = lib.unpack_tril(
                eris_vovv.reshape((p1 - p0) * nocc, nvir_pair))
            eris_vovv = eris_vovv.reshape(p1 - p0, nocc, nvir, nvir)

            wVOov[p0:p1] = lib.einsum('biac,jc->bija', eris_vovv, t1)

            theta = t2[:, :, p0:p1].transpose(1, 2, 0, 3) * 2
            theta -= t2[:, :, p0:p1].transpose(0, 2, 1, 3)
            t1new += lib.einsum('icjb,cjba->ia', theta, eris_vovv)
            theta = None
            time1 = log.timer_debug1('vovv [%d:%d]' % (p0, p1), *time1)

    if fswap is None:
        wooVV = lib.unpack_tril(wooVV.reshape(nocc**2, nvir_pair))
        return wVOov, wooVV.reshape(nocc, nocc, nvir,
                                    nvir).transpose(2, 1, 0, 3)
    else:
        fswap.create_dataset('wVooV', (nvir, nocc, nocc, nvir), 'f8')
        wooVV = wooVV.reshape(nocc, nocc, nvir_pair)
        tril2sq = lib.square_mat_in_trilu_indices(nvir)
        for p0, p1 in lib.prange(0, nvir, blksize):
            fswap['wVooV'][p0:p1] = wooVV[:, :, tril2sq[p0:p1]].transpose(
                2, 1, 0, 3)
        return fswap['wVOov'], fswap['wVooV']
Beispiel #25
0
def _ao2mo_ovov(mp, orbo, orbv, feri, max_memory=2000, verbose=None):
    time0 = (time.clock(), time.time())
    log = logger.new_logger(mp, verbose)

    mol = mp.mol
    int2e = mol._add_suffix('int2e')
    ao2mopt = _ao2mo.AO2MOpt(mol, int2e, 'CVHFnr_schwarz_cond',
                             'CVHFsetnr_direct_scf')
    nao, nocc = orbo.shape
    nvir = orbv.shape[1]
    nbas = mol.nbas
    assert(nvir <= nao)

    ao_loc = mol.ao_loc_nr()
    dmax = max(4, min(nao/3, numpy.sqrt(max_memory*.95e6/8/(nao+nocc)**2)))
    sh_ranges = ao2mo.outcore.balance_partition(ao_loc, dmax)
    dmax = max(x[2] for x in sh_ranges)
    eribuf = numpy.empty((nao,dmax,dmax,nao))
    ftmp = lib.H5TmpFile()
    log.debug('max_memory %s MB (dmax = %s) required disk space %g MB',
              max_memory, dmax, nocc**2*(nao*(nao+dmax)/2+nvir**2)*8/1e6)

    buf_i = numpy.empty((nocc*dmax**2*nao))
    buf_li = numpy.empty((nocc**2*dmax**2))
    buf1 = numpy.empty_like(buf_li)

    fint = gto.moleintor.getints4c
    jk_blk_slices = []
    count = 0
    time1 = time0
    with lib.call_in_background(ftmp.__setitem__) as save:
        for ip, (ish0, ish1, ni) in enumerate(sh_ranges):
            for jsh0, jsh1, nj in sh_ranges[:ip+1]:
                i0, i1 = ao_loc[ish0], ao_loc[ish1]
                j0, j1 = ao_loc[jsh0], ao_loc[jsh1]
                jk_blk_slices.append((i0,i1,j0,j1))

                eri = fint(int2e, mol._atm, mol._bas, mol._env,
                           shls_slice=(0,nbas,ish0,ish1, jsh0,jsh1,0,nbas),
                           aosym='s1', ao_loc=ao_loc, cintopt=ao2mopt._cintopt,
                           out=eribuf)
                tmp_i = numpy.ndarray((nocc,(i1-i0)*(j1-j0)*nao), buffer=buf_i)
                tmp_li = numpy.ndarray((nocc,nocc*(i1-i0)*(j1-j0)), buffer=buf_li)
                lib.ddot(orbo.T, eri.reshape(nao,(i1-i0)*(j1-j0)*nao), c=tmp_i)
                lib.ddot(orbo.T, tmp_i.reshape(nocc*(i1-i0)*(j1-j0),nao).T, c=tmp_li)
                tmp_li = tmp_li.reshape(nocc,nocc,(i1-i0),(j1-j0))
                save(str(count), tmp_li.transpose(1,0,2,3))
                buf_li, buf1 = buf1, buf_li
                count += 1
                time1 = log.timer_debug1('partial ao2mo [%d:%d,%d:%d]' %
                                         (ish0,ish1,jsh0,jsh1), *time1)
    time1 = time0 = log.timer('mp2 ao2mo_ovov pass1', *time0)
    eri = eribuf = tmp_i = tmp_li = buf_i = buf_li = buf1 = None

    h5dat = feri.create_dataset('ovov', (nocc*nvir,nocc*nvir), 'f8',
                                chunks=(nvir,nvir))
    occblk = int(min(nocc, max(4, 250/nocc, max_memory*.9e6/8/(nao**2*nocc)/5)))
    def load(i0, eri):
        if i0 < nocc:
            i1 = min(i0+occblk, nocc)
            for k, (p0,p1,q0,q1) in enumerate(jk_blk_slices):
                eri[:i1-i0,:,p0:p1,q0:q1] = ftmp[str(k)][i0:i1]
                if p0 != q0:
                    dat = numpy.asarray(ftmp[str(k)][:,i0:i1])
                    eri[:i1-i0,:,q0:q1,p0:p1] = dat.transpose(1,0,3,2)

    def save(i0, i1, dat):
        for i in range(i0, i1):
            h5dat[i*nvir:(i+1)*nvir] = dat[i-i0].reshape(nvir,nocc*nvir)

    orbv = numpy.asarray(orbv, order='F')
    buf_prefecth = numpy.empty((occblk,nocc,nao,nao))
    buf = numpy.empty_like(buf_prefecth)
    bufw = numpy.empty((occblk*nocc,nvir**2))
    bufw1 = numpy.empty_like(bufw)
    with lib.call_in_background(load) as prefetch:
        with lib.call_in_background(save) as bsave:
            load(0, buf_prefecth)
            for i0, i1 in lib.prange(0, nocc, occblk):
                buf, buf_prefecth = buf_prefecth, buf
                prefetch(i1, buf_prefecth)
                eri = buf[:i1-i0].reshape((i1-i0)*nocc,nao,nao)

                dat = _ao2mo.nr_e2(eri, orbv, (0,nvir,0,nvir), 's1', 's1', out=bufw)
                bsave(i0, i1, dat.reshape(i1-i0,nocc,nvir,nvir).transpose(0,2,1,3))
                bufw, bufw1 = bufw1, bufw
                time1 = log.timer_debug1('pass2 ao2mo [%d:%d]' % (i0,i1), *time1)

    time0 = log.timer('mp2 ao2mo_ovov pass2', *time0)
    return h5dat
Beispiel #26
0
def general(eri,
            mo_coeffs,
            erifile,
            dataname='eri_mo',
            ioblk_size=IOBLK_SIZE,
            compact=True,
            verbose=logger.NOTE):
    '''For the given four sets of orbitals, transfer arbitrary spherical AO
    integrals to MO integrals on disk.
    Args:
        eri : 8-fold reduced eri vector
        mo_coeffs : 4-item list of ndarray
            Four sets of orbital coefficients, corresponding to the four
            indices of (ij|kl)
        erifile : str or h5py File or h5py Group object
            To store the transformed integrals, in HDF5 format.
    Kwargs
        dataname : str
            The dataset name in the erifile (ref the hierarchy of HDF5 format
            http://www.hdfgroup.org/HDF5/doc1.6/UG/09_Groups.html).  By assigning
            different dataname, the existed integral file can be reused.  If
            the erifile contains the dataname, the new integrals data will
            overwrite the old one.
        ioblk_size : float or int
            The block size for IO, large block size may **not** improve performance
        compact : bool
            When compact is True, depending on the four oribital sets, the
            returned MO integrals has (up to 4-fold) permutation symmetry.
            If it's False, the function will abandon any permutation symmetry,
            and return the "plain" MO integrals


    Pseudocode / algorithm:
        u = mu
        v = nu
        l = lambda
        o = sigma

        Assume eri's are 8-fold reduced.
        nij/nkl_pair = npair or i*j/k*l if only transforming a subset

        First half transform:
            Initialize half_eri of size (nij_pair,npair)
                For lo = 1 -> npair
                    Unpack row lo
                    Unpack row lo to matrix E_{uv}^{lo}
                    Transform C_ui^+*E*C_nj -> E_{ij}^{lo}
                    Ravel or pack E_{ij}^{lo}
                    Save E_{ij}^{lo} -> half_eri[:,lo]

        Second half transform:
            Initialize h5d_eri of size (nij_pair,nkl_pair)
                For ij = 1 -> nij_pair
                    Load and unpack half_eri[ij,:] -> E_{lo}^{ij}
                    Transform C_{lk}E_{lo}^{ij}C_{ol} -> E_{kl}^{ij}
                    Repack E_{kl}^{ij}
                    Save E_{kl}^{ij} -> h5d_eri[ij,:]

        Each matrix is indexed by the composite index ij x kl, where ij/kl is
        either npair or ixj/kxl, if only a subset of MOs are being transformed.
        Since entire rows or columns need to be read in, the arrays are chunked
        such that IOBLK_SIZE = row/col x chunking col/row. For example, for the
        first half transform, we would save in nij_pair x IOBLK_SIZE/nij_pair,
        then load in IOBLK_SIZE/nkl_pair x npair for the second half transform.

        ------ kl ----->
        |jxl
        |
        ij
        |
        |
        v

        As a first guess, the chunking size is jxl. If the super-rows/cols are
        larger than IOBLK_SIZE, then the chunk rectangle jxl is trimmed
        accordingly. The pathological limiting case is where the dimensions
        nao_pair, nij_pair, or nkl_pair are so large that the arrays are
        chunked 1x1, in which case IOBLK_SIZE needs to be increased.

    '''
    log = logger.new_logger(None, verbose)
    log.info('******** ao2mo disk, custom eri ********')

    nmoi = mo_coeffs[0].shape[1]
    nmoj = mo_coeffs[1].shape[1]
    nmok = mo_coeffs[2].shape[1]
    nmol = mo_coeffs[3].shape[1]
    nao = mo_coeffs[0].shape[0]

    nao_pair = nao * (nao + 1) // 2
    if compact and iden_coeffs(mo_coeffs[0], mo_coeffs[1]):
        ij_red = False
        nij_pair = nmoi * (nmoi + 1) // 2
    else:
        ij_red = True
        nij_pair = nmoi * nmoj
    if compact and iden_coeffs(mo_coeffs[2], mo_coeffs[3]):
        kl_red = False
        nkl_pair = nmok * (nmok + 1) // 2
    else:
        kl_red = True
        nkl_pair = nmok * nmol

    chunks_half = (max(
        1, numpy.minimum(int(ioblk_size // (nao_pair * f8_size)), nmoj)),
                   max(
                       1,
                       numpy.minimum(int(ioblk_size // (nij_pair * f8_size)),
                                     nmol)))
    '''
    ideally, the final transformed eris should have a chunk of nmoj x nmol to
    optimize read operations. However, I'm chunking the row size so that the
    write operations during the transform can be done as fast as possible.
    '''
    chunks_full = (numpy.minimum(int(ioblk_size // (nkl_pair * f8_size)),
                                 nmoj), nmol)

    if isinstance(erifile, str):
        if h5py.is_hdf5(erifile):
            feri = h5py.File(erifile)
            if dataname in feri:
                del (feri[dataname])
        else:
            feri = h5py.File(erifile, 'w', libver='latest')
    else:
        assert (isinstance(erifile, h5py.Group))
        feri = erifile
    h5d_eri = feri.create_dataset(dataname, (nij_pair, nkl_pair),
                                  'f8',
                                  chunks=chunks_full)

    feri_swap = lib.H5TmpFile(libver='latest')
    half_eri = feri_swap.create_dataset(dataname, (nij_pair, nao_pair),
                                        'f8',
                                        chunks=chunks_half)

    log.debug('Memory information:')
    log.debug('  IOBLK_SIZE (MB): {}'.format(ioblk_size))
    log.debug('  jxl {}x{}, half eri chunk dim  {}x{}'.format(
        nmoj, nmol, chunks_half[0], chunks_half[1]))
    log.debug('  jxl {}x{}, full eri chunk dim {}x{}'.format(
        nmoj, nmol, chunks_full[0], chunks_full[1]))
    log.debug('  Final disk eri size (MB): {:.3g}, chunked {:.3g}'.format(
        nij_pair * nkl_pair * f8_size,
        numpy.prod(chunks_full) * f8_size))
    log.debug(
        '  Half transformed eri size (MB): {:.3g}, chunked {:.3g}'.format(
            nij_pair * nao_pair * f8_size,
            numpy.prod(chunks_half) * f8_size))
    log.debug('  RAM buffer for half transform (MB): {:.3g}'.format(
        nij_pair * chunks_half[1] * f8_size * 2))
    log.debug('  RAM buffer for full transform (MB): {:.3g}'.format(
        f8_size * chunks_full[0] * nkl_pair * 2 +
        chunks_half[0] * nao_pair * f8_size * 2))

    def save1(piece, buf):
        start = piece * chunks_half[1]
        stop = (piece + 1) * chunks_half[1]
        if stop > nao_pair:
            stop = nao_pair
        half_eri[:, start:stop] = buf[:, :stop - start]
        return

    def load2(piece):
        start = piece * chunks_half[0]
        stop = (piece + 1) * chunks_half[0]
        if stop > nij_pair:
            stop = nij_pair
            if start >= nij_pair:
                start = stop - 1
        return half_eri[start:stop, :]

    def prefetch2(piece):
        start = piece * chunks_half[0]
        stop = (piece + 1) * chunks_half[0]
        if stop > nij_pair:
            stop = nij_pair
            if start >= nij_pair:
                start = stop - 1
        buf_prefetch[:stop - start, :] = half_eri[start:stop, :]
        return

    def save2(piece, buf):
        start = piece * chunks_full[0]
        stop = (piece + 1) * chunks_full[0]
        if stop > nij_pair:
            stop = nij_pair
        h5d_eri[start:stop, :] = buf[:stop - start, :]
        return

    # transform \mu\nu -> ij
    cput0 = time.clock(), time.time()
    Cimu = mo_coeffs[0].conj().transpose()
    buf_write = numpy.empty((nij_pair, chunks_half[1]))
    buf_out = numpy.empty_like(buf_write)
    wpiece = 0
    with lib.call_in_background(save1) as async_write:
        for lo in range(nao_pair):
            if lo % chunks_half[1] == 0 and lo > 0:
                #save1(wpiece,buf_write)
                buf_out, buf_write = buf_write, buf_out
                async_write(wpiece, buf_out)
                wpiece += 1
            buf = lib.unpack_row(eri, lo)
            uv = lib.unpack_tril(buf)
            uv = Cimu.dot(uv).dot(mo_coeffs[1])
            if ij_red:
                ij = numpy.ravel(uv)  # grabs by row
            else:
                ij = lib.pack_tril(uv)
            buf_write[:, lo % chunks_half[1]] = ij
    # final write operation & cleanup
    save1(wpiece, buf_write)
    log.timer('(uv|lo) -> (ij|lo)', *cput0)
    uv = None
    ij = None
    buf = None

    # transform \lambda\sigma -> kl
    cput1 = time.clock(), time.time()
    Cklam = mo_coeffs[2].conj().transpose()
    buf_write = numpy.empty((chunks_full[0], nkl_pair))
    buf_out = numpy.empty_like(buf_write)
    buf_read = numpy.empty((chunks_half[0], nao_pair))
    buf_prefetch = numpy.empty_like(buf_read)
    rpiece = 0
    wpiece = 0
    with lib.call_in_background(save2, prefetch2) as (async_write, prefetch):
        buf_read = load2(rpiece)
        prefetch(rpiece + 1)
        for ij in range(nij_pair):
            if ij % chunks_full[0] == 0 and ij > 0:
                #save2(wpiece,buf_write)
                buf_out, buf_write = buf_write, buf_out
                async_write(wpiece, buf_out)
                wpiece += 1
            if ij % chunks_half[0] == 0 and ij > 0:
                #buf_read = load2(rpiece)
                buf_read, buf_prefetch = buf_prefetch, buf_read
                rpiece += 1
                prefetch(rpiece + 1)
            lo = lib.unpack_tril(buf_read[ij % chunks_half[0], :])
            lo = Cklam.dot(lo).dot(mo_coeffs[3])
            if kl_red:
                kl = numpy.ravel(lo)
            else:
                kl = lib.pack_tril(lo)
            buf_write[ij % chunks_full[0], :] = kl
    save2(wpiece, buf_write)
    log.timer('(ij|lo) -> (ij|kl)', *cput1)

    if isinstance(erifile, str):
        feri.close()
    return erifile
Beispiel #27
0
def general(eri, mo_coeffs, erifile, dataname='eri_mo',
            ioblk_size=IOBLK_SIZE, compact=True, verbose=logger.NOTE):
    '''For the given four sets of orbitals, transfer arbitrary spherical AO
    integrals to MO integrals on disk.
    Args:
        eri : 8-fold reduced eri vector
        mo_coeffs : 4-item list of ndarray
            Four sets of orbital coefficients, corresponding to the four
            indices of (ij|kl)
        erifile : str or h5py File or h5py Group object
            To store the transformed integrals, in HDF5 format.
    Kwargs
        dataname : str
            The dataset name in the erifile (ref the hierarchy of HDF5 format
            http://www.hdfgroup.org/HDF5/doc1.6/UG/09_Groups.html).  By assigning
            different dataname, the existed integral file can be reused.  If
            the erifile contains the dataname, the new integrals data will
            overwrite the old one.
        ioblk_size : float or int
            The block size for IO, large block size may **not** improve performance
        compact : bool
            When compact is True, depending on the four oribital sets, the
            returned MO integrals has (up to 4-fold) permutation symmetry.
            If it's False, the function will abandon any permutation symmetry,
            and return the "plain" MO integrals


    Pseudocode / algorithm:
        u = mu
        v = nu
        l = lambda
        o = sigma

        Assume eri's are 8-fold reduced.
        nij/nkl_pair = npair or i*j/k*l if only transforming a subset

        First half transform:
            Initialize half_eri of size (nij_pair,npair)
                For lo = 1 -> npair
                    Unpack row lo
                    Unpack row lo to matrix E_{uv}^{lo}
                    Transform C_ui^+*E*C_nj -> E_{ij}^{lo}
                    Ravel or pack E_{ij}^{lo}
                    Save E_{ij}^{lo} -> half_eri[:,lo]

        Second half transform:
            Initialize h5d_eri of size (nij_pair,nkl_pair)
                For ij = 1 -> nij_pair
                    Load and unpack half_eri[ij,:] -> E_{lo}^{ij}
                    Transform C_{lk}E_{lo}^{ij}C_{ol} -> E_{kl}^{ij}
                    Repack E_{kl}^{ij}
                    Save E_{kl}^{ij} -> h5d_eri[ij,:]

        Each matrix is indexed by the composite index ij x kl, where ij/kl is
        either npair or ixj/kxl, if only a subset of MOs are being transformed.
        Since entire rows or columns need to be read in, the arrays are chunked
        such that IOBLK_SIZE = row/col x chunking col/row. For example, for the
        first half transform, we would save in nij_pair x IOBLK_SIZE/nij_pair,
        then load in IOBLK_SIZE/nkl_pair x npair for the second half transform.

        ------ kl ----->
        |jxl
        |
        ij
        |
        |
        v

        As a first guess, the chunking size is jxl. If the super-rows/cols are
        larger than IOBLK_SIZE, then the chunk rectangle jxl is trimmed
        accordingly. The pathological limiting case is where the dimensions
        nao_pair, nij_pair, or nkl_pair are so large that the arrays are
        chunked 1x1, in which case IOBLK_SIZE needs to be increased.

    '''
    log = logger.new_logger(None, verbose)
    log.info('******** ao2mo disk, custom eri ********')

    nmoi = mo_coeffs[0].shape[1]
    nmoj = mo_coeffs[1].shape[1]
    nmok = mo_coeffs[2].shape[1]
    nmol = mo_coeffs[3].shape[1]
    nao = mo_coeffs[0].shape[0]

    nao_pair = nao*(nao+1) // 2
    if compact and iden_coeffs(mo_coeffs[0], mo_coeffs[1]):
        ij_red = False
        nij_pair = nmoi*(nmoi+1) // 2
    else:
        ij_red = True
        nij_pair = nmoi*nmoj
    if compact and iden_coeffs(mo_coeffs[2], mo_coeffs[3]):
        kl_red = False
        nkl_pair = nmok*(nmok+1) // 2
    else:
        kl_red = True
        nkl_pair = nmok*nmol

    chunks_half = (max(1, numpy.minimum(int(ioblk_size//(nao_pair*f8_size)),nmoj)),
                   max(1, numpy.minimum(int(ioblk_size//(nij_pair*f8_size)),nmol)))
    '''
    ideally, the final transformed eris should have a chunk of nmoj x nmol to
    optimize read operations. However, I'm chunking the row size so that the
    write operations during the transform can be done as fast as possible.
    '''
    chunks_full = (numpy.minimum(int(ioblk_size//(nkl_pair*f8_size)),nmoj),nmol)

    if isinstance(erifile, str):
        if h5py.is_hdf5(erifile):
            feri = h5py.File(erifile)
            if dataname in feri:
                del(feri[dataname])
        else:
            feri = h5py.File(erifile,'w',libver='latest')
    else:
        assert(isinstance(erifile, h5py.Group))
        feri = erifile
    h5d_eri = feri.create_dataset(dataname,(nij_pair,nkl_pair),'f8',chunks=chunks_full)

    feri_swap = lib.H5TmpFile(libver='latest')
    half_eri = feri_swap.create_dataset(dataname,(nij_pair,nao_pair),'f8',chunks=chunks_half)

    log.debug('Memory information:')
    log.debug('  IOBLK_SIZE (MB): {}'.format(ioblk_size))
    log.debug('  jxl {}x{}, half eri chunk dim  {}x{}'.format(nmoj,nmol,chunks_half[0],chunks_half[1]))
    log.debug('  jxl {}x{}, full eri chunk dim {}x{}'.format(nmoj,nmol,chunks_full[0],chunks_full[1]))
    log.debug('  Final disk eri size (MB): {:.3g}, chunked {:.3g}'
              .format(nij_pair*nkl_pair*f8_size,numpy.prod(chunks_full)*f8_size))
    log.debug('  Half transformed eri size (MB): {:.3g}, chunked {:.3g}'
              .format(nij_pair*nao_pair*f8_size,numpy.prod(chunks_half)*f8_size))
    log.debug('  RAM buffer for half transform (MB): {:.3g}'
             .format(nij_pair*chunks_half[1]*f8_size*2))
    log.debug('  RAM buffer for full transform (MB): {:.3g}'
             .format(f8_size*chunks_full[0]*nkl_pair*2 + chunks_half[0]*nao_pair*f8_size*2))

    def save1(piece,buf):
        start = piece*chunks_half[1]
        stop = (piece+1)*chunks_half[1]
        if stop > nao_pair:
            stop = nao_pair
        half_eri[:,start:stop] = buf[:,:stop-start]
        return

    def load2(piece):
        start = piece*chunks_half[0]
        stop = (piece+1)*chunks_half[0]
        if stop > nij_pair:
            stop = nij_pair
            if start >= nij_pair:
                start = stop - 1
        return half_eri[start:stop,:]

    def prefetch2(piece):
        start = piece*chunks_half[0]
        stop = (piece+1)*chunks_half[0]
        if stop > nij_pair:
            stop = nij_pair
            if start >= nij_pair:
                start = stop - 1
        buf_prefetch[:stop-start,:] = half_eri[start:stop,:]
        return

    def save2(piece,buf):
        start = piece*chunks_full[0]
        stop = (piece+1)*chunks_full[0]
        if stop > nij_pair:
            stop = nij_pair
        h5d_eri[start:stop,:] = buf[:stop-start,:]
        return

    # transform \mu\nu -> ij
    cput0 = time.clock(), time.time()
    Cimu = mo_coeffs[0].conj().transpose()
    buf_write = numpy.empty((nij_pair,chunks_half[1]))
    buf_out = numpy.empty_like(buf_write)
    wpiece = 0
    with lib.call_in_background(save1) as async_write:
        for lo in range(nao_pair):
            if lo % chunks_half[1] == 0 and lo > 0:
                #save1(wpiece,buf_write)
                buf_out, buf_write = buf_write, buf_out
                async_write(wpiece,buf_out)
                wpiece += 1
            buf = lib.unpack_row(eri,lo)
            uv = lib.unpack_tril(buf)
            uv = Cimu.dot(uv).dot(mo_coeffs[1])
            if ij_red:
                ij = numpy.ravel(uv) # grabs by row
            else:
                ij = lib.pack_tril(uv)
            buf_write[:,lo % chunks_half[1]] = ij
    # final write operation & cleanup
    save1(wpiece,buf_write)
    log.timer('(uv|lo) -> (ij|lo)', *cput0)
    uv = None
    ij = None
    buf = None

    # transform \lambda\sigma -> kl
    cput1 = time.clock(), time.time()
    Cklam = mo_coeffs[2].conj().transpose()
    buf_write = numpy.empty((chunks_full[0],nkl_pair))
    buf_out = numpy.empty_like(buf_write)
    buf_read = numpy.empty((chunks_half[0],nao_pair))
    buf_prefetch = numpy.empty_like(buf_read)
    rpiece = 0
    wpiece = 0
    with lib.call_in_background(save2,prefetch2) as (async_write,prefetch):
        buf_read = load2(rpiece)
        prefetch(rpiece+1)
        for ij in range(nij_pair):
            if ij % chunks_full[0] == 0 and ij > 0:
                #save2(wpiece,buf_write)
                buf_out, buf_write = buf_write, buf_out
                async_write(wpiece,buf_out)
                wpiece += 1
            if ij % chunks_half[0] == 0 and ij > 0:
                #buf_read = load2(rpiece)
                buf_read, buf_prefetch = buf_prefetch, buf_read
                rpiece += 1
                prefetch(rpiece+1)
            lo = lib.unpack_tril(buf_read[ij % chunks_half[0],:])
            lo = Cklam.dot(lo).dot(mo_coeffs[3])
            if kl_red:
                kl = numpy.ravel(lo)
            else:
                kl = lib.pack_tril(lo)
            buf_write[ij % chunks_full[0],:] = kl
    save2(wpiece,buf_write)
    log.timer('(ij|lo) -> (ij|kl)', *cput1)

    if isinstance(erifile, str):
        feri.close()
    return erifile
Beispiel #28
0
def kernel(mycc, eris, t1=None, t2=None, verbose=logger.NOTE):
    cpu1 = cpu0 = (time.clock(), time.time())
    log = logger.new_logger(mycc, verbose)
    if t1 is None: t1 = mycc.t1
    if t2 is None: t2 = mycc.t2

    nocc, nvir = t1.shape
    nmo = nocc + nvir

    dtype = numpy.result_type(t1, t2, eris.ovoo.dtype)
    if mycc.incore_complete:
        ftmp = None
        eris_vvop = numpy.zeros((nvir,nvir,nocc,nmo), dtype)
    else:
        ftmp = lib.H5TmpFile()
        eris_vvop = ftmp.create_dataset('vvop', (nvir,nvir,nocc,nmo), dtype)

    orbsym = _sort_eri(mycc, eris, nocc, nvir, eris_vvop, log)

    mo_energy, t1T, t2T, vooo, fvo, restore_t2_inplace = \
            _sort_t2_vooo_(mycc, orbsym, t1, t2, eris)
    cpu1 = log.timer_debug1('CCSD(T) sort_eri', *cpu1)

    cpu2 = list(cpu1)
    orbsym = numpy.hstack((numpy.sort(orbsym[:nocc]),numpy.sort(orbsym[nocc:])))
    o_ir_loc = numpy.append(0, numpy.cumsum(numpy.bincount(orbsym[:nocc], minlength=8)))
    v_ir_loc = numpy.append(0, numpy.cumsum(numpy.bincount(orbsym[nocc:], minlength=8)))
    o_sym = orbsym[:nocc]
    oo_sym = (o_sym[:,None] ^ o_sym).ravel()
    oo_ir_loc = numpy.append(0, numpy.cumsum(numpy.bincount(oo_sym, minlength=8)))
    nirrep = max(oo_sym) + 1

    orbsym   = orbsym.astype(numpy.int32)
    o_ir_loc = o_ir_loc.astype(numpy.int32)
    v_ir_loc = v_ir_loc.astype(numpy.int32)
    oo_ir_loc = oo_ir_loc.astype(numpy.int32)
    if dtype == numpy.complex:
        drv = _ccsd.libcc.CCsd_t_zcontract
    else:
        drv = _ccsd.libcc.CCsd_t_contract
    et_sum = numpy.zeros(1, dtype=dtype)
    def contract(a0, a1, b0, b1, cache):
        cache_row_a, cache_col_a, cache_row_b, cache_col_b = cache
        drv(et_sum.ctypes.data_as(ctypes.c_void_p),
            mo_energy.ctypes.data_as(ctypes.c_void_p),
            t1T.ctypes.data_as(ctypes.c_void_p),
            t2T.ctypes.data_as(ctypes.c_void_p),
            vooo.ctypes.data_as(ctypes.c_void_p),
            fvo.ctypes.data_as(ctypes.c_void_p),
            ctypes.c_int(nocc), ctypes.c_int(nvir),
            ctypes.c_int(a0), ctypes.c_int(a1),
            ctypes.c_int(b0), ctypes.c_int(b1),
            ctypes.c_int(nirrep),
            o_ir_loc.ctypes.data_as(ctypes.c_void_p),
            v_ir_loc.ctypes.data_as(ctypes.c_void_p),
            oo_ir_loc.ctypes.data_as(ctypes.c_void_p),
            orbsym.ctypes.data_as(ctypes.c_void_p),
            cache_row_a.ctypes.data_as(ctypes.c_void_p),
            cache_col_a.ctypes.data_as(ctypes.c_void_p),
            cache_row_b.ctypes.data_as(ctypes.c_void_p),
            cache_col_b.ctypes.data_as(ctypes.c_void_p))
        cpu2[:] = log.timer_debug1('contract %d:%d,%d:%d'%(a0,a1,b0,b1), *cpu2)

    # The rest 20% memory for cache b
    mem_now = lib.current_memory()[0]
    max_memory = max(0, mycc.max_memory - mem_now)
    bufsize = (max_memory*.5e6/8-nocc**3*3*lib.num_threads())/(nocc*nmo)  #*.5 for async_io
    bufsize *= .5  #*.5 upper triangular part is loaded
    bufsize *= .8  #*.8 for [a0:a1]/[b0:b1] partition
    bufsize = max(8, bufsize)
    log.debug('max_memory %d MB (%d MB in use)', max_memory, mem_now)
    with lib.call_in_background(contract, sync=not mycc.async_io) as async_contract:
        for a0, a1 in reversed(list(lib.prange_tril(0, nvir, bufsize))):
            cache_row_a = numpy.asarray(eris_vvop[a0:a1,:a1], order='C')
            if a0 == 0:
                cache_col_a = cache_row_a
            else:
                cache_col_a = numpy.asarray(eris_vvop[:a0,a0:a1], order='C')
            async_contract(a0, a1, a0, a1, (cache_row_a,cache_col_a,
                                            cache_row_a,cache_col_a))

            for b0, b1 in lib.prange_tril(0, a0, bufsize/8):
                cache_row_b = numpy.asarray(eris_vvop[b0:b1,:b1], order='C')
                if b0 == 0:
                    cache_col_b = cache_row_b
                else:
                    cache_col_b = numpy.asarray(eris_vvop[:b0,b0:b1], order='C')
                async_contract(a0, a1, b0, b1, (cache_row_a,cache_col_a,
                                                cache_row_b,cache_col_b))

    t2 = restore_t2_inplace(t2T)
    et_sum *= 2
    if abs(et_sum[0].imag) > 1e-4:
        logger.warn(mycc, 'Non-zero imaginary part of CCSD(T) energy was found %s',
                    et_sum[0])
    et = et_sum[0].real
    log.timer('CCSD(T)', *cpu0)
    log.note('CCSD(T) correction = %.15g', et)
    return et
'''
This example shows how to use the call_in_background macro
'''

from pyscf import lib
import time

def fa():
    print('a')
    time.sleep(0.5)

def fb():
    print('b')
    time.sleep(0.8)

print('type 1')
w0 = time.time()
with lib.call_in_background(fa) as afa, lib.call_in_background(fb) as afb:
    for i in range(3):
        afa()
        afb()
print('total time = %.1f s  = [fb]0.8 * 3 seconds' % (time.time() - w0))

print('type 2')
w0 = time.time()
with lib.call_in_background(fa, fb) as (afa, afb):
    for i in range(3):
        afa()
        afb()
print('total time = %.1f s  = ([fa]0.5 + [fb]0.8) * 3 seconds' % (time.time() - w0))
Beispiel #30
0
def _make_j3c(mydf, cell, auxcell, kptij_lst, cderi_file):
    log = logger.Logger(mydf.stdout, mydf.verbose)
    t1 = t0 = (time.clock(), time.time())

    fused_cell, fuse = fuse_auxcell(mydf, mydf.auxcell)
    ao_loc = cell.ao_loc_nr()
    nao = ao_loc[-1]
    naux = auxcell.nao_nr()
    nkptij = len(kptij_lst)
    gs = mydf.gs
    Gv, Gvbase, kws = cell.get_Gv_weights(gs)
    b = cell.reciprocal_vectors()
    gxyz = lib.cartesian_prod([numpy.arange(len(x)) for x in Gvbase])
    ngs = gxyz.shape[0]

    kptis = kptij_lst[:, 0]
    kptjs = kptij_lst[:, 1]
    kpt_ji = kptjs - kptis
    uniq_kpts, uniq_index, uniq_inverse = unique(kpt_ji)
    log.debug('Num uniq kpts %d', len(uniq_kpts))
    log.debug2('uniq_kpts %s', uniq_kpts)
    # j2c ~ (-kpt_ji | kpt_ji)
    j2c = fused_cell.pbc_intor('int2c2e_sph', hermi=1, kpts=uniq_kpts)
    j2ctags = []
    nauxs = []
    t1 = log.timer_debug1('2c2e', *t1)

    if h5py.is_hdf5(cderi_file):
        feri = h5py.File(cderi_file)
    else:
        feri = h5py.File(cderi_file, 'w')
    for k, kpt in enumerate(uniq_kpts):
        aoaux = ft_ao.ft_ao(fused_cell, Gv, None, b, gxyz, Gvbase, kpt).T
        coulG = numpy.sqrt(mydf.weighted_coulG(kpt, False, gs))
        kLR = (aoaux.real * coulG).T
        kLI = (aoaux.imag * coulG).T
        if not kLR.flags.c_contiguous: kLR = lib.transpose(kLR.T)
        if not kLI.flags.c_contiguous: kLI = lib.transpose(kLI.T)
        aoaux = None

        kLR1 = numpy.asarray(kLR[:, naux:], order='C')
        kLI1 = numpy.asarray(kLI[:, naux:], order='C')
        if is_zero(kpt):  # kpti == kptj
            for p0, p1 in mydf.mpi_prange(0, ngs):
                j2cR = lib.ddot(kLR1[p0:p1].T, kLR[p0:p1])
                j2cR = lib.ddot(kLI1[p0:p1].T, kLI[p0:p1], 1, j2cR, 1)
                j2c[k][naux:] -= mpi.allreduce(j2cR)
                j2c[k][:naux, naux:] = j2c[k][naux:, :naux].T
        else:
            for p0, p1 in mydf.mpi_prange(0, ngs):
                j2cR, j2cI = zdotCN(kLR1[p0:p1].T, kLI1[p0:p1].T, kLR[p0:p1],
                                    kLI[p0:p1])
                j2cR = mpi.allreduce(j2cR)
                j2cI = mpi.allreduce(j2cI)
                j2c[k][naux:] -= j2cR + j2cI * 1j
                j2c[k][:naux, naux:] = j2c[k][naux:, :naux].T.conj()
        j2c[k] = fuse(fuse(j2c[k]).T).T
        try:
            feri['j2c/%d' % k] = scipy.linalg.cholesky(j2c[k], lower=True)
            j2ctags.append('CD')
            nauxs.append(naux)
        except scipy.linalg.LinAlgError as e:
            #msg =('===================================\n'
            #      'J-metric not positive definite.\n'
            #      'It is likely that gs is not enough.\n'
            #      '===================================')
            #log.error(msg)
            #raise scipy.linalg.LinAlgError('\n'.join([e.message, msg]))
            w, v = scipy.linalg.eigh(j2c)
            log.debug2('metric linear dependency for kpt %s', uniq_kptji_id)
            log.debug2('cond = %.4g, drop %d bfns', w[0] / w[-1],
                       numpy.count_nonzero(w < LINEAR_DEP_THR))
            v = v[:, w > LINEAR_DEP_THR].T.conj()
            v /= numpy.sqrt(w[w > LINEAR_DEP_THR]).reshape(-1, 1)
            feri['j2c/%d' % k] = v
            j2ctags.append('eig')
            nauxs.append(v.shape[0])
        kLR = kLI = kLR1 = kLI1 = coulG = None
    j2c = None

    aosym_s2 = numpy.einsum('ix->i', abs(kptis - kptjs)) < 1e-9
    j_only = numpy.all(aosym_s2)
    if gamma_point(kptij_lst):
        dtype = 'f8'
    else:
        dtype = 'c16'
    vbar = mydf.auxbar(fused_cell)
    vbar = fuse(vbar)
    ovlp = cell.pbc_intor('int1e_ovlp_sph', hermi=1, kpts=kptjs[aosym_s2])
    ovlp = [lib.pack_tril(s) for s in ovlp]
    t1 = log.timer_debug1('aoaux and int2c', *t1)

    # Estimates the buffer size based on the last contraction in G-space.
    # This contraction requires to hold nkptj copies of (naux,?) array
    # simultaneously in memory.
    mem_now = max(comm.allgather(lib.current_memory()[0]))
    max_memory = max(2000, mydf.max_memory - mem_now)
    nkptj_max = max((uniq_inverse == x).sum() for x in set(uniq_inverse))
    buflen = max(
        int(
            min(max_memory * .5e6 / 16 / naux / (nkptj_max + 2) / nao,
                nao / 3 / mpi.pool.size)), 1)
    chunks = (buflen, nao)

    j3c_jobs = grids2d_int3c_jobs(cell, auxcell, kptij_lst, chunks, j_only)
    log.debug1('max_memory = %d MB (%d in use)  chunks %s', max_memory,
               mem_now, chunks)
    log.debug2('j3c_jobs %s', j3c_jobs)

    if j_only:
        int3c = wrap_int3c(cell, fused_cell, 'int3c2e_sph', 's2', 1, kptij_lst)
    else:
        int3c = wrap_int3c(cell, fused_cell, 'int3c2e_sph', 's1', 1, kptij_lst)
        idxb = numpy.tril_indices(nao)
        idxb = (idxb[0] * nao + idxb[1]).astype('i')
    aux_loc = fused_cell.ao_loc_nr('ssc' in 'int3c2e_sph')

    def gen_int3c(auxcell, job_id, ish0, ish1):
        dataname = 'j3c-chunks/%d' % job_id
        if dataname in feri:
            del (feri[dataname])

        i0 = ao_loc[ish0]
        i1 = ao_loc[ish1]
        dii = i1 * (i1 + 1) // 2 - i0 * (i0 + 1) // 2
        dij = (i1 - i0) * nao
        if j_only:
            buflen = max(8, int(max_memory * 1e6 / 16 / (nkptij * dii + dii)))
        else:
            buflen = max(8, int(max_memory * 1e6 / 16 / (nkptij * dij + dij)))
        auxranges = balance_segs(aux_loc[1:] - aux_loc[:-1], buflen)
        buflen = max([x[2] for x in auxranges])
        buf = numpy.empty(nkptij * dij * buflen, dtype=dtype)
        buf1 = numpy.empty(dij * buflen, dtype=dtype)

        naux = aux_loc[-1]
        for kpt_id, kptij in enumerate(kptij_lst):
            key = '%s/%d' % (dataname, kpt_id)
            if aosym_s2[kpt_id]:
                shape = (naux, dii)
            else:
                shape = (naux, dij)
            if gamma_point(kptij):
                feri.create_dataset(key, shape, 'f8')
            else:
                feri.create_dataset(key, shape, 'c16')

        naux0 = 0
        for istep, auxrange in enumerate(auxranges):
            log.alldebug2("aux_e2 job_id %d step %d", job_id, istep)
            sh0, sh1, nrow = auxrange
            sub_slice = (ish0, ish1, 0, cell.nbas, sh0, sh1)
            if j_only:
                mat = numpy.ndarray((nkptij, dii, nrow),
                                    dtype=dtype,
                                    buffer=buf)
            else:
                mat = numpy.ndarray((nkptij, dij, nrow),
                                    dtype=dtype,
                                    buffer=buf)
            mat = int3c(sub_slice, mat)

            for k, kptij in enumerate(kptij_lst):
                h5dat = feri['%s/%d' % (dataname, k)]
                v = lib.transpose(mat[k], out=buf1)
                if not j_only and aosym_s2[k]:
                    idy = idxb[i0 * (i0 + 1) // 2:i1 *
                               (i1 + 1) // 2] - i0 * nao
                    out = numpy.ndarray((nrow, dii),
                                        dtype=v.dtype,
                                        buffer=mat[k])
                    v = numpy.take(v, idy, axis=1, out=out)
                if gamma_point(kptij):
                    h5dat[naux0:naux0 + nrow] = v.real
                else:
                    h5dat[naux0:naux0 + nrow] = v
            naux0 += nrow

    def ft_fuse(job_id, uniq_kptji_id, sh0, sh1):
        kpt = uniq_kpts[uniq_kptji_id]  # kpt = kptj - kpti
        adapted_ji_idx = numpy.where(uniq_inverse == uniq_kptji_id)[0]
        adapted_kptjs = kptjs[adapted_ji_idx]
        nkptj = len(adapted_kptjs)

        shls_slice = (auxcell.nbas, fused_cell.nbas)
        Gaux = ft_ao.ft_ao(fused_cell, Gv, shls_slice, b, gxyz, Gvbase, kpt)
        Gaux *= mydf.weighted_coulG(kpt, False, gs).reshape(-1, 1)
        kLR = Gaux.real.copy('C')
        kLI = Gaux.imag.copy('C')
        j2c = numpy.asarray(feri['j2c/%d' % uniq_kptji_id])
        j2ctag = j2ctags[uniq_kptji_id]
        naux0 = j2c.shape[0]

        if is_zero(kpt):
            aosym = 's2'
        else:
            aosym = 's1'

        j3cR = [None] * nkptj
        j3cI = [None] * nkptj
        i0 = ao_loc[sh0]
        i1 = ao_loc[sh1]
        for k, idx in enumerate(adapted_ji_idx):
            key = 'j3c-chunks/%d/%d' % (job_id, idx)
            v = numpy.asarray(feri[key])
            if is_zero(kpt):
                for i, c in enumerate(vbar):
                    if c != 0:
                        v[i] -= c * ovlp[k][i0 * (i0 + 1) // 2:i1 *
                                            (i1 + 1) // 2].ravel()
            j3cR[k] = numpy.asarray(v.real, order='C')
            if v.dtype == numpy.complex128:
                j3cI[k] = numpy.asarray(v.imag, order='C')
            v = None

        ncol = j3cR[0].shape[1]
        Gblksize = max(16, int(max_memory * 1e6 / 16 / ncol /
                               (nkptj + 1)))  # +1 for pqkRbuf/pqkIbuf
        Gblksize = min(Gblksize, ngs, 16384)
        pqkRbuf = numpy.empty(ncol * Gblksize)
        pqkIbuf = numpy.empty(ncol * Gblksize)
        buf = numpy.empty(nkptj * ncol * Gblksize, dtype=numpy.complex128)
        log.alldebug2('    blksize (%d,%d)', Gblksize, ncol)

        shls_slice = (sh0, sh1, 0, cell.nbas)
        for p0, p1 in lib.prange(0, ngs, Gblksize):
            dat = ft_ao._ft_aopair_kpts(cell,
                                        Gv[p0:p1],
                                        shls_slice,
                                        aosym,
                                        b,
                                        gxyz[p0:p1],
                                        Gvbase,
                                        kpt,
                                        adapted_kptjs,
                                        out=buf)
            nG = p1 - p0
            for k, ji in enumerate(adapted_ji_idx):
                aoao = dat[k].reshape(nG, ncol)
                pqkR = numpy.ndarray((ncol, nG), buffer=pqkRbuf)
                pqkI = numpy.ndarray((ncol, nG), buffer=pqkIbuf)
                pqkR[:] = aoao.real.T
                pqkI[:] = aoao.imag.T

                lib.dot(kLR[p0:p1].T, pqkR.T, -1, j3cR[k][naux:], 1)
                lib.dot(kLI[p0:p1].T, pqkI.T, -1, j3cR[k][naux:], 1)
                if not (is_zero(kpt) and gamma_point(adapted_kptjs[k])):
                    lib.dot(kLR[p0:p1].T, pqkI.T, -1, j3cI[k][naux:], 1)
                    lib.dot(kLI[p0:p1].T, pqkR.T, 1, j3cI[k][naux:], 1)

        for k, idx in enumerate(adapted_ji_idx):
            if is_zero(kpt) and gamma_point(adapted_kptjs[k]):
                v = fuse(j3cR[k])
            else:
                v = fuse(j3cR[k] + j3cI[k] * 1j)
            if j2ctag == 'CD':
                v = scipy.linalg.solve_triangular(j2c,
                                                  v,
                                                  lower=True,
                                                  overwrite_b=True)
            else:
                v = lib.dot(j2c, v)
            feri['j3c-chunks/%d/%d' % (job_id, idx)][:naux0] = v

    t2 = t1
    j3c_workers = numpy.zeros(len(j3c_jobs), dtype=int)
    #for job_id, ish0, ish1 in mpi.work_share_partition(j3c_jobs):
    for job_id, ish0, ish1 in mpi.work_stealing_partition(j3c_jobs):
        gen_int3c(fused_cell, job_id, ish0, ish1)
        t2 = log.alltimer_debug2('int j3c %d' % job_id, *t2)

        for k, kpt in enumerate(uniq_kpts):
            ft_fuse(job_id, k, ish0, ish1)
            t2 = log.alltimer_debug2('ft-fuse %d k %d' % (job_id, k), *t2)

        j3c_workers[job_id] = rank
    j3c_workers = mpi.allreduce(j3c_workers)
    log.debug2('j3c_workers %s', j3c_workers)
    j2c = kLRs = kLIs = ovlp = vbar = fuse = gen_int3c = ft_fuse = None
    t1 = log.timer_debug1('int3c and fuse', *t1)

    def get_segs_loc(aosym):
        off0 = numpy.asarray([ao_loc[i0] for x, i0, i1 in j3c_jobs])
        off1 = numpy.asarray([ao_loc[i1] for x, i0, i1 in j3c_jobs])
        if aosym:  # s2
            dims = off1 * (off1 + 1) // 2 - off0 * (off0 + 1) // 2
        else:
            dims = (off1 - off0) * nao
        #dims = numpy.asarray([ao_loc[i1]-ao_loc[i0] for x,i0,i1 in j3c_jobs])
        dims = numpy.hstack(
            [dims[j3c_workers == w] for w in range(mpi.pool.size)])
        job_idx = numpy.hstack(
            [numpy.where(j3c_workers == w)[0] for w in range(mpi.pool.size)])
        segs_loc = numpy.append(0, numpy.cumsum(dims))
        segs_loc = [(segs_loc[j], segs_loc[j + 1])
                    for j in numpy.argsort(job_idx)]
        return segs_loc

    segs_loc_s1 = get_segs_loc(False)
    segs_loc_s2 = get_segs_loc(True)

    if 'j3c' in feri: del (feri['j3c'])
    segsize = (max(nauxs) + mpi.pool.size - 1) // mpi.pool.size
    naux0 = rank * segsize
    for k, kptij in enumerate(kptij_lst):
        naux1 = min(nauxs[uniq_inverse[k]], naux0 + segsize)
        nrow = max(0, naux1 - naux0)
        if gamma_point(kptij):
            dtype = 'f8'
        else:
            dtype = 'c16'
        if aosym_s2[k]:
            nao_pair = nao * (nao + 1) // 2
        else:
            nao_pair = nao * nao
        feri.create_dataset('j3c/%d' % k, (nrow, nao_pair),
                            dtype,
                            maxshape=(None, nao_pair))

    def load(k, p0, p1):
        naux1 = nauxs[uniq_inverse[k]]
        slices = [(min(i * segsize + p0, naux1), min(i * segsize + p1, naux1))
                  for i in range(mpi.pool.size)]
        segs = []
        for p0, p1 in slices:
            val = []
            for job_id, worker in enumerate(j3c_workers):
                if rank == worker:
                    key = 'j3c-chunks/%d/%d' % (job_id, k)
                    val.append(feri[key][p0:p1].ravel())
            if val:
                segs.append(numpy.hstack(val))
            else:
                segs.append(numpy.zeros(0))
        return segs

    def save(k, p0, p1, segs):
        segs = mpi.alltoall(segs)
        naux1 = nauxs[uniq_inverse[k]]
        loc0, loc1 = min(p0, naux1 - naux0), min(p1, naux1 - naux0)
        nL = loc1 - loc0
        if nL > 0:
            if aosym_s2[k]:
                segs = numpy.hstack([
                    segs[i0 * nL:i1 * nL].reshape(nL, -1)
                    for i0, i1 in segs_loc_s2
                ])
            else:
                segs = numpy.hstack([
                    segs[i0 * nL:i1 * nL].reshape(nL, -1)
                    for i0, i1 in segs_loc_s1
                ])
            feri['j3c/%d' % k][loc0:loc1] = segs

    mem_now = max(comm.allgather(lib.current_memory()[0]))
    max_memory = max(2000, min(8000, mydf.max_memory - mem_now))
    if numpy.all(aosym_s2):
        if gamma_point(kptij_lst):
            blksize = max(16, int(max_memory * .5e6 / 8 / nao**2))
        else:
            blksize = max(16, int(max_memory * .5e6 / 16 / nao**2))
    else:
        blksize = max(16, int(max_memory * .5e6 / 16 / nao**2 / 2))
    log.debug1('max_momory %d MB (%d in use), blksize %d', max_memory, mem_now,
               blksize)

    t2 = t1
    with lib.call_in_background(save) as async_write:
        for k, kptji in enumerate(kptij_lst):
            for p0, p1 in lib.prange(0, segsize, blksize):
                segs = load(k, p0, p1)
                async_write(k, p0, p1, segs)
                t2 = log.timer_debug1(
                    'assemble k=%d %d:%d (in %d)' % (k, p0, p1, segsize), *t2)

    if 'j3c-chunks' in feri: del (feri['j3c-chunks'])
    if 'j3c-kptij' in feri: del (feri['j3c-kptij'])
    feri['j3c-kptij'] = kptij_lst
    t1 = log.alltimer_debug1('assembling j3c', *t1)
    feri.close()
Beispiel #31
0
def general(mol, mo_coeffs, erifile, dataname='eri_mo',
            intor='int2e', aosym='s4', comp=None,
            max_memory=MAX_MEMORY, ioblk_size=IOBLK_SIZE, verbose=logger.WARN,
            compact=True):
    r'''For the given four sets of orbitals, transfer arbitrary spherical AO
    integrals to MO integrals on the fly.

    Args:
        mol : :class:`Mole` object
            AO integrals will be generated in terms of mol._atm, mol._bas, mol._env
        mo_coeffs : 4-item list of ndarray
            Four sets of orbital coefficients, corresponding to the four
            indices of (ij|kl)
        erifile : str or h5py File or h5py Group object
            To store the transformed integrals, in HDF5 format.

    Kwargs
        dataname : str
            The dataset name in the erifile (ref the hierarchy of HDF5 format
            http://www.hdfgroup.org/HDF5/doc1.6/UG/09_Groups.html).  By assigning
            different dataname, the existed integral file can be reused.  If
            the erifile contains the dataname, the new integrals data will
            overwrite the old one.
        intor : str
            Name of the 2-electron integral.  Ref to :func:`getints_by_shell`
            for the complete list of available 2-electron integral names
        aosym : int or str
            Permutation symmetry for the AO integrals

            | 4 or '4' or 's4': 4-fold symmetry (default)
            | '2ij' or 's2ij' : symmetry between i, j in (ij|kl)
            | '2kl' or 's2kl' : symmetry between k, l in (ij|kl)
            | 1 or '1' or 's1': no symmetry
            | 'a4ij' : 4-fold symmetry with anti-symmetry between i, j in (ij|kl) (TODO)
            | 'a4kl' : 4-fold symmetry with anti-symmetry between k, l in (ij|kl) (TODO)
            | 'a2ij' : anti-symmetry between i, j in (ij|kl) (TODO)
            | 'a2kl' : anti-symmetry between k, l in (ij|kl) (TODO)

        comp : int
            Components of the integrals, e.g. int2e_ip_sph has 3 components.
        max_memory : float or int
            The maximum size of cache to use (in MB), large cache may **not**
            improve performance.
        ioblk_size : float or int
            The block size for IO, large block size may **not** improve performance
        verbose : int
            Print level
        compact : bool
            When compact is True, depending on the four oribital sets, the
            returned MO integrals has (up to 4-fold) permutation symmetry.
            If it's False, the function will abandon any permutation symmetry,
            and return the "plain" MO integrals

    Returns:
        None

    Examples:

    >>> from pyscf import gto
    >>> from pyscf import ao2mo
    >>> import h5py
    >>> def view(h5file, dataname='eri_mo'):
    ...     f5 = h5py.File(h5file, 'r')
    ...     print('dataset %s, shape %s' % (str(f5.keys()), str(f5[dataname].shape)))
    ...     f5.close()
    >>> mol = gto.M(atom='O 0 0 0; H 0 1 0; H 0 0 1', basis='sto3g')
    >>> mo1 = numpy.random.random((mol.nao_nr(), 10))
    >>> mo2 = numpy.random.random((mol.nao_nr(), 8))
    >>> mo3 = numpy.random.random((mol.nao_nr(), 6))
    >>> mo4 = numpy.random.random((mol.nao_nr(), 4))
    >>> ao2mo.outcore.general(mol, (mo1,mo2,mo3,mo4), 'oh2.h5')
    >>> view('oh2.h5')
    dataset ['eri_mo'], shape (80, 24)
    >>> ao2mo.outcore.general(mol, (mo1,mo2,mo3,mo3), 'oh2.h5')
    >>> view('oh2.h5')
    dataset ['eri_mo'], shape (80, 21)
    >>> ao2mo.outcore.general(mol, (mo1,mo2,mo3,mo3), 'oh2.h5', compact=False)
    >>> view('oh2.h5')
    dataset ['eri_mo'], shape (80, 36)
    >>> ao2mo.outcore.general(mol, (mo1,mo1,mo2,mo2), 'oh2.h5')
    >>> view('oh2.h5')
    dataset ['eri_mo'], shape (55, 36)
    >>> ao2mo.outcore.general(mol, (mo1,mo1,mo1,mo1), 'oh2.h5', dataname='new')
    >>> view('oh2.h5', 'new')
    dataset ['eri_mo', 'new'], shape (55, 55)
    >>> ao2mo.outcore.general(mol, (mo1,mo1,mo1,mo1), 'oh2.h5', intor='int2e_ip1_sph', aosym='s1', comp=3)
    >>> view('oh2.h5')
    dataset ['eri_mo', 'new'], shape (3, 100, 100)
    >>> ao2mo.outcore.general(mol, (mo1,mo1,mo1,mo1), 'oh2.h5', intor='int2e_ip1_sph', aosym='s2kl', comp=3)
    >>> view('oh2.h5')
    dataset ['eri_mo', 'new'], shape (3, 100, 55)
    '''
    if any(c.dtype == numpy.complex128 for c in mo_coeffs):
        raise NotImplementedError('Integral transformation for complex orbitals')

    time_0pass = (logger.process_clock(), logger.perf_counter())
    log = logger.new_logger(mol, verbose)

    nmoi = mo_coeffs[0].shape[1]
    nmoj = mo_coeffs[1].shape[1]
    nmol = mo_coeffs[3].shape[1]
    nao = mo_coeffs[0].shape[0]

    intor, comp = gto.moleintor._get_intor_and_comp(mol._add_suffix(intor), comp)
    assert(nao == mol.nao_nr('_cart' in intor))

    aosym = _stand_sym_code(aosym)
    if aosym in ('s4', 's2kl'):
        nao_pair = nao * (nao+1) // 2
    else:
        nao_pair = nao * nao

    if (compact and iden_coeffs(mo_coeffs[0], mo_coeffs[1]) and
        aosym in ('s4', 's2ij')):
        nij_pair = nmoi*(nmoi+1) // 2
    else:
        nij_pair = nmoi*nmoj

    klmosym, nkl_pair, mokl, klshape = \
            incore._conc_mos(mo_coeffs[2], mo_coeffs[3],
                             compact and aosym in ('s4', 's2kl'))

#    if nij_pair > nkl_pair:
#        log.warn('low efficiency for AO to MO trans!')

    if isinstance(erifile, str):
        if h5py.is_hdf5(erifile):
            feri = h5py.File(erifile, 'a')
            if dataname in feri:
                del(feri[dataname])
        else:
            feri = h5py.File(erifile, 'w')
    else:
        assert(isinstance(erifile, h5py.Group))
        feri = erifile

    if comp == 1:
        chunks = (nmoj, nmol)
        shape = (nij_pair, nkl_pair)
    else:
        chunks = (1, nmoj, nmol)
        shape = (comp, nij_pair, nkl_pair)

    if nij_pair == 0 or nkl_pair == 0:
        feri.create_dataset(dataname, shape, 'f8')
        if isinstance(erifile, str):
            feri.close()
        return erifile
    else:
        h5d_eri = feri.create_dataset(dataname, shape, 'f8', chunks=chunks)

    log.debug('MO integrals %s are saved in %s/%s', intor, erifile, dataname)
    log.debug('num. MO ints = %.8g, required disk %.8g MB',
              float(nij_pair)*nkl_pair*comp, nij_pair*nkl_pair*comp*8/1e6)

# transform e1
    fswap = lib.H5TmpFile()
    half_e1(mol, mo_coeffs, fswap, intor, aosym, comp, max_memory, ioblk_size,
            log, compact)

    time_1pass = log.timer('AO->MO transformation for %s 1 pass'%intor,
                           *time_0pass)

    def load(icomp, row0, row1, buf):
        if icomp+1 < comp:
            icomp += 1
        else:  # move to next row-block
            row0, row1 = row1, min(nij_pair, row1+iobuflen)
            icomp = 0
        if row0 < row1:
            _load_from_h5g(fswap['%d'%icomp], row0, row1, buf)

    def save(icomp, row0, row1, buf):
        if comp == 1:
            h5d_eri[row0:row1] = buf[:row1-row0]
        else:
            h5d_eri[icomp,row0:row1] = buf[:row1-row0]

    ioblk_size = max(max_memory*.1, ioblk_size)
    iobuflen = guess_e2bufsize(ioblk_size, nij_pair, max(nao_pair,nkl_pair))[0]
    buf = numpy.empty((iobuflen,nao_pair))
    buf_prefetch = numpy.empty_like(buf)
    outbuf = numpy.empty((iobuflen,nkl_pair))
    buf_write = numpy.empty_like(outbuf)

    log.debug('step2: kl-pair (ao %d, mo %d), mem %.8g MB, ioblock %.8g MB',
              nao_pair, nkl_pair, iobuflen*nao_pair*8/1e6,
              iobuflen*nkl_pair*8/1e6)

    #klaoblks = len(fswap['0'])
    ijmoblks = int(numpy.ceil(float(nij_pair)/iobuflen)) * comp
    ao_loc = mol.ao_loc_nr('_cart' in intor)
    ti0 = time_1pass
    istep = 0
    with lib.call_in_background(load) as prefetch:
        with lib.call_in_background(save) as async_write:
            _load_from_h5g(fswap['0'], 0, min(nij_pair, iobuflen), buf_prefetch)

            for row0, row1 in prange(0, nij_pair, iobuflen):
                nrow = row1 - row0

                for icomp in range(comp):
                    istep += 1
                    log.debug1('step 2 [%d/%d], [%d,%d:%d], row = %d',
                               istep, ijmoblks, icomp, row0, row1, nrow)

                    buf, buf_prefetch = buf_prefetch, buf
                    prefetch(icomp, row0, row1, buf_prefetch)
                    _ao2mo.nr_e2(buf[:nrow], mokl, klshape, aosym, klmosym,
                                 ao_loc=ao_loc, out=outbuf)
                    async_write(icomp, row0, row1, outbuf)
                    outbuf, buf_write = buf_write, outbuf  # avoid flushing writing buffer

                    ti1 = (logger.process_clock(), logger.perf_counter())
                    log.debug1('step 2 [%d/%d] CPU time: %9.2f, Wall time: %9.2f',
                               istep, ijmoblks, ti1[0]-ti0[0], ti1[1]-ti0[1])
                    ti0 = ti1

    fswap = None
    if isinstance(erifile, str):
        feri.close()

    log.timer('AO->MO transformation for %s 2 pass'%intor, *time_1pass)
    log.timer('AO->MO transformation for %s '%intor, *time_0pass)
    return erifile
Beispiel #32
0
def _ao2mo_ovov(mp, orbo, orbv, feri, max_memory=2000, verbose=None):
    time0 = (time.clock(), time.time())
    log = logger.new_logger(mp, verbose)

    mol = mp.mol
    int2e = mol._add_suffix('int2e')
    ao2mopt = _ao2mo.AO2MOpt(mol, int2e, 'CVHFnr_schwarz_cond',
                             'CVHFsetnr_direct_scf')
    nao, nocc = orbo.shape
    nvir = orbv.shape[1]
    nbas = mol.nbas
    assert (nvir <= nao)

    ao_loc = mol.ao_loc_nr()
    dmax = max(
        4, min(nao / 3, numpy.sqrt(max_memory * .95e6 / 8 / (nao + nocc)**2)))
    sh_ranges = ao2mo.outcore.balance_partition(ao_loc, dmax)
    dmax = max(x[2] for x in sh_ranges)
    eribuf = numpy.empty((nao, dmax, dmax, nao))
    ftmp = lib.H5TmpFile()
    log.debug('max_memory %s MB (dmax = %s) required disk space %g MB',
              max_memory, dmax,
              nocc**2 * (nao * (nao + dmax) / 2 + nvir**2) * 8 / 1e6)

    buf_i = numpy.empty((nocc * dmax**2 * nao))
    buf_li = numpy.empty((nocc**2 * dmax**2))
    buf1 = numpy.empty_like(buf_li)

    fint = gto.moleintor.getints4c
    jk_blk_slices = []
    count = 0
    time1 = time0
    with lib.call_in_background(ftmp.__setitem__) as save:
        for ip, (ish0, ish1, ni) in enumerate(sh_ranges):
            for jsh0, jsh1, nj in sh_ranges[:ip + 1]:
                i0, i1 = ao_loc[ish0], ao_loc[ish1]
                j0, j1 = ao_loc[jsh0], ao_loc[jsh1]
                jk_blk_slices.append((i0, i1, j0, j1))

                eri = fint(int2e,
                           mol._atm,
                           mol._bas,
                           mol._env,
                           shls_slice=(0, nbas, ish0, ish1, jsh0, jsh1, 0,
                                       nbas),
                           aosym='s1',
                           ao_loc=ao_loc,
                           cintopt=ao2mopt._cintopt,
                           out=eribuf)
                tmp_i = numpy.ndarray((nocc, (i1 - i0) * (j1 - j0) * nao),
                                      buffer=buf_i)
                tmp_li = numpy.ndarray((nocc, nocc * (i1 - i0) * (j1 - j0)),
                                       buffer=buf_li)
                lib.ddot(orbo.T,
                         eri.reshape(nao, (i1 - i0) * (j1 - j0) * nao),
                         c=tmp_i)
                lib.ddot(orbo.T,
                         tmp_i.reshape(nocc * (i1 - i0) * (j1 - j0), nao).T,
                         c=tmp_li)
                tmp_li = tmp_li.reshape(nocc, nocc, (i1 - i0), (j1 - j0))
                save(str(count), tmp_li.transpose(1, 0, 2, 3))
                buf_li, buf1 = buf1, buf_li
                count += 1
                time1 = log.timer_debug1(
                    'partial ao2mo [%d:%d,%d:%d]' % (ish0, ish1, jsh0, jsh1),
                    *time1)
    time1 = time0 = log.timer('mp2 ao2mo_ovov pass1', *time0)
    eri = eribuf = tmp_i = tmp_li = buf_i = buf_li = buf1 = None

    h5dat = feri.create_dataset('ovov', (nocc * nvir, nocc * nvir),
                                'f8',
                                chunks=(nvir, nvir))
    occblk = int(
        min(nocc,
            max(4, 250 / nocc, max_memory * .9e6 / 8 / (nao**2 * nocc) / 5)))

    def load(i0, eri):
        if i0 < nocc:
            i1 = min(i0 + occblk, nocc)
            for k, (p0, p1, q0, q1) in enumerate(jk_blk_slices):
                eri[:i1 - i0, :, p0:p1, q0:q1] = ftmp[str(k)][i0:i1]
                if p0 != q0:
                    dat = numpy.asarray(ftmp[str(k)][:, i0:i1])
                    eri[:i1 - i0, :, q0:q1, p0:p1] = dat.transpose(1, 0, 3, 2)

    def save(i0, i1, dat):
        for i in range(i0, i1):
            h5dat[i * nvir:(i + 1) * nvir] = dat[i - i0].reshape(
                nvir, nocc * nvir)

    orbv = numpy.asarray(orbv, order='F')
    buf_prefecth = numpy.empty((occblk, nocc, nao, nao))
    buf = numpy.empty_like(buf_prefecth)
    bufw = numpy.empty((occblk * nocc, nvir**2))
    bufw1 = numpy.empty_like(bufw)
    with lib.call_in_background(load) as prefetch:
        with lib.call_in_background(save) as bsave:
            load(0, buf_prefecth)
            for i0, i1 in lib.prange(0, nocc, occblk):
                buf, buf_prefecth = buf_prefecth, buf
                prefetch(i1, buf_prefecth)
                eri = buf[:i1 - i0].reshape((i1 - i0) * nocc, nao, nao)

                dat = _ao2mo.nr_e2(eri,
                                   orbv, (0, nvir, 0, nvir),
                                   's1',
                                   's1',
                                   out=bufw)
                bsave(
                    i0, i1,
                    dat.reshape(i1 - i0, nocc, nvir,
                                nvir).transpose(0, 2, 1, 3))
                bufw, bufw1 = bufw1, bufw
                time1 = log.timer_debug1('pass2 ao2mo [%d:%d]' % (i0, i1),
                                         *time1)

    time0 = log.timer('mp2 ao2mo_ovov pass2', *time0)
    return h5dat
Beispiel #33
0
def cholesky_eri_b(mol,
                   erifile,
                   auxbasis='weigend+etb',
                   dataname='j3c',
                   int3c='int3c2e',
                   aosym='s2ij',
                   int2c='int2c2e',
                   comp=1,
                   max_memory=MAX_MEMORY,
                   auxmol=None,
                   verbose=logger.NOTE):
    '''3-center 2-electron DF tensor. Similar to cholesky_eri while this
    function stores DF tensor in blocks.
    '''
    assert (aosym in ('s1', 's2ij'))
    log = logger.new_logger(mol, verbose)
    time0 = (time.clock(), time.time())

    if auxmol is None:
        auxmol = make_auxmol(mol, auxbasis)
    j2c = auxmol.intor(int2c, hermi=1)
    log.debug('size of aux basis %d', j2c.shape[0])
    time1 = log.timer('2c2e', *time0)
    try:
        low = scipy.linalg.cholesky(j2c, lower=True)
        tag = 'cd'
    except scipy.linalg.LinAlgError:
        w, v = scipy.linalg.eigh(j2c)
        idx = w > LINEAR_DEP_THR
        low = (v[:, idx] / numpy.sqrt(w[idx]))
        v = None
        tag = 'eig'
    j2c = None
    naoaux, naux = low.shape
    time1 = log.timer('Cholesky 2c2e', *time1)

    int3c = gto.moleintor.ascint3(mol._add_suffix(int3c))
    atm, bas, env = gto.mole.conc_env(mol._atm, mol._bas, mol._env,
                                      auxmol._atm, auxmol._bas, auxmol._env)
    ao_loc = gto.moleintor.make_loc(bas, int3c)
    nao = ao_loc[mol.nbas]
    naoaux = ao_loc[-1] - nao
    if aosym == 's1':
        nao_pair = nao * nao
        buflen = min(max(int(max_memory * .24e6 / 8 / naoaux / comp), 1),
                     nao_pair)
        shranges = _guess_shell_ranges(mol, buflen, 's1')
    else:
        nao_pair = nao * (nao + 1) // 2
        buflen = min(max(int(max_memory * .24e6 / 8 / naoaux / comp), 1),
                     nao_pair)
        shranges = _guess_shell_ranges(mol, buflen, 's2ij')
    log.debug('erifile %.8g MB, IO buf size %.8g MB',
              naoaux * nao_pair * 8 / 1e6, comp * buflen * naoaux * 8 / 1e6)
    log.debug1('shranges = %s', shranges)
    # TODO: Libcint-3.14 and newer version support to compute int3c2e without
    # the opt for the 3rd index.
    #if '3c2e' in int3c:
    #    cintopt = gto.moleintor.make_cintopt(atm, mol._bas, env, int3c)
    #else:
    #    cintopt = gto.moleintor.make_cintopt(atm, bas, env, int3c)
    cintopt = gto.moleintor.make_cintopt(atm, bas, env, int3c)
    bufs1 = numpy.empty((comp * max([x[2] for x in shranges]), naoaux))

    feri = _create_h5file(erifile, dataname)

    def store(buf, label):
        if comp == 1:
            feri[label] = buf
        else:
            shape = (len(buf), ) + buf[0].shape
            fdat = feri.create_dataset(label, shape, buf[0].dtype.char)
            for i, b in enumerate(buf):
                fdat[i] = b

    def transform(b):
        if b.ndim == 3 and b.flags.f_contiguous:
            b = lib.transpose(b.T, axes=(0, 2, 1)).reshape(naoaux, -1)
        else:
            b = b.reshape((-1, naoaux)).T
        if tag == 'cd':
            if b.flags.c_contiguous:
                b = lib.transpose(b).T
            return scipy.linalg.solve_triangular(low,
                                                 b,
                                                 lower=True,
                                                 overwrite_b=True,
                                                 check_finite=False)
        else:
            return lib.dot(low.T, b)

    with lib.call_in_background(store) as bstore:
        for istep, sh_range in enumerate(shranges):
            log.debug('int3c2e [%d/%d], AO [%d:%d], nrow = %d', \
                      istep+1, len(shranges), *sh_range)
            bstart, bend, nrow = sh_range
            shls_slice = (bstart, bend, 0, mol.nbas, mol.nbas,
                          mol.nbas + auxmol.nbas)
            ints = gto.moleintor.getints3c(int3c,
                                           atm,
                                           bas,
                                           env,
                                           shls_slice,
                                           comp,
                                           aosym,
                                           ao_loc,
                                           cintopt,
                                           out=bufs1)
            if comp == 1:
                buf = transform(ints)
            else:
                buf = [transform(x) for x in ints]
            bstore(buf, '%s/%d' % (dataname, istep))
            buf = ints = None
            time1 = log.timer(
                'gen CD eri [%d/%d]' % (istep + 1, len(shranges)), *time1)
    bufs1 = None

    feri.close()
    return erifile
Beispiel #34
0
def _make_eris(mp, mo_coeff=None, verbose=None):
    log = logger.new_logger(mp, verbose)
    time0 = (time.clock(), time.time())

    log.debug('transform (ia|jb) outcore')
    mol = mp.mol
    nocc = mp.nocc
    nmo = mp.nmo
    nvir = nmo - nocc

    eris = mp2._ChemistsERIs(mp, mo_coeff)
    nao = eris.mo_coeff.shape[0]
    assert(nvir <= nao)
    orbo = eris.mo_coeff[:,:nocc]
    orbv = numpy.asarray(eris.mo_coeff[:,nocc:], order='F')
    eris.feri = lib.H5TmpFile()

    int2e = mol._add_suffix('int2e')
    ao2mopt = _ao2mo.AO2MOpt(mol, int2e, 'CVHFnr_schwarz_cond',
                             'CVHFsetnr_direct_scf')
    fint = gto.moleintor.getints4c

    ntasks = mpi.pool.size
    olocs = [_task_location(nocc, task_id) for task_id in range(ntasks)]
    oloc0, oloc1 = olocs[rank]
    nocc_seg = oloc1 - oloc0
    log.debug2('olocs %s', olocs)

    ao_loc = mol.ao_loc_nr()
    task_sh_locs = lib.misc._balanced_partition(ao_loc, ntasks)
    log.debug2('task_sh_locs %s', task_sh_locs)
    ao_sh0 = task_sh_locs[rank]
    ao_sh1 = task_sh_locs[rank+1]
    ao_loc0 = ao_loc[ao_sh0]
    ao_loc1 = ao_loc[ao_sh1]
    nao_seg = ao_loc1 - ao_loc0
    orbo_seg = orbo[ao_loc0:ao_loc1]

    mem_now = lib.current_memory()[0]
    max_memory = max(0, mp.max_memory - mem_now)
    dmax = numpy.sqrt(max_memory*.9e6/8/((nao+nocc)*(nao_seg+nocc)))
    dmax = min(nao//4+2, max(BLKMIN, min(comm.allgather(dmax))))
    sh_ranges = ao2mo.outcore.balance_partition(ao_loc, dmax)
    sh_ranges = comm.bcast(sh_ranges)
    dmax = max(x[2] for x in sh_ranges)
    eribuf = numpy.empty((nao,dmax,dmax,nao_seg))
    ftmp = lib.H5TmpFile()
    log.debug('max_memory %s MB (dmax = %s) required disk space %g MB',
              max_memory, dmax, nocc*nocc_seg*(nao*(nao+dmax)/2+nvir**2)*8/1e6)

    def save(count, tmp_xo):
        di, dj = tmp_xo.shape[2:4]
        tmp_xo = [tmp_xo[p0:p1] for p0, p1 in olocs]
        tmp_xo = mpi.alltoall(tmp_xo, split_recvbuf=True)
        tmp_xo = sum(tmp_xo).reshape(nocc_seg,nocc,di,dj)
        ftmp[str(count)+'b'] = tmp_xo

        tmp_ox = mpi.alltoall([tmp_xo[:,p0:p1] for p0, p1 in olocs],
                              split_recvbuf=True)
        tmp_ox = [tmp_ox[i].reshape(p1-p0,nocc_seg,di,dj)
                  for i, (p0,p1) in enumerate(olocs)]
        ftmp[str(count)+'a'] = numpy.vstack(tmp_ox)

    jk_blk_slices = []
    count = 0
    time1 = time0
    with lib.call_in_background(save) as bg_save:
        for ip, (ish0, ish1, ni) in enumerate(sh_ranges):
            for jsh0, jsh1, nj in sh_ranges[:ip+1]:
                i0, i1 = ao_loc[ish0], ao_loc[ish1]
                j0, j1 = ao_loc[jsh0], ao_loc[jsh1]
                jk_blk_slices.append((i0,i1,j0,j1))

                shls_slice = (0,mol.nbas,ish0,ish1, jsh0,jsh1,ao_sh0,ao_sh1)
                eri = fint(int2e, mol._atm, mol._bas, mol._env,
                           shls_slice=shls_slice, aosym='s1', ao_loc=ao_loc,
                           cintopt=ao2mopt._cintopt, out=eribuf)
                tmp_xo = lib.einsum('pi,pqrs->iqrs', orbo, eri)
                tmp_xo = lib.einsum('iqrs,sl->ilqr', tmp_xo, orbo_seg)
                bg_save(count, tmp_xo)
                tmp_xo = None
                count += 1
                time1 = log.timer_debug1('partial ao2mo [%d:%d,%d:%d]' %
                                         (ish0,ish1,jsh0,jsh1), *time1)
    eri = eribuf = None
    time1 = time0 = log.timer('mp2 ao2mo_ovov pass1', *time0)

    eris.ovov = eris.feri.create_dataset('ovov', (nocc,nvir,nocc_seg,nvir), 'f8')
    occblk = int(min(nocc, max(BLKMIN, max_memory*.9e6/8/(nao**2*nocc_seg+1)/5)))
    def load(i0, eri):
        if i0 < nocc:
            i1 = min(i0+occblk, nocc)
            for k, (p0,p1,q0,q1) in enumerate(jk_blk_slices):
                eri[:i1-i0,:,p0:p1,q0:q1] = ftmp[str(k)+'a'][i0:i1]
                if p0 != q0:
                    dat = numpy.asarray(ftmp[str(k)+'b'][:,i0:i1])
                    eri[:i1-i0,:,q0:q1,p0:p1] = dat.transpose(1,0,3,2)

    def save(i0, i1, dat):
        eris.ovov[i0:i1] = dat

    buf_prefecth = numpy.empty((occblk,nocc_seg,nao,nao))
    buf = numpy.empty_like(buf_prefecth)
    bufw = numpy.empty((occblk*nocc_seg,nvir**2))
    bufw1 = numpy.empty_like(bufw)
    with lib.call_in_background(load) as prefetch:
        with lib.call_in_background(save) as bsave:
            load(0, buf_prefecth)
            for i0, i1 in lib.prange(0, nocc, occblk):
                buf, buf_prefecth = buf_prefecth, buf
                prefetch(i1, buf_prefecth)
                eri = buf[:i1-i0].reshape((i1-i0)*nocc_seg,nao,nao)

                dat = _ao2mo.nr_e2(eri, orbv, (0,nvir,0,nvir), 's1', 's1', out=bufw)
                bsave(i0, i1, dat.reshape(i1-i0,nocc_seg,nvir,nvir).transpose(0,2,1,3))
                bufw, bufw1 = bufw1, bufw
                time1 = log.timer_debug1('pass2 ao2mo [%d:%d]' % (i0,i1), *time1)

    time0 = log.timer('mp2 ao2mo_ovov pass2', *time0)
    mp._eris = eris
    return eris
Beispiel #35
0
def cholesky_eri(mol,
                 erifile,
                 auxbasis='weigend+etb',
                 dataname='j3c',
                 tmpdir=None,
                 int3c='int3c2e',
                 aosym='s2ij',
                 int2c='int2c2e',
                 comp=1,
                 max_memory=MAX_MEMORY,
                 auxmol=None,
                 verbose=logger.NOTE):
    '''3-index density-fitting tensor.
    '''
    assert (aosym in ('s1', 's2ij'))
    assert (comp == 1)
    log = logger.new_logger(mol, verbose)
    time0 = (time.clock(), time.time())

    if auxmol is None:
        auxmol = make_auxmol(mol, auxbasis)

    if tmpdir is None:
        tmpdir = lib.param.TMPDIR
    swapfile = tempfile.NamedTemporaryFile(dir=tmpdir)
    cholesky_eri_b(mol,
                   swapfile.name,
                   auxbasis,
                   dataname,
                   int3c,
                   aosym,
                   int2c,
                   comp,
                   max_memory,
                   auxmol,
                   verbose=log)
    fswap = h5py.File(swapfile.name, 'r')
    time1 = log.timer('generate (ij|L) 1 pass', *time0)

    # Cannot let naoaux = auxmol.nao_nr() if auxbasis has linear dependence
    nao = mol.nao_nr()
    if aosym == 's1':
        nao_pair = nao * nao
    else:
        nao_pair = nao * (nao + 1) // 2

    feri = _create_h5file(erifile, dataname)
    if comp == 1:
        naoaux = fswap['%s/0' % dataname].shape[0]
        h5d_eri = feri.create_dataset(dataname, (naoaux, nao_pair), 'f8')
    else:
        naoaux = fswap['%s/0' % dataname].shape[1]
        h5d_eri = feri.create_dataset(dataname, (comp, naoaux, nao_pair), 'f8')

    def save(row0, row1, buf):
        if comp == 1:
            h5d_eri[row0:row1] = buf
        else:
            h5d_eri[:, row0:row1] = buf

    iolen = min(max(int(max_memory * .45e6 / 8 / nao_pair), 28), naoaux)
    totstep = (naoaux + iolen - 1) // iolen
    bufs1 = numpy.empty((iolen, nao_pair))
    bufs2 = numpy.empty_like(bufs1)
    ti0 = time1
    with lib.call_in_background(save) as bsave:
        for istep, (row0, row1) in enumerate(lib.prange(0, naoaux, iolen)):
            nrow = row1 - row0
            buf = _load_from_h5g(fswap[dataname], row0, row1, bufs1)
            bufs1, bufs2 = bufs2, bufs1
            bsave(row0, row1, buf)
            ti0 = log.timer(
                'step 2 [%d/%d], [%d:%d], row = %d' %
                (istep + 1, totstep, row0, row1, nrow), *ti0)

    fswap.close()
    feri.close()
    log.timer('cholesky_eri', *time0)
    return erifile
Beispiel #36
0
    def make_kpt(uniq_kptji_id, cholesky_j2c):
        kpt = uniq_kpts[uniq_kptji_id]  # kpt = kptj - kpti
        log.debug1('kpt = %s', kpt)
        adapted_ji_idx = numpy.where(uniq_inverse == uniq_kptji_id)[0]
        adapted_kptjs = kptjs[adapted_ji_idx]
        nkptj = len(adapted_kptjs)
        log.debug1('adapted_ji_idx = %s', adapted_ji_idx)

        j2c, j2c_negative, j2ctag = cholesky_j2c

        shls_slice = (auxcell.nbas, fused_cell.nbas)
        Gaux = ft_ao.ft_ao(fused_cell, Gv, shls_slice, b, gxyz, Gvbase, kpt)
        wcoulG = mydf.weighted_coulG(kpt, False, mesh)
        Gaux *= wcoulG.reshape(-1,1)
        kLR = Gaux.real.copy('C')
        kLI = Gaux.imag.copy('C')
        Gaux = None

        if is_zero(kpt):  # kpti == kptj
            aosym = 's2'
            nao_pair = nao*(nao+1)//2

            if cell.dimension == 3:
                vbar = fuse(mydf.auxbar(fused_cell))
                ovlp = cell.pbc_intor('int1e_ovlp', hermi=1, kpts=adapted_kptjs)
                ovlp = [lib.pack_tril(s) for s in ovlp]
        else:
            aosym = 's1'
            nao_pair = nao**2

        mem_now = lib.current_memory()[0]
        log.debug2('memory = %s', mem_now)
        max_memory = max(2000, mydf.max_memory-mem_now)
        # nkptj for 3c-coulomb arrays plus 1 Lpq array
        buflen = min(max(int(max_memory*.38e6/16/naux/(nkptj+1)), 1), nao_pair)
        shranges = _guess_shell_ranges(cell, buflen, aosym)
        buflen = max([x[2] for x in shranges])
        # +1 for a pqkbuf
        if aosym == 's2':
            Gblksize = max(16, int(max_memory*.1e6/16/buflen/(nkptj+1)))
        else:
            Gblksize = max(16, int(max_memory*.2e6/16/buflen/(nkptj+1)))
        Gblksize = min(Gblksize, ngrids, 16384)
        pqkRbuf = numpy.empty(buflen*Gblksize)
        pqkIbuf = numpy.empty(buflen*Gblksize)
        # buf for ft_aopair
        buf = numpy.empty(nkptj*buflen*Gblksize, dtype=numpy.complex128)
        def pw_contract(istep, sh_range, j3cR, j3cI):
            bstart, bend, ncol = sh_range
            if aosym == 's2':
                shls_slice = (bstart, bend, 0, bend)
            else:
                shls_slice = (bstart, bend, 0, cell.nbas)

            for p0, p1 in lib.prange(0, ngrids, Gblksize):
                dat = ft_ao._ft_aopair_kpts(cell, Gv[p0:p1], shls_slice, aosym,
                                            b, gxyz[p0:p1], Gvbase, kpt,
                                            adapted_kptjs, out=buf)
                nG = p1 - p0
                for k, ji in enumerate(adapted_ji_idx):
                    aoao = dat[k].reshape(nG,ncol)
                    pqkR = numpy.ndarray((ncol,nG), buffer=pqkRbuf)
                    pqkI = numpy.ndarray((ncol,nG), buffer=pqkIbuf)
                    pqkR[:] = aoao.real.T
                    pqkI[:] = aoao.imag.T

                    lib.dot(kLR[p0:p1].T, pqkR.T, -1, j3cR[k][naux:], 1)
                    lib.dot(kLI[p0:p1].T, pqkI.T, -1, j3cR[k][naux:], 1)
                    if not (is_zero(kpt) and gamma_point(adapted_kptjs[k])):
                        lib.dot(kLR[p0:p1].T, pqkI.T, -1, j3cI[k][naux:], 1)
                        lib.dot(kLI[p0:p1].T, pqkR.T,  1, j3cI[k][naux:], 1)

            for k, ji in enumerate(adapted_ji_idx):
                if is_zero(kpt) and gamma_point(adapted_kptjs[k]):
                    v = fuse(j3cR[k])
                else:
                    v = fuse(j3cR[k] + j3cI[k] * 1j)
                if j2ctag == 'CD':
                    v = scipy.linalg.solve_triangular(j2c, v, lower=True, overwrite_b=True)
                    feri['j3c/%d/%d'%(ji,istep)] = v
                else:
                    feri['j3c/%d/%d'%(ji,istep)] = lib.dot(j2c, v)

                # low-dimension systems
                if j2c_negative is not None:
                    feri['j3c-/%d/%d'%(ji,istep)] = lib.dot(j2c_negative, v)

        with lib.call_in_background(pw_contract) as compute:
            col1 = 0
            for istep, sh_range in enumerate(shranges):
                log.debug1('int3c2e [%d/%d], AO [%d:%d], ncol = %d', \
                           istep+1, len(shranges), *sh_range)
                bstart, bend, ncol = sh_range
                col0, col1 = col1, col1+ncol
                j3cR = []
                j3cI = []
                for k, idx in enumerate(adapted_ji_idx):
                    v = numpy.vstack([fswap['j3c-junk/%d/%d'%(idx,i)][0,col0:col1].T
                                      for i in range(nsegs)])
                    # vbar is the interaction between the background charge
                    # and the auxiliary basis.  0D, 1D, 2D do not have vbar.
                    if is_zero(kpt) and cell.dimension == 3:
                        for i in numpy.where(vbar != 0)[0]:
                            v[i] -= vbar[i] * ovlp[k][col0:col1]
                    j3cR.append(numpy.asarray(v.real, order='C'))
                    if is_zero(kpt) and gamma_point(adapted_kptjs[k]):
                        j3cI.append(None)
                    else:
                        j3cI.append(numpy.asarray(v.imag, order='C'))
                v = None
                compute(istep, sh_range, j3cR, j3cI)
        for ji in adapted_ji_idx:
            del(fswap['j3c-junk/%d'%ji])
Beispiel #37
0
def update_amps(mycc, t1, t2, eris):
    time1 = time0 = time.clock(), time.time()
    log = logger.Logger(mycc.stdout, mycc.verbose)
    cpu1 = time0

    t1T = t1.T
    t2T = numpy.asarray(t2.transpose(2, 3, 0, 1), order='C')
    nvir_seg, nvir, nocc = t2T.shape[:3]
    t1 = t2 = None
    ntasks = mpi.pool.size
    vlocs = [_task_location(nvir, task_id) for task_id in range(ntasks)]
    vloc0, vloc1 = vlocs[rank]
    log.debug2('vlocs %s', vlocs)
    assert (vloc1 - vloc0 == nvir_seg)

    fock = eris.fock
    mo_e_o = eris.mo_energy[:nocc]
    mo_e_v = eris.mo_energy[nocc:] + mycc.level_shift

    def _rotate_vir_block(buf):
        for task_id, buf in _rotate_tensor_block(buf):
            loc0, loc1 = vlocs[task_id]
            yield task_id, buf, loc0, loc1

    fswap = lib.H5TmpFile()
    wVooV = numpy.zeros((nvir_seg, nocc, nocc, nvir))
    eris_voov = _cp(eris.ovvo).transpose(1, 0, 3, 2)
    tau = t2T * .5
    tau += numpy.einsum('ai,bj->abij', t1T[vloc0:vloc1], t1T)
    for task_id, tau, p0, p1 in _rotate_vir_block(tau):
        wVooV += lib.einsum('bkic,cajk->bija', eris_voov[:, :, :, p0:p1], tau)
    fswap['wVooV1'] = wVooV
    wVooV = tau = None
    time1 = log.timer_debug1('wVooV', *time1)

    wVOov = eris_voov
    eris_VOov = eris_voov - eris_voov.transpose(0, 2, 1, 3) * .5
    tau = t2T.transpose(2, 0, 3, 1) - t2T.transpose(3, 0, 2, 1) * .5
    tau -= numpy.einsum('ai,bj->jaib', t1T[vloc0:vloc1], t1T)
    for task_id, tau, p0, p1 in _rotate_vir_block(tau):
        wVOov += lib.einsum('dlkc,kcjb->dljb', eris_VOov[:, :, :, p0:p1], tau)
    fswap['wVOov1'] = wVOov
    wVOov = tau = eris_VOov = eris_voov = None
    time1 = log.timer_debug1('wVOov', *time1)

    t1Tnew = numpy.zeros_like(t1T)
    t2Tnew = mycc._add_vvvv(t1T, t2T, eris, t2sym='jiba')
    time1 = log.timer_debug1('vvvv', *time1)

    #** make_inter_F
    fov = fock[:nocc, nocc:].copy()
    t1Tnew += fock[nocc:, :nocc]

    foo = fock[:nocc, :nocc] - numpy.diag(mo_e_o)
    foo += .5 * numpy.einsum('ia,aj->ij', fock[:nocc, nocc:], t1T)

    fvv = fock[nocc:, nocc:] - numpy.diag(mo_e_v)
    fvv -= .5 * numpy.einsum('ai,ib->ab', t1T, fock[:nocc, nocc:])

    foo_priv = numpy.zeros_like(foo)
    fov_priv = numpy.zeros_like(fov)
    fvv_priv = numpy.zeros_like(fvv)
    t1T_priv = numpy.zeros_like(t1T)

    max_memory = mycc.max_memory - lib.current_memory()[0]
    unit = nocc * nvir**2 * 3 + nocc**2 * nvir + 1
    blksize = min(nvir,
                  max(BLKMIN, int((max_memory * .9e6 / 8 - t2T.size) / unit)))
    log.debug1('pass 1, max_memory %d MB,  nocc,nvir = %d,%d  blksize = %d',
               max_memory, nocc, nvir, blksize)

    buf = numpy.empty((blksize, nvir, nvir, nocc))

    def load_vvvo(p0):
        p1 = min(nvir_seg, p0 + blksize)
        if p0 < p1:
            buf[:p1 - p0] = eris.vvvo[p0:p1]

    fswap.create_dataset('wVooV', (nvir_seg, nocc, nocc, nvir), 'f8')
    wVOov = []

    with lib.call_in_background(load_vvvo) as prefetch:
        load_vvvo(0)
        for p0, p1 in lib.prange(vloc0, vloc1, blksize):
            i0, i1 = p0 - vloc0, p1 - vloc0
            eris_vvvo, buf = buf[:p1 - p0], numpy.empty_like(buf)
            prefetch(i1)

            fvv_priv[p0:p1] += 2 * numpy.einsum('ck,abck->ab', t1T, eris_vvvo)
            fvv_priv -= numpy.einsum('ck,cabk->ab', t1T[p0:p1], eris_vvvo)

            if not mycc.direct:
                raise NotImplementedError
                tau = t2T[i0:i1] + numpy.einsum('ai,bj->abij', t1T[p0:p1], t1T)
                for task_id, tau, q0, q1 in _rotate_vir_block(tau):
                    tmp = lib.einsum('bdck,cdij->bkij', eris_vvvo[:, :, q0:q1],
                                     tau)
                    t2Tnew -= lib.einsum('ak,bkij->baji', t1T, tmp)
                tau = tmp = None

            fswap['wVooV'][i0:i1] = lib.einsum('cj,baci->bija', -t1T,
                                               eris_vvvo)

            theta = t2T[i0:i1].transpose(0, 2, 1, 3) * 2
            theta -= t2T[i0:i1].transpose(0, 3, 1, 2)
            t1T_priv += lib.einsum('bicj,bacj->ai', theta, eris_vvvo)
            wVOov.append(lib.einsum('acbi,cj->abij', eris_vvvo, t1T))
            theta = eris_vvvo = None
            time1 = log.timer_debug1('vvvo [%d:%d]' % (p0, p1), *time1)

    wVOov = numpy.vstack(wVOov)
    wVOov = mpi.alltoall([wVOov[:, q0:q1] for q0, q1 in vlocs],
                         split_recvbuf=True)
    wVOov = numpy.vstack([x.reshape(-1, nvir_seg, nocc, nocc) for x in wVOov])
    fswap['wVOov'] = wVOov.transpose(1, 2, 3, 0)
    wVooV = None

    unit = nocc**2 * nvir * 7 + nocc**3 + nocc * nvir**2
    max_memory = max(0, mycc.max_memory - lib.current_memory()[0])
    blksize = min(nvir,
                  max(BLKMIN, int((max_memory * .9e6 / 8 - nocc**4) / unit)))
    log.debug1('pass 2, max_memory %d MB,  nocc,nvir = %d,%d  blksize = %d',
               max_memory, nocc, nvir, blksize)

    woooo = numpy.zeros((nocc, nocc, nocc, nocc))

    for p0, p1 in lib.prange(vloc0, vloc1, blksize):
        i0, i1 = p0 - vloc0, p1 - vloc0
        wVOov = fswap['wVOov'][i0:i1]
        wVooV = fswap['wVooV'][i0:i1]
        eris_ovoo = eris.ovoo[:, i0:i1]
        eris_oovv = numpy.empty((nocc, nocc, i1 - i0, nvir))

        def load_oovv(p0, p1):
            eris_oovv[:] = eris.oovv[:, :, p0:p1]

        with lib.call_in_background(load_oovv) as prefetch_oovv:
            #:eris_oovv = eris.oovv[:,:,i0:i1]
            prefetch_oovv(i0, i1)
            foo_priv += numpy.einsum('ck,kcji->ij', 2 * t1T[p0:p1], eris_ovoo)
            foo_priv += numpy.einsum('ck,icjk->ij', -t1T[p0:p1], eris_ovoo)
            tmp = lib.einsum('al,jaik->lkji', t1T[p0:p1], eris_ovoo)
            woooo += tmp + tmp.transpose(1, 0, 3, 2)
            tmp = None

            wVOov -= lib.einsum('jbik,ak->bjia', eris_ovoo, t1T)
            t2Tnew[i0:i1] += wVOov.transpose(0, 3, 1, 2)

            wVooV += lib.einsum('kbij,ak->bija', eris_ovoo, t1T)
            eris_ovoo = None
        load_oovv = prefetch_oovv = None

        eris_ovvo = numpy.empty((nocc, i1 - i0, nvir, nocc))

        def load_ovvo(p0, p1):
            eris_ovvo[:] = eris.ovvo[:, p0:p1]

        with lib.call_in_background(load_ovvo) as prefetch_ovvo:
            #:eris_ovvo = eris.ovvo[:,i0:i1]
            prefetch_ovvo(i0, i1)
            t1T_priv[p0:p1] -= numpy.einsum('bj,jiab->ai', t1T, eris_oovv)
            wVooV -= eris_oovv.transpose(2, 0, 1, 3)
            wVOov += wVooV * .5  #: bjia + bija*.5
        eris_voov = eris_ovvo.transpose(1, 0, 3, 2)
        eris_ovvo = None
        load_ovvo = prefetch_ovvo = None

        def update_wVooV(i0, i1):
            wVooV[:] += fswap['wVooV1'][i0:i1]
            fswap['wVooV1'][i0:i1] = wVooV
            wVOov[:] += fswap['wVOov1'][i0:i1]
            fswap['wVOov1'][i0:i1] = wVOov

        with lib.call_in_background(update_wVooV) as update_wVooV:
            update_wVooV(i0, i1)
            t2Tnew[i0:i1] += eris_voov.transpose(0, 3, 1, 2) * .5
            t1T_priv[p0:p1] += 2 * numpy.einsum('bj,aijb->ai', t1T, eris_voov)

            tmp = lib.einsum('ci,kjbc->bijk', t1T, eris_oovv)
            tmp += lib.einsum('bjkc,ci->bjik', eris_voov, t1T)
            t2Tnew[i0:i1] -= lib.einsum('bjik,ak->baji', tmp, t1T)
            eris_oovv = tmp = None

            fov_priv[:,
                     p0:p1] += numpy.einsum('ck,aikc->ia', t1T, eris_voov) * 2
            fov_priv[:, p0:p1] -= numpy.einsum('ck,akic->ia', t1T, eris_voov)

            tau = numpy.einsum('ai,bj->abij', t1T[p0:p1] * .5, t1T)
            tau += t2T[i0:i1]
            theta = tau.transpose(0, 1, 3, 2) * 2
            theta -= tau
            fvv_priv -= lib.einsum('caij,cjib->ab', theta, eris_voov)
            foo_priv += lib.einsum('aikb,abkj->ij', eris_voov, theta)
            tau = theta = None

            tau = t2T[i0:i1] + numpy.einsum('ai,bj->abij', t1T[p0:p1], t1T)
            woooo += lib.einsum('abij,aklb->ijkl', tau, eris_voov)
            tau = None
        eris_VOov = wVOov = wVooV = update_wVooV = None
        time1 = log.timer_debug1('voov [%d:%d]' % (p0, p1), *time1)

    wVooV = _cp(fswap['wVooV1'])
    for task_id, wVooV, p0, p1 in _rotate_vir_block(wVooV):
        tmp = lib.einsum('ackj,ckib->ajbi', t2T[:, p0:p1], wVooV)
        t2Tnew += tmp.transpose(0, 2, 3, 1)
        t2Tnew += tmp.transpose(0, 2, 1, 3) * .5
    wVooV = tmp = None
    time1 = log.timer_debug1('contracting wVooV', *time1)

    wVOov = _cp(fswap['wVOov1'])
    theta = t2T * 2
    theta -= t2T.transpose(0, 1, 3, 2)
    for task_id, wVOov, p0, p1 in _rotate_vir_block(wVOov):
        t2Tnew += lib.einsum('acik,ckjb->abij', theta[:, p0:p1], wVOov)
    wVOov = theta = None
    fswap = None
    time1 = log.timer_debug1('contracting wVOov', *time1)

    foo += mpi.allreduce(foo_priv)
    fov += mpi.allreduce(fov_priv)
    fvv += mpi.allreduce(fvv_priv)

    theta = t2T.transpose(0, 1, 3, 2) * 2 - t2T
    t1T_priv[vloc0:vloc1] += numpy.einsum('jb,abji->ai', fov, theta)
    ovoo = _cp(eris.ovoo)
    for task_id, ovoo, p0, p1 in _rotate_vir_block(ovoo):
        t1T_priv[vloc0:vloc1] -= lib.einsum('jbki,abjk->ai', ovoo,
                                            theta[:, p0:p1])
    theta = ovoo = None

    woooo = mpi.allreduce(woooo)
    woooo += _cp(eris.oooo).transpose(0, 2, 1, 3)
    tau = t2T + numpy.einsum('ai,bj->abij', t1T[vloc0:vloc1], t1T)
    t2Tnew += .5 * lib.einsum('abkl,ijkl->abij', tau, woooo)
    tau = woooo = None

    t1Tnew += mpi.allreduce(t1T_priv)

    ft_ij = foo + numpy.einsum('aj,ia->ij', .5 * t1T, fov)
    ft_ab = fvv - numpy.einsum('ai,ib->ab', .5 * t1T, fov)
    t2Tnew += lib.einsum('acij,bc->abij', t2T, ft_ab)
    t2Tnew -= lib.einsum('ki,abkj->abij', ft_ij, t2T)

    eia = mo_e_o[:, None] - mo_e_v
    t1Tnew += numpy.einsum('bi,ab->ai', t1T, fvv)
    t1Tnew -= numpy.einsum('aj,ji->ai', t1T, foo)
    t1Tnew /= eia.T

    t2tmp = mpi.alltoall([t2Tnew[:, p0:p1] for p0, p1 in vlocs],
                         split_recvbuf=True)
    for task_id, (p0, p1) in enumerate(vlocs):
        tmp = t2tmp[task_id].reshape(p1 - p0, nvir_seg, nocc, nocc)
        t2Tnew[:, p0:p1] += tmp.transpose(1, 0, 3, 2)

    for i in range(vloc0, vloc1):
        t2Tnew[i - vloc0] /= lib.direct_sum('i+jb->bij', eia[:, i], eia)

    time0 = log.timer_debug1('update t1 t2', *time0)
    return t1Tnew.T, t2Tnew.transpose(2, 3, 0, 1)
Beispiel #38
0
def update_amps(mycc, t1, t2, eris):
    assert (isinstance(eris, _ChemistsERIs))

    time0 = logger.process_clock(), logger.perf_counter()
    log = logger.Logger(mycc.stdout, mycc.verbose)
    nocc, nvir = t1.shape
    fock = eris.fock
    mo_e_o = eris.mo_energy[:nocc]
    mo_e_v = eris.mo_energy[nocc:] + mycc.level_shift

    t1new = numpy.zeros_like(t1)
    t2new = _add_vvvv(mycc, 0 * t1, t2, eris, t2sym='jiba')
    t2new *= .5  # *.5 because t2+t2.transpose(1,0,3,2) in the end
    time1 = log.timer_debug1('vvvv', *time0)

    #** make_inter_F
    fov = fock[:nocc, nocc:].copy()
    t1new += fov

    foo = fock[:nocc, :nocc] - numpy.diag(mo_e_o)
    fvv = fock[nocc:, nocc:] - numpy.diag(mo_e_v)

    if mycc.incore_complete:
        fswap = None
    else:
        fswap = lib.H5TmpFile()
    fwVOov, fwVooV = _add_ovvv_(mycc, t1, t2, eris, fvv, t1new, t2new, fswap)
    time1 = log.timer_debug1('ovvv', *time1)

    woooo = numpy.asarray(eris.oooo).transpose(0, 2, 1, 3).copy()

    unit = nocc**2 * nvir * 7 + nocc**3 + nocc * nvir**2
    mem_now = lib.current_memory()[0]
    max_memory = max(0, mycc.max_memory - mem_now)
    blksize = min(nvir,
                  max(BLKMIN, int((max_memory * .9e6 / 8 - nocc**4) / unit)))
    log.debug1('max_memory %d MB,  nocc,nvir = %d,%d  blksize = %d',
               max_memory, nocc, nvir, blksize)

    for p0, p1 in lib.prange(0, nvir, blksize):
        wVOov = fwVOov[p0:p1]
        wVooV = fwVooV[p0:p1]
        eris_ovoo = eris.ovoo[:, p0:p1]
        eris_oovv = numpy.empty((nocc, nocc, p1 - p0, nvir))

        def load_oovv(p0, p1):
            eris_oovv[:] = eris.oovv[:, :, p0:p1]

        with lib.call_in_background(load_oovv,
                                    sync=not mycc.async_io) as prefetch_oovv:
            #:eris_oovv = eris.oovv[:,:,p0:p1]
            prefetch_oovv(p0, p1)
            wVOov -= lib.einsum('jbik,ka->bjia', eris_ovoo, t1)
            t2new[:, :, p0:p1] += wVOov.transpose(1, 2, 0, 3)
            eris_ovoo = None
        load_oovv = prefetch_oovv = None

        wVOov *= 0  # QCI

        eris_ovvo = numpy.empty((nocc, p1 - p0, nvir, nocc))

        def load_ovvo(p0, p1):
            eris_ovvo[:] = eris.ovvo[:, p0:p1]

        with lib.call_in_background(load_ovvo,
                                    sync=not mycc.async_io) as prefetch_ovvo:
            #:eris_ovvo = eris.ovvo[:,p0:p1]
            prefetch_ovvo(p0, p1)
            t1new[:, p0:p1] -= numpy.einsum('jb,jiab->ia', t1, eris_oovv)
            wVooV -= eris_oovv.transpose(2, 0, 1, 3)
            wVOov += wVooV * .5  #: bjia + bija*.5
        load_ovvo = prefetch_ovvo = None

        t2new[:, :, p0:p1] += (eris_ovvo * 0.5).transpose(0, 3, 1, 2)
        eris_voov = eris_ovvo.conj().transpose(1, 0, 3, 2)
        t1new[:, p0:p1] += 2 * numpy.einsum('jb,aijb->ia', t1, eris_voov)
        eris_ovvo = None
        eris_oovv = tmp = None

        fov[:, p0:p1] += numpy.einsum('kc,aikc->ia', t1, eris_voov) * 2
        fov[:, p0:p1] -= numpy.einsum('kc,akic->ia', t1, eris_voov)

        tau = t2[:, :, p0:p1]
        theta = tau.transpose(1, 0, 2, 3) * 2
        theta -= tau
        fvv -= lib.einsum('cjia,cjib->ab', theta.transpose(2, 1, 0, 3),
                          eris_voov)
        foo += lib.einsum('aikb,kjab->ij', eris_voov, theta)
        theta = None
        woooo += lib.einsum('ijab,aklb->ijkl', tau, eris_voov)
        tau = None

        def update_wVooV(q0, q1, tau):
            wVooV[:] += lib.einsum('bkic,jkca->bija', eris_voov[:, :, :,
                                                                q0:q1], tau)

        with lib.call_in_background(update_wVooV,
                                    sync=not mycc.async_io) as update_wVooV:
            for q0, q1 in lib.prange(0, nvir, blksize):
                tau = t2[:, :, q0:q1] * .5
                #:wVooV += lib.einsum('bkic,jkca->bija', eris_voov[:,:,:,q0:q1], tau)
                update_wVooV(q0, q1, tau)
        tau = update_wVooV = None

        def update_t2(q0, q1, tmp):
            t2new[:, :, q0:q1] += tmp.transpose(2, 0, 1, 3)
            tmp *= .5
            t2new[:, :, q0:q1] += tmp.transpose(0, 2, 1, 3)

        with lib.call_in_background(update_t2,
                                    sync=not mycc.async_io) as update_t2:
            for q0, q1 in lib.prange(0, nvir, blksize):
                tmp = lib.einsum('jkca,ckib->jaib', t2[:, :, p0:p1, q0:q1],
                                 wVooV)
                #:t2new[:,:,q0:q1] += tmp.transpose(2,0,1,3)
                #:tmp *= .5
                #:t2new[:,:,q0:q1] += tmp.transpose(0,2,1,3)
                update_t2(q0, q1, tmp)
                tmp = None

        wVOov += eris_voov
        eris_VOov = -.5 * eris_voov.transpose(0, 2, 1, 3)
        eris_VOov += eris_voov
        eris_voov = None

        def update_wVOov(q0, q1, tau):
            wVOov[:, :, :,
                  q0:q1] += .5 * lib.einsum('aikc,kcjb->aijb', eris_VOov, tau)

        with lib.call_in_background(update_wVOov,
                                    sync=not mycc.async_io) as update_wVOov:
            for q0, q1 in lib.prange(0, nvir, blksize):
                tau = t2[:, :, q0:q1].transpose(1, 3, 0, 2) * 2
                tau -= t2[:, :, q0:q1].transpose(0, 3, 1, 2)
                #:wVOov[:,:,:,q0:q1] += .5 * lib.einsum('aikc,kcjb->aijb', eris_VOov, tau)
                update_wVOov(q0, q1, tau)
                tau = None

        def update_t2(q0, q1, theta):
            t2new[:, :, q0:q1] += lib.einsum('kica,ckjb->ijab', theta, wVOov)

        with lib.call_in_background(update_t2,
                                    sync=not mycc.async_io) as update_t2:
            for q0, q1 in lib.prange(0, nvir, blksize):
                theta = t2[:, :, p0:p1, q0:q1] * 2
                theta -= t2[:, :, p0:p1, q0:q1].transpose(1, 0, 2, 3)
                #:t2new[:,:,q0:q1] += lib.einsum('kica,ckjb->ijab', theta, wVOov)
                update_t2(q0, q1, theta)
                theta = None
        eris_VOov = wVOov = wVooV = update_wVOov = None
        time1 = log.timer_debug1('voov [%d:%d]' % (p0, p1), *time1)
    fwVOov = fwVooV = fswap = None

    for p0, p1 in lib.prange(0, nvir, blksize):
        theta = t2[:, :, p0:p1].transpose(1, 0, 2, 3) * 2 - t2[:, :, p0:p1]
        t1new += numpy.einsum('jb,ijba->ia', fov[:, p0:p1], theta)
        t1new -= lib.einsum('jbki,kjba->ia', eris.ovoo[:, p0:p1], theta)

        tau = t2[:, :, p0:p1]
        t2new[:, :, p0:p1] += .5 * lib.einsum('ijkl,klab->ijab', woooo, tau)
        theta = tau = None

    t2new += lib.einsum('ijac,bc->ijab', t2, fvv)
    t2new -= lib.einsum('ki,kjab->ijab', foo, t2)

    eia = mo_e_o[:, None] - mo_e_v
    t1new += numpy.einsum('ib,ab->ia', t1, fvv)
    t1new -= numpy.einsum('ja,ji->ia', t1, foo)
    t1new /= eia

    #: t2new = t2new + t2new.transpose(1,0,3,2)
    for i in range(nocc):
        if i > 0:
            t2new[i, :i] += t2new[:i, i].transpose(0, 2, 1)
            t2new[i, :i] /= lib.direct_sum('a,jb->jab', eia[i], eia[:i])
            t2new[:i, i] = t2new[i, :i].transpose(0, 2, 1)
        t2new[i, i] = t2new[i, i] + t2new[i, i].T
        t2new[i, i] /= lib.direct_sum('a,b->ab', eia[i], eia[i])

    time0 = log.timer_debug1('update t1 t2', *time0)
    return t1new, t2new
Beispiel #39
0
    def make_kpt(uniq_kptji_id, cholesky_j2c):
        kpt = uniq_kpts[uniq_kptji_id]  # kpt = kptj - kpti
        log.debug1('kpt = %s', kpt)
        adapted_ji_idx = numpy.where(uniq_inverse == uniq_kptji_id)[0]
        adapted_kptjs = kptjs[adapted_ji_idx]
        nkptj = len(adapted_kptjs)
        log.debug1('adapted_ji_idx = %s', adapted_ji_idx)

        j2c, j2c_negative, j2ctag = cholesky_j2c

        shls_slice = (auxcell.nbas, fused_cell.nbas)
        Gaux = ft_ao.ft_ao(fused_cell, Gv, shls_slice, b, gxyz, Gvbase, kpt)
        wcoulG = mydf.weighted_coulG(kpt, False, mesh)
        Gaux *= wcoulG.reshape(-1, 1)
        kLR = Gaux.real.copy('C')
        kLI = Gaux.imag.copy('C')
        Gaux = None

        if is_zero(kpt):  # kpti == kptj
            aosym = 's2'
            nao_pair = nao * (nao + 1) // 2

            if cell.dimension == 3:
                vbar = fuse(mydf.auxbar(fused_cell))
                ovlp = cell.pbc_intor('int1e_ovlp',
                                      hermi=1,
                                      kpts=adapted_kptjs)
                ovlp = [lib.pack_tril(s) for s in ovlp]
        else:
            aosym = 's1'
            nao_pair = nao**2

        mem_now = lib.current_memory()[0]
        log.debug2('memory = %s', mem_now)
        max_memory = max(2000, mydf.max_memory - mem_now)
        # nkptj for 3c-coulomb arrays plus 1 Lpq array
        buflen = min(max(int(max_memory * .38e6 / 16 / naux / (nkptj + 1)), 1),
                     nao_pair)
        shranges = _guess_shell_ranges(cell, buflen, aosym)
        buflen = max([x[2] for x in shranges])
        # +1 for a pqkbuf
        if aosym == 's2':
            Gblksize = max(16,
                           int(max_memory * .1e6 / 16 / buflen / (nkptj + 1)))
        else:
            Gblksize = max(16,
                           int(max_memory * .2e6 / 16 / buflen / (nkptj + 1)))
        Gblksize = min(Gblksize, ngrids, 16384)
        pqkRbuf = numpy.empty(buflen * Gblksize)
        pqkIbuf = numpy.empty(buflen * Gblksize)
        # buf for ft_aopair
        buf = numpy.empty(nkptj * buflen * Gblksize, dtype=numpy.complex128)

        def pw_contract(istep, sh_range, j3cR, j3cI):
            bstart, bend, ncol = sh_range
            if aosym == 's2':
                shls_slice = (bstart, bend, 0, bend)
            else:
                shls_slice = (bstart, bend, 0, cell.nbas)

            for p0, p1 in lib.prange(0, ngrids, Gblksize):
                dat = ft_ao._ft_aopair_kpts(cell,
                                            Gv[p0:p1],
                                            shls_slice,
                                            aosym,
                                            b,
                                            gxyz[p0:p1],
                                            Gvbase,
                                            kpt,
                                            adapted_kptjs,
                                            out=buf)
                nG = p1 - p0
                for k, ji in enumerate(adapted_ji_idx):
                    aoao = dat[k].reshape(nG, ncol)
                    pqkR = numpy.ndarray((ncol, nG), buffer=pqkRbuf)
                    pqkI = numpy.ndarray((ncol, nG), buffer=pqkIbuf)
                    pqkR[:] = aoao.real.T
                    pqkI[:] = aoao.imag.T

                    lib.dot(kLR[p0:p1].T, pqkR.T, -1, j3cR[k][naux:], 1)
                    lib.dot(kLI[p0:p1].T, pqkI.T, -1, j3cR[k][naux:], 1)
                    if not (is_zero(kpt) and gamma_point(adapted_kptjs[k])):
                        lib.dot(kLR[p0:p1].T, pqkI.T, -1, j3cI[k][naux:], 1)
                        lib.dot(kLI[p0:p1].T, pqkR.T, 1, j3cI[k][naux:], 1)

            for k, ji in enumerate(adapted_ji_idx):
                if is_zero(kpt) and gamma_point(adapted_kptjs[k]):
                    v = fuse(j3cR[k])
                else:
                    v = fuse(j3cR[k] + j3cI[k] * 1j)
                if j2ctag == 'CD':
                    v = scipy.linalg.solve_triangular(j2c,
                                                      v,
                                                      lower=True,
                                                      overwrite_b=True)
                    feri['j3c/%d/%d' % (ji, istep)] = v
                else:
                    feri['j3c/%d/%d' % (ji, istep)] = lib.dot(j2c, v)

                # low-dimension systems
                if j2c_negative is not None:
                    feri['j3c-/%d/%d' % (ji, istep)] = lib.dot(j2c_negative, v)

        with lib.call_in_background(pw_contract) as compute:
            col1 = 0
            for istep, sh_range in enumerate(shranges):
                log.debug1('int3c2e [%d/%d], AO [%d:%d], ncol = %d', \
                           istep+1, len(shranges), *sh_range)
                bstart, bend, ncol = sh_range
                col0, col1 = col1, col1 + ncol
                j3cR = []
                j3cI = []
                for k, idx in enumerate(adapted_ji_idx):
                    v = numpy.vstack([
                        fswap['j3c-junk/%d/%d' % (idx, i)][0, col0:col1].T
                        for i in range(nsegs)
                    ])
                    # vbar is the interaction between the background charge
                    # and the auxiliary basis.  0D, 1D, 2D do not have vbar.
                    if is_zero(kpt) and cell.dimension == 3:
                        for i in numpy.where(vbar != 0)[0]:
                            v[i] -= vbar[i] * ovlp[k][col0:col1]
                    j3cR.append(numpy.asarray(v.real, order='C'))
                    if is_zero(kpt) and gamma_point(adapted_kptjs[k]):
                        j3cI.append(None)
                    else:
                        j3cI.append(numpy.asarray(v.imag, order='C'))
                v = None
                compute(istep, sh_range, j3cR, j3cI)
        for ji in adapted_ji_idx:
            del (fswap['j3c-junk/%d' % ji])
Beispiel #40
0
def _make_eris_outcore(mycc, mo_coeff=None):
    cput0 = (time.clock(), time.time())
    log = logger.Logger(mycc.stdout, mycc.verbose)
    _sync_(mycc)
    eris = ccsd._ChemistsERIs()
    if rank == 0:
        eris._common_init_(mycc, mo_coeff)
        comm.bcast((eris.mo_coeff, eris.fock, eris.nocc, eris.mo_energy))
    else:
        eris.mol = mycc.mol
        eris.mo_coeff, eris.fock, eris.nocc, eris.mo_energy = comm.bcast(None)

    mol = mycc.mol
    mo_coeff = numpy.asarray(eris.mo_coeff, order='F')
    nocc = eris.nocc
    nao, nmo = mo_coeff.shape
    nvir = nmo - nocc
    orbo = mo_coeff[:, :nocc]
    orbv = mo_coeff[:, nocc:]
    nvpair = nvir * (nvir + 1) // 2
    vlocs = [_task_location(nvir, task_id) for task_id in range(mpi.pool.size)]
    vloc0, vloc1 = vlocs[rank]
    vseg = vloc1 - vloc0

    eris.feri1 = lib.H5TmpFile()
    eris.oooo = eris.feri1.create_dataset('oooo', (nocc, nocc, nocc, nocc),
                                          'f8')
    eris.oovv = eris.feri1.create_dataset('oovv', (nocc, nocc, vseg, nvir),
                                          'f8',
                                          chunks=(nocc, nocc, 1, nvir))
    eris.ovoo = eris.feri1.create_dataset('ovoo', (nocc, vseg, nocc, nocc),
                                          'f8',
                                          chunks=(nocc, 1, nocc, nocc))
    eris.ovvo = eris.feri1.create_dataset('ovvo', (nocc, vseg, nvir, nocc),
                                          'f8',
                                          chunks=(nocc, 1, nvir, nocc))
    eris.ovov = eris.feri1.create_dataset('ovov', (nocc, vseg, nocc, nvir),
                                          'f8',
                                          chunks=(nocc, 1, nocc, nvir))
    #    eris.ovvv = eris.feri1.create_dataset('ovvv', (nocc,vseg,nvpair), 'f8', chunks=(nocc,1,nvpair))
    eris.vvvo = eris.feri1.create_dataset('vvvo', (vseg, nvir, nvir, nocc),
                                          'f8',
                                          chunks=(1, nvir, 1, nocc))
    assert (mycc.direct)

    def save_occ_frac(p0, p1, eri):
        eri = eri.reshape(p1 - p0, nocc, nmo, nmo)
        eris.oooo[p0:p1] = eri[:, :, :nocc, :nocc]
        eris.oovv[p0:p1] = eri[:, :, nocc + vloc0:nocc + vloc1, nocc:]

    def save_vir_frac(p0, p1, eri):
        log.alldebug1('save_vir_frac %d %d %s', p0, p1, eri.shape)
        eri = eri.reshape(p1 - p0, nocc, nmo, nmo)
        eris.ovoo[:, p0:p1] = eri[:, :, :nocc, :nocc].transpose(1, 0, 2, 3)
        eris.ovvo[:, p0:p1] = eri[:, :, nocc:, :nocc].transpose(1, 0, 2, 3)
        eris.ovov[:, p0:p1] = eri[:, :, :nocc, nocc:].transpose(1, 0, 2, 3)
        #        vvv = lib.pack_tril(eri[:,:,nocc:,nocc:].reshape((p1-p0)*nocc,nvir,nvir))
        #        eris.ovvv[:,p0:p1] = vvv.reshape(p1-p0,nocc,nvpair).transpose(1,0,2)

        cput2 = time.clock(), time.time()
        ovvv_segs = [
            eri[:, :, nocc + q0:nocc + q1, nocc:].transpose(2, 3, 0, 1)
            for q0, q1 in vlocs
        ]
        ovvv_segs = mpi.alltoall(ovvv_segs, split_recvbuf=True)
        cput2 = log.timer_debug1('vvvo alltoall', *cput2)
        for task_id, (q0, q1) in enumerate(comm.allgather((p0, p1))):
            ip0 = q0 + vlocs[task_id][0]
            ip1 = q1 + vlocs[task_id][0]
            eris.vvvo[:, :, ip0:ip1] = ovvv_segs[task_id].reshape(
                vseg, nvir, q1 - q0, nocc)

    fswap = lib.H5TmpFile()
    max_memory = max(MEMORYMIN, mycc.max_memory - lib.current_memory()[0])
    int2e = mol._add_suffix('int2e')
    orbov = numpy.hstack((orbo, orbv[:, vloc0:vloc1]))
    ao2mo.outcore.half_e1(mol, (orbov, orbo),
                          fswap,
                          int2e,
                          's4',
                          1,
                          max_memory,
                          verbose=log)

    ao_loc = mol.ao_loc_nr()
    nao_pair = nao * (nao + 1) // 2
    blksize = int(min(8e9, max_memory * .5e6) / 8 / (nao_pair + nmo**2) / nocc)
    blksize = min(nvir, max(BLKMIN, blksize))
    fload = ao2mo.outcore._load_from_h5g

    buf = numpy.empty((blksize * nocc, nao_pair))
    buf_prefetch = numpy.empty_like(buf)

    def prefetch(p0, p1, rowmax):
        p0, p1 = p1, min(rowmax, p1 + blksize)
        if p0 < p1:
            fload(fswap['0'], p0 * nocc, p1 * nocc, buf_prefetch)

    cput1 = time.clock(), time.time()
    outbuf = numpy.empty((blksize * nocc, nmo**2))
    with lib.call_in_background(prefetch) as bprefetch:
        fload(fswap['0'], 0, min(nocc, blksize) * nocc, buf_prefetch)
        for p0, p1 in lib.prange(0, nocc, blksize):
            nrow = (p1 - p0) * nocc
            buf, buf_prefetch = buf_prefetch, buf
            bprefetch(p0, p1, nocc)
            dat = ao2mo._ao2mo.nr_e2(buf[:nrow],
                                     mo_coeff, (0, nmo, 0, nmo),
                                     's4',
                                     's1',
                                     out=outbuf,
                                     ao_loc=ao_loc)
            save_occ_frac(p0, p1, dat)

        blksize = min(comm.allgather(blksize))
        norb_max = nocc + vseg
        fload(fswap['0'], nocc**2,
              min(nocc + blksize, norb_max) * nocc, buf_prefetch)
        for p0, p1 in mpi.prange(vloc0, vloc1, blksize):
            i0, i1 = p0 - vloc0, p1 - vloc0
            nrow = (p1 - p0) * nocc
            buf, buf_prefetch = buf_prefetch, buf
            bprefetch(nocc + i0, nocc + i1, norb_max)
            dat = ao2mo._ao2mo.nr_e2(buf[:nrow],
                                     mo_coeff, (0, nmo, 0, nmo),
                                     's4',
                                     's1',
                                     out=outbuf,
                                     ao_loc=ao_loc)
            save_vir_frac(i0, i1, dat)
    buf = buf_prefecth = outbuf = None

    cput1 = log.timer_debug1('transforming oppp', *cput1)
    log.timer('CCSD integral transformation', *cput0)
    mycc._eris = eris
    return eris
Beispiel #41
0
def _aux_e2(cell,
            auxcell,
            erifile,
            intor='int3c2e',
            aosym='s2ij',
            comp=None,
            kptij_lst=None,
            dataname='eri_mo',
            shls_slice=None,
            max_memory=2000,
            verbose=0):
    r'''3-center AO integrals (ij|L) with double lattice sum:
    \sum_{lm} (i[l]j[m]|L[0]), where L is the auxiliary basis.
    Three-index integral tensor (kptij_idx, nao_pair, naux) or four-index
    integral tensor (kptij_idx, comp, nao_pair, naux) are stored on disk.

    **This function should be only used by df and mdf initialization function
    _make_j3c**

    Args:
        kptij_lst : (*,2,3) array
            A list of (kpti, kptj)
    '''
    intor, comp = gto.moleintor._get_intor_and_comp(cell._add_suffix(intor),
                                                    comp)

    if isinstance(erifile, h5py.Group):
        feri = erifile
    elif h5py.is_hdf5(erifile):
        feri = h5py.File(erifile, 'a')
    else:
        feri = h5py.File(erifile, 'w')
    if dataname in feri:
        del (feri[dataname])
    if dataname + '-kptij' in feri:
        del (feri[dataname + '-kptij'])

    if kptij_lst is None:
        kptij_lst = numpy.zeros((1, 2, 3))
    feri[dataname + '-kptij'] = kptij_lst

    if shls_slice is None:
        shls_slice = (0, cell.nbas, 0, cell.nbas, 0, auxcell.nbas)

    ao_loc = cell.ao_loc_nr()
    aux_loc = auxcell.ao_loc_nr(auxcell.cart
                                or 'ssc' in intor)[:shls_slice[5] + 1]
    ni = ao_loc[shls_slice[1]] - ao_loc[shls_slice[0]]
    nj = ao_loc[shls_slice[3]] - ao_loc[shls_slice[2]]
    naux = aux_loc[shls_slice[5]] - aux_loc[shls_slice[4]]
    nkptij = len(kptij_lst)

    nii = (ao_loc[shls_slice[1]] * (ao_loc[shls_slice[1]] + 1) // 2 -
           ao_loc[shls_slice[0]] * (ao_loc[shls_slice[0]] + 1) // 2)
    nij = ni * nj

    kpti = kptij_lst[:, 0]
    kptj = kptij_lst[:, 1]
    aosym_ks2 = abs(kpti - kptj).sum(axis=1) < KPT_DIFF_TOL
    j_only = numpy.all(aosym_ks2)
    #aosym_ks2 &= (aosym[:2] == 's2' and shls_slice[:2] == shls_slice[2:4])
    aosym_ks2 &= aosym[:2] == 's2'

    if j_only and aosym[:2] == 's2':
        assert (shls_slice[2] == 0)
        nao_pair = nii
    else:
        nao_pair = nij

    if gamma_point(kptij_lst):
        dtype = numpy.double
    else:
        dtype = numpy.complex128

    buflen = max(8, int(max_memory * .47e6 / 16 / (nkptij * ni * nj * comp)))
    auxdims = aux_loc[shls_slice[4] + 1:shls_slice[5] +
                      1] - aux_loc[shls_slice[4]:shls_slice[5]]
    auxranges = balance_segs(auxdims, buflen)
    buflen = max([x[2] for x in auxranges])
    buf = numpy.empty(nkptij * comp * ni * nj * buflen, dtype=dtype)
    buf1 = numpy.empty_like(buf)

    int3c = wrap_int3c(cell, auxcell, intor, aosym, comp, kptij_lst)

    kptis = kptij_lst[:, 0]
    kptjs = kptij_lst[:, 1]
    kpt_ji = kptjs - kptis
    uniq_kpts, uniq_index, uniq_inverse = unique(kpt_ji)
    # sorted_ij_idx: Sort and group the kptij_lst according to the ordering in
    # df._make_j3c to reduce the data fragment in the hdf5 file.  When datasets
    # are written to hdf5, they are saved sequentially. If the integral data are
    # saved as the order of kptij_lst, removing the datasets in df._make_j3c will
    # lead to holes that can not be reused.
    sorted_ij_idx = numpy.hstack(
        [numpy.where(uniq_inverse == k)[0] for k, kpt in enumerate(uniq_kpts)])
    tril_idx = numpy.tril_indices(ni)
    tril_idx = tril_idx[0] * ni + tril_idx[1]

    def save(istep, mat):
        for k in sorted_ij_idx:
            v = mat[k]
            if gamma_point(kptij_lst[k]):
                v = v.real
            if aosym_ks2[k] and nao_pair == ni**2:
                v = v[:, tril_idx]
            feri['%s/%d/%d' % (dataname, k, istep)] = v

    with lib.call_in_background(save) as bsave:
        for istep, auxrange in enumerate(auxranges):
            sh0, sh1, nrow = auxrange
            sub_slice = (shls_slice[0], shls_slice[1], shls_slice[2],
                         shls_slice[3], shls_slice[4] + sh0,
                         shls_slice[4] + sh1)
            mat = numpy.ndarray((nkptij, comp, nao_pair, nrow),
                                dtype=dtype,
                                buffer=buf)
            bsave(istep, int3c(sub_slice, mat))
            buf, buf1 = buf1, buf

    if not isinstance(erifile, h5py.Group):
        feri.close()
    return erifile
Beispiel #42
0
def _make_eris_outcore(myci, mo_coeff=None):
    cput0 = (time.clock(), time.time())
    log = logger.Logger(myci.stdout, myci.verbose)
    eris = _RCISD_ERIs(myci, mo_coeff)

    mol = myci.mol
    mo_coeff = eris.mo_coeff
    nocc = eris.nocc
    nao, nmo = mo_coeff.shape
    nvir = nmo - nocc
    orbo = mo_coeff[:, :nocc]
    orbv = mo_coeff[:, nocc:]
    nvpair = nvir * (nvir + 1) // 2
    eris.feri1 = lib.H5TmpFile()
    eris.oooo = eris.feri1.create_dataset('oooo', (nocc, nocc, nocc, nocc),
                                          'f8')
    eris.vvoo = eris.feri1.create_dataset('vvoo', (nvir, nvir, nocc, nocc),
                                          'f8')
    eris.vooo = eris.feri1.create_dataset('vooo', (nvir, nocc, nocc, nocc),
                                          'f8')
    eris.voov = eris.feri1.create_dataset('voov', (nvir, nocc, nocc, nvir),
                                          'f8')
    eris.vovv = eris.feri1.create_dataset('vovv', (nvir, nocc, nvpair), 'f8')

    nvir_pair = nvir * (nvir + 1) // 2
    oovv = numpy.empty((nocc, nocc, nvir, nvir))

    def save_occ_frac(p0, p1, eri):
        eri = eri.reshape(p1 - p0, nocc, nmo, nmo)
        eris.oooo[p0:p1] = eri[:, :, :nocc, :nocc]
        oovv[p0:p1] = eri[:, :, nocc:, nocc:]

    def save_vir_frac(p0, p1, eri):
        eri = eri.reshape(p1 - p0, nocc, nmo, nmo)
        eris.vooo[p0:p1] = eri[:, :, :nocc, :nocc]
        eris.voov[p0:p1] = eri[:, :, :nocc, nocc:]
        vv = _cp(eri[:, :, nocc:, nocc:].reshape((p1 - p0) * nocc, nvir, nvir))
        eris.vovv[p0:p1] = lib.pack_tril(vv).reshape(p1 - p0, nocc, nvir_pair)

    cput1 = time.clock(), time.time()
    if not myci.direct:
        max_memory = max(2000, myci.max_memory - lib.current_memory()[0])
        eris.feri2 = lib.H5TmpFile()
        ao2mo.full(mol, orbv, eris.feri2, max_memory=max_memory, verbose=log)
        eris.vvvv = eris.feri2['eri_mo']
        cput1 = log.timer_debug1('transforming vvvv', *cput1)

    tmpfile3 = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
    with h5py.File(tmpfile3.name, 'w') as fswap:
        mo_coeff = numpy.asarray(mo_coeff, order='F')
        max_memory = max(2000, myci.max_memory - lib.current_memory()[0])
        int2e = mol._add_suffix('int2e')
        ao2mo.outcore.half_e1(mol, (mo_coeff, mo_coeff[:, :nocc]),
                              fswap,
                              int2e,
                              's4',
                              1,
                              max_memory,
                              verbose=log)

        ao_loc = mol.ao_loc_nr()
        nao_pair = nao * (nao + 1) // 2
        blksize = int(
            min(8e9, max_memory * .5e6) / 8 / (nao_pair + nmo**2) / nocc)
        blksize = max(1, min(nmo * nocc, blksize))
        fload = ao2mo.outcore._load_from_h5g

        def prefetch(p0, p1, rowmax, buf):
            p0, p1 = p1, min(rowmax, p1 + blksize)
            if p0 < p1:
                fload(fswap['0'], p0 * nocc, p1 * nocc, buf)

        buf = numpy.empty((blksize * nocc, nao_pair))
        buf_prefetch = numpy.empty_like(buf)
        outbuf = numpy.empty((blksize * nocc, nmo**2))
        with lib.call_in_background(prefetch) as bprefetch:
            fload(fswap['0'], 0, min(nocc, blksize) * nocc, buf_prefetch)
            for p0, p1 in lib.prange(0, nocc, blksize):
                nrow = (p1 - p0) * nocc
                buf, buf_prefetch = buf_prefetch, buf
                bprefetch(p0, p1, nocc, buf_prefetch)
                dat = ao2mo._ao2mo.nr_e2(buf[:nrow],
                                         mo_coeff, (0, nmo, 0, nmo),
                                         's4',
                                         's1',
                                         out=outbuf,
                                         ao_loc=ao_loc)
                save_occ_frac(p0, p1, dat)

            fload(fswap['0'], nocc**2,
                  min(nmo, nocc + blksize) * nocc, buf_prefetch)
            for p0, p1 in lib.prange(0, nvir, blksize):
                nrow = (p1 - p0) * nocc
                buf, buf_prefetch = buf_prefetch, buf
                bprefetch(nocc + p0, nocc + p1, nmo, buf_prefetch)
                dat = ao2mo._ao2mo.nr_e2(buf[:nrow],
                                         mo_coeff, (0, nmo, 0, nmo),
                                         's4',
                                         's1',
                                         out=outbuf,
                                         ao_loc=ao_loc)
                save_vir_frac(p0, p1, dat)

        cput1 = log.timer_debug1('transforming oppp', *cput1)
    eris.vvoo[:] = lib.transpose(oovv.reshape(nocc**2, -1)).reshape(
        nvir, nvir, nocc, nocc)
    log.timer('CISD integral transformation', *cput0)
    return eris
Beispiel #43
0
def _assemble(mydf, kptij_lst, j3c_jobs, gen_int3c, ft_fuse, cderi_file, fswap,
              log):
    t1 = (time.clock(), time.time())
    cell = mydf.cell
    ao_loc = cell.ao_loc_nr()
    nao = ao_loc[-1]
    kptis = kptij_lst[:, 0]
    kptjs = kptij_lst[:, 1]
    kpt_ji = kptjs - kptis
    uniq_kpts, uniq_index, uniq_inverse = unique(kpt_ji)
    aosym_s2 = numpy.einsum('ix->i', abs(kptis - kptjs)) < 1e-9

    t2 = t1
    j3c_workers = numpy.zeros(len(j3c_jobs), dtype=int)
    #for job_id, ish0, ish1 in mpi.work_share_partition(j3c_jobs):
    for job_id, ish0, ish1 in mpi.work_stealing_partition(j3c_jobs):
        gen_int3c(job_id, ish0, ish1)
        t2 = log.alltimer_debug2('int j3c %d' % job_id, *t2)

        for k, kpt in enumerate(uniq_kpts):
            ft_fuse(job_id, k, ish0, ish1)
            t2 = log.alltimer_debug2('ft-fuse %d k %d' % (job_id, k), *t2)

        j3c_workers[job_id] = rank
    j3c_workers = mpi.allreduce(j3c_workers)
    log.debug2('j3c_workers %s', j3c_workers)
    t1 = log.timer_debug1('int3c and fuse', *t1)

    # Pass 2
    # Transpose 3-index tensor and save data in cderi_file
    feri = h5py.File(cderi_file, 'w')
    nauxs = [fswap['j2c/%d' % k].shape[0] for k, kpt in enumerate(uniq_kpts)]
    segsize = (max(nauxs) + mpi.pool.size - 1) // mpi.pool.size
    naux0 = rank * segsize
    for k, kptij in enumerate(kptij_lst):
        naux1 = min(nauxs[uniq_inverse[k]], naux0 + segsize)
        nrow = max(0, naux1 - naux0)
        if gamma_point(kptij):
            dtype = 'f8'
        else:
            dtype = 'c16'
        if aosym_s2[k]:
            nao_pair = nao * (nao + 1) // 2
        else:
            nao_pair = nao * nao
        feri.create_dataset('j3c/%d' % k, (nrow, nao_pair),
                            dtype,
                            maxshape=(None, nao_pair))

    def get_segs_loc(aosym):
        off0 = numpy.asarray([ao_loc[i0] for x, i0, i1 in j3c_jobs])
        off1 = numpy.asarray([ao_loc[i1] for x, i0, i1 in j3c_jobs])
        if aosym:  # s2
            dims = off1 * (off1 + 1) // 2 - off0 * (off0 + 1) // 2
        else:
            dims = (off1 - off0) * nao
        #dims = numpy.asarray([ao_loc[i1]-ao_loc[i0] for x,i0,i1 in j3c_jobs])
        dims = numpy.hstack(
            [dims[j3c_workers == w] for w in range(mpi.pool.size)])
        job_idx = numpy.hstack(
            [numpy.where(j3c_workers == w)[0] for w in range(mpi.pool.size)])
        segs_loc = numpy.append(0, numpy.cumsum(dims))
        segs_loc = [(segs_loc[j], segs_loc[j + 1])
                    for j in numpy.argsort(job_idx)]
        return segs_loc

    segs_loc_s1 = get_segs_loc(False)
    segs_loc_s2 = get_segs_loc(True)

    job_ids = numpy.where(rank == j3c_workers)[0]

    def load(k, p0, p1):
        naux1 = nauxs[uniq_inverse[k]]
        slices = [(min(i * segsize + p0, naux1), min(i * segsize + p1, naux1))
                  for i in range(mpi.pool.size)]
        segs = []
        for p0, p1 in slices:
            val = [
                fswap['j3c-chunks/%d/%d' % (job, k)][p0:p1].ravel()
                for job in job_ids
            ]
            if val:
                segs.append(numpy.hstack(val))
            else:
                segs.append(numpy.zeros(0))
        return segs

    def save(k, p0, p1, segs):
        segs = mpi.alltoall(segs)
        naux1 = nauxs[uniq_inverse[k]]
        loc0, loc1 = min(p0, naux1 - naux0), min(p1, naux1 - naux0)
        nL = loc1 - loc0
        if nL > 0:
            if aosym_s2[k]:
                segs = numpy.hstack([
                    segs[i0 * nL:i1 * nL].reshape(nL, -1)
                    for i0, i1 in segs_loc_s2
                ])
            else:
                segs = numpy.hstack([
                    segs[i0 * nL:i1 * nL].reshape(nL, -1)
                    for i0, i1 in segs_loc_s1
                ])
            feri['j3c/%d' % k][loc0:loc1] = segs

    mem_now = max(comm.allgather(lib.current_memory()[0]))
    max_memory = max(2000, min(8000, mydf.max_memory - mem_now))
    if numpy.all(aosym_s2):
        if gamma_point(kptij_lst):
            blksize = max(16, int(max_memory * .5e6 / 8 / nao**2))
        else:
            blksize = max(16, int(max_memory * .5e6 / 16 / nao**2))
    else:
        blksize = max(16, int(max_memory * .5e6 / 16 / nao**2 / 2))
    log.debug1('max_momory %d MB (%d in use), blksize %d', max_memory, mem_now,
               blksize)

    t2 = t1
    with lib.call_in_background(save) as async_write:
        for k, kptji in enumerate(kptij_lst):
            for p0, p1 in lib.prange(0, segsize, blksize):
                segs = load(k, p0, p1)
                async_write(k, p0, p1, segs)
                t2 = log.timer_debug1(
                    'assemble k=%d %d:%d (in %d)' % (k, p0, p1, segsize), *t2)

    if 'j2c-' in fswap:
        j2c_kpts_lists = []
        for k, kpt in enumerate(uniq_kpts):
            if ('j2c-/%d' % k) in fswap:
                adapted_ji_idx = numpy.where(uniq_inverse == k)[0]
                j2c_kpts_lists.append(adapted_ji_idx)

        for k in numpy.hstack(j2c_kpts_lists):
            val = [
                numpy.asarray(fswap['j3c-/%d/%d' % (job, k)]).ravel()
                for job in job_ids
            ]
            val = mpi.gather(numpy.hstack(val))
            if rank == 0:
                naux1 = fswap['j3c-/0/%d' % k].shape[0]
                if aosym_s2[k]:
                    v = [
                        val[i0 * naux1:i1 * naux1].reshape(naux1, -1)
                        for i0, i1 in segs_loc_s2
                    ]
                else:
                    v = [
                        val[i0 * naux1:i1 * naux1].reshape(naux1, -1)
                        for i0, i1 in segs_loc_s1
                    ]
                feri['j3c-/%d' % k] = numpy.hstack(v)

    if 'j3c-kptij' in feri: del (feri['j3c-kptij'])
    feri['j3c-kptij'] = kptij_lst
    t1 = log.alltimer_debug1('assembling j3c', *t1)
    feri.close()
Beispiel #44
0
def _ao2mo_ovov(mp, orbs, feri, max_memory=2000, verbose=None):
    time0 = (time.clock(), time.time())
    log = logger.new_logger(mp, verbose)
    orboa = numpy.asarray(orbs[0], order='F')
    orbva = numpy.asarray(orbs[1], order='F')
    orbob = numpy.asarray(orbs[2], order='F')
    orbvb = numpy.asarray(orbs[3], order='F')
    nao, nocca = orboa.shape
    noccb = orbob.shape[1]
    nvira = orbva.shape[1]
    nvirb = orbvb.shape[1]

    mol = mp.mol
    int2e = mol._add_suffix('int2e')
    ao2mopt = _ao2mo.AO2MOpt(mol, int2e, 'CVHFnr_schwarz_cond',
                             'CVHFsetnr_direct_scf')
    nbas = mol.nbas
    assert(nvira <= nao)
    assert(nvirb <= nao)

    ao_loc = mol.ao_loc_nr()
    dmax = max(4, min(nao/3, numpy.sqrt(max_memory*.95e6/8/(nao+nocca)**2)))
    sh_ranges = ao2mo.outcore.balance_partition(ao_loc, dmax)
    dmax = max(x[2] for x in sh_ranges)
    eribuf = numpy.empty((nao,dmax,dmax,nao))
    ftmp = lib.H5TmpFile()
    disk = (nocca**2*(nao*(nao+dmax)/2+nvira**2) +
            noccb**2*(nao*(nao+dmax)/2+nvirb**2) +
            nocca*noccb*(nao**2+nvira*nvirb))
    log.debug('max_memory %s MB (dmax = %s) required disk space %g MB',
              max_memory, dmax, disk*8/1e6)

    fint = gto.moleintor.getints4c
    aa_blk_slices = []
    ab_blk_slices = []
    count_ab = 0
    count_aa = 0
    time1 = time0
    with lib.call_in_background(ftmp.__setitem__) as save:
        for ish0, ish1, ni in sh_ranges:
            for jsh0, jsh1, nj in sh_ranges:
                i0, i1 = ao_loc[ish0], ao_loc[ish1]
                j0, j1 = ao_loc[jsh0], ao_loc[jsh1]

                eri = fint(int2e, mol._atm, mol._bas, mol._env,
                           shls_slice=(0,nbas,ish0,ish1, jsh0,jsh1,0,nbas),
                           aosym='s1', ao_loc=ao_loc, cintopt=ao2mopt._cintopt,
                           out=eribuf)
                tmp_i = lib.ddot(orboa.T, eri.reshape(nao,(i1-i0)*(j1-j0)*nao))
                tmp_li = lib.ddot(orbob.T, tmp_i.reshape(nocca*(i1-i0)*(j1-j0),nao).T)
                tmp_li = tmp_li.reshape(noccb,nocca,(i1-i0),(j1-j0))
                save('ab/%d'%count_ab, tmp_li.transpose(1,0,2,3))
                ab_blk_slices.append((i0,i1,j0,j1))
                count_ab += 1

                if ish0 >= jsh0:
                    tmp_li = lib.ddot(orboa.T, tmp_i.reshape(nocca*(i1-i0)*(j1-j0),nao).T)
                    tmp_li = tmp_li.reshape(nocca,nocca,(i1-i0),(j1-j0))
                    save('aa/%d'%count_aa, tmp_li.transpose(1,0,2,3))

                    tmp_i = lib.ddot(orbob.T, eri.reshape(nao,(i1-i0)*(j1-j0)*nao))
                    tmp_li = lib.ddot(orbob.T, tmp_i.reshape(noccb*(i1-i0)*(j1-j0),nao).T)
                    tmp_li = tmp_li.reshape(noccb,noccb,(i1-i0),(j1-j0))
                    save('bb/%d'%count_aa, tmp_li.transpose(1,0,2,3))
                    aa_blk_slices.append((i0,i1,j0,j1))
                    count_aa += 1

                time1 = log.timer_debug1('partial ao2mo [%d:%d,%d:%d]' %
                                         (ish0,ish1,jsh0,jsh1), *time1)
    time1 = time0 = log.timer('mp2 ao2mo_ovov pass1', *time0)
    eri = eribuf = tmp_i = tmp_li = None

    fovov = feri.create_dataset('ovov', (nocca*nvira,nocca*nvira), 'f8',
                                chunks=(nvira,nvira))
    fovOV = feri.create_dataset('ovOV', (nocca*nvira,noccb*nvirb), 'f8',
                                chunks=(nvira,nvirb))
    fOVOV = feri.create_dataset('OVOV', (noccb*nvirb,noccb*nvirb), 'f8',
                                chunks=(nvirb,nvirb))
    occblk = int(min(max(nocca,noccb),
                     max(4, 250/nocca, max_memory*.9e6/8/(nao**2*nocca)/5)))

    def load_aa(h5g, nocc, i0, eri):
        if i0 < nocc:
            i1 = min(i0+occblk, nocc)
            for k, (p0,p1,q0,q1) in enumerate(aa_blk_slices):
                eri[:i1-i0,:,p0:p1,q0:q1] = h5g[str(k)][i0:i1]
                if p0 != q0:
                    dat = numpy.asarray(h5g[str(k)][:,i0:i1])
                    eri[:i1-i0,:,q0:q1,p0:p1] = dat.transpose(1,0,3,2)

    def load_ab(h5g, nocca, i0, eri):
        if i0 < nocca:
            i1 = min(i0+occblk, nocca)
            for k, (p0,p1,q0,q1) in enumerate(ab_blk_slices):
                eri[:i1-i0,:,p0:p1,q0:q1] = h5g[str(k)][i0:i1]

    def save(h5dat, nvir, i0, i1, dat):
        for i in range(i0, i1):
            h5dat[i*nvir:(i+1)*nvir] = dat[i-i0].reshape(nvir,-1)

    with lib.call_in_background(save) as bsave:
        with lib.call_in_background(load_aa) as prefetch:
            buf_prefecth = numpy.empty((occblk,nocca,nao,nao))
            buf = numpy.empty_like(buf_prefecth)
            load_aa(ftmp['aa'], nocca, 0, buf_prefecth)
            for i0, i1 in lib.prange(0, nocca, occblk):
                buf, buf_prefecth = buf_prefecth, buf
                prefetch(ftmp['aa'], nocca, i1, buf_prefecth)
                eri = buf[:i1-i0].reshape((i1-i0)*nocca,nao,nao)
                dat = _ao2mo.nr_e2(eri, orbva, (0,nvira,0,nvira), 's1', 's1')
                bsave(fovov, nvira, i0, i1,
                      dat.reshape(i1-i0,nocca,nvira,nvira).transpose(0,2,1,3))
                time1 = log.timer_debug1('pass2 ao2mo for aa [%d:%d]' % (i0,i1), *time1)

            buf_prefecth = numpy.empty((occblk,noccb,nao,nao))
            buf = numpy.empty_like(buf_prefecth)
            load_aa(ftmp['bb'], noccb, 0, buf_prefecth)
            for i0, i1 in lib.prange(0, noccb, occblk):
                buf, buf_prefecth = buf_prefecth, buf
                prefetch(ftmp['bb'], noccb, i1, buf_prefecth)
                eri = buf[:i1-i0].reshape((i1-i0)*noccb,nao,nao)
                dat = _ao2mo.nr_e2(eri, orbvb, (0,nvirb,0,nvirb), 's1', 's1')
                bsave(fOVOV, nvirb, i0, i1,
                      dat.reshape(i1-i0,noccb,nvirb,nvirb).transpose(0,2,1,3))
                time1 = log.timer_debug1('pass2 ao2mo for bb [%d:%d]' % (i0,i1), *time1)

        orbvab = numpy.asarray(numpy.hstack((orbva, orbvb)), order='F')
        with lib.call_in_background(load_ab) as prefetch:
            load_ab(ftmp['ab'], nocca, 0, buf_prefecth)
            for i0, i1 in lib.prange(0, nocca, occblk):
                buf, buf_prefecth = buf_prefecth, buf
                prefetch(ftmp['ab'], nocca, i1, buf_prefecth)
                eri = buf[:i1-i0].reshape((i1-i0)*noccb,nao,nao)
                dat = _ao2mo.nr_e2(eri, orbvab, (0,nvira,nvira,nvira+nvirb), 's1', 's1')
                bsave(fovOV, nvira, i0, i1,
                      dat.reshape(i1-i0,noccb,nvira,nvirb).transpose(0,2,1,3))
                time1 = log.timer_debug1('pass2 ao2mo for ab [%d:%d]' % (i0,i1), *time1)

    time0 = log.timer('mp2 ao2mo_ovov pass2', *time0)