def test_unpack_row(self): row = numpy.arange(28.) ref = lib.unpack_tril(row)[4] self.assertTrue(numpy.array_equal(ref, lib.unpack_row(row, 4))) row = numpy.arange(28, dtype=numpy.int32) ref = lib.unpack_tril(row)[4] a = lib.unpack_row(row, 4) self.assertTrue(numpy.array_equal(ref, a)) self.assertTrue(a.dtype == numpy.int32)
def general(eri, mo_coeffs, erifile, dataname='eri_mo', ioblk_size=IOBLK_SIZE, compact=True, verbose=logger.NOTE): '''For the given four sets of orbitals, transfer arbitrary spherical AO integrals to MO integrals on disk. Args: eri : 8-fold reduced eri vector mo_coeffs : 4-item list of ndarray Four sets of orbital coefficients, corresponding to the four indices of (ij|kl) erifile : str or h5py File or h5py Group object To store the transformed integrals, in HDF5 format. Kwargs dataname : str The dataset name in the erifile (ref the hierarchy of HDF5 format http://www.hdfgroup.org/HDF5/doc1.6/UG/09_Groups.html). By assigning different dataname, the existed integral file can be reused. If the erifile contains the dataname, the new integrals data will overwrite the old one. ioblk_size : float or int The block size for IO, large block size may **not** improve performance compact : bool When compact is True, depending on the four oribital sets, the returned MO integrals has (up to 4-fold) permutation symmetry. If it's False, the function will abandon any permutation symmetry, and return the "plain" MO integrals Pseudocode / algorithm: u = mu v = nu l = lambda o = sigma Assume eri's are 8-fold reduced. nij/nkl_pair = npair or i*j/k*l if only transforming a subset First half transform: Initialize half_eri of size (nij_pair,npair) For lo = 1 -> npair Unpack row lo Unpack row lo to matrix E_{uv}^{lo} Transform C_ui^+*E*C_nj -> E_{ij}^{lo} Ravel or pack E_{ij}^{lo} Save E_{ij}^{lo} -> half_eri[:,lo] Second half transform: Initialize h5d_eri of size (nij_pair,nkl_pair) For ij = 1 -> nij_pair Load and unpack half_eri[ij,:] -> E_{lo}^{ij} Transform C_{lk}E_{lo}^{ij}C_{ol} -> E_{kl}^{ij} Repack E_{kl}^{ij} Save E_{kl}^{ij} -> h5d_eri[ij,:] Each matrix is indexed by the composite index ij x kl, where ij/kl is either npair or ixj/kxl, if only a subset of MOs are being transformed. Since entire rows or columns need to be read in, the arrays are chunked such that IOBLK_SIZE = row/col x chunking col/row. For example, for the first half transform, we would save in nij_pair x IOBLK_SIZE/nij_pair, then load in IOBLK_SIZE/nkl_pair x npair for the second half transform. ------ kl -----> |jxl | ij | | v As a first guess, the chunking size is jxl. If the super-rows/cols are larger than IOBLK_SIZE, then the chunk rectangle jxl is trimmed accordingly. The pathological limiting case is where the dimensions nao_pair, nij_pair, or nkl_pair are so large that the arrays are chunked 1x1, in which case IOBLK_SIZE needs to be increased. ''' log = logger.new_logger(None, verbose) log.info('******** ao2mo disk, custom eri ********') nmoi = mo_coeffs[0].shape[1] nmoj = mo_coeffs[1].shape[1] nmok = mo_coeffs[2].shape[1] nmol = mo_coeffs[3].shape[1] nao = mo_coeffs[0].shape[0] nao_pair = nao*(nao+1) // 2 if compact and iden_coeffs(mo_coeffs[0], mo_coeffs[1]): ij_red = False nij_pair = nmoi*(nmoi+1) // 2 else: ij_red = True nij_pair = nmoi*nmoj if compact and iden_coeffs(mo_coeffs[2], mo_coeffs[3]): kl_red = False nkl_pair = nmok*(nmok+1) // 2 else: kl_red = True nkl_pair = nmok*nmol chunks_half = (max(1, numpy.minimum(int(ioblk_size//(nao_pair*f8_size)),nmoj)), max(1, numpy.minimum(int(ioblk_size//(nij_pair*f8_size)),nmol))) ''' ideally, the final transformed eris should have a chunk of nmoj x nmol to optimize read operations. However, I'm chunking the row size so that the write operations during the transform can be done as fast as possible. ''' chunks_full = (numpy.minimum(int(ioblk_size//(nkl_pair*f8_size)),nmoj),nmol) if isinstance(erifile, str): if h5py.is_hdf5(erifile): feri = h5py.File(erifile) if dataname in feri: del(feri[dataname]) else: feri = h5py.File(erifile,'w',libver='latest') else: assert(isinstance(erifile, h5py.Group)) feri = erifile h5d_eri = feri.create_dataset(dataname,(nij_pair,nkl_pair),'f8',chunks=chunks_full) feri_swap = lib.H5TmpFile(libver='latest') half_eri = feri_swap.create_dataset(dataname,(nij_pair,nao_pair),'f8',chunks=chunks_half) log.debug('Memory information:') log.debug(' IOBLK_SIZE (MB): {}'.format(ioblk_size)) log.debug(' jxl {}x{}, half eri chunk dim {}x{}'.format(nmoj,nmol,chunks_half[0],chunks_half[1])) log.debug(' jxl {}x{}, full eri chunk dim {}x{}'.format(nmoj,nmol,chunks_full[0],chunks_full[1])) log.debug(' Final disk eri size (MB): {:.3g}, chunked {:.3g}' .format(nij_pair*nkl_pair*f8_size,numpy.prod(chunks_full)*f8_size)) log.debug(' Half transformed eri size (MB): {:.3g}, chunked {:.3g}' .format(nij_pair*nao_pair*f8_size,numpy.prod(chunks_half)*f8_size)) log.debug(' RAM buffer for half transform (MB): {:.3g}' .format(nij_pair*chunks_half[1]*f8_size*2)) log.debug(' RAM buffer for full transform (MB): {:.3g}' .format(f8_size*chunks_full[0]*nkl_pair*2 + chunks_half[0]*nao_pair*f8_size*2)) def save1(piece,buf): start = piece*chunks_half[1] stop = (piece+1)*chunks_half[1] if stop > nao_pair: stop = nao_pair half_eri[:,start:stop] = buf[:,:stop-start] return def load2(piece): start = piece*chunks_half[0] stop = (piece+1)*chunks_half[0] if stop > nij_pair: stop = nij_pair if start >= nij_pair: start = stop - 1 return half_eri[start:stop,:] def prefetch2(piece): start = piece*chunks_half[0] stop = (piece+1)*chunks_half[0] if stop > nij_pair: stop = nij_pair if start >= nij_pair: start = stop - 1 buf_prefetch[:stop-start,:] = half_eri[start:stop,:] return def save2(piece,buf): start = piece*chunks_full[0] stop = (piece+1)*chunks_full[0] if stop > nij_pair: stop = nij_pair h5d_eri[start:stop,:] = buf[:stop-start,:] return # transform \mu\nu -> ij cput0 = time.clock(), time.time() Cimu = mo_coeffs[0].conj().transpose() buf_write = numpy.empty((nij_pair,chunks_half[1])) buf_out = numpy.empty_like(buf_write) wpiece = 0 with lib.call_in_background(save1) as async_write: for lo in range(nao_pair): if lo % chunks_half[1] == 0 and lo > 0: #save1(wpiece,buf_write) buf_out, buf_write = buf_write, buf_out async_write(wpiece,buf_out) wpiece += 1 buf = lib.unpack_row(eri,lo) uv = lib.unpack_tril(buf) uv = Cimu.dot(uv).dot(mo_coeffs[1]) if ij_red: ij = numpy.ravel(uv) # grabs by row else: ij = lib.pack_tril(uv) buf_write[:,lo % chunks_half[1]] = ij # final write operation & cleanup save1(wpiece,buf_write) log.timer('(uv|lo) -> (ij|lo)', *cput0) uv = None ij = None buf = None # transform \lambda\sigma -> kl cput1 = time.clock(), time.time() Cklam = mo_coeffs[2].conj().transpose() buf_write = numpy.empty((chunks_full[0],nkl_pair)) buf_out = numpy.empty_like(buf_write) buf_read = numpy.empty((chunks_half[0],nao_pair)) buf_prefetch = numpy.empty_like(buf_read) rpiece = 0 wpiece = 0 with lib.call_in_background(save2,prefetch2) as (async_write,prefetch): buf_read = load2(rpiece) prefetch(rpiece+1) for ij in range(nij_pair): if ij % chunks_full[0] == 0 and ij > 0: #save2(wpiece,buf_write) buf_out, buf_write = buf_write, buf_out async_write(wpiece,buf_out) wpiece += 1 if ij % chunks_half[0] == 0 and ij > 0: #buf_read = load2(rpiece) buf_read, buf_prefetch = buf_prefetch, buf_read rpiece += 1 prefetch(rpiece+1) lo = lib.unpack_tril(buf_read[ij % chunks_half[0],:]) lo = Cklam.dot(lo).dot(mo_coeffs[3]) if kl_red: kl = numpy.ravel(lo) else: kl = lib.pack_tril(lo) buf_write[ij % chunks_full[0],:] = kl save2(wpiece,buf_write) log.timer('(ij|lo) -> (ij|kl)', *cput1) if isinstance(erifile, str): feri.close() return erifile
def general(eri, mo_coeffs, erifile, dataname='eri_mo', ioblk_size=IOBLK_SIZE, compact=True, verbose=logger.NOTE): '''For the given four sets of orbitals, transfer arbitrary spherical AO integrals to MO integrals on disk. Args: eri : 8-fold reduced eri vector mo_coeffs : 4-item list of ndarray Four sets of orbital coefficients, corresponding to the four indices of (ij|kl) erifile : str or h5py File or h5py Group object To store the transformed integrals, in HDF5 format. Kwargs dataname : str The dataset name in the erifile (ref the hierarchy of HDF5 format http://www.hdfgroup.org/HDF5/doc1.6/UG/09_Groups.html). By assigning different dataname, the existed integral file can be reused. If the erifile contains the dataname, the new integrals data will overwrite the old one. ioblk_size : float or int The block size for IO, large block size may **not** improve performance compact : bool When compact is True, depending on the four oribital sets, the returned MO integrals has (up to 4-fold) permutation symmetry. If it's False, the function will abandon any permutation symmetry, and return the "plain" MO integrals Pseudocode / algorithm: u = mu v = nu l = lambda o = sigma Assume eri's are 8-fold reduced. nij/nkl_pair = npair or i*j/k*l if only transforming a subset First half transform: Initialize half_eri of size (nij_pair,npair) For lo = 1 -> npair Unpack row lo Unpack row lo to matrix E_{uv}^{lo} Transform C_ui^+*E*C_nj -> E_{ij}^{lo} Ravel or pack E_{ij}^{lo} Save E_{ij}^{lo} -> half_eri[:,lo] Second half transform: Initialize h5d_eri of size (nij_pair,nkl_pair) For ij = 1 -> nij_pair Load and unpack half_eri[ij,:] -> E_{lo}^{ij} Transform C_{lk}E_{lo}^{ij}C_{ol} -> E_{kl}^{ij} Repack E_{kl}^{ij} Save E_{kl}^{ij} -> h5d_eri[ij,:] Each matrix is indexed by the composite index ij x kl, where ij/kl is either npair or ixj/kxl, if only a subset of MOs are being transformed. Since entire rows or columns need to be read in, the arrays are chunked such that IOBLK_SIZE = row/col x chunking col/row. For example, for the first half transform, we would save in nij_pair x IOBLK_SIZE/nij_pair, then load in IOBLK_SIZE/nkl_pair x npair for the second half transform. ------ kl -----> |jxl | ij | | v As a first guess, the chunking size is jxl. If the super-rows/cols are larger than IOBLK_SIZE, then the chunk rectangle jxl is trimmed accordingly. The pathological limiting case is where the dimensions nao_pair, nij_pair, or nkl_pair are so large that the arrays are chunked 1x1, in which case IOBLK_SIZE needs to be increased. ''' log = logger.new_logger(None, verbose) log.info('******** ao2mo disk, custom eri ********') nmoi = mo_coeffs[0].shape[1] nmoj = mo_coeffs[1].shape[1] nmok = mo_coeffs[2].shape[1] nmol = mo_coeffs[3].shape[1] nao = mo_coeffs[0].shape[0] nao_pair = nao * (nao + 1) // 2 if compact and iden_coeffs(mo_coeffs[0], mo_coeffs[1]): ij_red = False nij_pair = nmoi * (nmoi + 1) // 2 else: ij_red = True nij_pair = nmoi * nmoj if compact and iden_coeffs(mo_coeffs[2], mo_coeffs[3]): kl_red = False nkl_pair = nmok * (nmok + 1) // 2 else: kl_red = True nkl_pair = nmok * nmol chunks_half = (max( 1, numpy.minimum(int(ioblk_size // (nao_pair * f8_size)), nmoj)), max( 1, numpy.minimum(int(ioblk_size // (nij_pair * f8_size)), nmol))) ''' ideally, the final transformed eris should have a chunk of nmoj x nmol to optimize read operations. However, I'm chunking the row size so that the write operations during the transform can be done as fast as possible. ''' chunks_full = (numpy.minimum(int(ioblk_size // (nkl_pair * f8_size)), nmoj), nmol) if isinstance(erifile, str): if h5py.is_hdf5(erifile): feri = h5py.File(erifile) if dataname in feri: del (feri[dataname]) else: feri = h5py.File(erifile, 'w', libver='latest') else: assert (isinstance(erifile, h5py.Group)) feri = erifile h5d_eri = feri.create_dataset(dataname, (nij_pair, nkl_pair), 'f8', chunks=chunks_full) feri_swap = lib.H5TmpFile(libver='latest') half_eri = feri_swap.create_dataset(dataname, (nij_pair, nao_pair), 'f8', chunks=chunks_half) log.debug('Memory information:') log.debug(' IOBLK_SIZE (MB): {}'.format(ioblk_size)) log.debug(' jxl {}x{}, half eri chunk dim {}x{}'.format( nmoj, nmol, chunks_half[0], chunks_half[1])) log.debug(' jxl {}x{}, full eri chunk dim {}x{}'.format( nmoj, nmol, chunks_full[0], chunks_full[1])) log.debug(' Final disk eri size (MB): {:.3g}, chunked {:.3g}'.format( nij_pair * nkl_pair * f8_size, numpy.prod(chunks_full) * f8_size)) log.debug( ' Half transformed eri size (MB): {:.3g}, chunked {:.3g}'.format( nij_pair * nao_pair * f8_size, numpy.prod(chunks_half) * f8_size)) log.debug(' RAM buffer for half transform (MB): {:.3g}'.format( nij_pair * chunks_half[1] * f8_size * 2)) log.debug(' RAM buffer for full transform (MB): {:.3g}'.format( f8_size * chunks_full[0] * nkl_pair * 2 + chunks_half[0] * nao_pair * f8_size * 2)) def save1(piece, buf): start = piece * chunks_half[1] stop = (piece + 1) * chunks_half[1] if stop > nao_pair: stop = nao_pair half_eri[:, start:stop] = buf[:, :stop - start] return def load2(piece): start = piece * chunks_half[0] stop = (piece + 1) * chunks_half[0] if stop > nij_pair: stop = nij_pair if start >= nij_pair: start = stop - 1 return half_eri[start:stop, :] def prefetch2(piece): start = piece * chunks_half[0] stop = (piece + 1) * chunks_half[0] if stop > nij_pair: stop = nij_pair if start >= nij_pair: start = stop - 1 buf_prefetch[:stop - start, :] = half_eri[start:stop, :] return def save2(piece, buf): start = piece * chunks_full[0] stop = (piece + 1) * chunks_full[0] if stop > nij_pair: stop = nij_pair h5d_eri[start:stop, :] = buf[:stop - start, :] return # transform \mu\nu -> ij cput0 = time.clock(), time.time() Cimu = mo_coeffs[0].conj().transpose() buf_write = numpy.empty((nij_pair, chunks_half[1])) buf_out = numpy.empty_like(buf_write) wpiece = 0 with lib.call_in_background(save1) as async_write: for lo in range(nao_pair): if lo % chunks_half[1] == 0 and lo > 0: #save1(wpiece,buf_write) buf_out, buf_write = buf_write, buf_out async_write(wpiece, buf_out) wpiece += 1 buf = lib.unpack_row(eri, lo) uv = lib.unpack_tril(buf) uv = Cimu.dot(uv).dot(mo_coeffs[1]) if ij_red: ij = numpy.ravel(uv) # grabs by row else: ij = lib.pack_tril(uv) buf_write[:, lo % chunks_half[1]] = ij # final write operation & cleanup save1(wpiece, buf_write) log.timer('(uv|lo) -> (ij|lo)', *cput0) uv = None ij = None buf = None # transform \lambda\sigma -> kl cput1 = time.clock(), time.time() Cklam = mo_coeffs[2].conj().transpose() buf_write = numpy.empty((chunks_full[0], nkl_pair)) buf_out = numpy.empty_like(buf_write) buf_read = numpy.empty((chunks_half[0], nao_pair)) buf_prefetch = numpy.empty_like(buf_read) rpiece = 0 wpiece = 0 with lib.call_in_background(save2, prefetch2) as (async_write, prefetch): buf_read = load2(rpiece) prefetch(rpiece + 1) for ij in range(nij_pair): if ij % chunks_full[0] == 0 and ij > 0: #save2(wpiece,buf_write) buf_out, buf_write = buf_write, buf_out async_write(wpiece, buf_out) wpiece += 1 if ij % chunks_half[0] == 0 and ij > 0: #buf_read = load2(rpiece) buf_read, buf_prefetch = buf_prefetch, buf_read rpiece += 1 prefetch(rpiece + 1) lo = lib.unpack_tril(buf_read[ij % chunks_half[0], :]) lo = Cklam.dot(lo).dot(mo_coeffs[3]) if kl_red: kl = numpy.ravel(lo) else: kl = lib.pack_tril(lo) buf_write[ij % chunks_full[0], :] = kl save2(wpiece, buf_write) log.timer('(ij|lo) -> (ij|kl)', *cput1) if isinstance(erifile, str): feri.close() return erifile
def general(eri, mo_coeffs, erifile, dataname='eri_mo', ioblk_size=IOBLK_SIZE, compact=True, verbose=logger.NOTE): '''For the given four sets of orbitals, transfer arbitrary spherical AO integrals to MO integrals on disk. Args: eri : 8-fold reduced eri vector mo_coeffs : 4-item list of ndarray Four sets of orbital coefficients, corresponding to the four indices of (ij|kl) erifile : str or h5py File or h5py Group object To store the transformed integrals, in HDF5 format. Kwargs dataname : str The dataset name in the erifile (ref the hierarchy of HDF5 format http://www.hdfgroup.org/HDF5/doc1.6/UG/09_Groups.html). By assigning different dataname, the existed integral file can be reused. If the erifile contains the dataname, the new integrals data will overwrite the old one. ioblk_size : float or int The block size for IO, large block size may **not** improve performance compact : bool When compact is True, depending on the four oribital sets, the returned MO integrals has (up to 4-fold) permutation symmetry. If it's False, the function will abandon any permutation symmetry, and return the "plain" MO integrals Pseudocode / algorithm: u = mu v = nu l = lambda o = sigma Assume eri's are 8-fold reduced. nij/nkl_pair = npair or i*j/k*l if only transforming a subset First half transform: Initialize half_eri of size (nij_pair,npair) For lo = 1 -> npair Unpack row lo Unpack row lo to matrix E_{uv}^{lo} Transform C_ui^+*E*C_nj -> E_{ij}^{lo} Ravel or pack E_{ij}^{lo} Save E_{ij}^{lo} -> half_eri[:,lo] Second half transform: Initialize h5d_eri of size (nij_pair,nkl_pair) For ij = 1 -> nij_pair Load and unpack half_eri[ij,:] -> E_{lo}^{ij} Transform C_{lk}E_{lo}^{ij}C_{ol} -> E_{kl}^{ij} Repack E_{kl}^{ij} Save E_{kl}^{ij} -> h5d_eri[ij,:] Each matrix is indexed by the composite index ij x kl, where ij/kl is either npair or ixj/kxl, if only a subset of MOs are being transformed. Since entire rows or columns need to be read in, the arrays are chunked such that IOBLK_SIZE = row/col x chunking col/row. For example, for the first half transform, we would save in nij_pair x IOBLK_SIZE/nij_pair, then load in IOBLK_SIZE/nkl_pair x npair for the second half transform. ------ kl -----> |jxl | ij | | v As a first guess, the chunking size is jxl. If the super-rows/cols are larger than IOBLK_SIZE, then the chunk rectangle jxl is trimmed accordingly. The pathological limiting case is where the dimensions nao_pair, nij_pair, or nkl_pair are so large that the arrays are chunked 1x1, in which case IOBLK_SIZE needs to be increased. ''' log = logger.new_logger(None, verbose) log.info('******** ao2mo disk, custom eri ********') eri_ao = numpy.asarray(eri, order='C') nao, nmoi = mo_coeffs[0].shape nmoj = mo_coeffs[1].shape[1] nao_pair = nao*(nao+1)//2 ijmosym, nij_pair, moij, ijshape = _conc_mos(mo_coeffs[0], mo_coeffs[1], compact) klmosym, nkl_pair, mokl, klshape = _conc_mos(mo_coeffs[2], mo_coeffs[3], compact) ijshape = (ijshape[0], ijshape[1]-ijshape[0], ijshape[2], ijshape[3]-ijshape[2]) dtype = numpy.result_type(eri, *mo_coeffs) typesize = dtype.itemsize/1e6 # in MB if nij_pair == 0: return numpy.empty((nij_pair,nkl_pair)) ij_red = ijmosym == 's1' kl_red = klmosym == 's1' if isinstance(erifile, str): if h5py.is_hdf5(erifile): feri = h5py.File(erifile, 'a') if dataname in feri: del(feri[dataname]) else: feri = h5py.File(erifile,'w',libver='latest') else: assert(isinstance(erifile, h5py.Group)) feri = erifile h5d_eri = feri.create_dataset(dataname,(nij_pair,nkl_pair), dtype.char) feri_swap = lib.H5TmpFile(libver='latest') chunk_size = min(nao_pair, max(4, int(ioblk_size*1e6/8/nao_pair))) log.debug('Memory information:') log.debug(' IOBLK_SIZE (MB): {} chunk_size: {}' .format(ioblk_size, chunk_size)) log.debug(' Final disk eri size (MB): {:.3g}' .format(nij_pair*nkl_pair*typesize)) log.debug(' Half transformed eri size (MB): {:.3g}' .format(nij_pair*nao_pair*typesize)) log.debug(' RAM buffer (MB): {:.3g}' .format(nij_pair*IOBLK_SIZE*typesize*2)) if eri_ao.size == nao_pair**2: # 4-fold symmetry # half_e1 first transforms the indices which are contiguous in memory # transpose the 4-fold integrals to make ij the contiguous indices eri_ao = lib.transpose(eri_ao) ftrans = _ao2mo.libao2mo.AO2MOtranse1_incore_s4 elif eri_ao.size == nao_pair*(nao_pair+1)//2: ftrans = _ao2mo.libao2mo.AO2MOtranse1_incore_s8 else: raise NotImplementedError if ijmosym == 's2': fmmm = _ao2mo.libao2mo.AO2MOmmm_nr_s2_s2 elif nmoi <= nmoj: fmmm = _ao2mo.libao2mo.AO2MOmmm_nr_s2_iltj else: fmmm = _ao2mo.libao2mo.AO2MOmmm_nr_s2_igtj fdrv = getattr(_ao2mo.libao2mo, 'AO2MOnr_e1incore_drv') def save(piece, buf): feri_swap[str(piece)] = buf.T # transform \mu\nu -> ij cput0 = time.clock(), time.time() with lib.call_in_background(save) as async_write: for istep, (p0, p1) in enumerate(lib.prange(0, nao_pair, chunk_size)): if dtype == numpy.double: buf = numpy.empty((p1-p0, nij_pair)) fdrv(ftrans, fmmm, buf.ctypes.data_as(ctypes.c_void_p), eri_ao.ctypes.data_as(ctypes.c_void_p), moij.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(p0), ctypes.c_int(p1-p0), ctypes.c_int(nao), ctypes.c_int(ijshape[0]), ctypes.c_int(ijshape[1]), ctypes.c_int(ijshape[2]), ctypes.c_int(ijshape[3])) else: # complex tmp = numpy.empty((p1-p0, nao_pair)) if eri_ao.size == nao_pair**2: # 4-fold symmetry tmp = eri_ao[p0:p1] else: # 8-fold symmetry for i in range(p0, p1): tmp[i-p0] = lib.unpack_row(eri_ao, i) tmp = lib.unpack_tril(tmp, filltriu=lib.SYMMETRIC) buf = lib.einsum('xpq,pi,qj->xij', tmp, mo_coeffs[0].conj(), mo_coeffs[1]) if ij_red: buf = buf.reshape(p1-p0,-1) # grabs by row else: buf = lib.pack_tril(buf) async_write(istep, buf) log.timer('(uv|lo) -> (ij|lo)', *cput0) # transform \lambda\sigma -> kl cput1 = time.clock(), time.time() Cklam = mo_coeffs[2].conj() buf_read = numpy.empty((chunk_size,nao_pair), dtype=dtype) buf_prefetch = numpy.empty_like(buf_read) def load(start, stop, buf): if start < stop: _load_from_h5g(feri_swap, start, stop, buf) def save(start, stop, buf): if start < stop: h5d_eri[start:stop] = buf[:stop-start] with lib.call_in_background(save,load) as (async_write, prefetch): for p0, p1 in lib.prange(0, nij_pair, chunk_size): if p0 == 0: load(p0, p1, buf_prefetch) buf_read, buf_prefetch = buf_prefetch, buf_read prefetch(p1, min(p1+chunk_size, nij_pair), buf_prefetch) lo = lib.unpack_tril(buf_read[:p1-p0], filltriu=lib.SYMMETRIC) lo = lib.einsum('xpq,pi,qj->xij', lo, Cklam, mo_coeffs[3]) if kl_red: kl = lo.reshape(p1-p0,-1) else: kl = lib.pack_tril(lo) async_write(p0, p1, kl) log.timer('(ij|lo) -> (ij|kl)', *cput1) if isinstance(erifile, str): feri.close() return erifile