Python OnDiskInvertedListsの例、faiss.OnDiskInvertedLists Pythonの例

コード例 #1

0

ファイルを表示

def run_index(args):
    phrase_path = os.path.join(args.dump_dir, 'phrase.hdf5')
    if os.path.exists(phrase_path):
        dump_paths = [phrase_path]
    else:
        dump_names = os.listdir(os.path.join(args.dump_dir, 'phrase'))
        dump_paths = [os.path.join(args.dump_dir, 'phrase', name) for name in dump_names if name.endswith('.hdf5')]

    data = None

    if args.stage in ['all', 'coarse']:
        if args.replace or not os.path.exists(args.quantizer_path):
            if not os.path.exists(args.index_dir):
                os.makedirs(args.index_dir)
            data, max_norm = sample_data(dump_paths, max_norm=args.max_norm, para=args.para,
                                         doc_sample_ratio=args.doc_sample_ratio, vec_sample_ratio=args.vec_sample_ratio,
                                         max_norm_cf=args.max_norm_cf, num_dummy_zeros=args.num_dummy_zeros,
                                         norm_th=args.norm_th)
            with open(args.max_norm_path, 'w') as fp:
                json.dump(max_norm, fp)
            train_coarse_quantizer(data, args.quantizer_path, args.num_clusters, cuda=args.cuda)

    if args.stage in ['all', 'fine']:
        if args.replace or not os.path.exists(args.trained_index_path):
            with open(args.max_norm_path, 'r') as fp:
                max_norm = json.load(fp)
            if data is None:
                data, _ = sample_data(dump_paths, max_norm=max_norm, para=args.para,
                                      doc_sample_ratio=args.doc_sample_ratio, vec_sample_ratio=args.vec_sample_ratio,
                                      num_dummy_zeros=args.num_dummy_zeros, norm_th=args.norm_th)
            train_index(data, args.quantizer_path, args.trained_index_path, fine_quant=args.fine_quant, cuda=args.cuda)

    if args.stage in ['all', 'add']:
        if args.replace or not os.path.exists(args.index_path):
            with open(args.max_norm_path, 'r') as fp:
                max_norm = json.load(fp)
            if args.dump_paths is not None:
                dump_paths = args.dump_paths
                if not os.path.exists(args.subindex_dir):
                    os.makedirs(args.subindex_dir)
            add_to_index(dump_paths, args.trained_index_path, args.index_path, args.idx2id_path,
                         max_norm=max_norm, para=args.para, num_dummy_zeros=args.num_dummy_zeros, cuda=args.cuda,
                         num_docs_per_add=args.num_docs_per_add, offset=args.offset, norm_th=args.norm_th,
                         fine_quant=args.fine_quant)

    if args.stage == 'merge':
        if args.replace or not os.path.exists(args.index_path):
            merge_indexes(args.subindex_dir, args.trained_index_path, args.index_path, args.idx2id_path, args.inv_path)

    if args.stage == 'move':
        index = faiss.read_index(args.trained_index_path)
        invlists = faiss.OnDiskInvertedLists(
            index.nlist, index.code_size,
            args.inv_path)
        index.replace_invlists(invlists)
        faiss.write_index(index, args.index_path)

コード例 #2

0

ファイルを表示

def merge_indexes(subindex_dir, trained_index_path, target_index_path, target_idx2id_path, target_inv_path):
    # target_inv_path = merged_index.ivfdata
    names = os.listdir(subindex_dir)
    idx2id_paths = [os.path.join(subindex_dir, name) for name in names if name.endswith('.hdf5')]
    index_paths = [os.path.join(subindex_dir, name) for name in names if name.endswith('.faiss')]

    print('copying idx2id')
    with h5py.File(target_idx2id_path, 'w') as out:
        for idx2id_path in tqdm(idx2id_paths, desc='copying idx2id'):
            with h5py.File(idx2id_path, 'r') as in_:
                offset = str(in_.attrs['offset'])
                group = out.create_group(offset)
                group.create_dataset('doc', data=in_['doc'])
                group.create_dataset('para', data=in_['para'])
                group.create_dataset('word', data=in_['word'])

    print('loading invlists')
    ivfs = []
    for index_path in tqdm(index_paths, desc='loading invlists'):
        # the IO_FLAG_MMAP is to avoid actually loading the data thus
        # the total size of the inverted lists can exceed the
        # available RAM
        index = faiss.read_index(index_path,
                                 faiss.IO_FLAG_MMAP)
        ivfs.append(index.invlists)

        # avoid that the invlists get deallocated with the index
        index.own_invlists = False

    # construct the output index
    index = faiss.read_index(trained_index_path)

    # prepare the output inverted lists. They will be written
    # to merged_index.ivfdata
    invlists = faiss.OnDiskInvertedLists(
        index.nlist, index.code_size,
        target_inv_path)

    # merge all the inverted lists
    print('merging')
    ivf_vector = faiss.InvertedListsPtrVector()
    for ivf in tqdm(ivfs):
        ivf_vector.push_back(ivf)

    print("merge %d inverted lists " % ivf_vector.size())
    ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())
    print(ntotal)

    # now replace the inverted lists in the output index
    index.ntotal = ntotal
    index.replace_invlists(invlists)

    print('writing index')
    faiss.write_index(index, target_index_path)

コード例 #3

0

ファイルを表示

ファイル: test_index_composite.py プロジェクト: yaobh/faiss

    def test_rename(self):
        d = 10
        nb = 500
        nq = 100
        nt = 100

        xt, xb, xq = get_dataset_2(d, nt, nb, nq)

        quantizer = faiss.IndexFlatL2(d)

        index1 = faiss.IndexIVFFlat(quantizer, d, 20)
        index1.train(xt)

        dirname = tempfile.mkdtemp()

        try:

            # make an index with ondisk invlists
            invlists = faiss.OnDiskInvertedLists(
                index1.nlist, index1.code_size,
                dirname + '/aa.ondisk')
            index1.replace_invlists(invlists)
            index1.add(xb)
            D1, I1 = index1.search(xq, 10)
            faiss.write_index(index1, dirname + '/aa.ivf')

            # move the index elsewhere
            os.mkdir(dirname + '/1')
            for fname in 'aa.ondisk', 'aa.ivf':
                os.rename(dirname + '/' + fname,
                          dirname + '/1/' + fname)

            # try to read it: fails!
            try:
                index2 = faiss.read_index(dirname + '/1/aa.ivf')
            except RuntimeError:
                pass   # normal
            else:
                assert False

            # read it with magic flag
            index2 = faiss.read_index(dirname + '/1/aa.ivf',
                                      faiss.IO_FLAG_ONDISK_SAME_DIR)
            D2, I2 = index2.search(xq, 10)
            assert np.all(I1 == I2)

        finally:
            shutil.rmtree(dirname)

コード例 #4

0

ファイルを表示

ファイル: utils.py プロジェクト: jeremyephron/forager

def merge_on_disk(trained_index: faiss.Index, shard_fnames: List[str],
                  ivfdata_fname: str) -> None:
    """
	Adds the contents of the indexes stored in shard_fnames into the index
    trained_index. The on-disk data is stored in ivfdata_fname.

    Args:
        trained_index: The trained index to add the data to.
        shard_fnames: A list of the partial index filenames.
        ivfdata_fname: The filename for the on-disk extracted data.

	"""

    # Load the inverted lists
    ivfs = []
    for fname in shard_fnames:
        # The IO_FLAG_MMAP is to avoid actually loading the data, and thus the
        # total size of the inverted lists can exceed the available RAM
        index = faiss.read_index(fname, faiss.IO_FLAG_MMAP)
        index_ivf = faiss.extract_index_ivf(index)
        ivfs.append(index_ivf.invlists)

        # Avoid deallocating the invlists with the index
        index_ivf.own_invlists = False

    # Construct the output index
    index = trained_index
    index_ivf = faiss.extract_index_ivf(index)

    assert index.ntotal == 0, 'The trained index should be empty'

    # Prepare the output inverted lists, which are written to ivfdata_fname.
    invlists = faiss.OnDiskInvertedLists(index_ivf.nlist, index_ivf.code_size,
                                         ivfdata_fname)

    # Merge all the inverted lists
    ivf_vector = faiss.InvertedListsPtrVector()
    for ivf in ivfs:
        ivf_vector.push_back(ivf)

    n_total = invlists.merge_from(ivf_vector.data(), ivf_vector.size())

    # Replace the inverted lists in the output index
    index.ntotal = index_ivf.ntotal = n_total
    index_ivf.replace_invlists(invlists, True)
    invlists.this.disown()

コード例 #5

0

ファイルを表示

ファイル: ondisk.py プロジェクト: ifeherva/faiss

def merge_ondisk(trained_index: faiss.Index, shard_fnames: List[str],
                 ivfdata_fname: str) -> None:
    """Add the contents of the indexes stored in shard_fnames into the index
    trained_index. The on-disk data is stored in ivfdata_fname"""
    assert not isinstance(
        trained_index,
        faiss.IndexIVFPQR), "IndexIVFPQR is not supported as an on disk index."
    # merge the images into an on-disk index
    # first load the inverted lists
    ivfs = []
    for fname in shard_fnames:
        # the IO_FLAG_MMAP is to avoid actually loading the data thus
        # the total size of the inverted lists can exceed the
        # available RAM
        LOG.info("read " + fname)
        index = faiss.read_index(fname, faiss.IO_FLAG_MMAP)
        index_ivf = faiss.extract_index_ivf(index)
        ivfs.append(index_ivf.invlists)

        # avoid that the invlists get deallocated with the index
        index_ivf.own_invlists = False

    # construct the output index
    index = trained_index
    index_ivf = faiss.extract_index_ivf(index)

    assert index.ntotal == 0, "works only on empty index"

    # prepare the output inverted lists. They will be written
    # to merged_index.ivfdata
    invlists = faiss.OnDiskInvertedLists(index_ivf.nlist, index_ivf.code_size,
                                         ivfdata_fname)

    # merge all the inverted lists
    ivf_vector = faiss.InvertedListsPtrVector()
    for ivf in ivfs:
        ivf_vector.push_back(ivf)

    LOG.info("merge %d inverted lists " % ivf_vector.size())
    ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())

    # now replace the inverted lists in the output index
    index.ntotal = index_ivf.ntotal = ntotal
    index_ivf.replace_invlists(invlists, True)
    invlists.this.disown()

コード例 #6

0

ファイルを表示

ファイル: indexConstruction.py プロジェクト: daniel-acuna/Scalable_Provenance

def mergeIndexes(indexFolder, machineNumber, finalIndexFile):
    indexesToMerge = []
    indexFiles = os.listdir(os.path.dirname(indexFolder))
    for indexFile in indexFiles:
        parts = indexFile.split('_')
        if len(parts) > 2:
            mnum = int(parts[1][-3:])
            if mnum == machineNumber:
                fullFile = os.path.join(os.path.dirname(indexFolder),
                                        indexFile)
                print('mmaping ', fullFile)
                print('memory usage: ', psutil.virtual_memory().percent)
                index = faiss.read_index(fullFile, faiss.IO_FLAG_MMAP)
                indexesToMerge.append(index.invlists)
    print('adding final index')
    mainIndexFile = open(finalIndexFile, 'rb')
    mainIndexResource = Resource('indexparameters', mainIndexFile.read(),
                                 'application/octet-stream')
    mainIndex, emptyIndex, preproc, map, all_tmp_paths = deserializeIndex(
        mainIndexResource)
    indexesToMerge.append(mainIndex.invlists)
    print("Merging " + str(len(indexesToMerge)) +
          "Index Shards for final index")
    finalIndex = emptyIndex
    invlists = faiss.OnDiskInvertedLists(
        finalIndex.nlist, index.code_size,
        os.path.join(os.path.dirname(self.index_cachefile),
                     'merged_index.ivfdata'))
    ivf_vector = faiss.InvertedListsPtrVector()
    bar = progressbar.ProgressBar()
    for ivf in bar(indexesToMerge):
        ivf_vector.push_back(ivf)
    print("merge %d inverted lists " % ivf_vector.size())
    ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())
    finalIndex.ntotal = ntotal
    finalIndex.replace_invlists(invlists)
    print('ntotal: ', finalIndex.ntotal)
    outName = "finalIndex_%03d" % machineNumber
    outPath = os.path.join(os.path.dirname(finalIndexFile), outName)
    binaryIndex = serializeIndex(finalIndexFile, map, machineNumber,
                                 all_tmp_paths)
    with open(outPath, 'wb') as of:
        of.write(binaryIndex)

コード例 #7

0

ファイルを表示

ファイル: test_index_composite.py プロジェクト: yaobh/faiss

    def do_merge_then_remove(self, ondisk):
        d = 10
        nb = 1000
        nq = 200
        nt = 200

        xt, xb, xq = get_dataset_2(d, nt, nb, nq)

        quantizer = faiss.IndexFlatL2(d)

        index1 = faiss.IndexIVFFlat(quantizer, d, 20)
        index1.train(xt)

        filename = None
        if ondisk:
            filename = tempfile.mkstemp()[1]
            invlists = faiss.OnDiskInvertedLists(
                index1.nlist, index1.code_size,
                filename)
            index1.replace_invlists(invlists)

        index1.add(xb[:int(nb / 2)])

        index2 = faiss.IndexIVFFlat(quantizer, d, 20)
        assert index2.is_trained
        index2.add(xb[int(nb / 2):])

        Dref, Iref = index1.search(xq, 10)
        index1.merge_from(index2, int(nb / 2))

        assert index1.ntotal == nb

        index1.remove_ids(faiss.IDSelectorRange(int(nb / 2), nb))

        assert index1.ntotal == int(nb / 2)
        Dnew, Inew = index1.search(xq, 10)

        assert np.all(Dnew == Dref)
        assert np.all(Inew == Iref)

        if filename is not None:
            os.unlink(filename)

コード例 #8

0

ファイルを表示

ファイル: indexMerger.py プロジェクト: daniel-acuna/Scalable_Provenance

def mergeIndexList(indexList, emptyIndexPath, outPath, machineNum=""):
    # merge the images into an on-disk index
    # first load the inverted lists
    ivfs = []
    outDir = os.path.dirname(outPath)
    bar = progressbar.ProgressBar()
    for indexFile in bar(indexList):
        # the IO_FLAG_MMAP is to avoid actually loading the data thus
        # the total size of the inverted lists can exceed the
        # available RAM
        # print("read " + indexFile)
        index = faiss.read_index(indexFile, faiss.IO_FLAG_MMAP)
        ivfs.append(index.invlists)

        # avoid that the invlists get deallocated with the index
        index.own_invlists = False

    # construct the output index
    index = faiss.read_index(emptyIndexPath)

    # prepare the output inverted lists. They will be written
    # to merged_index.ivfdata
    ivfDataStr = outDir + "merged_index_" + machineNum + "_.ivfdata"
    invlists = faiss.OnDiskInvertedLists(index.nlist, index.code_size,
                                         ivfDataStr)

    # merge all the inverted lists
    ivf_vector = faiss.InvertedListsPtrVector()
    for ivf in ivfs:
        ivf_vector.push_back(ivf)

    print("merge %d inverted lists " % ivf_vector.size())
    ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())

    # now replace the inverted lists in the output index
    index.ntotal = ntotal
    index.replace_invlists(invlists)

    print("write " + outPath)
    faiss.write_index(index, outPath)
    return ivfDataStr

コード例 #9

0

ファイルを表示

    def merge_IVFs(self,
                   index_path: Union[str, Path],
                   ivfdata_path: Union[str, Path],
                   ivfindex_paths: List[Union[str, Path]] = None) -> int:
        """
        An on-disk index must be built from existing subindexes. The
        inverted file list (IVF) from each subindex is merged into one
        disk-searchable .ivfdata file referenced by the .index file.

        Note: Use self.mv_index_and_ivfdata() to move these files.

        :param index_path: Path to output.index file
        :param ivfdata_path: Path to output.ivfdata file (on-disk searchable data)
        :param ivfindex_paths: Paths to indexes to be merged
        :return: Number of vectors indexed
        """
        # Collect IVF data from subindexes
        ivfs = list()
        if not ivfindex_paths:
            ivfindex_paths = list(self.subindex_path_totals.keys())
        for subindex_path in ivfindex_paths:
            index = faiss.read_index(p.abspath(subindex_path),
                                     faiss.IO_FLAG_MMAP)
            ivfs.append(index.invlists)
            index.own_invlists = False  # Prevents de-allocation
            del index

        # Prepare .ivfdata file
        index = self.load_base_idx()
        invlists = faiss.OnDiskInvertedLists(index.nlist, index.code_size,
                                             ivfdata_path)
        ivf_vector = faiss.InvertedListsPtrVector()
        for ivf in ivfs:
            ivf_vector.push_back(ivf)

        # Merge IVF data
        ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())
        index.ntotal = ntotal
        index.replace_invlists(invlists)
        faiss.write_index(index, index_path)
        return int(ntotal)

コード例 #10

0

ファイルを表示

ファイル: indexMerger.py プロジェクト: daniel-acuna/Scalable_Provenance

def dumpIndex(indexInMemory, indexOnDiskPath, shardCount):
    if shardCount == 0:
        faiss.write_index(indexInMemory, indexOnDiskPath)
    else:
        ivfs = []
        ivfs.append(indexInMemory.invlists)
        indexInMemory.own_invlists = False
        diskIndex = faiss.read_index(indexOnDiskPath, faiss.IO_FLAG_MMAP)
        ivfs.append(diskIndex.invlists)
        diskIndex.own_invlists = False

        invlists = faiss.OnDiskInvertedLists(diskIndex.nlist,
                                             diskIndex.code_size,
                                             'mergedIndex_tmp.ivfdata')
        ivf_vector = faiss.InvertedListsPtrVector()
        ivf_vector.push_back(indexInMemory.invlists)
        ivf_vector.push_back(diskIndex.invlists)
        ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())
        indexInMemory.ntotal = ntotal
        indexInMemory.replace_invlists(invlists)
        print('Index on disk now has ', indexInMemory.ntotal, ' entries')
        faiss.write_index(indexInMemory, indexOnDiskPath)

コード例 #11

0

ファイルを表示

ファイル: index_builder.py プロジェクト: Ljferrer/SimSent

    def make_mmap_index(self, base_index: BASE_INDEX, ids: np.array,
                        embs: np.array):
        # Get invlists
        index = faiss.clone_index(base_index)
        index.add_with_ids(embs, ids)
        ivf_vector = faiss.InvertedListsPtrVector()
        ivf_vector.push_back(index.invlists)
        index.own_invlists = False
        del index
        gc.collect()

        # Make MMAP ivfdata
        index_name = p.abspath(self.sub_dir / f'{self.seed_name}')
        invlists = faiss.OnDiskInvertedLists(base_index.nlist,
                                             base_index.code_size,
                                             f'{index_name}.ivfdata')
        ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())

        # Link index to ivfdata and save
        index = faiss.clone_index(base_index)
        index.ntotal = ntotal
        index.replace_invlists(invlists)
        faiss.write_index(index, f'{index_name}.index')

コード例 #12

0

ファイルを表示

        # the total size of the inverted lists can exceed the
        # available RAM
        print("read " + tmpdir + "block_%d.index" % bno)
        index = faiss.read_index(tmpdir + "block_%d.index" % bno,
                                 faiss.IO_FLAG_MMAP)
        ivfs.append(index.invlists)

        # avoid that the invlists get deallocated with the index
        index.own_invlists = False

    # construct the output index
    index = faiss.read_index(tmpdir + "trained.index")

    # prepare the output inverted lists. They will be written
    # to merged_index.ivfdata
    invlists = faiss.OnDiskInvertedLists(index.nlist, index.code_size,
                                         tmpdir + "merged_index.ivfdata")

    # merge all the inverted lists
    ivf_vector = faiss.InvertedListsPtrVector()
    for ivf in ivfs:
        ivf_vector.push_back(ivf)

    print("merge %d inverted lists " % ivf_vector.size())
    ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())

    # now replace the inverted lists in the output index
    index.ntotal = ntotal
    index.replace_invlists(invlists)

    print("write " + tmpdir + "populated.index")
    faiss.write_index(index, tmpdir + "populated.index")

コード例 #13

0

ファイルを表示

ファイル: merge_to_ondisk.py プロジェクト: zuo934/faiss

            il.crop_invlists(args.l0, args.l1)
            ils_dont_dealloc.append(il)
        ils.push_back(il)

        if index0 is None:
            index0 = index

    print("loaded %d invlists" % ils.size())

    if not args.outputIL:
        args.outputIL = args.output + '_invlists'

    il0 = ils.at(0)

    il = faiss.OnDiskInvertedLists(il0.nlist, il0.code_size, args.outputIL)

    print("perform merge")

    ntotal = il.merge_from(ils.data(), ils.size(), True)

    print("swap into index0")

    index0_ivf = faiss.extract_index_ivf(index0)
    index0_ivf.nlist = il0.nlist
    index0_ivf.ntotal = index0.ntotal = ntotal
    index0_ivf.invlists = il
    index0_ivf.own_invlists = False

    print("write", args.output)

コード例 #14

0

ファイルを表示

ファイル: create_index.py プロジェクト: woffett/DensePhrases

def run_index(args):
    dump_names = os.listdir(os.path.join(args.dump_dir, 'phrase'))
    dump_paths = sorted([
        os.path.join(args.dump_dir, 'phrase', name) for name in dump_names
        if name.endswith('.hdf5')
    ])

    data = None
    if args.stage in ['all', 'coarse']:
        if args.replace or not os.path.exists(args.quantizer_path):
            if not os.path.exists(args.index_dir):
                os.makedirs(args.index_dir)
            start_data = sample_data(dump_paths,
                                     doc_sample_ratio=args.doc_sample_ratio,
                                     vec_sample_ratio=args.vec_sample_ratio,
                                     norm_th=args.norm_th)

    if args.stage in ['all', 'fine']:
        if args.replace or not os.path.exists(args.trained_index_path):
            if start_data is None:
                start_data = sample_data(
                    dump_paths,
                    doc_sample_ratio=args.doc_sample_ratio,
                    vec_sample_ratio=args.vec_sample_ratio,
                    norm_th=args.norm_th,
                    hnsw=args.hnsw)
            train_index(start_data,
                        args.quantizer_path,
                        args.trained_index_path,
                        args.num_clusters,
                        fine_quant=args.fine_quant,
                        cuda=args.cuda,
                        hnsw=args.hnsw)

    if args.stage in ['all', 'add']:
        if args.replace or not os.path.exists(args.index_path):
            if args.dump_paths is not None:
                dump_paths = args.dump_paths
                if not os.path.exists(args.subindex_dir):
                    os.makedirs(args.subindex_dir)
            add_to_index(dump_paths,
                         args.trained_index_path,
                         args.index_path,
                         args.idx2id_path,
                         cuda=args.cuda,
                         num_docs_per_add=args.num_docs_per_add,
                         offset=args.offset,
                         norm_th=args.norm_th,
                         fine_quant=args.fine_quant)

    if args.stage == 'merge':
        if args.replace or not os.path.exists(args.index_path):
            merge_indexes(args.subindex_dir, args.trained_index_path,
                          args.index_path, args.idx2id_path, args.inv_path)

    if args.stage == 'move':
        index = faiss.read_index(args.trained_index_path)
        invlists = faiss.OnDiskInvertedLists(index.nlist, index.code_size,
                                             args.inv_path)
        index.replace_invlists(invlists)
        faiss.write_index(index, args.index_path)

コード例 #15

0

ファイルを表示

ファイル: build_phrase_index.py プロジェクト: princeton-nlp/DensePhrases

def run_index(args):
    dump_names = os.listdir(os.path.join(args.dump_dir, args.phrase_dir))
    dump_paths = sorted([
        os.path.join(args.dump_dir, args.phrase_dir, name)
        for name in dump_names if name.endswith('.hdf5')
    ])

    data = None
    if args.stage in ['all', 'coarse']:
        if args.replace or not os.path.exists(args.quantizer_path):
            if not os.path.exists(args.index_dir):
                os.makedirs(args.index_dir)
            start_data, avg_vec, std_vec = sample_data(
                dump_paths,
                doc_sample_ratio=args.doc_sample_ratio,
                vec_sample_ratio=args.vec_sample_ratio,
                norm_th=args.norm_th)
            with open(os.path.join(args.index_dir, 'avg_vec.pkl'), 'wb') as fp:
                pickle.dump(avg_vec, fp)
            with open(os.path.join(args.index_dir, 'std_vec.pkl'), 'wb') as fp:
                pickle.dump(std_vec, fp)

    if args.stage in ['all', 'fine']:
        if args.replace or not os.path.exists(args.trained_index_path):
            if start_data is None:
                start_data, avg_vec, std_vec = sample_data(
                    dump_paths,
                    doc_sample_ratio=args.doc_sample_ratio,
                    vec_sample_ratio=args.vec_sample_ratio,
                    norm_th=args.norm_th,
                    hnsw=args.hnsw)
            train_index(start_data,
                        args.quantizer_path,
                        args.trained_index_path,
                        args.num_clusters,
                        fine_quant=args.fine_quant,
                        cuda=args.cuda,
                        hnsw=args.hnsw)

    if args.stage in ['all', 'add']:
        if args.replace or not os.path.exists(args.index_path):
            avg_vec = None
            std_vec = None
            # with open(os.path.join(args.index_dir, 'avg_vec.pkl'), 'rb') as fp:
            #     avg_vec = pickle.load(fp)
            # with open(os.path.join(args.index_dir, 'std_vec.pkl'), 'rb') as fp:
            #     std_vec = pickle.load(fp)

            if args.dump_paths is not None:
                dump_paths = args.dump_paths
                if not os.path.exists(args.subindex_dir):
                    os.makedirs(args.subindex_dir)
            add_to_index(
                dump_paths,
                args.trained_index_path,
                args.index_path,
                args.idx2id_path,
                cuda=args.cuda,
                num_docs_per_add=args.num_docs_per_add,
                offset=args.offset,
                norm_th=args.norm_th,
                fine_quant=args.fine_quant,
                avg_vec=avg_vec,
                std_vec=std_vec,
                first_passage=args.first_passage,
                index_filter=args.index_filter,
            )

    if args.stage == 'merge':
        if args.replace or not os.path.exists(args.index_path):
            merge_indexes(args.subindex_dir, args.trained_index_path,
                          args.index_path, args.idx2id_path, args.inv_path)

    if args.stage == 'move':
        index = faiss.read_index(args.trained_index_path)
        invlists = faiss.OnDiskInvertedLists(index.nlist, index.code_size,
                                             args.inv_path)
        index.replace_invlists(invlists)
        faiss.write_index(index, args.index_path)