def __init__(self,name, approximator):
     super(FaissApproximateRetriever, self).__init__(name=name, approximator=approximator, algorithm="FAISS")
     self.index_path = str(approximator.index_path).replace('//','/')
     self.ivfs = []
     self.ivf_vector = faiss.InvertedListsPtrVector()
     self.uuid = str(uuid.uuid4()).replace('-','_')
     self.faiss_index = None
Exemple #2
0
    def test_slice_vstack(self):
        d = 10
        nb = 1000
        nq = 100
        nt = 200

        xt, xb, xq = get_dataset_2(d, nt, nb, nq)

        quantizer = faiss.IndexFlatL2(d)
        index = faiss.IndexIVFFlat(quantizer, d, 30)

        index.train(xt)
        index.add(xb)
        Dref, Iref = index.search(xq, 10)

        # faiss.wait()

        il0 = index.invlists
        ils = []
        ilv = faiss.InvertedListsPtrVector()
        for sl in 0, 1, 2:
            il = faiss.SliceInvertedLists(il0, sl * 10, sl * 10 + 10)
            ils.append(il)
            ilv.push_back(il)

        il2 = faiss.VStackInvertedLists(ilv.size(), ilv.data())

        index2 = faiss.IndexIVFFlat(quantizer, d, 30)
        index2.replace_invlists(il2)
        index2.ntotal = index.ntotal

        D, I = index2.search(xq, 10)
        assert np.all(D == Dref)
        assert np.all(I == Iref)
Exemple #3
0
 def __init__(self, name, approximator):
     self.name = name
     self.index_path = str(approximator.index_path).replace('//', '/')
     self.ivfs = []
     self.ivf_vector = faiss.InvertedListsPtrVector()
     self.uuid = str(uuid.uuid4()).replace('-', '_')
     self.faiss_index = None
     self.tree = IntervalTree()
     self.loaded_entries = set()
     self.findex = 0
Exemple #4
0
def merge_indexes(subindex_dir, trained_index_path, target_index_path, target_idx2id_path, target_inv_path):
    # target_inv_path = merged_index.ivfdata
    names = os.listdir(subindex_dir)
    idx2id_paths = [os.path.join(subindex_dir, name) for name in names if name.endswith('.hdf5')]
    index_paths = [os.path.join(subindex_dir, name) for name in names if name.endswith('.faiss')]

    print('copying idx2id')
    with h5py.File(target_idx2id_path, 'w') as out:
        for idx2id_path in tqdm(idx2id_paths, desc='copying idx2id'):
            with h5py.File(idx2id_path, 'r') as in_:
                offset = str(in_.attrs['offset'])
                group = out.create_group(offset)
                group.create_dataset('doc', data=in_['doc'])
                group.create_dataset('para', data=in_['para'])
                group.create_dataset('word', data=in_['word'])

    print('loading invlists')
    ivfs = []
    for index_path in tqdm(index_paths, desc='loading invlists'):
        # the IO_FLAG_MMAP is to avoid actually loading the data thus
        # the total size of the inverted lists can exceed the
        # available RAM
        index = faiss.read_index(index_path,
                                 faiss.IO_FLAG_MMAP)
        ivfs.append(index.invlists)

        # avoid that the invlists get deallocated with the index
        index.own_invlists = False

    # construct the output index
    index = faiss.read_index(trained_index_path)

    # prepare the output inverted lists. They will be written
    # to merged_index.ivfdata
    invlists = faiss.OnDiskInvertedLists(
        index.nlist, index.code_size,
        target_inv_path)

    # merge all the inverted lists
    print('merging')
    ivf_vector = faiss.InvertedListsPtrVector()
    for ivf in tqdm(ivfs):
        ivf_vector.push_back(ivf)

    print("merge %d inverted lists " % ivf_vector.size())
    ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())
    print(ntotal)

    # now replace the inverted lists in the output index
    index.ntotal = ntotal
    index.replace_invlists(invlists)

    print('writing index')
    faiss.write_index(index, target_index_path)
Exemple #5
0
def merge_on_disk(trained_index: faiss.Index, shard_fnames: List[str],
                  ivfdata_fname: str) -> None:
    """
	Adds the contents of the indexes stored in shard_fnames into the index
    trained_index. The on-disk data is stored in ivfdata_fname.

    Args:
        trained_index: The trained index to add the data to.
        shard_fnames: A list of the partial index filenames.
        ivfdata_fname: The filename for the on-disk extracted data.

	"""

    # Load the inverted lists
    ivfs = []
    for fname in shard_fnames:
        # The IO_FLAG_MMAP is to avoid actually loading the data, and thus the
        # total size of the inverted lists can exceed the available RAM
        index = faiss.read_index(fname, faiss.IO_FLAG_MMAP)
        index_ivf = faiss.extract_index_ivf(index)
        ivfs.append(index_ivf.invlists)

        # Avoid deallocating the invlists with the index
        index_ivf.own_invlists = False

    # Construct the output index
    index = trained_index
    index_ivf = faiss.extract_index_ivf(index)

    assert index.ntotal == 0, 'The trained index should be empty'

    # Prepare the output inverted lists, which are written to ivfdata_fname.
    invlists = faiss.OnDiskInvertedLists(index_ivf.nlist, index_ivf.code_size,
                                         ivfdata_fname)

    # Merge all the inverted lists
    ivf_vector = faiss.InvertedListsPtrVector()
    for ivf in ivfs:
        ivf_vector.push_back(ivf)

    n_total = invlists.merge_from(ivf_vector.data(), ivf_vector.size())

    # Replace the inverted lists in the output index
    index.ntotal = index_ivf.ntotal = n_total
    index_ivf.replace_invlists(invlists, True)
    invlists.this.disown()
Exemple #6
0
def merge_ondisk(trained_index: faiss.Index, shard_fnames: List[str],
                 ivfdata_fname: str) -> None:
    """Add the contents of the indexes stored in shard_fnames into the index
    trained_index. The on-disk data is stored in ivfdata_fname"""
    assert not isinstance(
        trained_index,
        faiss.IndexIVFPQR), "IndexIVFPQR is not supported as an on disk index."
    # merge the images into an on-disk index
    # first load the inverted lists
    ivfs = []
    for fname in shard_fnames:
        # the IO_FLAG_MMAP is to avoid actually loading the data thus
        # the total size of the inverted lists can exceed the
        # available RAM
        LOG.info("read " + fname)
        index = faiss.read_index(fname, faiss.IO_FLAG_MMAP)
        index_ivf = faiss.extract_index_ivf(index)
        ivfs.append(index_ivf.invlists)

        # avoid that the invlists get deallocated with the index
        index_ivf.own_invlists = False

    # construct the output index
    index = trained_index
    index_ivf = faiss.extract_index_ivf(index)

    assert index.ntotal == 0, "works only on empty index"

    # prepare the output inverted lists. They will be written
    # to merged_index.ivfdata
    invlists = faiss.OnDiskInvertedLists(index_ivf.nlist, index_ivf.code_size,
                                         ivfdata_fname)

    # merge all the inverted lists
    ivf_vector = faiss.InvertedListsPtrVector()
    for ivf in ivfs:
        ivf_vector.push_back(ivf)

    LOG.info("merge %d inverted lists " % ivf_vector.size())
    ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())

    # now replace the inverted lists in the output index
    index.ntotal = index_ivf.ntotal = ntotal
    index_ivf.replace_invlists(invlists, True)
    invlists.this.disown()
def mergeIndexes(indexFolder, machineNumber, finalIndexFile):
    indexesToMerge = []
    indexFiles = os.listdir(os.path.dirname(indexFolder))
    for indexFile in indexFiles:
        parts = indexFile.split('_')
        if len(parts) > 2:
            mnum = int(parts[1][-3:])
            if mnum == machineNumber:
                fullFile = os.path.join(os.path.dirname(indexFolder),
                                        indexFile)
                print('mmaping ', fullFile)
                print('memory usage: ', psutil.virtual_memory().percent)
                index = faiss.read_index(fullFile, faiss.IO_FLAG_MMAP)
                indexesToMerge.append(index.invlists)
    print('adding final index')
    mainIndexFile = open(finalIndexFile, 'rb')
    mainIndexResource = Resource('indexparameters', mainIndexFile.read(),
                                 'application/octet-stream')
    mainIndex, emptyIndex, preproc, map, all_tmp_paths = deserializeIndex(
        mainIndexResource)
    indexesToMerge.append(mainIndex.invlists)
    print("Merging " + str(len(indexesToMerge)) +
          "Index Shards for final index")
    finalIndex = emptyIndex
    invlists = faiss.OnDiskInvertedLists(
        finalIndex.nlist, index.code_size,
        os.path.join(os.path.dirname(self.index_cachefile),
                     'merged_index.ivfdata'))
    ivf_vector = faiss.InvertedListsPtrVector()
    bar = progressbar.ProgressBar()
    for ivf in bar(indexesToMerge):
        ivf_vector.push_back(ivf)
    print("merge %d inverted lists " % ivf_vector.size())
    ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())
    finalIndex.ntotal = ntotal
    finalIndex.replace_invlists(invlists)
    print('ntotal: ', finalIndex.ntotal)
    outName = "finalIndex_%03d" % machineNumber
    outPath = os.path.join(os.path.dirname(finalIndexFile), outName)
    binaryIndex = serializeIndex(finalIndexFile, map, machineNumber,
                                 all_tmp_paths)
    with open(outPath, 'wb') as of:
        of.write(binaryIndex)
Exemple #8
0
    def __init__(self,
                 invlist_fnames,
                 empty_index_fname,
                 masked_index_fname=None):

        self.indexes = indexes = []
        ilv = faiss.InvertedListsPtrVector()

        for fname in invlist_fnames:
            if os.path.exists(fname):
                print('reading', fname, end='\r', flush=True)
                index = faiss.read_index(fname)
                indexes.append(index)
                il = faiss.extract_index_ivf(index).invlists
            else:
                assert False
            ilv.push_back(il)
        print()

        self.big_il = faiss.VStackInvertedLists(ilv.size(), ilv.data())
        if masked_index_fname:
            self.big_il_base = self.big_il
            print('loading', masked_index_fname)
            self.masked_index = faiss.read_index(
                masked_index_fname,
                faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY)
            self.big_il = faiss.MaskedInvertedLists(
                faiss.extract_index_ivf(self.masked_index).invlists,
                self.big_il_base)

        print('loading empty index', empty_index_fname)
        self.index = faiss.read_index(empty_index_fname)
        ntotal = self.big_il.compute_ntotal()

        print('replace invlists')
        index_ivf = faiss.extract_index_ivf(self.index)
        index_ivf.replace_invlists(self.big_il, False)
        index_ivf.ntotal = self.index.ntotal = ntotal
        index_ivf.parallel_mode = 1  # seems reasonable to do this all the time

        quantizer = faiss.downcast_index(index_ivf.quantizer)
        quantizer.hnsw.efSearch = 1024
def mergeIndexList(indexList, emptyIndexPath, outPath, machineNum=""):
    # merge the images into an on-disk index
    # first load the inverted lists
    ivfs = []
    outDir = os.path.dirname(outPath)
    bar = progressbar.ProgressBar()
    for indexFile in bar(indexList):
        # the IO_FLAG_MMAP is to avoid actually loading the data thus
        # the total size of the inverted lists can exceed the
        # available RAM
        # print("read " + indexFile)
        index = faiss.read_index(indexFile, faiss.IO_FLAG_MMAP)
        ivfs.append(index.invlists)

        # avoid that the invlists get deallocated with the index
        index.own_invlists = False

    # construct the output index
    index = faiss.read_index(emptyIndexPath)

    # prepare the output inverted lists. They will be written
    # to merged_index.ivfdata
    ivfDataStr = outDir + "merged_index_" + machineNum + "_.ivfdata"
    invlists = faiss.OnDiskInvertedLists(index.nlist, index.code_size,
                                         ivfDataStr)

    # merge all the inverted lists
    ivf_vector = faiss.InvertedListsPtrVector()
    for ivf in ivfs:
        ivf_vector.push_back(ivf)

    print("merge %d inverted lists " % ivf_vector.size())
    ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())

    # now replace the inverted lists in the output index
    index.ntotal = ntotal
    index.replace_invlists(invlists)

    print("write " + outPath)
    faiss.write_index(index, outPath)
    return ivfDataStr
Exemple #10
0
    def merge_IVFs(self,
                   index_path: Union[str, Path],
                   ivfdata_path: Union[str, Path],
                   ivfindex_paths: List[Union[str, Path]] = None) -> int:
        """
        An on-disk index must be built from existing subindexes. The
        inverted file list (IVF) from each subindex is merged into one
        disk-searchable .ivfdata file referenced by the .index file.

        Note: Use self.mv_index_and_ivfdata() to move these files.

        :param index_path: Path to output.index file
        :param ivfdata_path: Path to output.ivfdata file (on-disk searchable data)
        :param ivfindex_paths: Paths to indexes to be merged
        :return: Number of vectors indexed
        """
        # Collect IVF data from subindexes
        ivfs = list()
        if not ivfindex_paths:
            ivfindex_paths = list(self.subindex_path_totals.keys())
        for subindex_path in ivfindex_paths:
            index = faiss.read_index(p.abspath(subindex_path),
                                     faiss.IO_FLAG_MMAP)
            ivfs.append(index.invlists)
            index.own_invlists = False  # Prevents de-allocation
            del index

        # Prepare .ivfdata file
        index = self.load_base_idx()
        invlists = faiss.OnDiskInvertedLists(index.nlist, index.code_size,
                                             ivfdata_path)
        ivf_vector = faiss.InvertedListsPtrVector()
        for ivf in ivfs:
            ivf_vector.push_back(ivf)

        # Merge IVF data
        ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())
        index.ntotal = ntotal
        index.replace_invlists(invlists)
        faiss.write_index(index, index_path)
        return int(ntotal)
def dumpIndex(indexInMemory, indexOnDiskPath, shardCount):
    if shardCount == 0:
        faiss.write_index(indexInMemory, indexOnDiskPath)
    else:
        ivfs = []
        ivfs.append(indexInMemory.invlists)
        indexInMemory.own_invlists = False
        diskIndex = faiss.read_index(indexOnDiskPath, faiss.IO_FLAG_MMAP)
        ivfs.append(diskIndex.invlists)
        diskIndex.own_invlists = False

        invlists = faiss.OnDiskInvertedLists(diskIndex.nlist,
                                             diskIndex.code_size,
                                             'mergedIndex_tmp.ivfdata')
        ivf_vector = faiss.InvertedListsPtrVector()
        ivf_vector.push_back(indexInMemory.invlists)
        ivf_vector.push_back(diskIndex.invlists)
        ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())
        indexInMemory.ntotal = ntotal
        indexInMemory.replace_invlists(invlists)
        print('Index on disk now has ', indexInMemory.ntotal, ' entries')
        faiss.write_index(indexInMemory, indexOnDiskPath)
Exemple #12
0
    def make_mmap_index(self, base_index: BASE_INDEX, ids: np.array,
                        embs: np.array):
        # Get invlists
        index = faiss.clone_index(base_index)
        index.add_with_ids(embs, ids)
        ivf_vector = faiss.InvertedListsPtrVector()
        ivf_vector.push_back(index.invlists)
        index.own_invlists = False
        del index
        gc.collect()

        # Make MMAP ivfdata
        index_name = p.abspath(self.sub_dir / f'{self.seed_name}')
        invlists = faiss.OnDiskInvertedLists(base_index.nlist,
                                             base_index.code_size,
                                             f'{index_name}.ivfdata')
        ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())

        # Link index to ivfdata and save
        index = faiss.clone_index(base_index)
        index.ntotal = ntotal
        index.replace_invlists(invlists)
        faiss.write_index(index, f'{index_name}.index')
Exemple #13
0
                                 faiss.IO_FLAG_MMAP)
        ivfs.append(index.invlists)

        # avoid that the invlists get deallocated with the index
        index.own_invlists = False

    # construct the output index
    index = faiss.read_index(tmpdir + "trained.index")

    # prepare the output inverted lists. They will be written
    # to merged_index.ivfdata
    invlists = faiss.OnDiskInvertedLists(index.nlist, index.code_size,
                                         tmpdir + "merged_index.ivfdata")

    # merge all the inverted lists
    ivf_vector = faiss.InvertedListsPtrVector()
    for ivf in ivfs:
        ivf_vector.push_back(ivf)

    print("merge %d inverted lists " % ivf_vector.size())
    ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())

    # now replace the inverted lists in the output index
    index.ntotal = ntotal
    index.replace_invlists(invlists)

    print("write " + tmpdir + "populated.index")
    faiss.write_index(index, tmpdir + "populated.index")

if stage == 6:
    # perform a search from disk
Exemple #14
0
    parser.add_argument('--l0', type=int, default=0)
    parser.add_argument('--l1', type=int, default=-1)

    parser.add_argument('--nt', default=-1, help='nb threads')

    parser.add_argument('--output',
                        required=True,
                        help='output index filename')
    parser.add_argument('--outputIL', help='output invfile filename')

    args = parser.parse_args()

    if args.nt != -1:
        print('set nb of threads to', args.nt)

    ils = faiss.InvertedListsPtrVector()
    ils_dont_dealloc = []

    pool = ThreadPool(20)

    def load_index(fname):
        print("loading", fname)
        try:
            index = faiss.read_index(
                fname, faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY)
        except RuntimeError as e:
            print('could not load %s: %s' % (fname, e))
            return fname, None

        print("  %d entries" % index.ntotal)
        return fname, index