def __init__(self,name, approximator): super(FaissApproximateRetriever, self).__init__(name=name, approximator=approximator, algorithm="FAISS") self.index_path = str(approximator.index_path).replace('//','/') self.ivfs = [] self.ivf_vector = faiss.InvertedListsPtrVector() self.uuid = str(uuid.uuid4()).replace('-','_') self.faiss_index = None
def test_slice_vstack(self): d = 10 nb = 1000 nq = 100 nt = 200 xt, xb, xq = get_dataset_2(d, nt, nb, nq) quantizer = faiss.IndexFlatL2(d) index = faiss.IndexIVFFlat(quantizer, d, 30) index.train(xt) index.add(xb) Dref, Iref = index.search(xq, 10) # faiss.wait() il0 = index.invlists ils = [] ilv = faiss.InvertedListsPtrVector() for sl in 0, 1, 2: il = faiss.SliceInvertedLists(il0, sl * 10, sl * 10 + 10) ils.append(il) ilv.push_back(il) il2 = faiss.VStackInvertedLists(ilv.size(), ilv.data()) index2 = faiss.IndexIVFFlat(quantizer, d, 30) index2.replace_invlists(il2) index2.ntotal = index.ntotal D, I = index2.search(xq, 10) assert np.all(D == Dref) assert np.all(I == Iref)
def __init__(self, name, approximator): self.name = name self.index_path = str(approximator.index_path).replace('//', '/') self.ivfs = [] self.ivf_vector = faiss.InvertedListsPtrVector() self.uuid = str(uuid.uuid4()).replace('-', '_') self.faiss_index = None self.tree = IntervalTree() self.loaded_entries = set() self.findex = 0
def merge_indexes(subindex_dir, trained_index_path, target_index_path, target_idx2id_path, target_inv_path): # target_inv_path = merged_index.ivfdata names = os.listdir(subindex_dir) idx2id_paths = [os.path.join(subindex_dir, name) for name in names if name.endswith('.hdf5')] index_paths = [os.path.join(subindex_dir, name) for name in names if name.endswith('.faiss')] print('copying idx2id') with h5py.File(target_idx2id_path, 'w') as out: for idx2id_path in tqdm(idx2id_paths, desc='copying idx2id'): with h5py.File(idx2id_path, 'r') as in_: offset = str(in_.attrs['offset']) group = out.create_group(offset) group.create_dataset('doc', data=in_['doc']) group.create_dataset('para', data=in_['para']) group.create_dataset('word', data=in_['word']) print('loading invlists') ivfs = [] for index_path in tqdm(index_paths, desc='loading invlists'): # the IO_FLAG_MMAP is to avoid actually loading the data thus # the total size of the inverted lists can exceed the # available RAM index = faiss.read_index(index_path, faiss.IO_FLAG_MMAP) ivfs.append(index.invlists) # avoid that the invlists get deallocated with the index index.own_invlists = False # construct the output index index = faiss.read_index(trained_index_path) # prepare the output inverted lists. They will be written # to merged_index.ivfdata invlists = faiss.OnDiskInvertedLists( index.nlist, index.code_size, target_inv_path) # merge all the inverted lists print('merging') ivf_vector = faiss.InvertedListsPtrVector() for ivf in tqdm(ivfs): ivf_vector.push_back(ivf) print("merge %d inverted lists " % ivf_vector.size()) ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size()) print(ntotal) # now replace the inverted lists in the output index index.ntotal = ntotal index.replace_invlists(invlists) print('writing index') faiss.write_index(index, target_index_path)
def merge_on_disk(trained_index: faiss.Index, shard_fnames: List[str], ivfdata_fname: str) -> None: """ Adds the contents of the indexes stored in shard_fnames into the index trained_index. The on-disk data is stored in ivfdata_fname. Args: trained_index: The trained index to add the data to. shard_fnames: A list of the partial index filenames. ivfdata_fname: The filename for the on-disk extracted data. """ # Load the inverted lists ivfs = [] for fname in shard_fnames: # The IO_FLAG_MMAP is to avoid actually loading the data, and thus the # total size of the inverted lists can exceed the available RAM index = faiss.read_index(fname, faiss.IO_FLAG_MMAP) index_ivf = faiss.extract_index_ivf(index) ivfs.append(index_ivf.invlists) # Avoid deallocating the invlists with the index index_ivf.own_invlists = False # Construct the output index index = trained_index index_ivf = faiss.extract_index_ivf(index) assert index.ntotal == 0, 'The trained index should be empty' # Prepare the output inverted lists, which are written to ivfdata_fname. invlists = faiss.OnDiskInvertedLists(index_ivf.nlist, index_ivf.code_size, ivfdata_fname) # Merge all the inverted lists ivf_vector = faiss.InvertedListsPtrVector() for ivf in ivfs: ivf_vector.push_back(ivf) n_total = invlists.merge_from(ivf_vector.data(), ivf_vector.size()) # Replace the inverted lists in the output index index.ntotal = index_ivf.ntotal = n_total index_ivf.replace_invlists(invlists, True) invlists.this.disown()
def merge_ondisk(trained_index: faiss.Index, shard_fnames: List[str], ivfdata_fname: str) -> None: """Add the contents of the indexes stored in shard_fnames into the index trained_index. The on-disk data is stored in ivfdata_fname""" assert not isinstance( trained_index, faiss.IndexIVFPQR), "IndexIVFPQR is not supported as an on disk index." # merge the images into an on-disk index # first load the inverted lists ivfs = [] for fname in shard_fnames: # the IO_FLAG_MMAP is to avoid actually loading the data thus # the total size of the inverted lists can exceed the # available RAM LOG.info("read " + fname) index = faiss.read_index(fname, faiss.IO_FLAG_MMAP) index_ivf = faiss.extract_index_ivf(index) ivfs.append(index_ivf.invlists) # avoid that the invlists get deallocated with the index index_ivf.own_invlists = False # construct the output index index = trained_index index_ivf = faiss.extract_index_ivf(index) assert index.ntotal == 0, "works only on empty index" # prepare the output inverted lists. They will be written # to merged_index.ivfdata invlists = faiss.OnDiskInvertedLists(index_ivf.nlist, index_ivf.code_size, ivfdata_fname) # merge all the inverted lists ivf_vector = faiss.InvertedListsPtrVector() for ivf in ivfs: ivf_vector.push_back(ivf) LOG.info("merge %d inverted lists " % ivf_vector.size()) ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size()) # now replace the inverted lists in the output index index.ntotal = index_ivf.ntotal = ntotal index_ivf.replace_invlists(invlists, True) invlists.this.disown()
def mergeIndexes(indexFolder, machineNumber, finalIndexFile): indexesToMerge = [] indexFiles = os.listdir(os.path.dirname(indexFolder)) for indexFile in indexFiles: parts = indexFile.split('_') if len(parts) > 2: mnum = int(parts[1][-3:]) if mnum == machineNumber: fullFile = os.path.join(os.path.dirname(indexFolder), indexFile) print('mmaping ', fullFile) print('memory usage: ', psutil.virtual_memory().percent) index = faiss.read_index(fullFile, faiss.IO_FLAG_MMAP) indexesToMerge.append(index.invlists) print('adding final index') mainIndexFile = open(finalIndexFile, 'rb') mainIndexResource = Resource('indexparameters', mainIndexFile.read(), 'application/octet-stream') mainIndex, emptyIndex, preproc, map, all_tmp_paths = deserializeIndex( mainIndexResource) indexesToMerge.append(mainIndex.invlists) print("Merging " + str(len(indexesToMerge)) + "Index Shards for final index") finalIndex = emptyIndex invlists = faiss.OnDiskInvertedLists( finalIndex.nlist, index.code_size, os.path.join(os.path.dirname(self.index_cachefile), 'merged_index.ivfdata')) ivf_vector = faiss.InvertedListsPtrVector() bar = progressbar.ProgressBar() for ivf in bar(indexesToMerge): ivf_vector.push_back(ivf) print("merge %d inverted lists " % ivf_vector.size()) ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size()) finalIndex.ntotal = ntotal finalIndex.replace_invlists(invlists) print('ntotal: ', finalIndex.ntotal) outName = "finalIndex_%03d" % machineNumber outPath = os.path.join(os.path.dirname(finalIndexFile), outName) binaryIndex = serializeIndex(finalIndexFile, map, machineNumber, all_tmp_paths) with open(outPath, 'wb') as of: of.write(binaryIndex)
def __init__(self, invlist_fnames, empty_index_fname, masked_index_fname=None): self.indexes = indexes = [] ilv = faiss.InvertedListsPtrVector() for fname in invlist_fnames: if os.path.exists(fname): print('reading', fname, end='\r', flush=True) index = faiss.read_index(fname) indexes.append(index) il = faiss.extract_index_ivf(index).invlists else: assert False ilv.push_back(il) print() self.big_il = faiss.VStackInvertedLists(ilv.size(), ilv.data()) if masked_index_fname: self.big_il_base = self.big_il print('loading', masked_index_fname) self.masked_index = faiss.read_index( masked_index_fname, faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY) self.big_il = faiss.MaskedInvertedLists( faiss.extract_index_ivf(self.masked_index).invlists, self.big_il_base) print('loading empty index', empty_index_fname) self.index = faiss.read_index(empty_index_fname) ntotal = self.big_il.compute_ntotal() print('replace invlists') index_ivf = faiss.extract_index_ivf(self.index) index_ivf.replace_invlists(self.big_il, False) index_ivf.ntotal = self.index.ntotal = ntotal index_ivf.parallel_mode = 1 # seems reasonable to do this all the time quantizer = faiss.downcast_index(index_ivf.quantizer) quantizer.hnsw.efSearch = 1024
def mergeIndexList(indexList, emptyIndexPath, outPath, machineNum=""): # merge the images into an on-disk index # first load the inverted lists ivfs = [] outDir = os.path.dirname(outPath) bar = progressbar.ProgressBar() for indexFile in bar(indexList): # the IO_FLAG_MMAP is to avoid actually loading the data thus # the total size of the inverted lists can exceed the # available RAM # print("read " + indexFile) index = faiss.read_index(indexFile, faiss.IO_FLAG_MMAP) ivfs.append(index.invlists) # avoid that the invlists get deallocated with the index index.own_invlists = False # construct the output index index = faiss.read_index(emptyIndexPath) # prepare the output inverted lists. They will be written # to merged_index.ivfdata ivfDataStr = outDir + "merged_index_" + machineNum + "_.ivfdata" invlists = faiss.OnDiskInvertedLists(index.nlist, index.code_size, ivfDataStr) # merge all the inverted lists ivf_vector = faiss.InvertedListsPtrVector() for ivf in ivfs: ivf_vector.push_back(ivf) print("merge %d inverted lists " % ivf_vector.size()) ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size()) # now replace the inverted lists in the output index index.ntotal = ntotal index.replace_invlists(invlists) print("write " + outPath) faiss.write_index(index, outPath) return ivfDataStr
def merge_IVFs(self, index_path: Union[str, Path], ivfdata_path: Union[str, Path], ivfindex_paths: List[Union[str, Path]] = None) -> int: """ An on-disk index must be built from existing subindexes. The inverted file list (IVF) from each subindex is merged into one disk-searchable .ivfdata file referenced by the .index file. Note: Use self.mv_index_and_ivfdata() to move these files. :param index_path: Path to output.index file :param ivfdata_path: Path to output.ivfdata file (on-disk searchable data) :param ivfindex_paths: Paths to indexes to be merged :return: Number of vectors indexed """ # Collect IVF data from subindexes ivfs = list() if not ivfindex_paths: ivfindex_paths = list(self.subindex_path_totals.keys()) for subindex_path in ivfindex_paths: index = faiss.read_index(p.abspath(subindex_path), faiss.IO_FLAG_MMAP) ivfs.append(index.invlists) index.own_invlists = False # Prevents de-allocation del index # Prepare .ivfdata file index = self.load_base_idx() invlists = faiss.OnDiskInvertedLists(index.nlist, index.code_size, ivfdata_path) ivf_vector = faiss.InvertedListsPtrVector() for ivf in ivfs: ivf_vector.push_back(ivf) # Merge IVF data ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size()) index.ntotal = ntotal index.replace_invlists(invlists) faiss.write_index(index, index_path) return int(ntotal)
def dumpIndex(indexInMemory, indexOnDiskPath, shardCount): if shardCount == 0: faiss.write_index(indexInMemory, indexOnDiskPath) else: ivfs = [] ivfs.append(indexInMemory.invlists) indexInMemory.own_invlists = False diskIndex = faiss.read_index(indexOnDiskPath, faiss.IO_FLAG_MMAP) ivfs.append(diskIndex.invlists) diskIndex.own_invlists = False invlists = faiss.OnDiskInvertedLists(diskIndex.nlist, diskIndex.code_size, 'mergedIndex_tmp.ivfdata') ivf_vector = faiss.InvertedListsPtrVector() ivf_vector.push_back(indexInMemory.invlists) ivf_vector.push_back(diskIndex.invlists) ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size()) indexInMemory.ntotal = ntotal indexInMemory.replace_invlists(invlists) print('Index on disk now has ', indexInMemory.ntotal, ' entries') faiss.write_index(indexInMemory, indexOnDiskPath)
def make_mmap_index(self, base_index: BASE_INDEX, ids: np.array, embs: np.array): # Get invlists index = faiss.clone_index(base_index) index.add_with_ids(embs, ids) ivf_vector = faiss.InvertedListsPtrVector() ivf_vector.push_back(index.invlists) index.own_invlists = False del index gc.collect() # Make MMAP ivfdata index_name = p.abspath(self.sub_dir / f'{self.seed_name}') invlists = faiss.OnDiskInvertedLists(base_index.nlist, base_index.code_size, f'{index_name}.ivfdata') ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size()) # Link index to ivfdata and save index = faiss.clone_index(base_index) index.ntotal = ntotal index.replace_invlists(invlists) faiss.write_index(index, f'{index_name}.index')
faiss.IO_FLAG_MMAP) ivfs.append(index.invlists) # avoid that the invlists get deallocated with the index index.own_invlists = False # construct the output index index = faiss.read_index(tmpdir + "trained.index") # prepare the output inverted lists. They will be written # to merged_index.ivfdata invlists = faiss.OnDiskInvertedLists(index.nlist, index.code_size, tmpdir + "merged_index.ivfdata") # merge all the inverted lists ivf_vector = faiss.InvertedListsPtrVector() for ivf in ivfs: ivf_vector.push_back(ivf) print("merge %d inverted lists " % ivf_vector.size()) ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size()) # now replace the inverted lists in the output index index.ntotal = ntotal index.replace_invlists(invlists) print("write " + tmpdir + "populated.index") faiss.write_index(index, tmpdir + "populated.index") if stage == 6: # perform a search from disk
parser.add_argument('--l0', type=int, default=0) parser.add_argument('--l1', type=int, default=-1) parser.add_argument('--nt', default=-1, help='nb threads') parser.add_argument('--output', required=True, help='output index filename') parser.add_argument('--outputIL', help='output invfile filename') args = parser.parse_args() if args.nt != -1: print('set nb of threads to', args.nt) ils = faiss.InvertedListsPtrVector() ils_dont_dealloc = [] pool = ThreadPool(20) def load_index(fname): print("loading", fname) try: index = faiss.read_index( fname, faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY) except RuntimeError as e: print('could not load %s: %s' % (fname, e)) return fname, None print(" %d entries" % index.ntotal) return fname, index