def main(): parser = argparse.ArgumentParser( description='make index for a subset of the data') def aa(*args, **kwargs): group.add_argument(*args, **kwargs) group = parser.add_argument_group('index type') aa('--inputindex', default=workdir + 'trained.faissindex', help='empty input index to fill in') aa('--nt', default=-1, type=int, help='nb of openmp threads to use') group = parser.add_argument_group('db options') aa('--input', default=deep1bdir + "base.fvecs") aa('--bs', default=2**18, type=int, help='batch size for db access') aa('--i0', default=0, type=int, help='lower bound to index') aa('--i1', default=-1, type=int, help='upper bound of vectors to index') group = parser.add_argument_group('output') aa('-o', default='/tmp/x', help='output index') aa('--keepquantizer', default=False, action='store_true', help='by default we remove the data from the quantizer to save space') args = parser.parse_args() print('args=', args) print('start accessing data') src = produce_batches(args) print('loading index', args.inputindex) index = faiss.read_index(args.inputindex) if args.nt != -1: faiss.omp_set_num_threads(args.nt) t0 = time.time() ntot = 0 for ids, x in rate_limited_iter(src): print('add %d:%d (%.3f s)' % (ntot, ntot + ids.size, time.time() - t0)) index.add_with_ids(np.ascontiguousarray(x, dtype='float32'), ids) ntot += ids.size index_ivf = faiss.extract_index_ivf(index) print('invlists stats: imbalance %.3f' % index_ivf.invlists.imbalance_factor()) index_ivf.invlists.print_stats() if not args.keepquantizer: print('resetting quantizer content') index_ivf = faiss.extract_index_ivf(index) index_ivf.quantizer.reset() print('store output', args.o) faiss.write_index(index, args.o)
def get_cluster_ids(self, list_num: int) -> np.ndarray: """ TODO: docstring """ # TODO: assert IVF assert self.is_trained # This fixes problem with SWIG and numpy int list_num = int(list_num) index = faiss.read_index(str(self.tempdir / self.MERGED_INDEX_NAME)) # Get the IVF from potentially opaque index invlists = faiss.extract_index_ivf(index).invlists list_size = invlists.list_size(list_num) list_ids = np.zeros(list_size, dtype=np.int64) temp_ids = invlists.get_ids(list_num) # Need to copy since memory will be deallocated along with the invlist. faiss.memcpy(faiss.swig_ptr(list_ids), temp_ids, list_ids.nbytes) invlists.release_ids(list_num, temp_ids) if self.multi_id: list_ids = self._invert_cantor_pairing_vec(list_ids) return list_ids
def __init__(self, index, sub_indexes): self.index = index self.code_size = faiss.extract_index_ivf(index.index).code_size self.sub_indexes = sub_indexes self.ni = len(self.sub_indexes) # pool of threads. Each thread manages one sub-index. self.pool = ThreadPool(self.ni) self.verbose = False
def merge_on_disk(trained_index: faiss.Index, shard_fnames: List[str], ivfdata_fname: str) -> None: """ Adds the contents of the indexes stored in shard_fnames into the index trained_index. The on-disk data is stored in ivfdata_fname. Args: trained_index: The trained index to add the data to. shard_fnames: A list of the partial index filenames. ivfdata_fname: The filename for the on-disk extracted data. """ # Load the inverted lists ivfs = [] for fname in shard_fnames: # The IO_FLAG_MMAP is to avoid actually loading the data, and thus the # total size of the inverted lists can exceed the available RAM index = faiss.read_index(fname, faiss.IO_FLAG_MMAP) index_ivf = faiss.extract_index_ivf(index) ivfs.append(index_ivf.invlists) # Avoid deallocating the invlists with the index index_ivf.own_invlists = False # Construct the output index index = trained_index index_ivf = faiss.extract_index_ivf(index) assert index.ntotal == 0, 'The trained index should be empty' # Prepare the output inverted lists, which are written to ivfdata_fname. invlists = faiss.OnDiskInvertedLists(index_ivf.nlist, index_ivf.code_size, ivfdata_fname) # Merge all the inverted lists ivf_vector = faiss.InvertedListsPtrVector() for ivf in ivfs: ivf_vector.push_back(ivf) n_total = invlists.merge_from(ivf_vector.data(), ivf_vector.size()) # Replace the inverted lists in the output index index.ntotal = index_ivf.ntotal = n_total index_ivf.replace_invlists(invlists, True) invlists.this.disown()
def merge_ondisk(trained_index: faiss.Index, shard_fnames: List[str], ivfdata_fname: str) -> None: """Add the contents of the indexes stored in shard_fnames into the index trained_index. The on-disk data is stored in ivfdata_fname""" assert not isinstance( trained_index, faiss.IndexIVFPQR), "IndexIVFPQR is not supported as an on disk index." # merge the images into an on-disk index # first load the inverted lists ivfs = [] for fname in shard_fnames: # the IO_FLAG_MMAP is to avoid actually loading the data thus # the total size of the inverted lists can exceed the # available RAM LOG.info("read " + fname) index = faiss.read_index(fname, faiss.IO_FLAG_MMAP) index_ivf = faiss.extract_index_ivf(index) ivfs.append(index_ivf.invlists) # avoid that the invlists get deallocated with the index index_ivf.own_invlists = False # construct the output index index = trained_index index_ivf = faiss.extract_index_ivf(index) assert index.ntotal == 0, "works only on empty index" # prepare the output inverted lists. They will be written # to merged_index.ivfdata invlists = faiss.OnDiskInvertedLists(index_ivf.nlist, index_ivf.code_size, ivfdata_fname) # merge all the inverted lists ivf_vector = faiss.InvertedListsPtrVector() for ivf in ivfs: ivf_vector.push_back(ivf) LOG.info("merge %d inverted lists " % ivf_vector.size()) ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size()) # now replace the inverted lists in the output index index.ntotal = index_ivf.ntotal = ntotal index_ivf.replace_invlists(invlists, True) invlists.this.disown()
def train_index(start_data, quantizer_path, trained_index_path, num_clusters, fine_quant='SQ4', cuda=False, hnsw=False): ds = start_data.shape[1] quantizer = faiss.IndexFlatIP(ds) # Used only for reimplementation if fine_quant == 'SQ4': start_index = faiss.IndexIVFScalarQuantizer( quantizer, ds, num_clusters, faiss.ScalarQuantizer.QT_4bit, faiss.METRIC_INNER_PRODUCT) # Default index type elif 'OPQ' in fine_quant: code_size = int(fine_quant[fine_quant.index('OPQ') + 3:]) if hnsw: start_index = faiss.IndexHNSWPQ(ds, "HNSW32,PQ96", faiss.METRIC_INNER_PRODUCT) else: opq_matrix = faiss.OPQMatrix(ds, code_size) opq_matrix.niter = 10 sub_index = faiss.IndexIVFPQ(quantizer, ds, num_clusters, code_size, 8, faiss.METRIC_INNER_PRODUCT) start_index = faiss.IndexPreTransform(opq_matrix, sub_index) elif 'none' in fine_quant: start_index = faiss.IndexFlatIP(ds) else: raise ValueError(fine_quant) start_index.verbose = False if cuda: # Convert to GPU index res = faiss.StandardGpuResources() co = faiss.GpuClonerOptions() co.useFloat16 = True gpu_index = faiss.index_cpu_to_gpu(res, 0, start_index, co) gpu_index.verbose = False # Train on GPU and back to CPU gpu_index.train(start_data) start_index = faiss.index_gpu_to_cpu(gpu_index) else: start_index.train(start_data) # Make sure to set direct map again if 'none' not in fine_quant: index_ivf = faiss.extract_index_ivf(start_index) index_ivf.make_direct_map() index_ivf.set_direct_map_type(faiss.DirectMap.Hashtable) faiss.write_index(start_index, trained_index_path)
def __init__(self, invlist_fnames, empty_index_fname, masked_index_fname=None): self.indexes = indexes = [] ilv = faiss.InvertedListsPtrVector() for fname in invlist_fnames: if os.path.exists(fname): print('reading', fname, end='\r', flush=True) index = faiss.read_index(fname) indexes.append(index) il = faiss.extract_index_ivf(index).invlists else: assert False ilv.push_back(il) print() self.big_il = faiss.VStackInvertedLists(ilv.size(), ilv.data()) if masked_index_fname: self.big_il_base = self.big_il print('loading', masked_index_fname) self.masked_index = faiss.read_index( masked_index_fname, faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY) self.big_il = faiss.MaskedInvertedLists( faiss.extract_index_ivf(self.masked_index).invlists, self.big_il_base) print('loading empty index', empty_index_fname) self.index = faiss.read_index(empty_index_fname) ntotal = self.big_il.compute_ntotal() print('replace invlists') index_ivf = faiss.extract_index_ivf(self.index) index_ivf.replace_invlists(self.big_il, False) index_ivf.ntotal = self.index.ntotal = ntotal index_ivf.parallel_mode = 1 # seems reasonable to do this all the time quantizer = faiss.downcast_index(index_ivf.quantizer) quantizer.hnsw.efSearch = 1024
def test_ivf_train_2level(self): " check 2-level clustering with IVF training " ds = datasets.SyntheticDataset(32, 10000, 1000, 200) index = faiss.index_factory(ds.d, "PCA16,IVF100,SQ8") faiss.extract_index_ivf(index).nprobe = 10 index.train(ds.get_train()) index.add(ds.get_database()) Dref, Iref = index.search(ds.get_queries(), 1) index = faiss.index_factory(ds.d, "PCA16,IVF100,SQ8") faiss.extract_index_ivf(index).nprobe = 10 clustering.train_ivf_index_with_2level(index, ds.get_train(), verbose=True) index.add(ds.get_database()) Dnew, Inew = index.search(ds.get_queries(), 1) # normally 47 / 200 differences ndiff = (Iref != Inew).sum() self.assertLess(ndiff, 50)
def get_cluster_sizes(self) -> List[int]: """Returns the number of vectors assigned to each cluster.""" # TODO: assert IVF assert self.is_trained index = faiss.read_index(str(self.tempdir / self.MERGED_INDEX_NAME)) # Get the IVF from potentially opaque index invlists = faiss.extract_index_ivf(index).invlists list_sizes = [invlists.list_size(i) for i in range(invlists.nlist)] return list_sizes
def __init__(self, phrase_dump_dir, index_path, idx2id_path, cuda=False, logging_level=logging.INFO): self.phrase_dump_dir = phrase_dump_dir # Read index self.index = {} logger.info(f'Reading {index_path}') self.index = faiss.read_index(index_path, faiss.IO_FLAG_ONDISK_SAME_DIR) self.max_idx = 1e8 if 'PQ' not in index_path else 1e9 logger.info( f'index ntotal: {self.index.ntotal} | PQ: {"PQ" in index_path}') # Read idx2id self.idx_f = {} logger.info('Load idx2id on memory') self.idx_f = self.load_idx_f(idx2id_path) self.offset = None self.scale = None self.doc_groups = None # Options logger.setLevel(logging_level) self.num_docs_list = [] self.cuda = cuda if self.cuda: assert torch.cuda.is_available( ), f"Cuda availability {torch.cuda.is_available()}" self.device = torch.device('cuda') logger.info("Load IVF on GPU") index_ivf = faiss.extract_index_ivf(self.index) quantizer = index_ivf.quantizer quantizer_gpu = faiss.index_cpu_to_all_gpus(quantizer) index_ivf.quantizer = quantizer_gpu else: self.device = torch.device("cpu") # Load metadata on RAM if possible doc_group_path = os.path.join( self.phrase_dump_dir[:self.phrase_dump_dir.index('/phrase')], 'dph_meta_compressed.pkl') # 1 min if os.path.exists(doc_group_path) and ('PQ' in index_path): logger.info( f"Loading metadata on RAM from {doc_group_path} (for PQ only)") self.doc_groups = pickle.load(open(doc_group_path, 'rb')) else: logger.info( f"Will read metadata directly from hdf5 files (requires SSDs for faster inference)" )
def training_initialize(self, index, quantizer): """ The index and quantizer should be owned by caller. """ assert self.ngpu > 0 s = time.time() self.index_ivf = faiss.extract_index_ivf(index) self.clustering_index = faiss.index_cpu_to_all_gpus(quantizer) self.index_ivf.clustering_index = self.clustering_index print(time.time() - s)
def get_centroids(self) -> np.ndarray: """Returns the IVF centroids.""" # TODO: assert IVF assert self.is_trained index = faiss.read_index(str(self.tempdir / self.MERGED_INDEX_NAME)) # Get the IVF from potentially opaque index index_ivf = faiss.extract_index_ivf(index) centroids = index_ivf.quantizer.reconstruct_n(0, index_ivf.nlist) return centroids
def train_quantizer(self, data): db_ids = [t[0] for t in data] # [print("CHANGE BACK TO 60 * 65536") for _ in range(1000)] data = random.sample(data, 60 * 65536) vectors = [np.reshape(t[1], (1, -1)) for t in data] vectors = np.concatenate(vectors, axis=0) if not self.index.is_trained: print("training product quantizer") index_ivf = faiss.extract_index_ivf(self.index) clustering_index = faiss.index_cpu_to_all_gpus( faiss.IndexFlatL2(768)) index_ivf.clustering_index = clustering_index self.index.train(vectors)
def transform_and_assign(self, xq): index = self.index if isinstance(index, faiss.IndexPreTransform): assert index.chain.size() == 1 vt = index.chain.at(0) xq = vt.apply_py(xq) # perform quantization index_ivf = faiss.extract_index_ivf(index) quantizer = index_ivf.quantizer coarse_dis, list_nos = quantizer.search(xq, index_ivf.nprobe) return xq, list_nos, coarse_dis
def ivf_search_preassigned(self, xq, list_nos, coarse_dis, k): index_ivf = faiss.extract_index_ivf(self.index) n, d = xq.shape assert d == index_ivf.d n2, d2 = list_nos.shape assert list_nos.shape == coarse_dis.shape assert n2 == n assert d2 == index_ivf.nprobe D = np.empty((n, k), dtype='float32') I = np.empty((n, k), dtype='int64') index_ivf.search_preassigned( n, faiss.swig_ptr(xq), k, faiss.swig_ptr(list_nos), faiss.swig_ptr(coarse_dis), faiss.swig_ptr(D), faiss.swig_ptr(I), False) return D, I
def ivf_range_search_preassigned(self, xq, list_nos, coarse_dis, radius): index_ivf = faiss.extract_index_ivf(self.index) n, d = xq.shape assert d == index_ivf.d n2, d2 = list_nos.shape assert list_nos.shape == coarse_dis.shape assert n2 == n assert d2 == index_ivf.nprobe res = faiss.RangeSearchResult(n) index_ivf.range_search_preassigned(n, faiss.swig_ptr(xq), radius, faiss.swig_ptr(list_nos), faiss.swig_ptr(coarse_dis), res) lims = faiss.rev_swig_ptr(res.lims, n + 1).copy() nd = int(lims[-1]) D = faiss.rev_swig_ptr(res.distances, nd).copy() I = faiss.rev_swig_ptr(res.labels, nd).copy() return lims, D, I
def build(self, use_gpu=False): self.vectors = np.array(self.vectors) faiss.normalize_L2(self.vectors) logging.info('Indexing {} vectors'.format(self.vectors.shape[0])) if self.vectors.shape[0] > 50000: num_centroids = 8 * int( math.sqrt(math.pow(2, int(math.log(self.vectors.shape[0], 2))))) logging.info('Using {} centroids'.format(num_centroids)) self.index = faiss.index_factory( self.d, "IVF{}_HNSW32,Flat".format(num_centroids)) ngpu = faiss.get_num_gpus() if ngpu > 0 and use_gpu: logging.info('Using {} GPUs'.format(ngpu)) index_ivf = faiss.extract_index_ivf(self.index) clustering_index = faiss.index_cpu_to_all_gpus( faiss.IndexFlatL2(self.d)) index_ivf.clustering_index = clustering_index logging.info('Training index...') self.index.train(self.vectors) else: self.index = faiss.IndexFlatL2(self.d) if faiss.get_num_gpus() > 0 and use_gpu: self.index = faiss.index_cpu_to_all_gpus(self.index) logging.info('Adding vectors to index...') self.index.add(self.vectors)
def simulate_mee_runtime(n_videos=1000000, d=256, n_query=100, max_neighbors=100, n_runs=5, n_warmup_runs=10): """ Search over a database of shape [n_videos, d] with query of shape [n_query, d]. For each query, return max_neighbors results. """ import faiss torch.cuda.synchronize() st_time = time.time() fake_database = faiss.rand((n_videos, d)) fake_query = faiss.rand((n_query, d)) torch.cuda.synchronize() logger.info("Construct fake database + query time {}".format(time.time() - st_time)) torch.cuda.synchronize() st_time = time.time() index = faiss.index_factory(d, "IVF4096,Flat", faiss.METRIC_L2) index_ivf = faiss.extract_index_ivf(index) clustering_index = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(d)) index_ivf.clustering_index = clustering_index torch.cuda.synchronize() logger.info("Build/Move to GPU? index time {}".format(time.time() - st_time)) st_time = time.time() torch.cuda.synchronize() index_ivf.train(fake_database) torch.cuda.synchronize() logger.info("Train index time {}".format(time.time() - st_time)) times = [] for _ in range(n_warmup_runs+n_runs): torch.cuda.synchronize() st_time = time.time() D, I = index_ivf.search(fake_query, max_neighbors) torch.cuda.synchronize() times.append(time.time() - st_time) avg_time = np.mean(times[n_warmup_runs:]) * 2 # video + sub logger.info("Avg searching time ({} runs) {}".format(n_runs, avg_time)) return avg_time
return parser.parse_args() if __name__ == '__main__': args = arguments() reader = MemoryMappedDatasetReader(args.input_database, start=True) n, d = reader.shape start = datetime.now() print(f"Starting at {start}", flush=True) if args.size == 'large': outputfile = args.input_database / "trained.faiss.index" index = faiss.index_factory(d, "OPQ64_128,IVF262144_HNSW32,PQ64") #index = faiss.index_factory(d, "OPQ64_128,IVF16384_HNSW32,PQ64") #index = faiss.index_factory(d, "OPQ64_128,IVF8192_HNSW32,PQ64") ivf = faiss.extract_index_ivf(index) clustering_index = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(64)) ivf.clustering_index = clustering_index print(f"Sent clustering index to GPU", flush=True) print(f"Clustering ({n}, {d}) matrix ({args.size} mode)", flush=True) index.train(reader.embedding_matrix) print("Finished training", flush=True) faiss.write_index(index, str(outputfile)) else: outputfile = args.input_database / "trained.kdtree.index" print(f"Clustering ({n}, {d}) matrix ({args.size} mode)", flush=True) kdt = KDTree(reader.embedding_matrix, metric='euclidean') print("Finished training") joblib.dump(kdt, str(outputfile))
def test_parenthesis_2(self): index = faiss.index_factory(50, "PCA30,IVF32(PQ15),Flat") index_ivf = faiss.extract_index_ivf(index) quantizer = faiss.downcast_index(index_ivf.quantizer) self.assertEqual(quantizer.pq.M, 15) self.assertEqual(quantizer.d, 30)
def __init__(self, s: int, index: faiss.Index): rpc.Server.__init__(self, s) self.index = index self.index_ivf = faiss.extract_index_ivf(index)
def add_to_index(dump_paths, trained_index_path, target_index_path, idx2id_path, num_docs_per_add=1000, cuda=False, fine_quant='SQ4', offset=0, norm_th=999, ignore_ids=None): sidx2doc_id = [] sidx2word_id = [] dumps = [h5py.File(dump_path, 'r') for dump_path in dump_paths] print('reading %s' % trained_index_path) start_index = faiss.read_index(trained_index_path) start_index.make_direct_map() start_index.set_direct_map_type(faiss.DirectMap.Hashtable) if cuda: if 'PQ' in fine_quant: index_ivf = faiss.extract_index_ivf(start_index) quantizer = index_ivf.quantizer quantizer_gpu = faiss.index_cpu_to_all_gpus(quantizer) index_ivf.quantizer = quantizer_gpu else: res = faiss.StandardGpuResources() co = faiss.GpuClonerOptions() co.useFloat16 = True start_index = faiss.index_cpu_to_gpu(res, 0, start_index, co) print('adding following dumps:') for dump_path in dump_paths: print(dump_path) start_total = 0 start_total_prev = 0 cnt = 0 for di, phrase_dump in enumerate(tqdm(dumps, desc='dumps')): starts = [] start_valids = [] dump_length = len(phrase_dump) for i, (doc_idx, doc_group) in enumerate( tqdm(phrase_dump.items(), desc='adding %d' % di)): if ignore_ids is not None and doc_idx in ignore_ids: continue num_start = doc_group['start'].shape[0] if num_start == 0: continue cnt += 1 start = int8_to_float(doc_group['start'][:], doc_group.attrs['offset'], doc_group.attrs['scale']) start_valid = np.linalg.norm(start, axis=1) <= norm_th starts.append(start) start_valids.append(start_valid) sidx2doc_id.extend([int(doc_idx)] * num_start) sidx2word_id.extend(range(num_start)) start_total += num_start if len(starts) > 0 and ((i % num_docs_per_add == 0) or (i == dump_length - 1)): print('adding at %d' % (i + 1)) add_with_offset(start_index, concat_vectors(starts), concat_vectors(start_valids), start_total_prev, offset) start_total_prev = start_total starts = [] start_valids = [] if len(starts) > 0: print('final adding at %d' % (i + 1)) add_with_offset(start_index, concat_vectors(starts), concat_vectors(start_valids), start_total_prev, offset) start_total_prev = start_total print('number of docs', cnt) for dump in dumps: dump.close() if cuda: print('moving back to cpu') if 'PQ' in fine_quant: index_ivf.quantizer = quantizer del quantizer_gpu else: start_index = faiss.index_gpu_to_cpu(start_index) print('start_index ntotal: %d' % start_index.ntotal) print(start_total) sidx2doc_id = np.array(sidx2doc_id, dtype=np.int32) sidx2word_id = np.array(sidx2word_id, dtype=np.int32) print('writing index and metadata') with h5py.File(idx2id_path, 'w') as f: g = f.create_group(str(offset)) g.create_dataset('doc', data=sidx2doc_id) g.create_dataset('word', data=sidx2word_id) g.attrs['offset'] = offset faiss.write_index(start_index, target_index_path) print('done')
def set_prefetch_nthread(self, nt): for idx in self.indexes: il = faiss.downcast_InvertedLists( faiss.extract_index_ivf(idx).invlists) il.prefetch_nthread il.prefetch_nthread = nt
def set_parallel_mode(self, pm): index_ivf = faiss.extract_index_ivf(self.index) index_ivf.parallel_mode = pm
def add_to_index(dump_paths, trained_index_path, target_index_path, idx2id_path, num_docs_per_add=1000, cuda=False, fine_quant='SQ4', offset=0, norm_th=999, ignore_ids=None, avg_vec=None, std_vec=None, first_passage=False, index_filter=-1e8): sidx2doc_id = [] sidx2word_id = [] dumps = [h5py.File(dump_path, 'r') for dump_path in dump_paths] # filter dumps if index_filter != -1e8: f_dumps = [ h5py.File(dump_path.replace('/phrase/', '/filter/'), 'r') for dump_path in dump_paths ] print('reading %s' % trained_index_path) start_index = faiss.read_index(trained_index_path) if 'none' not in fine_quant: index_ivf = faiss.extract_index_ivf(start_index) index_ivf.make_direct_map() index_ivf.set_direct_map_type(faiss.DirectMap.Hashtable) if cuda: if 'PQ' in fine_quant: index_ivf = faiss.extract_index_ivf(start_index) quantizer = index_ivf.quantizer quantizer_gpu = faiss.index_cpu_to_all_gpus(quantizer) index_ivf.quantizer = quantizer_gpu else: res = faiss.StandardGpuResources() co = faiss.GpuClonerOptions() co.useFloat16 = True start_index = faiss.index_cpu_to_gpu(res, 0, start_index, co) print('adding following dumps:') for dump_path in dump_paths: print(dump_path) start_total = 0 start_total_prev = 0 cnt = 0 for di, phrase_dump in enumerate(tqdm(dumps, desc='dumps')): starts = [] start_valids = [] dump_length = len(phrase_dump) for i, (doc_idx, doc_group) in enumerate( tqdm(phrase_dump.items(), desc='adding %d' % di)): if ignore_ids is not None and doc_idx in ignore_ids: continue num_start = doc_group['start'].shape[0] if num_start == 0: continue cnt += 1 # First passage only if first_passage: f2o_start = doc_group['f2o_start'][:] cut = sum(f2o_start < doc_group['len_per_para'][0]) start = int8_to_float(doc_group['start'][:cut], doc_group.attrs['offset'], doc_group.attrs['scale']) num_start = start.shape[0] # Apply index filter elif index_filter != -1e8: o2f_start = { orig: ft for ft, orig in enumerate(doc_group['f2o_start'][:]) } filter_start = f_dumps[di][doc_idx]['filter_start'][:] filter_end = f_dumps[di][doc_idx]['filter_end'][:] start_idxs, = np.where(filter_start > index_filter) end_idxs, = np.where(filter_end > index_filter) save_idx = set(np.concatenate([start_idxs, end_idxs])) save_idx = sorted( [o2f_start[si] for si in save_idx if si in o2f_start]) start = int8_to_float(doc_group['start'][save_idx], doc_group.attrs['offset'], doc_group.attrs['scale']) num_start = start.shape[0] else: start = int8_to_float(doc_group['start'][:], doc_group.attrs['offset'], doc_group.attrs['scale']) start_valid = np.linalg.norm(start, axis=1) <= norm_th starts.append(start) start_valids.append(start_valid) sidx2doc_id.extend([int(doc_idx)] * num_start) if index_filter == -1e8: sidx2word_id.extend(range(num_start)) else: sidx2word_id.extend(save_idx) start_total += num_start if len(starts) > 0 and ((i % num_docs_per_add == 0) or (i == dump_length - 1)): print('adding at %d' % (i + 1)) add_with_offset( start_index, concat_vectors(starts), concat_vectors(start_valids), start_total_prev, offset, fine_quant, ) start_total_prev = start_total starts = [] start_valids = [] if len(starts) > 0: print('final adding at %d' % (i + 1)) add_with_offset( start_index, concat_vectors(starts), concat_vectors(start_valids), start_total_prev, offset, fine_quant, ) start_total_prev = start_total print('number of docs', cnt) for dump in dumps: dump.close() if cuda: print('moving back to cpu') if 'PQ' in fine_quant: index_ivf.quantizer = quantizer del quantizer_gpu else: start_index = faiss.index_gpu_to_cpu(start_index) print('start_index ntotal: %d' % start_index.ntotal) print(start_total) sidx2doc_id = np.array(sidx2doc_id, dtype=np.int32) sidx2word_id = np.array(sidx2word_id, dtype=np.int32) print('writing index and metadata') with h5py.File(idx2id_path, 'w') as f: g = f.create_group(str(offset)) g.create_dataset('doc', data=sidx2doc_id) g.create_dataset('word', data=sidx2word_id) g.attrs['offset'] = offset faiss.write_index(start_index, target_index_path) print('done')
def set_nprobe(self, nprobe): index_ivf = faiss.extract_index_ivf(self.index) index_ivf.nprobe = nprobe
def __init__(self, phrase_dump_dir, index_path, idx2id_path, cuda=False, logging_level=logging.INFO): self.phrase_dump_dir = phrase_dump_dir # Read index self.index = {} logger.info( f'Reading {index_path} - could take up to 15 mins depending on the file reading speed of HDD/SSD' ) self.index = faiss.read_index(index_path, faiss.IO_FLAG_ONDISK_SAME_DIR) self.reconst_fn = faiss.downcast_index(self.index.index).reconstruct self.R = torch.FloatTensor( faiss.vector_to_array( faiss.downcast_VectorTransform( self.index.chain.at(0)).A).reshape(self.index.d, self.index.d)) self.max_idx = 1e8 if 'PQ' not in index_path else 1e9 logger.info( f'index ntotal: {self.index.ntotal} | PQ: {"PQ" in index_path}') # Read idx2id self.idx_f = {} logger.info('Load idx2id on memory') self.idx_f = self.load_idx_f(idx2id_path) self.offset = None self.scale = None self.doc_groups = None # Options logger.setLevel(logging_level) self.num_docs_list = [] self.cuda = cuda if self.cuda: assert torch.cuda.is_available( ), f"Cuda availability {torch.cuda.is_available()}" self.device = torch.device('cuda') logger.info("Load IVF on GPU") index_ivf = faiss.extract_index_ivf(self.index) index_ivf.nprobe = 256 quantizer = index_ivf.quantizer quantizer_gpu = faiss.index_cpu_to_all_gpus(quantizer) index_ivf.quantizer = quantizer_gpu self.R = self.R.to(self.device) logger.info(f"N probe: {index_ivf.nprobe}") else: self.device = torch.device("cpu") index_ivf = faiss.extract_index_ivf(self.index) index_ivf.nprobe = 256 # For sentence split self.sentencizer = English() self.sentencizer.add_pipe(self.sentencizer.create_pipe('sentencizer')) # Load metadata on RAM if possible doc_group_path = os.path.join( self.phrase_dump_dir[:self.phrase_dump_dir.index('/phrase')], 'meta_compressed.pkl') if os.path.exists(doc_group_path) and ('PQ' in index_path): logger.info( f"Loading metadata on RAM from {doc_group_path} (for PQ only)") self.doc_groups = pickle.load(open(doc_group_path, 'rb')) else: logger.info( f"Will read metadata directly from hdf5 files (requires SSDs for faster inference)" )