def create_faiss_index(vecs, method, n_gpu): """ Create FAISS index on GPU(s). To create a GPU index with FAISS, one first needs to create it on CPU then copy it on GPU. Note that a "flat" index means that it is brute-force, with no approximation techniques. """ # Build flat CPU index given the chosen method. if method=='l2': index = faiss.IndexFlatL2(vecs.shape[1]) # Exact Search for L2 elif method=='ip': index = faiss.IndexFlatIP(vecs.shape[1]) # Exact Search for Inner Product elif method=='cos': # Cosime similarity comes down to normalizing the embeddings beforehand and then applying inner product. vecs = preprocessing.normalize(vecs, norm='l2') index = faiss.IndexFlatIP(vecs.shape[1]) else: print("Error: Please choose between L2 Distance ('l2'), Inner Product Distance ('ip') or Cosine Distance ('cos') as brute-force method for exact search. Exiting...") sys.exit(0) # Convert to flat GPU index. if n_gpu > 0: co = faiss.GpuMultipleClonerOptions() # If using multiple GPUs, enable sharding so that the dataset is divided across the GPUs rather than replicated. co.shard = True index = faiss.index_cpu_to_all_gpus(index, co=co, ngpu=n_gpu) # Convert CPU index to GPU index. # Add vectors to GPU index. index.add(vecs) # Convert back to cpu index (needed for saving it to disk). index = faiss.index_gpu_to_cpu(index) return index
def __init__(self, xt_path="/home/wenqingfu/sift1b/bigann_learn.bvecs", xb_path="/home/wenqingfu/sift1b/bigann_base.bvecs", ngpu=3): self.xt = self.mmap_bvecs(xt_path) self.xb = self.mmap_bvecs(xb_path) self.xt = self.sanitize(self.xt[:1000000]) self.xb = self.sanitize(self.xb[self.db_start * 1000 * 1000:self.db_end * 1000 * 1000]) self.gpu_resources = [] for i in range(0, ngpu): res = faiss.StandardGpuResources() if tempmem >= 0: res.setTempMemory(tempmem) print("set tempemm to %d" % tempmem) self.gpu_resources.append(res) self.vres = faiss.GpuResourcesVector() self.vdev = faiss.IntVector() for i in range(0, ngpu): self.vdev.push_back(i) self.vres.push_back(self.gpu_resources[i]) self.co = faiss.GpuMultipleClonerOptions() self.co.useFloat16 = True self.co.useFloat16CoarseQuantizer = False self.co.usePrecomputed = False self.co.indicesOptions = 0 self.co.verbose = True self.co.shard = True self.ps = faiss.GpuParameterSpace()
def _set_mips_index(self): """ Create a Faiss Flat index with inner product as the metric to search against """ try: import faiss except ImportError: raise Exception( "Error: Please install faiss to use FaissMIPSIndex") if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0: print("\n> Building index", flush=True) cpu_index = faiss.IndexFlatIP(self.embed_size) if self.use_gpu: # create resources and config for GpuIndex config = faiss.GpuMultipleClonerOptions() config.shard = True config.useFloat16 = True gpu_index = faiss.index_cpu_to_all_gpus(cpu_index, co=config) self.mips_index = faiss.IndexIDMap(gpu_index) if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0: print(">> Initialized index on GPU", flush=True) else: # CPU index supports IDs so wrap with IDMap self.mips_index = faiss.IndexIDMap(cpu_index) if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0: print(">> Initialized index on CPU", flush=True) # if we were constructed with a BlockData, then automatically load it # when the FAISS structure is built if self.embed_data is not None: self.add_embed_data(self.embed_data)
def convert_index_to_gpu(index, faiss_gpu_index, useFloat16=False): if type(faiss_gpu_index) == list and len(faiss_gpu_index) == 1: faiss_gpu_index = faiss_gpu_index[0] if isinstance(faiss_gpu_index, int): res = faiss.StandardGpuResources() res.setTempMemory(512 * 1024 * 1024) co = faiss.GpuClonerOptions() co.useFloat16 = useFloat16 index = faiss.index_cpu_to_gpu(res, faiss_gpu_index, index, co) else: global gpu_resources if len(gpu_resources) == 0: import torch for i in range(torch.cuda.device_count()): res = faiss.StandardGpuResources() res.setTempMemory(256 * 1024 * 1024) gpu_resources.append(res) assert isinstance(faiss_gpu_index, list) vres = faiss.GpuResourcesVector() vdev = faiss.IntVector() co = faiss.GpuMultipleClonerOptions() co.shard = True co.useFloat16 = useFloat16 for i in faiss_gpu_index: vdev.push_back(i) vres.push_back(gpu_resources[i]) index = faiss.index_cpu_to_gpu_multiple(vres, vdev, index, co) return index
def get_gpu_index(cpu_index): gpu_resources = [] ngpu = faiss.get_num_gpus() tempmem = -1 for i in range(ngpu): res = faiss.StandardGpuResources() if tempmem >= 0: res.setTempMemory(tempmem) gpu_resources.append(res) def make_vres_vdev(i0=0, i1=-1): " return vectors of device ids and resources useful for gpu_multiple" vres = faiss.GpuResourcesVector() vdev = faiss.IntVector() if i1 == -1: i1 = ngpu for i in range(i0, i1): vdev.push_back(i) vres.push_back(gpu_resources[i]) return vres, vdev co = faiss.GpuMultipleClonerOptions() co.shard = True gpu_vector_resources, gpu_devices_vector = make_vres_vdev(0, ngpu) gpu_index = faiss.index_cpu_to_gpu_multiple(gpu_vector_resources, gpu_devices_vector, cpu_index, co) return gpu_index
def faiss_knn(feats_train, targets_train, feats_val, targets_val, k): feats_train = feats_train.numpy() targets_train = targets_train.numpy() feats_val = feats_val.numpy() targets_val = targets_val.numpy() d = feats_train.shape[-1] index = faiss.IndexFlatL2(d) # build the index co = faiss.GpuMultipleClonerOptions() co.useFloat16 = True co.shard = True gpu_index = faiss.index_cpu_to_all_gpus(index, co) gpu_index.add(feats_train) D, I = gpu_index.search(feats_val, k) pred = np.zeros(I.shape[0]) for i in range(I.shape[0]): votes = list(Counter(targets_train[I[i]]).items()) shuffle(votes) pred[i] = max(votes, key=lambda x: x[1])[0] acc = 100.0 * (pred == targets_val).mean() return acc
def compute_populated_index(preproc): """Add elements to a sharded index. Return the index and if available a sharded gpu_index that contains the same data. """ indexall = prepare_trained_index(preproc) co = faiss.GpuMultipleClonerOptions() co.useFloat16 = use_float16 co.useFloat16CoarseQuantizer = False co.usePrecomputed = use_precomputed_tables co.indicesOptions = faiss.INDICES_CPU co.verbose = True co.reserveVecs = max_add if max_add > 0 else xb.shape[0] co.shard = True assert co.shard_type in (0, 1, 2) vres, vdev = make_vres_vdev() gpu_index = faiss.index_cpu_to_gpu_multiple(vres, vdev, indexall, co) print("add...") t0 = time.time() nb = xb.shape[0] for i0, xs in dataset_iterator(xb, preproc, add_batch_size): i1 = i0 + xs.shape[0] gpu_index.add_with_ids(xs, np.arange(i0, i1)) if max_add > 0 and gpu_index.ntotal > max_add: print("Flush indexes to CPU") for i in range(ngpu): index_src_gpu = faiss.downcast_index(gpu_index.at(i)) index_src = faiss.index_gpu_to_cpu(index_src_gpu) print(" index %d size %d" % (i, index_src.ntotal)) index_src.copy_subset_to(indexall, 0, 0, nb) index_src_gpu.reset() index_src_gpu.reserveMemory(max_add) gpu_index.sync_with_shard_indexes() print('\r%d/%d (%.3f s) ' % (i0, nb, time.time() - t0), end=' ') sys.stdout.flush() print("Add time: %.3f s" % (time.time() - t0)) print("Aggregate indexes to CPU") t0 = time.time() if hasattr(gpu_index, 'at'): # it is a sharded index for i in range(ngpu): index_src = faiss.index_gpu_to_cpu(gpu_index.at(i)) print(" index %d size %d" % (i, index_src.ntotal)) index_src.copy_subset_to(indexall, 0, 0, nb) else: # simple index index_src = faiss.index_gpu_to_cpu(gpu_index) index_src.copy_subset_to(indexall, 0, 0, nb) print(" done in %.3f s" % (time.time() - t0)) if max_add > 0: # it does not contain all the vectors gpu_index = None return gpu_index, indexall
def __init__(self, target, nprobe=128, index_factory_str=None, verbose=False, mode='proxy', using_gpu=True): self._res_list = [] num_gpu = faiss.get_num_gpus() print('[faiss gpu] #GPU: {}'.format(num_gpu)) size, dim = target.shape assert size > 0, "size: {}".format(size) index_factory_str = "IVF{},PQ{}".format( min(8192, 16 * round(np.sqrt(size))), 32) if index_factory_str is None else index_factory_str cpu_index = faiss.index_factory(dim, index_factory_str) cpu_index.nprobe = nprobe if mode == 'proxy': co = faiss.GpuClonerOptions() co.useFloat16 = True co.usePrecomputed = False index = faiss.IndexProxy() for i in range(num_gpu): res = faiss.StandardGpuResources() self._res_list.append(res) sub_index = faiss.index_cpu_to_gpu( res, i, cpu_index, co) if using_gpu else cpu_index index.addIndex(sub_index) elif mode == 'shard': co = faiss.GpuMultipleClonerOptions() co.useFloat16 = True co.usePrecomputed = False co.shard = True index = faiss.index_cpu_to_all_gpus(cpu_index, co, ngpu=num_gpu) else: raise KeyError("Unknown index mode") index = faiss.IndexIDMap(index) index.verbose = verbose # get nlist to decide how many samples used for training nlist = int( float([ item for item in index_factory_str.split(",") if 'IVF' in item ][0].replace("IVF", ""))) # training if not index.is_trained: indexes_sample_for_train = np.random.randint(0, size, nlist * 256) index.train(target[indexes_sample_for_train]) # add with ids target_ids = np.arange(0, size) index.add_with_ids(target, target_ids) self.index = index
def range_ground_truth(xq, db_iterator, threshold, metric_type=faiss.METRIC_L2, shard=False, ngpu=-1): """Computes the range-search search results for a dataset that possibly does not fit in RAM but for which we have an iterator that returns it block by block. """ nq, d = xq.shape t0 = time.time() xq = np.ascontiguousarray(xq, dtype='float32') index = faiss.IndexFlat(d, metric_type) if ngpu == -1: ngpu = faiss.get_num_gpus() if ngpu: LOG.info('running on %d GPUs' % ngpu) co = faiss.GpuMultipleClonerOptions() co.shard = shard index_gpu = faiss.index_cpu_to_all_gpus(index, co=co, ngpu=ngpu) # compute ground-truth by blocks i0 = 0 D = [[] for _i in range(nq)] I = [[] for _i in range(nq)] all_lims = [] for xbi in db_iterator: ni = xbi.shape[0] if ngpu > 0: index_gpu.add(xbi) lims_i, Di, Ii = range_search_gpu(xq, threshold, index_gpu, xbi) index_gpu.reset() else: index.add(xbi) lims_i, Di, Ii = index.range_search(xq, threshold) index.reset() Ii += i0 for j in range(nq): l0, l1 = lims_i[j], lims_i[j + 1] if l1 > l0: D[j].append(Di[l0:l1]) I[j].append(Ii[l0:l1]) i0 += ni LOG.info("%d db elements, %.3f s" % (i0, time.time() - t0)) empty_I = np.zeros(0, dtype='int64') empty_D = np.zeros(0, dtype='float32') # import pdb; pdb.set_trace() D = [(np.hstack(i) if i != [] else empty_D) for i in D] I = [(np.hstack(i) if i != [] else empty_I) for i in I] sizes = [len(i) for i in I] assert len(sizes) == nq lims = np.zeros(nq + 1, dtype="uint64") lims[1:] = np.cumsum(sizes) return lims, np.hstack(D), np.hstack(I)
def do_cpu_to_gpu(self, index_key): ts = [] ts.append(time.time()) (xt, xb, xq) = self.get_dataset(small_one=True) nb, d = xb.shape index = faiss.index_factory(d, index_key) if index.__class__ == faiss.IndexIVFPQ: # speed up test index.pq.cp.niter = 2 index.do_polysemous_training = False ts.append(time.time()) index.train(xt) ts.append(time.time()) # adding some ids because there was a bug in this case index.add_with_ids(xb, np.arange(nb).astype(np.int64) * 3 + 12345) ts.append(time.time()) index.nprobe = 4 D, Iref = index.search(xq, 10) ts.append(time.time()) res = faiss.StandardGpuResources() gpu_index = faiss.index_cpu_to_gpu(res, 0, index) ts.append(time.time()) gpu_index.setNumProbes(4) D, Inew = gpu_index.search(xq, 10) ts.append(time.time()) print('times:', [t - ts[0] for t in ts]) self.assertGreaterEqual((Iref == Inew).sum(), Iref.size) if faiss.get_num_gpus() == 1: return for shard in False, True: # test on just 2 GPUs res = [faiss.StandardGpuResources() for i in range(2)] co = faiss.GpuMultipleClonerOptions() co.shard = shard gpu_index = faiss.index_cpu_to_gpu_multiple_py(res, index, co) faiss.GpuParameterSpace().set_index_parameter( gpu_index, 'nprobe', 4) D, Inew = gpu_index.search(xq, 10) # 0.99: allow some tolerance in results otherwise test # fails occasionally (not reproducible) self.assertGreaterEqual((Iref == Inew).sum(), Iref.size * 0.99)
def get_populated_index(preproc): if not index_cachefile or not os.path.exists(index_cachefile): if not altadd: gpu_index, indexall = compute_populated_index(preproc) else: gpu_index, indexall = compute_populated_index_2(preproc) if index_cachefile: print "store", index_cachefile faiss.write_index(indexall, index_cachefile) else: print "load", index_cachefile indexall = faiss.read_index(index_cachefile) gpu_index = None co = faiss.GpuMultipleClonerOptions() co.useFloat16 = use_float16 co.useFloat16CoarseQuantizer = False co.usePrecomputed = use_precomputed_tables co.indicesOptions = 0 co.verbose = True co.shard = True # the replicas will be made "manually" t0 = time.time() print "CPU index contains %d vectors, move to GPU" % indexall.ntotal if replicas == 1: if not gpu_index: print "copying loaded index to GPUs" vres, vdev = make_vres_vdev() index = faiss.index_cpu_to_gpu_multiple( vres, vdev, indexall, co) else: index = gpu_index else: del gpu_index # We override the GPU index print "Copy CPU index to %d sharded GPU indexes" % replicas index = faiss.IndexProxy() for i in range(replicas): gpu0 = ngpu * i / replicas gpu1 = ngpu * (i + 1) / replicas vres, vdev = make_vres_vdev(gpu0, gpu1) print " dispatch to GPUs %d:%d" % (gpu0, gpu1) index1 = faiss.index_cpu_to_gpu_multiple( vres, vdev, indexall, co) index1.this.disown() index.addIndex(index1) index.own_fields = True del indexall print "move to GPU done in %.3f s" % (time.time() - t0) return index
def faiss_knn(feats_train, targets_train, feats_val, targets_val, feats_val_poisoned, targets_val_poisoned, k): feats_train = feats_train.numpy() targets_train = targets_train.numpy() feats_val = feats_val.numpy() targets_val = targets_val.numpy() d = feats_train.shape[-1] index = faiss.IndexFlatL2(d) # build the index co = faiss.GpuMultipleClonerOptions() co.useFloat16 = True co.shard = True gpu_index = faiss.index_cpu_to_all_gpus(index, co) gpu_index.add(feats_train) # Val clean D, I = gpu_index.search(feats_val, k) # create confusion matrix ROWS ground truth COLUMNS pred conf_matrix_clean = np.zeros( (int(targets_val.max()) + 1, int(targets_val.max()) + 1)) pred = np.zeros(I.shape[0]) for i in range(I.shape[0]): votes = list(Counter(targets_train[I[i]]).items()) shuffle(votes) pred[i] = max(votes, key=lambda x: x[1])[0] # update confusion matrix conf_matrix_clean[targets_val[i], int(pred[i])] += 1 acc = 100.0 * (pred == targets_val).mean() # Val poisoned feats_val_poisoned = feats_val_poisoned.numpy() targets_val_poisoned = targets_val_poisoned.numpy() D, I = gpu_index.search(feats_val_poisoned, k) # create confusion matrix ROWS ground truth COLUMNS pred conf_matrix_poisoned = np.zeros((int(targets_val_poisoned.max()) + 1, int(targets_val_poisoned.max()) + 1)) pred_poisoned = np.zeros(I.shape[0]) for i in range(I.shape[0]): votes = list(Counter(targets_train[I[i]]).items()) shuffle(votes) pred_poisoned[i] = max(votes, key=lambda x: x[1])[0] # update confusion matrix conf_matrix_poisoned[targets_val_poisoned[i], int(pred_poisoned[i])] += 1 acc_poisoned = 100.0 * (pred_poisoned == targets_val_poisoned).mean() return acc, conf_matrix_clean, acc_poisoned, conf_matrix_poisoned
def to_gpu(self): if faiss.get_num_gpus() == 1: res = faiss.StandardGpuResources() self.index = faiss.index_cpu_to_gpu(res, 0, self.index) else: cloner_options = faiss.GpuMultipleClonerOptions() cloner_options.shard = True self.index = faiss.index_cpu_to_all_gpus(self.index, co=cloner_options) return self.index
def _create_co(self, use_float16, use_float16_quantizer, use_precomputed_codes) -> 'faiss.GpuMultipleClonerOptions': """ TODO: docstring """ co = faiss.GpuMultipleClonerOptions() co.useFloat16 = self.use_float16 co.useFloat16CoarseQuantizer = self.use_float16_quantizer co.usePrecomputed = use_precomputed_codes return co
def moveCPUtoGPU(self): co = faiss.GpuMultipleClonerOptions() co.useFloat16 = self.use_float16 co.useFloat16CoarseQuantizer = False co.usePrecomputed = self.use_precomputed_tables co.indicesOptions = faiss.INDICES_CPU co.verbose = True co.reserveVecs = self.max_add co.shard = True vres, vdev = indexfunctions.make_vres_vdev(self.gpu_resources, ngpu=self.ngpu) self.gpu_index = faiss.index_cpu_to_gpu_multiple( vres, vdev, self.index, co)
def make_index(sx, preproc=ident): N, p = sx.shape ngpu = faiss.get_num_gpus() if N < 1000: indextype = 'Flat' elif N < 10**6: indextype = 'GPUFlat' elif N < 100000: indextype = 'GPUIVFFlat' else: indextype = 'GPUIVFFlatShards' if (indextype == 'IVFFlat' or indextype == 'GPUIVFFlat' or indextype == 'GPUIVFFlatShards'): ncentroids = int(4 * np.floor(np.sqrt(N))) nprobe = 256 print("using IndexIVFFlat with %d/%d centroids" % (nprobe, ncentroids)) q = faiss.IndexFlatL2(p) index = faiss.IndexIVFFlat(q, p, ncentroids, faiss.METRIC_INNER_PRODUCT) if nprobe >= ncentroids * 3 / 4: nprobe = int(ncentroids * 3 / 4) print(" forcing nprobe to %d" % nprobe) index.nprobe = nprobe index.quantizer_no_dealloc = q if indextype.startswith('GPU') and ngpu > 0: index = move_index_to_gpu(index, indextype == 'GPUIVFFlatShards') ntrain = min(ncentroids * 100, N) print("prepare train set, size=%d" % ntrain) trainset = sx[:ntrain] trainset.max() # force move to RAM print("train") index.train(trainset) elif indextype == 'GPUFlat' or indextype == 'Flat': index = faiss.IndexFlatIP(p) if indextype.startswith('GPU') and ngpu > 0: co = faiss.GpuMultipleClonerOptions() co.useFloat16 = True index = faiss.index_cpu_to_all_gpus(index, co) else: assert False bs = 16384 for i0, i1, block in dataset_iterator(sx, preproc, bs): print(" add %d:%d / %d\r" % (i0, i1, N), end=' ') sys.stdout.flush() index.add(block) return index
def copyToGpu(index_cpu): co = faiss.GpuMultipleClonerOptions() co.useFloat16 = useFloat16 co.useFloat16CoarseQuantizer = False co.usePrecomputed = usePrecomputed co.indicesOptions = faiss.INDICES_CPU co.verbose = True co.reserveVecs = N co.shard = True assert co.shard_type in (0, 1, 2) vres, vdev = make_vres_vdev() index_gpu = faiss.index_cpu_to_gpu_multiple(vres, vdev, index_cpu, co) return index_gpu
def move_index_to_gpu(index, shard=False): ngpu = faiss.get_num_gpus() gpu_resources = [faiss.StandardGpuResources() for i in range(ngpu)] co = faiss.GpuMultipleClonerOptions() co.useFloat16 = True co.shard = shard co.shard_type = 1 print(" moving to %d GPUs" % ngpu) t0 = time.time() index = faiss.index_cpu_to_gpu_multiple_py(gpu_resources, index, co) index.dont_dealloc_me = gpu_resources print(" done in %.3f s" % (time.time() - t0)) return index
def enrich_vcr_with_omcs(args): omcs_embs, omcs_index = load_omcs(args) LOG.info('Loaded faiss index with OMCS emnbeddings, ntotal={}'.format( omcs_index.ntotal)) co = faiss.GpuMultipleClonerOptions() co.shard = False # Replica mode (dataparallel) instead of shard mode omcs_index = faiss.index_cpu_to_all_gpus(omcs_index, co) vcr_h5 = h5py.File(args.vcr_h5, 'r') LOG.info('Loaded VCR embeddings from {}, found {} entities'.format( args.vcr_h5, len(vcr_h5))) outfile = os.path.basename(args.vcr_h5).split('.')[0] + '_omcs.h5' if not os.path.exists(args.outdir): os.makedirs(args.outdir) outfile = os.path.join(args.outdir, outfile) LOG.info('Writing output OMCS embeddings to {}'.format(outfile)) output_h5 = h5py.File(outfile, 'w') for i in range(len(vcr_h5)): output_h5.create_group(f'{i}') for i in tqdm(range(len(vcr_h5))): grp = vcr_h5[str(i)] out_grp = output_h5[str(i)] # Each key has embeddings of dim (num_words, d). Batch over all keys. items = {k: np.array(v, dtype=np.float32) for k, v in grp.items()} vcr_embs = np.vstack(items.values()) vcr_omcs_embs = get_omcs_embeddings_for_vcr(omcs_embs, omcs_index, vcr_embs, args) assert vcr_embs.shape == vcr_omcs_embs.shape # Convert back to float16 to match BERT VCR format vcr_omcs_embs = vcr_omcs_embs.numpy().astype(np.float16) # Unbatch based on word counts word_counts = [v.shape[0] for v in items.values()] vcr_omcs_embs = np.split(vcr_omcs_embs, np.cumsum(word_counts)[:-1]) assert len(vcr_omcs_embs) == len(items) # Write in the same format as vcr_h5 file for key, data in zip(items.keys(), vcr_omcs_embs): out_grp.create_dataset(key, data=data) LOG.info('Success!')
def adding_initialize(self, index): """ The index should be owned by caller. """ assert self.ngpu > 0 print_message('Adding initialize...') self.co = faiss.GpuMultipleClonerOptions() self.co.useFloat16 = True self.co.useFloat16CoarseQuantizer = False self.co.usePrecomputed = False self.co.indicesOptions = faiss.INDICES_CPU self.co.verbose = True self.co.reserveVecs = self.max_add self.co.shard = True assert self.co.shard_type in (0, 1, 2) self.vres, self.vdev = self._make_vres_vdev() self.gpu_index = faiss.index_cpu_to_gpu_multiple( self.vres, self.vdev, index, self.co)
def __loadIndex(self): assert self.dbs != [], "You should load db before load index, use self.loadDB() ..." d = self.dbs[0].shape[-1] ngpu = faiss.get_num_gpus() index = faiss.IndexFlatL2(d) vres = faiss.GpuResourcesVector() vdev = faiss.IntVector() gpu_resources = [] for i in range(0, ngpu): res = faiss.StandardGpuResources() gpu_resources.append(res) vdev.push_back(i) vres.push_back(res) co = faiss.GpuMultipleClonerOptions() co.shard = True self.gpu_index = faiss.index_cpu_to_gpu_multiple(vres, vdev, index, co) self.gpu_index.referenced_objects = gpu_resources self.gpu_index.add(self.dbs)
def gpux4_allpair_similarity(ds, prefix): # Use cache cache_data = load_cached_result(prefix) if cache_data is not None: return cache_data # Search with GpuMultiple co = faiss.GpuMultipleClonerOptions() co.shard = True vres = [] for _ in range(4): res = faiss.StandardGpuResources() vres.append(res) cpu_index = faiss.IndexFlatIP(ds.feats_index.shape[1]) gpu_index = faiss.index_cpu_to_gpu_multiple_py(vres, cpu_index, co) gpu_index.add(ds.feats_index) # 177sec with timer('Prepare all-pair similarity on index dataset'): ii_sims, ii_ids = gpu_index.search(x=ds.feats_index, k=100) with timer('Save results (index-index)'): fn_out = Path(prefix) / "index19_vs_index19_ids.npy" fn_out.parent.mkdir(parents=True, exist_ok=True) np.save(str(fn_out), ii_ids) np.save(str(Path(prefix) / "index19_vs_index19_sims.npy"), ii_sims) with timer('Prepare all-pair similarity on test-index dataset'): ti_sims, ti_ids = gpu_index.search(x=ds.feats_test, k=100) with timer('Save results (test-index)'): np.save(str(Path(prefix) / "test19_vs_index19_ids.npy"), ti_ids) np.save(str(Path(prefix) / "test19_vs_index19_sims.npy"), ti_sims) return edict({ 'ti_sims': ti_sims, 'ti_ids': ti_ids, 'ii_sims': ii_sims, 'ii_ids': ii_ids, })
def faiss_kmeans(train_feats, val_feats, nmb_clusters): train_feats = train_feats.numpy() val_feats = val_feats.numpy() d = train_feats.shape[-1] clus = faiss.Clustering(d, nmb_clusters) clus.niter = 20 clus.max_points_per_centroid = 10000000 index = faiss.IndexFlatL2(d) co = faiss.GpuMultipleClonerOptions() co.useFloat16 = True co.shard = True index = faiss.index_cpu_to_all_gpus(index, co) clus.train(train_feats, index) _, train_a = index.search(train_feats, 1) _, val_a = index.search(val_feats, 1) return list(train_a[:, 0]), list(val_a[:, 0])
def voronoi_gpu(): test_index = tools.load_vector('../data/adamskij/test_index.bin', 'L') nlist = 100 quantizer = faiss.IndexFlatL2(ncols) cpu_index = faiss.IndexIVFFlat(quantizer, ncols, nlist) xb = tools.load_2d_vec(fout, ncols, typecode='f') xq = np.copy(xb[:test_size]) cpu_index.train(xb) ngpus = faiss.get_num_gpus() print("number of GPUs:", ngpus) ress = [] for i in range(ngpus): res = faiss.StandardGpuResources() if i in (2, 3, 4, 5): res.noTempMemory() res.initializeForDevice(i) ress.append(res) co = faiss.GpuMultipleClonerOptions() co.shard = True gpu_index = faiss.index_cpu_to_gpu_multiple_py(ress, cpu_index, co) # gpu_index = faiss.index_cpu_to_all_gpus(cpu_index, co) gpu_index.add(xb[:20_000_000]) # for xb in it: # gpu_index.add(xb) for i in range(20): gpu_index.nprobe = i + 1 # default nprobe is 1, try a few more start_time = time.time() D, I = gpu_index.search(xq, 2) secs = time.time() - start_time # acc = (I[:, 1] == test_index).sum() print(i + 1, secs)
def test_sharded(self): d = 32 nb = 1000 nq = 200 k = 10 rs = np.random.RandomState(123) xb = rs.rand(nb, d).astype('float32') xq = rs.rand(nq, d).astype('float32') index_cpu = faiss.IndexFlatL2(d) assert faiss.get_num_gpus() > 1 co = faiss.GpuMultipleClonerOptions() co.shard = True index = faiss.index_cpu_to_all_gpus(index_cpu, co, ngpu=2) index.add(xb) D, I = index.search(xq, k) index_cpu.add(xb) D_ref, I_ref = index_cpu.search(xq, k) assert np.all(I == I_ref) del index index2 = faiss.index_cpu_to_all_gpus(index_cpu, co, ngpu=2) D2, I2 = index2.search(xq, k) assert np.all(I2 == I_ref) try: index2.add(xb) except RuntimeError: pass else: assert False, "this call should fail!"
def load_index(passage_embeddings, index_path, faiss_gpu_index, use_gpu): dim = passage_embeddings.shape[1] if index_path is None: index = faiss.index_factory(dim, "Flat", faiss.METRIC_INNER_PRODUCT) index.add(passage_embeddings) else: index = faiss.read_index(index_path) if faiss_gpu_index and use_gpu: if len(faiss_gpu_index) == 1: res = faiss.StandardGpuResources() res.setTempMemory(1024 * 1024 * 1024) co = faiss.GpuClonerOptions() if index_path: co.useFloat16 = True else: co.useFloat16 = False index = faiss.index_cpu_to_gpu(res, faiss_gpu_index, index, co) else: assert not index_path # Only need one GPU for compressed index global gpu_resources import torch for i in range(torch.cuda.device_count()): res = faiss.StandardGpuResources() res.setTempMemory(128 * 1024 * 1024) gpu_resources.append(res) assert isinstance(faiss_gpu_index, list) vres = faiss.GpuResourcesVector() vdev = faiss.IntVector() co = faiss.GpuMultipleClonerOptions() co.shard = True for i in faiss_gpu_index: vdev.push_back(i) vres.push_back(gpu_resources[i]) index = faiss.index_cpu_to_gpu_multiple(vres, vdev, index, co) return index
def faiss_knn(feats_train, targets_train, feats_val, targets_val, k): feats_train = feats_train.numpy() targets_train = targets_train.numpy() feats_val = feats_val.numpy() targets_val = targets_val.numpy() d = feats_train.shape[-1] index = faiss.IndexFlatL2(d) # build the index co = faiss.GpuMultipleClonerOptions() co.useFloat16 = True co.shard = True gpu_index = faiss.index_cpu_to_all_gpus(index, co) gpu_index.add(feats_train) D, I = gpu_index.search(feats_val, k) pred = np.zeros(I.shape[0], dtype=np.int) conf_mat = np.zeros((1000, 1000), dtype=np.int) for i in range(I.shape[0]): votes = list(Counter(targets_train[I[i]]).items()) shuffle(votes) pred[i] = max(votes, key=lambda x: x[1])[0] conf_mat[targets_val[i], pred[i]] += 1 acc = 100.0 * (pred == targets_val).mean() assert acc == (100.0 * (np.trace(conf_mat) / np.sum(conf_mat))) # per_cat_acc = 100.0 * (np.diag(conf_mat) / np.sum(conf_mat, axis=1)) # sparse_cats = [58, 155, 356, 747, 865, 234, 268, 384, 385, 491, 498, 538, 646, 650, 726, 860, 887, 15, 170, 231] # s = ' '.join('{}'.format(c) for c in sparse_cats) # print('==> cats: {}'.format(s)) # s = ' '.join('{:.1f}'.format(a) for a in per_cat_acc[sparse_cats]) # print('==> acc/cat: {}'.format(s)) # print('==> mean acc: {}'.format(per_cat_acc[sparse_cats].mean())) return acc
def gpux4_euclidsearch_from_dataset(ds, fn_npy, lhs='test', rhs='index', topk=100): # Search with GpuMultiple co = faiss.GpuMultipleClonerOptions() co.shard = True vres = [] for _ in range(4): res = faiss.StandardGpuResources() vres.append(res) cpu_index = faiss.IndexFlatL2(ds[f'feats_{rhs}'].shape[1]) gpu_index = faiss.index_cpu_to_gpu_multiple_py(vres, cpu_index, co) gpu_index.add(ds[f'feats_{rhs}']) _, all_ranks = gpu_index.search(x=ds[f'feats_{lhs}'], k=topk) Path(fn_npy).parent.mkdir(parents=True, exist_ok=True) np.save(fn_npy, all_ranks) if lhs == 'test' and rhs == 'index': # Retrieval task fn_sub = fn_npy.rstrip('.npy') + '.csv.gz' save_sub_from_top100ranks(ds, all_ranks, fn_sub, topk=topk)
def load_omcs(self, use_sentence_embs=True): omcs_h5_file = os.path.join(VCR_ANNOTS_DIR, 'omcs', 'bert_da_omcs.h5') # Embeddings are stored as float16, but faiss requires float32 _, sentence_embs, word_embs = load_omcs_embeddings(omcs_h5_file, dtype=np.float32) if use_sentence_embs: embs = np.vstack(sentence_embs) index_file = 'bert_da_omcs_sentences.faissindex' else: embs = np.vstack(word_embs) index_file = 'bert_da_omcs_words.faissindex' index_file = os.path.join(VCR_ANNOTS_DIR, 'omcs', index_file) index = faiss.read_index(index_file) assert len(embs) == index.ntotal assert embs.shape[1] == index.d LOG.info('Loaded faiss index with OMCS embeddings from {}, ntotal={}'. format(index_file, index.ntotal)) self.co = faiss.GpuMultipleClonerOptions() self.co.shard = False # Replica mode (dataparallel) instead of shard mode index = faiss.index_cpu_to_all_gpus(index, self.co) return torch.from_numpy(embs), index
codes = faiss.vector_to_array(rfn.codes) np.save(args.neigh_recons_codes, codes) ###################################################### # Exhaustive evaluation ###################################################### if args.exhaustive: print "exhaustive evaluation" xq_tr = vec_transform(sanitize(xq)) index2 = faiss.IndexFlatL2(index_hnsw.d) accu_recons_error = 0.0 if faiss.get_num_gpus() > 0: print "do eval on GPU" co = faiss.GpuMultipleClonerOptions() co.shard = False index2 = faiss.index_cpu_to_all_gpus(index2, co) # process in batches in case the dataset does not fit in RAM rh = datasets.ResultHeap(xq_tr.shape[0], 100) t0 = time.time() bs = 500000 for i0 in range(0, nb, bs): i1 = min(nb, i0 + bs) print ' handling batch %d:%d' % (i0, i1) xb_recons = np.empty((i1 - i0, index_hnsw.d), dtype='float32') rfn.reconstruct_n(i0, i1 - i0, faiss.swig_ptr(xb_recons)) accu_recons_error += ((vec_transform(sanitize(xb[i0:i1])) -